使用这个德语到英语 Pytorch Seq2Seq 机器翻译,我一直在尝试使用自定义数据集创建中古英语到现代英语 Seq2Seq 机器翻译。除了创建自定义数据集之外,我实际上复制并粘贴了德语模板,并用我(中古英语)替换了德语单词,用 pdf(现在​​的英语)替换了英语,但是在尝试训练模型时我收到了两条错误消息. 当我运行德语版本的代码时,它训练得很好。以下是我的代码和两条错误消息。


# imports
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data, datasets
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np # other useful math-related libraries and modules
import spacy # this is where he gets his nlp datasets
import spacy.cli
import random
from utils2 import translate_sentence, bleu, save_checkpoint, load_checkpoint

spacy_eng = spacy.load('en') # loading up the English tokenizer
def tokenizer_eng(text): # English tokenizer function for the fields
  return [tok.text for tok in spacy_eng.tokenizer(text)]

# constructing the fields for pde and me
me = Field(sequential=True, use_vocab=True, tokenize=tokenizer_eng, lower=True, init_token='sos', eos_token='<eos>')
pde = Field(sequential=True, use_vocab=True, tokenize=tokenizer_eng, lower=True, init_token='sos', eos_token='<eos>')

fields = {'Middle English': ('me', me), 'Present Day English': ('pde', pde)}

train_data, validation_data, test_data = TabularDataset.splits(


# Building the vocabulary

# Building the seq2seq and encoder decoder models
class Encoder(nn.Module): # first LSTM
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    def forward(self, x): # defines the vector of indices
        # x vector shape: (seq_length, N)
        embedding = self.dropout(self.embedding(x))
        # embedding vector shape: (seq_length, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)
        return hidden, cell

class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
      # output_size should be the same as input_size, because the size of the vocabulary should change.
      super(Decoder, self).__init__()
      self.hidden_size = hidden_size
      self.num_layers = num_layers

      self.dropout = nn.Dropout(p)
      self.embedding = nn.Embedding(input_size, embedding_size)
      self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
      self.fc = nn.Linear(hidden_size, output_size) # fc stands for fully connected

  def forward(self, x, hidden, cell):
      x = x.unsqueeze(0) #this adds another dimension

      embedding = self.dropout(self.embedding(x))
      # embedding shape: (1, N, embedding_size)

      outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
      # outputs is what we think this next word should be
      # shape of the outputs: (1, N, hidden_size)

      predictions = self.fc(outputs)
      # shape of predictions: (1, N, length_of_vocab)
      # this will be sent to the loss function

      predictions = predictions.squeeze(0)

      return predictions, hidden, cell

class Seq2Seq(nn.Module): # combines the encoder and decoder
  def __init__(self, encoder, decoder):
      super(Seq2Seq, self).__init__()
      self.encoder = encoder
      self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
      batch_size = source.shape[1]
      # batch_size should look like (trg_len, N)
      target_len = target.shape[0]
      target_vocab_size = len(pde.vocab) #make sure to change this to pde

      outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
      # predicts one word at a time, but each word predicts an entire batch and every prediction is a vector of the entire vocabulary size

      hidden, cell = self.encoder(source)

      x = target[0]

      for t in range(1, target_len):
          output, hidden, cell = self.decoder(x, hidden, cell)

          outputs[t] = output

          # the output will look like (N, english_vocab_size)
          best_guess = output.argmax(1)

          x = target[t] if random.random() < teacher_force_ratio else best_guess

      return outputs

# hyperparameters for the training model
num_epochs = 2
learning_rate = 3e-4
batch_size = 32 # batch size must be smaller than total amount of data

# model hyperparameters
save_model = True
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(me.vocab)
input_size_decoder = len(pde.vocab)
output_size = len(me.vocab)
encoder_embedding_size = 50
decoder_embedding_size = 50
hidden_size = 100
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# writing iterators
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), # make sure these are in same order as declared variables
    sort_within_batch = True,
    sort_key = lambda x: len(x.me), # formerly x.src

# running the encoder decoder models
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = pde.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint2.pth.tar"), model, optimizer)

# This is an example sentence.
sentence = (
    "Is owr dyner dyght?"

for epoch in range(num_epochs):
    print(f'Epoch [{epoch} / {num_epochs}]')

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),


    translated_sentence = translate_sentence(
        model, sentence, me, pde, device, max_length=50

    print(f"Translated example sentence: \n {translated_sentence}")


    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.me.to(device)
        target = batch.pde.to(device)
        # this was formerly batch.src and batch.trg

        output = model(inp_data, target)
        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. Here we can view it in a similar
        # way so we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshaping."
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        loss = criterion(output, target)



print("run complete")

错误一我认为此错误主要与输出未按应有的形状有关,但我不知道如何使其成为正确的形状。我认为问题可能在于我如何创建数据集、如何创建 Seq2Seq2 类或程序的最后一个 for 循环。

Traceback (most recent call last):
  File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 250, in <module>
    output = model(inp_data, target)
  File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 150, in forward
    outputs[t] = output
RuntimeError: The expanded size of the tensor (118) must match the existing size (135) at non-singleton dimension 1.  Target sizes: [20, 118].  Tensor sizes: [20, 135]

错误二 这个错误有时会随机出现。我认为这可能是因为我的超参数对于我的样本量来说太大了,但我不确定为什么有时只会出现此错误消息而不是第一个错误消息。

Traceback (most recent call last):
  File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 235, in <module>
    translated_sentence = translate_sentence(
  File "/Users/user/PycharmProjects/pythonProject/utils2.py", line 53, in translate_sentence
    output, hidden, cell = model.decoder(previous_word, hidden, cell)
  File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 105, in forward
    embedding = self.dropout(self.embedding(x))
  File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 124, in forward
    return F.embedding(
  File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/functional.py", line 1852, in embedding
    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self

到目前为止,我在 test.tsv 中只有 25 个翻译,在 train.tsv 中只有 20 个,在 validation.tsv 中只有 5 个。我会得到更多,但我希望程序能够首先成功运行。来自validation.tsv 的示例

Middle English  Present Day English
Than make I buter ferther on the day    Then I make butter later in the day.
Ye wold say, 'they be prowde!'  You would say, ‘they are proud!’
Whyll yow slepe fulle stylle,   While you sleep soundly,
Kype wylle owr chelderne and let them not wepe. Keep our children well and don’t let them weep.
Yet I have not a feyr word whan that I have done.   Yet I don’t get any kind words when I have done that.


