python - “RuntimeError: 张量 (118) 的扩展大小必须与非单维 1 处的现有大小 (135) 匹配” Pytorch 机器翻译
问题描述
使用这个德语到英语 Pytorch Seq2Seq 机器翻译,我一直在尝试使用自定义数据集创建中古英语到现代英语 Seq2Seq 机器翻译。除了创建自定义数据集之外,我实际上复制并粘贴了德语模板,并用我(中古英语)替换了德语单词,用 pdf(现在的英语)替换了英语,但是在尝试训练模型时我收到了两条错误消息. 当我运行德语版本的代码时,它训练得很好。以下是我的代码和两条错误消息。
我的代码:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data, datasets
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np # other useful math-related libraries and modules
import spacy # this is where he gets his nlp datasets
import spacy.cli
import random
from utils2 import translate_sentence, bleu, save_checkpoint, load_checkpoint
spacy_eng = spacy.load('en') # loading up the English tokenizer
def tokenizer_eng(text): # English tokenizer function for the fields
return [tok.text for tok in spacy_eng.tokenizer(text)]
# constructing the fields for pde and me
me = Field(sequential=True, use_vocab=True, tokenize=tokenizer_eng, lower=True, init_token='sos', eos_token='<eos>')
pde = Field(sequential=True, use_vocab=True, tokenize=tokenizer_eng, lower=True, init_token='sos', eos_token='<eos>')
fields = {'Middle English': ('me', me), 'Present Day English': ('pde', pde)}
train_data, validation_data, test_data = TabularDataset.splits(
path='/Users/user/Desktop/Code/datasets',
train='train.tsv',
test='test.tsv',
validation='validation.tsv',
format='tsv',
fields=fields
)
#print(test_data[0].__dict__.keys())
#print(validation_data[1].__dict__.values())
# Building the vocabulary
me.build_vocab(train_data)
pde.build_vocab(train_data)
# Building the seq2seq and encoder decoder models
class Encoder(nn.Module): # first LSTM
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def forward(self, x): # defines the vector of indices
# x vector shape: (seq_length, N)
embedding = self.dropout(self.embedding(x))
# embedding vector shape: (seq_length, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding)
return hidden, cell
class Decoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
# output_size should be the same as input_size, because the size of the vocabulary should change.
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
self.fc = nn.Linear(hidden_size, output_size) # fc stands for fully connected
def forward(self, x, hidden, cell):
x = x.unsqueeze(0) #this adds another dimension
embedding = self.dropout(self.embedding(x))
# embedding shape: (1, N, embedding_size)
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
# outputs is what we think this next word should be
# shape of the outputs: (1, N, hidden_size)
predictions = self.fc(outputs)
# shape of predictions: (1, N, length_of_vocab)
# this will be sent to the loss function
predictions = predictions.squeeze(0)
return predictions, hidden, cell
class Seq2Seq(nn.Module): # combines the encoder and decoder
def __init__(self, encoder, decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, teacher_force_ratio=0.5):
batch_size = source.shape[1]
# batch_size should look like (trg_len, N)
target_len = target.shape[0]
target_vocab_size = len(pde.vocab) #make sure to change this to pde
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
# predicts one word at a time, but each word predicts an entire batch and every prediction is a vector of the entire vocabulary size
hidden, cell = self.encoder(source)
x = target[0]
for t in range(1, target_len):
output, hidden, cell = self.decoder(x, hidden, cell)
outputs[t] = output
# the output will look like (N, english_vocab_size)
best_guess = output.argmax(1)
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
# hyperparameters for the training model
num_epochs = 2
learning_rate = 3e-4
batch_size = 32 # batch size must be smaller than total amount of data
# model hyperparameters
save_model = True
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(me.vocab)
input_size_decoder = len(pde.vocab)
output_size = len(me.vocab)
encoder_embedding_size = 50
decoder_embedding_size = 50
hidden_size = 100
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
# writing iterators
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, validation_data, test_data), # make sure these are in same order as declared variables
batch_size=batch_size,
sort_within_batch = True,
sort_key = lambda x: len(x.me), # formerly x.src
device=device
)
# running the encoder decoder models
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = pde.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
if load_model:
load_checkpoint(torch.load("my_checkpoint2.pth.tar"), model, optimizer)
# This is an example sentence.
sentence = (
"Is owr dyner dyght?"
)
for epoch in range(num_epochs):
print(f'Epoch [{epoch} / {num_epochs}]')
if save_model:
checkpoint = {
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}
save_checkpoint(checkpoint)
model.eval()
translated_sentence = translate_sentence(
model, sentence, me, pde, device, max_length=50
)
print(f"Translated example sentence: \n {translated_sentence}")
model.train()
for batch_idx, batch in enumerate(train_iterator):
inp_data = batch.me.to(device)
target = batch.pde.to(device)
# this was formerly batch.src and batch.trg
output = model(inp_data, target)
# Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
# doesn't take input in that form. Here we can view it in a similar
# way so we have output_words * batch_size that we want to send in into
# our cost function, so we need to do some reshaping."
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
loss = criterion(output, target)
loss.backward()
optimizer.step()
print("run complete")
错误一我认为此错误主要与输出未按应有的形状有关,但我不知道如何使其成为正确的形状。我认为问题可能在于我如何创建数据集、如何创建 Seq2Seq2 类或程序的最后一个 for 循环。
Traceback (most recent call last):
File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 250, in <module>
output = model(inp_data, target)
File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 150, in forward
outputs[t] = output
RuntimeError: The expanded size of the tensor (118) must match the existing size (135) at non-singleton dimension 1. Target sizes: [20, 118]. Tensor sizes: [20, 135]
错误二 这个错误有时会随机出现。我认为这可能是因为我的超参数对于我的样本量来说太大了,但我不确定为什么有时只会出现此错误消息而不是第一个错误消息。
Traceback (most recent call last):
File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 235, in <module>
translated_sentence = translate_sentence(
File "/Users/user/PycharmProjects/pythonProject/utils2.py", line 53, in translate_sentence
output, hidden, cell = model.decoder(previous_word, hidden, cell)
File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/Users/user/PycharmProjects/pythonProject/ME-PDE.py", line 105, in forward
embedding = self.dropout(self.embedding(x))
File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 124, in forward
return F.embedding(
File "/Users/user/.conda/envs/pythonProject/lib/python3.8/site-packages/torch/nn/functional.py", line 1852, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
到目前为止,我在 test.tsv 中只有 25 个翻译,在 train.tsv 中只有 20 个,在 validation.tsv 中只有 5 个。我会得到更多,但我希望程序能够首先成功运行。来自validation.tsv 的示例
Middle English Present Day English
Than make I buter ferther on the day Then I make butter later in the day.
Ye wold say, 'they be prowde!' You would say, ‘they are proud!’
Whyll yow slepe fulle stylle, While you sleep soundly,
Kype wylle owr chelderne and let them not wepe. Keep our children well and don’t let them weep.
Yet I have not a feyr word whan that I have done. Yet I don’t get any kind words when I have done that.
非常感谢您提供的任何帮助!
解决方案
推荐阅读
- javascript - 使用 map() 从 firebase 渲染对象的内容
- javascript - 我可以在使用 ChartJS 的 OHLC/烛台图表中使用十字准线吗
- swift - 如何获取用于构建扩展的数组的 ElementType?
- php - 解析来自 SOAP 的 curl 响应
- java - 通过 AWS Java SDK 访问 S3 的问题
- python-3.x - 我如何编写一个程序来读取一个数字,然后使用嵌套循环打印出一个三角形的#s?
- bash - 在 Bash 脚本的 For 循环中保持 PBS 作业
- flutter - 如何使用颤振获取实时相机的颜色代码?
- python - 如何从数组 discord.py 中对用户进行 dm
- google-chrome - 如何通过启动在 Devtools 中打开自定义 Chrome 扩展?