pytorch - pytorch LSTM 的损失没有减少
问题描述
我是 pytorch 的新手,并在 lstm 实施方面寻求您的帮助。我有一个单层 LSTM,后跟一个全连接层和 sigmoid(实现深度知识跟踪)。
示例输入输出对如下,输入 =
[[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
输出= [[1, 0],
[0, 0],
[0, 1],
[-1, -1]]
我有以下 LSTM 代码,并希望将二进制交叉熵计算为损失。然而,损失并没有随着时间的推移而减少。我已经关注了几个博客来实现这一点,我认为这是正确的。我的代码有什么问题吗?如果您能花几分钟查看代码并帮助提出问题是否有问题,那就太好了。
class BuildModel(nn.Module):
on_gpu = False
def __init__(self, num_skills, batch_size = 2, lstm_units = 200):
super(BuildModel, self).__init__()
self.lstm_units = lstm_units
self.batch_size = batch_size
self.output_dim = num_skills
self.input_dim = num_skills * 2
self.seq_len = seq_len
self.__build_model()
def __build_model(self):
self.lstm = nn.LSTM(
input_size = self.input_dim,
hidden_size = self.lstm_units,
num_layers = 1,
batch_first = True,
)
self.hidden_to_skills = nn.Linear(self.lstm_units, self.output_dim)
def init_hidden(self):
hidden_a = torch.randn(1, self.batch_size, self.lstm_units)
hidden_b = torch.randn(1, self.batch_size, self.lstm_units)
if self.on_gpu:
hidden_a = hidden_a.cuda()
hidden_b = hidden_b.cuda()
hidden_a = Variable(hidden_a, requires_grad=True)
hidden_b = Variable(hidden_b, requires_grad=True)
return (hidden_a, hidden_b)
def forward(self, X, X_lengths):
self.hidden = self.init_hidden()
batch_size, seq_len, _ = X.size()
## X is of shape batch_size, sequence_length, num_skills*2
X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True, enforce_sorted=False)
X, self.hidden = self.lstm(X, self.hidden)
X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)
# Transfer data from (batch_size, seq+len, lstm_units) --> (batch_size * seq_len, lstm_units)
X = X.contiguous()
X = X.view(-1, X.shape[2])
X = self.hidden_to_skills(X)
X = torch.nn.functional.sigmoid(X)
# return the predictions
return X
def loss(self, Y_hat, Y, threshold, seq_len):
#flatten labels
Y = Y.view(-1)
Y_hat = Y_hat.view(-1, seq_len * self.output_dim * self.batch_size)
mask = (Y > -1).float()
mask_long = (Y > -1).long()
nb_tokens = int(torch.sum(mask).item())
Y_hat = Y_hat[range(Y_hat.shape[0])] * mask
Y_hat = Y_hat.reshape(-1)
Y = Y[range(Y.shape[0])] * mask_long
Y = Y.float()
Y_hat = (Y_hat > 0.0).float() * 1.0
loss = torch.nn.BCELoss()
ce_loss = loss(Y_hat, Y)
return Variable(ce_loss, requires_grad = True)
### For training
learning_rate = .1
total_step = len(loader)
model = BuildModel(num_skills, 2, 200)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
overall_loss = 0.0
for i, (X, y, lengths, seq_len) in enumerate(loader):
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).long()
outputs = model(X, lengths)
loss = model.loss(outputs, y, 0.5, seq_len)
optimizer.zero_grad()
loss.backward()
optimizer.step()
解决方案
推荐阅读
- json - 弹性搜索如何索引嵌套列表
- c++ - C ++定义一个宏,它接受一个函数并将其传递给其他函数,这可能吗?
- java - Java 8 PriorityQueue 比较器做错了什么?
- html - 如何使图像网格适合所有屏幕分辨率?
- python - Python - 将两个单列列表合并为一个双列列表并打印
- google-cloud-platform - 带有 JWT 令牌的 Google Cloud Storage JSON API
- javascript - Javascript:单击后更改按钮文本更改
- awk - 如何使用 awk 仅打印在某个字段具有最小值的匹配项?
- r - 在 R 中使用循环创建多个图
- linq-to-sql - Linq to entity 添加 Where() 子句会中断查询