python - 为什么 PyTorch 会出现这个 CUDA 错误?
问题描述
在构建 RNN 模型的过程中,我遇到了以下错误。以下是我的代码的一部分:
class RNN(nn.Module):
def __init__(self):
super().__init__()
self.embedding = nn.Embedding(emb_num, emb_size)
self.dropout1 = nn.Dropout(dropout_rate)
self.LSTM = nn.LSTM(50, 128, 1, bidirectional = True)
self.dropout2 = nn.Dropout(dropout_rate)
self.full_connect = nn.Linear(256 , 5) # biLSTM state * 2
def forward(self, x):
x = self.embedding(x)
x = x.permute(1,0,2)
x = self.dropout1(x)
_, (hn, cn) = self.LSTM(x)
out = self.dropout2(hn)
#print(out.shape)
out = torch.cat([out[i, :, :] for i in range(2)], 1)
out = out.squeeze()
out = self.full_connect(out)
return out
def train():
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = 0.001)
Loss = nn.CrossEntropyLoss()
for epoch in range(epochs):
model.train()
max_acc = 0
print('epoch:{}'.format(epoch + 1))
for i, data in enumerate(trainloader, 0):
X_train, y_train = data
optimizer.zero_grad()
X_train = X_train.long().to(device)
y_train = y_train.long().to(device)
output = model(X_train)
loss = Loss(output, y_train)
loss.backward()
optimizer.step()
print('loss:{:3f}'.format(loss))
model.eval()
acc = valid(validloader)
print('epoch:{} acc:{}'.format(epoch+1, acc))
if epoch + 1 == 50:
torch.save(model.state_dict(), 'epoch50.pt')
if acc > max_acc:
max_acc = acc
torch.save(model.state_dict(), 'max_acc model.pt')
torch.save(model.state_dict(), 'final model.pt')
def valid(dataloader):
correct = 0
total = 0
with torch.no_grad():
for i, data in enumerate(dataloader, 0):
X_train, y_train = data
#optimizer.zero_grad()
X_train = X_train.long().to(device)
y_train = y_train.long().to(device)
output = model(X_train)
#loss = Loss(output, y_train)
#loss.backward()
#optimizer.step()
correct += (torch.argmax(output, dim = 1) == y_train).sum().item()
total += y_train.shape[0]
return correct / total
在上面的代码中,我创建了一个开发集来测试训练中的模型。但是在 4 个 epoch 或更多之后,出现了这个错误:
Traceback (most recent call last):
File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 287, in <module>
train()
File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 185, in train
acc = valid(validloader)
File "c:\Users\hhhh\Desktop\NLP-beginner\task2\task2.py", line 207, in valid
correct += (torch.argmax(output, dim = 1) == y_train).sum().item()
RuntimeError: CUDA error: unspecified launch failure
我曾尝试切换到 cpu 设备来训练模型,但即使 1 个 epoch 训练速度也会变慢。是不是因为我的电脑配置不够运行这个?
解决方案
要检查您的系统是否有 CUDA:
from torch.cuda import is_available
def main():
use_cuda = not args.no_cuda and is_available()
dev = device("cuda" if use_cuda else "cpu")
model = RNN().to(device=dev)
# Call train and test methods below
if __name__ == '__main__':
main()
推荐阅读
- google-cloud-platform - Google Cloud CDN 从日志中获取边缘位置
- vba - VBA - 数据验证空单元格
- model - 从 tensorflow 模型中保存权重和偏差
- javascript - 访问对象属性中的数组,Sequelize
- soap - 如何从 Sabre SOAP API 中的队列中删除特定 PNR
- c# - Asp Net Core MVC RedirectToAction 不工作(无 TempData)
- sqlite - SQLITE : 表中有多少列
- c# - 如何在 C# 中调试异常?
- java - MNIST 的缩小图像
- ansible - Ansible 说参数不存在