machine-learning - Pytorch 中的多 GPU 训练导致我的系统死机
问题描述
当我将模型包裹起来nn.DataParallel(model)
并开始训练时,我的屏幕冻结了,我每次都必须手动重新启动计算机。
我尝试了一些变体,比如不添加.to(device)
每个x
and y
,但每当nn.DataParallel
使用时,我似乎都会导致计算机死机。
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, datasets, transforms
import torch.utils.data
DataLoader = torch.utils.data.DataLoader
random_split = torch.utils.data.random_split
global_rank = 0
MNIST = datasets.MNIST
class MLPClassifier(nn.Module):
def __init__(self):
super(MLPClassifier, self).__init__()
self.layer_1 = torch.nn.Linear(28 * 28, 128)
self.layer_2 = torch.nn.Linear(128, 444)
self.layer_3 = torch.nn.Linear(444, 333)
self.layer_4 = torch.nn.Linear(333, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.layer_1(x)
x = F.relu(x)
x = self.layer_2(x)
x = F.relu(x)
x = self.layer_3(x)
x = F.relu(x)
x = self.layer_4(x)
return x
# Download data
if global_rank == 0:
mnist_train = MNIST(os.getcwd(), train=True, download=True)
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# dist.barrier()
#transforms
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])
mnist_train = MNIST(os.getcwd(), train=True, transform=transform)
# Split dataset
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
mnist_test = MNIST(os.getcwd(), train=False, download=True)
# Build dataloaders
mnist_train = DataLoader(mnist_train, batch_size=256)
mnist_val = DataLoader(mnist_val, batch_size=256)
mnist_test = DataLoader(mnist_test, batch_size=256)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLPClassifier()
model = nn.DataParallel(model)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Train loop
model.train()
num_epochs = 1
for epoch in range(num_epochs):
for train_batch in mnist_train:
x, y = train_batch
logits = model(x.to(device))
loss = F.cross_entropy(logits, y.to(device))
print('rain loss: ', loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL LOOP
model.eval()
with torch.no_grad():
val_loss_a = []
for val_batch in mnist_val:
x, y = val_batch
logits = model(x.to(device))
val_loss = F.cross_entropy(logits, y.to(device))
val_loss_a.append(val_loss)
avg_val_loss = torch.stack(val_loss_a).mean()
model.train()
解决方案
推荐阅读
- go - 基于 Cobra 的 CLI 中的 OnInitialize 与 PersistentPreRun
- angularjs - Angular 调用自签名外部服务
- python-3.x - 如何在 python 中正确模拟 gcp 客户端库调用
- python - 如何从命令行在 Zenodo 中创建帐户?
- excel - 计算日期和匹配关键字之间的单元格总和
- go - 将 pflags.Flag.Value 转换为任意类型
- python - 谷歌云 cron.yaml 间隔
- python - 导入库不同的方法
- kotlin - Kotlin 长文字:值超出范围
- python - 如何在python中将列表转换为数据框?