首页 > 解决方案 > Pytorch 中的多 GPU 训练导致我的系统死机

问题描述

当我将模型包裹起来nn.DataParallel(model)并开始训练时,我的屏幕冻结了,我每次都必须手动重新启动计算机。

我尝试了一些变体,比如不添加.to(device)每个xand y,但每当nn.DataParallel使用时,我似乎都会导致计算机死机。

import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models, datasets, transforms

import torch.utils.data

DataLoader = torch.utils.data.DataLoader
random_split = torch.utils.data.random_split

global_rank = 0
MNIST = datasets.MNIST

class MLPClassifier(nn.Module):
    def __init__(self):
        super(MLPClassifier, self).__init__()
        self.layer_1 = torch.nn.Linear(28 * 28, 128)
        self.layer_2 = torch.nn.Linear(128, 444)
        self.layer_3 = torch.nn.Linear(444, 333)
        self.layer_4 = torch.nn.Linear(333, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.layer_1(x)
        x = F.relu(x)
        x = self.layer_2(x)
        x = F.relu(x)
        x = self.layer_3(x)
        x = F.relu(x)
        x = self.layer_4(x)
        return x

# Download data
if global_rank == 0:
    mnist_train = MNIST(os.getcwd(), train=True, download=True)
    mnist_test = MNIST(os.getcwd(), train=False, download=True)

# dist.barrier()

#transforms
transform=transforms.Compose([transforms.ToTensor(),
                             transforms.Normalize((0.1307,), (0.3081,))])
mnist_train = MNIST(os.getcwd(), train=True, transform=transform)


# Split dataset
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
mnist_test = MNIST(os.getcwd(), train=False, download=True)

# Build dataloaders
mnist_train = DataLoader(mnist_train, batch_size=256)
mnist_val = DataLoader(mnist_val, batch_size=256)
mnist_test = DataLoader(mnist_test, batch_size=256)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


model = MLPClassifier()
model = nn.DataParallel(model)
model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train loop
model.train()
num_epochs = 1
for epoch in range(num_epochs):
    for train_batch in mnist_train:
        x, y = train_batch
        logits = model(x.to(device))
        loss = F.cross_entropy(logits, y.to(device))
        print('rain loss: ', loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # EVAL LOOP
        model.eval()
        with torch.no_grad():
            val_loss_a = []
            for val_batch in mnist_val:
                x, y = val_batch
                logits = model(x.to(device))
                val_loss = F.cross_entropy(logits, y.to(device))
                val_loss_a.append(val_loss)
            avg_val_loss = torch.stack(val_loss_a).mean()
            model.train()

标签: machine-learningparallel-processingpytorch

解决方案


推荐阅读