首页 > 解决方案 > 预期的设备 cuda:0 但在我已经将设备分配为 cuda 时在 PyTorch 中获得了设备 cpu

问题描述

我有以下神经网络代码,我得到“预期设备 cuda:0 但在 PyTorch 中得到设备 cpu”错误,我不知道为什么。我将设备指定为 cuda,打印行返回 cuda。我已经尝试将设备分配为 device = cuda:0 以防万一,但这没有效果。这是代码:

def run():
    torch.multiprocessing.freeze_support()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)  

    metabolites = pd.read_excel("testmetabolitedata.xlsx")
    subject_metadata = pd.read_excel("testsubj.xlsx")
    metabolitesdf = pd.DataFrame(data=metabolites)
    metabolitesdf = metabolitesdf.iloc[:, 1:9153] 
    subjectsdf = pd.DataFrame(data=subject_metadata)

    n_samples, n_metabolites = metabolitesdf.shape
    print(n_samples)

    #genotypes of the target gene
    print(subjectsdf['SLCO1B1_rs4149056'])
    genotypes = subjectsdf['SLCO1B1_rs4149056']
    print(genotypes)
    # print('{} unique genotypes'.format(len(set(genotypes))))

    labels = [1 if g == 1 else 0 for g in genotypes]
    print('{} samples with genotype 1 out of {}  samples ({:.1%})'.format(sum(labels), len(labels),
                                                    sum(labels) / len(labels)))


    #Insert 0 into index 0 (first) into the list for the first row with column names
    labels.insert(0, 0)
    #log transform                                                 
    log_metabol = np.log10(metabolitesdf + 1)

    #Split data into training and validation 70% / 30%
    data = torch.utils.data.TensorDataset(torch.Tensor(np.array(log_metabol)),
                                                    torch.Tensor(labels))
    train, val = torch.utils.data.random_split(data, [int(0.7 * len(data)), 
                                                                len(data) - int(0.7 * len(data))])

    print('{:.0f}/{} training/total ({:.1%}) in training set, {:.0f}/{} val/total ({:.1%}) in validation set'.format(\
        train[:][1].sum(), len(train), train[:][1].sum() / len(train),
        val[:][1].sum(), len(val), val[:][1].sum() / len(val)))




    class MultiLayerPredictor(torch.nn.Module):
        def __init__(self, input_shape, output_shape=1, hidden_dim=1024, **kwargs):
            super().__init__()
            self.fc1 = torch.nn.Linear(in_features=input_shape, out_features=hidden_dim)
            self.bn1 = torch.nn.BatchNorm1d(hidden_dim)
            self.fc2 = torch.nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
            self.bn2 = torch.nn.BatchNorm1d(hidden_dim)
            self.fc3 = torch.nn.Linear(in_features=hidden_dim, out_features=output_shape)

        def forward(self, x):
            l1 = torch.relu(self.bn1(self.fc1(x)))
            l2 = torch.relu(self.bn2(self.fc2(l1)))
            return torch.sigmoid(self.fc3(l2)).reshape(-1)


    #load the training and validation sets       
    print("Load training and validation data ")
    train_loader = torch.utils.data.DataLoader(train, batch_size=128, 
                                                shuffle=True, num_workers=10, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val, batch_size=128, 
                                                shuffle=False, num_workers=10, pin_memory=True)

    print("Loading complete, create model")
    model3 = MultiLayerPredictor(input_shape=n_metabolites).to(device)
    print("Model created! Moving to optimizer")
    optimizer3 = torch.optim.SGD(model3.parameters(), lr=1e-2)
    print("Optimizer done")
    objective3 = torch.nn.BCELoss()
    epochs = 30


    print_stats_interval = 10
    log3 = []
    print("Moving to training loop")

    for epoch in range(epochs):
        loss = n_correct = 0
        model3.train()
        for batch, target in train_loader:
            batch = batch.view(-1, n_metabolites).to(device)
            optimizer3.zero_grad()
            outputs = model3(batch) # stack trace shows the issue being either on this line 
            train_loss = objective3(outputs, target) # or this line
            loss += train_loss.item()
            n_correct += (target == (outputs.reshape(-1) > 0.5).float()).sum()
            train_loss.backward()
            optimizer3.step()
        
        loss = loss / len(train_loader)    
        acc = (n_correct.float() / len(train)).numpy()
        epoch += 1
            
        model3.eval();
        val_loss = val_n_correct = 0
        with torch.no_grad():
            for batch, target in val_loader:
                batch = batch.view(-1, n_metabolites).to(device)
                outputs = model3(batch)
                val_loss += objective3(outputs, target)
                val_n_correct += (target == (outputs.reshape(-1) > 0.5).float()).sum()
        val_loss = (val_loss / len(val_loader)).numpy()
        val_acc = (val_n_correct.float() / len(val)).numpy()
        
        if (epoch % print_stats_interval) == 0 or epoch == epochs:
            print(f'epoch={epoch:.0f}, loss={loss:.5f}, val_loss={np.round(val_loss,5):.5f}, acc={np.round(acc,5):.5f}, val_acc={np.round(val_acc,5):.5f}')
        log3.append((epoch, loss, val_loss, acc, val_acc))
    log3 = pd.DataFrame(log3, columns=['epoch', 'loss', 'val_loss', 'acc', 'val_acc'])

    plt.figure(figsize=(6, 3))
    plt.plot(log3['epoch'], log3['loss'], label='Training');
    plt.plot(log3['epoch'], log3['val_loss'], label='Validation');
    plt.xlabel('Epoch'); plt.ylabel('Loss')
    plt.legend();

    val_log_mutations = val_hcc[:][0].numpy().reshape(-1)
    val_true_labels = val_hcc[:][1].numpy() + 0

    res = model3(val_hcc[:][0])
    predictions = (res.detach().numpy().reshape(-1) > 0.5) + 0
    correct = (val_true_labels == predictions) + 0
    n_correct = correct.sum()
    print('{}/{} ({:.1%}) in the validation set'.format(n_correct, len(correct), n_correct / len(correct)))
    print('Majority classifier accuracy: {:.1%}'.format((len(correct) - val_true_labels.sum()) / len(correct)))

if __name__ == '__main__':
    run()

这里发生了什么?这里的堆栈跟踪:

Traceback (most recent call last):
  File "//ad..fi/home/h/h/Desktop/neuralnet/neuralnet_train.py", line 142, in <module>
    run()
  File "//ad..fi/home/h/h/Desktop/neuralnet/neuralnet_train.py", line 99, in run
    train_loss = objective3(outputs, target)
  File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\modules\loss.py", line 516, in forward
    return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\functional.py", line 2378, in binary_cross_entropy
    return torch._C._nn.binary_cross_entropy(
RuntimeError: expected device cuda:0 but got device cpu
PS Microsoft.PowerShell.Core\FileSystem::\\ad..fi\home\h\h\Desktop\neuralnet>

标签: pythonpytorch

解决方案


在循环的训练和验证中也将目标移动到 CUDA。

 for batch, target in train_loader:
            batch,target = batch.view(-1, n_metabolites).to(device),target.to(device)
                                  .
                                  .
                                  .

for batch, target in val_loader:
            batch,target = batch.view(-1, n_metabolites).to(device),target.to(device)``
                                  .
                                  .
                                  .

推荐阅读