首页 > 解决方案 > 如何解决 Windows 10 中 pytorch 的多处理问题

问题描述

所以我尝试了 kaggle mnist 挑战,并使用了 torch 的自定义数据集模块来加载 csv 文件。每当我使用 num_workers 设置为大于 0 的数据加载器训练网络时,它似乎给了我一个 BrokenPipeError。我在网上遵循了许多教程,甚至将我的代码放在了if __name__ == "__main__" 行下,但似乎没有任何东西可以解决这个错误,num_workers=0 也没有给我任何错误,而是给我一个关于命名张量的用户警告。
下面是代码

import torch.nn as nn 
import torch 
import torch.nn.functional as F
import torch.optim as optim 
import pandas as pd
import numpy as np 
import torchvision.transforms as transforms

class mnistdataset(torch.utils.data.Dataset):
    def __init__(self, file, transform=None):

        self.file = pd.read_csv(file)
        self.labels = self.file["label"].values
        self.transform = transform  
        

    def __len__(self):
        return self.file.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        im = self.file.iloc[idx, 1:].to_numpy(dtype="uint8").reshape(-1)
        im = np.array([im]).reshape(28,28)
        if self.transform:
            im = self.transform(im)
        return im, self.labels[idx]

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, 5, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 64, 5, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.m1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.d1 = nn.Dropout2d(0.25)
        self.conv3 = nn.Conv2d(64,64,3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64,64,3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(64)
        self.conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(64)
        self.m2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.d2 = nn.Dropout2d(0.25)
        self.conv6 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.bn6 = nn.BatchNorm2d(128)
        self.d3 = nn.Dropout2d(0.25)
        self.lin1 = nn.Linear(4608, 400)
        self.d4 = nn.Dropout(0.4)
        self.lin2 = nn.Linear(400, 28)
        self.d5 = nn.Dropout(0.2)
        self.lin3 = nn.Linear(28, 10)


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.bn1(x)
        x = F.relu(self.conv2(x))
        x = self.bn2(x)
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = self.d1(x)
        x = F.relu(self.conv3(x))
        x = self.bn3(x)
        x = F.relu(self.conv4(x))
        x = self.bn4(x)
        x = F.relu(self.conv5(x))
        x = self.bn5(x)
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = self.d2(x)
        x = F.relu(self.conv6(x))
        x = self.bn6(x)
        x = self.d3(x)

        x = x.view(x.size(0), -1)
        
        x = F.relu(self.lin1(x))
        x = self.d1(x)
        x = F.relu(self.lin2(x))
        x = self.d2(x)
        x = self.lin3(x)

        return x

def get_dataloaders():
    train_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(),transforms.Normalize((0.1307), (0.3081))])
    train = mnistdataset("train.csv", transform = train_transform)
    return torch.utils.data.DataLoader(train, batch_size=20, shuffle=True, num_workers=2)


def train_network(train_loader):
    net = Net().cuda()
    opt = optim.SGD(net.parameters(), lr= 0.01, momentum=0.5)
    loss = nn.CrossEntropyLoss().cuda()

    epochs = 2
    for epoch in range(epochs):
        
        net.train()
        for batch_id, (im, target) in enumerate(train_loader):
        
            im = im.to('cuda', non_blocking=True)
            target = target.to('cuda', non_blocking=True).long()
            opt.zero_grad()
            pred = net(im)
            l = loss(pred, target)
            l.backward()
            opt.step()
            if (batch_id + 1)% 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, (batch_id + 1) * len(im), len(train_loader.dataset),
                    100. * (batch_id + 1) / len(train_loader), l.item()))

if __name__ == '__main__':
    
    train_loader = get_dataloaders()
    train_network(train_loader)

我得到的错误是

---------------------------------------------------------------------------
BrokenPipeError                           Traceback (most recent call last)
<ipython-input-8-5af6b8b22e93> in <module>
      2 
      3     train_loader = get_dataloaders()
----> 4     train_network(train_loader)

<ipython-input-4-24f1b1c4c822> in train_network(train_loader)
      8 
      9         net.train()
---> 10         for batch_id, (im, target) in enumerate(train_loader):
     11 
     12             im = im.to('cuda', non_blocking=True)

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
    357             return self._iterator
    358         else:
--> 359             return self._get_iterator()
    360 
    361     @property

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in _get_iterator(self)
    303         else:
    304             self.check_worker_number_rationality()
--> 305             return _MultiProcessingDataLoaderIter(self)
    306 
    307     @property

~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
    916             #     before it starts, and __del__ tries to join but will get:
    917             #     AssertionError: can only join a started process.
--> 918             w.start()
    919             self._index_queues.append(index_queue)
    920             self._workers.append(w)

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py in start(self)
    103                'daemonic processes are not allowed to have children'
    104         _cleanup()
--> 105         self._popen = self._Popen(self)
    106         self._sentinel = self._popen.sentinel
    107         # Avoid a refcycle if the target function holds an indirect

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
    221     @staticmethod
    222     def _Popen(process_obj):
--> 223         return _default_context.get_context().Process._Popen(process_obj)
    224 
    225 class DefaultContext(BaseContext):

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
    320         def _Popen(process_obj):
    321             from .popen_spawn_win32 import Popen
--> 322             return Popen(process_obj)
    323 
    324     class SpawnContext(BaseContext):

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
     63             try:
     64                 reduction.dump(prep_data, to_child)
---> 65                 reduction.dump(process_obj, to_child)
     66             finally:
     67                 set_spawning_popen(None)

~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
     58 def dump(obj, file, protocol=None):
     59     '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60     ForkingPickler(file, protocol).dump(obj)
     61 
     62 #

BrokenPipeError: [Errno 32] Broken pipe

我在 num_workers 设置为 0 时收到的警告是

ipykernel_launcher:33: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  ..\c10/core/TensorImpl.h:1156.)

该模型仍然在 num_workers 设置为 0 的情况下进行训练。

我的环境详细信息:
Windows 10 家庭版、用于 CUDA 11.2 的 Pytorch(安装了 pip,没有 conda)、用于 Windows 的 Python 3.6.7、GTX 1050 Ti GPU、Intel i5 第 9 代

编辑:当我在 python 文件中运行代码时,代码似乎可以工作,但在使用 jupyter 笔记本时似乎没有运行

标签: python-3.xwindowspytorchpytorch-dataloader

解决方案


推荐阅读