python-3.x - 如何解决 Windows 10 中 pytorch 的多处理问题
问题描述
所以我尝试了 kaggle mnist 挑战,并使用了 torch 的自定义数据集模块来加载 csv 文件。每当我使用 num_workers 设置为大于 0 的数据加载器训练网络时,它似乎给了我一个 BrokenPipeError。我在网上遵循了许多教程,甚至将我的代码放在了if __name__ == "__main__"
行下,但似乎没有任何东西可以解决这个错误,num_workers=0 也没有给我任何错误,而是给我一个关于命名张量的用户警告。
下面是代码
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
class mnistdataset(torch.utils.data.Dataset):
def __init__(self, file, transform=None):
self.file = pd.read_csv(file)
self.labels = self.file["label"].values
self.transform = transform
def __len__(self):
return self.file.shape[0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
im = self.file.iloc[idx, 1:].to_numpy(dtype="uint8").reshape(-1)
im = np.array([im]).reshape(28,28)
if self.transform:
im = self.transform(im)
return im, self.labels[idx]
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 64, 5, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(64)
self.conv2 = nn.Conv2d(64, 64, 5, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(64)
self.m1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d1 = nn.Dropout2d(0.25)
self.conv3 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn3 = nn.BatchNorm2d(64)
self.conv4 = nn.Conv2d(64,64,3, stride=1, padding=1)
self.bn4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
self.bn5 = nn.BatchNorm2d(64)
self.m2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.d2 = nn.Dropout2d(0.25)
self.conv6 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
self.bn6 = nn.BatchNorm2d(128)
self.d3 = nn.Dropout2d(0.25)
self.lin1 = nn.Linear(4608, 400)
self.d4 = nn.Dropout(0.4)
self.lin2 = nn.Linear(400, 28)
self.d5 = nn.Dropout(0.2)
self.lin3 = nn.Linear(28, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.bn1(x)
x = F.relu(self.conv2(x))
x = self.bn2(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d1(x)
x = F.relu(self.conv3(x))
x = self.bn3(x)
x = F.relu(self.conv4(x))
x = self.bn4(x)
x = F.relu(self.conv5(x))
x = self.bn5(x)
x = torch.max_pool2d(x, kernel_size=2, stride=2)
x = self.d2(x)
x = F.relu(self.conv6(x))
x = self.bn6(x)
x = self.d3(x)
x = x.view(x.size(0), -1)
x = F.relu(self.lin1(x))
x = self.d1(x)
x = F.relu(self.lin2(x))
x = self.d2(x)
x = self.lin3(x)
return x
def get_dataloaders():
train_transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor(),transforms.Normalize((0.1307), (0.3081))])
train = mnistdataset("train.csv", transform = train_transform)
return torch.utils.data.DataLoader(train, batch_size=20, shuffle=True, num_workers=2)
def train_network(train_loader):
net = Net().cuda()
opt = optim.SGD(net.parameters(), lr= 0.01, momentum=0.5)
loss = nn.CrossEntropyLoss().cuda()
epochs = 2
for epoch in range(epochs):
net.train()
for batch_id, (im, target) in enumerate(train_loader):
im = im.to('cuda', non_blocking=True)
target = target.to('cuda', non_blocking=True).long()
opt.zero_grad()
pred = net(im)
l = loss(pred, target)
l.backward()
opt.step()
if (batch_id + 1)% 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_id + 1) * len(im), len(train_loader.dataset),
100. * (batch_id + 1) / len(train_loader), l.item()))
if __name__ == '__main__':
train_loader = get_dataloaders()
train_network(train_loader)
我得到的错误是
---------------------------------------------------------------------------
BrokenPipeError Traceback (most recent call last)
<ipython-input-8-5af6b8b22e93> in <module>
2
3 train_loader = get_dataloaders()
----> 4 train_network(train_loader)
<ipython-input-4-24f1b1c4c822> in train_network(train_loader)
8
9 net.train()
---> 10 for batch_id, (im, target) in enumerate(train_loader):
11
12 im = im.to('cuda', non_blocking=True)
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
357 return self._iterator
358 else:
--> 359 return self._get_iterator()
360
361 @property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in _get_iterator(self)
303 else:
304 self.check_worker_number_rationality()
--> 305 return _MultiProcessingDataLoaderIter(self)
306
307 @property
~\AppData\Local\Programs\Python\Python36\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
916 # before it starts, and __del__ tries to join but will get:
917 # AssertionError: can only join a started process.
--> 918 w.start()
919 self._index_queues.append(index_queue)
920 self._workers.append(w)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 # Avoid a refcycle if the target function holds an indirect
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
221 @staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
--> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
63 try:
64 reduction.dump(prep_data, to_child)
---> 65 reduction.dump(process_obj, to_child)
66 finally:
67 set_spawning_popen(None)
~\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
61
62 #
BrokenPipeError: [Errno 32] Broken pipe
我在 num_workers 设置为 0 时收到的警告是
ipykernel_launcher:33: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at ..\c10/core/TensorImpl.h:1156.)
该模型仍然在 num_workers 设置为 0 的情况下进行训练。
我的环境详细信息:
Windows 10 家庭版、用于 CUDA 11.2 的 Pytorch(安装了 pip,没有 conda)、用于 Windows 的 Python 3.6.7、GTX 1050 Ti GPU、Intel i5 第 9 代
编辑:当我在 python 文件中运行代码时,代码似乎可以工作,但在使用 jupyter 笔记本时似乎没有运行
解决方案
推荐阅读
- angular - Toast 消息在 Angular 项目中不起作用
- arrays - 如何从路径列表构建多维数组或字典
- inheritance - Lua 中基于类的继承如何工作?
- java - 将活动复制到片段
- build - 用于构建和发布 .NET Core 控制台应用程序的命令
- c++ - CMAKE:构建可执行文件和库并链接它们
- excel - 删除 Excel 表格中字段具有特定内容的整行
- lua - 我需要使用什么编译器来构建托管在 github 上的软件?[视窗]
- c++ - c++中pair class stl有什么用,它是像数据类型还是容器?
- javascript - 可能的未处理承诺拒绝(id:2)