首页 > 解决方案 > Windows 上的 Pytorch:numworkers 的数据加载器问题

问题描述

我刚买了一台运行 Windows 10 的新计算机,它有一个 GPU,所以我想看看我是否可以明智地将它用于机器学习。所以我尝试运行一个我之前在 Google Colab 上训练过的旧模型。答案是它确实做得很好,但我发现我不能在 Dataloader 中使用多个工人。谷歌搜索发现这是 Jupyter Notebooks 中 Windows 上 PyTorch 的一个已知问题,因此我尝试在普通 Python 程序中运行它。我发现它确实有效,但创建 DataIterator 需要很长时间。以下是 1、2 和 6 名工人的时间(以秒为单位),每人完成两次:

在此处输入图像描述

我注意到 2 名工人似乎是最快的,而且似乎有很多变化,这让我感到惊讶,因为机器什么也没做。

所以第一个问题是:

第二个问题是:

我运行的代码在相关部分之后是if __name__ == "__main__"

# -*- coding: utf-8 -*-


import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

import os
import numpy as np
#import gym
import pickle
import matplotlib.pyplot as plt
import time


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

# create dataset
class C4Dataset(Dataset):
    '''
        the data for the first 12moves is held in a pickles list
        as (key,val)
        The key has to be converted to the pos amd mask which can then be converted to the ones,twos and zeros
        Val is the value for the player playing so needs to be changed for wnen moves  i odd to minus val
    '''
   
    fileName = r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers\ListAllKeyVal19'
    bottom_mask = 4432676798593    
    board_mask  = bottom_mask * ((1 << 6) - 1)
    bNos = 2**np.arange(49, dtype = np.uint64)
    
    def getData(fileName):
        with open(fileName,'rb') as inFile:
            dict = pickle.load(inFile)
        return dict
    
    def oneHot(x):
        return np.eye(37,dtype = np.float32)[x]
        
    def getNoMoves(ones,twos) :
        return np.sum(ones+twos)
    
    def getPosMask(key):
        
        binary = ('{:049b}'.format(key))[::-1]    
        arr    = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
        outArr = np.reshape(arr,(7,7),order = 'F')
        
        arr = np.flipud(outArr)
        pos = arr.copy()
        mask =arr.copy()
        
        for col in range(7):            
            res = np.where(arr[:,col]==1)              
            topPos = res[0][0]
            pos[topPos,col]  = 0
            mask[topPos,col] = 0
            if topPos<6:
                mask[topPos+1:,col] = 1 
        
        msk = np.flipud(mask)
        msk = np.reshape(msk,(49),order = 'F')       
        maskNo  = np.array(msk.dot(C4Dataset.bNos),dtype = np.uint64).item()
       

        return pos.astype('float32'),(pos ^ mask).astype('float32'),(np.logical_not(mask)).astype('float32'),maskNo

    def possible(mask) :
        poss = (mask + C4Dataset.bottom_mask) & C4Dataset.board_mask
        binary = ('{:049b}'.format(poss))[::-1]    
        arr    = np.frombuffer(binary.encode(encoding='utf-8', errors='strict'),'u1') - ord('0')
        outArr = np.reshape(arr,(7,7),order = 'F')
         
        arr = np.flipud(outArr)
         
        return arr

    

    def __init__(self):        
        self.lst = C4Dataset.getData(C4Dataset.fileName)

    def __len__(self):
        return len(self.lst)

    def __getitem__(self, idx): 
        key,val = self.lst[idx]
        val = int(val)
        ones,twos,zeros,mask = C4Dataset.getPosMask(key)
        
        arr = np.zeros((5,7,7),dtype = np.float32)
        arr[0,:6,:7] = ones[1:,:]
        arr[1,:6,:7] = twos[1:,:]
        arr[2,:6,:7] = zeros[1:,:]
         
        moves = int(C4Dataset.getNoMoves(ones,twos))

        p = (moves % 2) + 3
        
        arr[p,:6,:7] = C4Dataset.possible(mask)[1:,:]
        
               
        return arr,val+18 #C4Dataset.oneHot(val+18)


class C4Net(nn.Module):
    def __init__(self, inFilters,outFilters):
        super(C4Net, self).__init__()        

        self.conv1 = nn.Conv2d(inFilters, 32, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layer1 = BasicBlock(32,32)
        self.layer2 = BasicBlock(32,32)
        self.layer3 = BasicBlock(32,32)
        self.layer4 = BasicBlock(32,32)
        self.layer5 = BasicBlock(32,32)
        self.layer6 = BasicBlock(32,32)
        self.layer7 = BasicBlock(32,32)
        self.layer8 = BasicBlock(32,32)
        
        self.linear = nn.Linear(32*7*7,outFilters)#1568

    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
   
        #out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


# show some images
def show(img):
    npimg = img.numpy()[:3,:,:]
    plt.imshow(np.transpose(npimg, (1,2,0)), interpolation='nearest')
# get some random training images
    
if __name__ == "__main__": 
    
    dirName =r'C:\Users\alan\Desktop\Python\Python36\connect4\Layers'
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Assuming that we are on a CUDA machine, this should print a CUDA device:

    print(device)
    # create dataloader
    max_epochs = 1
    batchSize = 1024#512#256
    learningRate = .00003
    # Parameters
    params = {'batch_size': batchSize,'shuffle': True,'num_workers': 2}
    # Generators
    dataset = C4Dataset()
    start = time.time()
    dataloader = DataLoader(dataset, **params)
    middle = time.time()
    
    print('create dataloader',middle-start)
    
    
    dataiter = iter(dataloader)
    end = time.time()
    print('create data iterator',end-middle)
    
    images, labels = dataiter.next()
    final = time.time()
    print('get one batch',final-end)
    # show images
    show(torchvision.utils.make_grid(images[:16]))

    #create the weights
    wts =np.array([59, 963, 12406, 148920, 62551, 47281, 55136, 54312, 44465, 31688,
          27912, 37907, 114778, 242800, 394530, 495237, 582174, 163370, 480850,
          201152, 690905, 633937, 721340, 372479, 193375, 84648, 76576, 91087, 130428,
          154184, 157339, 156453, 227696, 1705325, 548155, 44315, 2082],dtype = np.float32)
    
    maxwt = wts.max()
    weights = wts/maxwt
    weights = torch.from_numpy(weights)
    weights.to(device)
    
    # create the network
    
    net = C4Net(5,37)
    net.to(device)
    PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00003.pth'
    net.load_state_dict(torch.load(PATH,map_location=torch.device(device)))
    
    #create the loss function and optimiser
    criterion = nn.CrossEntropyLoss(weight = weights.to(device) )
    optimizer = optim.Adam(net.parameters(), lr=learningRate)
    
    #train the network
    import time
    start = time.time()
    for epoch in range(max_epochs):  # loop over the dataset multiple times
    
        running_loss = 0.0
        for i, data in enumerate(dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = inputs, labels = data[0].to(device), data[1].to(device)
    
            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
                torch.save(net.state_dict(),r'C:\Users\alan\Desktop\Python\connectX\tempWeights')
    print('Finished Training')
    # save the weights
    PATH = r'C:\Users\alan\Desktop\Python\connectX\c4Net37Weights00004.pth'
    
    torch.save(net.state_dict(), PATH)
    end = time.time()
    print('elapsed time',end-start)

PS 该机器是戴尔 XPS 17,配备 8 核 Intel Core i9-10885H,GPU 是配备 Max-Q 的 NVIDIA GeForce RTX 2060。在这项测试中,它的运行速度比在 Google Colab 上快 4 倍,但我不知道分配给我的是什么 GPU。

标签: windows-10pytorch

解决方案


推荐阅读