首页 > 解决方案 > 在并行过程中评估 PyTorch 模型上的数据后,GPU 内存不会释放

问题描述

对于我的优化算法,我每次迭代都需要评估几百张图像。为了加快这个过程,我想充分利用我的 3 个 GPU。

我的过程:

现在执行一次并结束问题没有问题,但是,当我重复执行此操作时,GPU 内存开始填满每次迭代,直到我收到“RuntimeError:CUDA 错误:内存不足”

我的问题:

更新 考虑到此线程中出现的问题,我重新编写了代码。在程序中的任何循环之外实例化 Pool() 并不能解决 GPU 内存溢出问题,但是,它会阻止 CPU 内存随着时间的推移而累积。

'''
Test GPU Memory Leak
Description: Tests how the memory doesn't get freed up when running multiprocessing with PyTorch Model forward pass
'''
import torch
import torch.multiprocessing as mp
import importlib
from PIL import Image
from skimage import io, transform
from skimage.color import rgb2gray
from skimage.io._plugins.pil_plugin import *
import torch 
import torch.nn as nn

# Convolutional neural network (twohttps://duckduckgo.com/?q=install+gmsh+conda&t=canonical convolutional layers)
class ConvNet(nn.Module):
    def __init__(self, num_classes=10, num_img_layers = 1, img_res = 128):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            #torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1,  
            # padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')
            nn.Conv2d(num_img_layers, 64, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.LeakyReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc1 = nn.Linear(32*int(img_res/2)*int(img_res/2), 32*32)
        self.fc2 = nn.Linear(32*32, num_classes)

    def forward(self, x):
        #print(x.shape)
        out = self.layer1(x)
        #print(out.shape)
        out = self.layer2(out)
        #print(out.shape)
        out = out.reshape(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

class NNEvaluator:
    def __init__(self, model_dict, GPU, img_res = 128, num_img_layers = 1, num_classes = None):
        # Load the model checkpoint
        gpu_id = 'cuda:' + str(GPU)
        self.device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
        self.model_state_dict = model_dict['model_state_dict']
        self.model = ConvNet(num_classes = num_classes, num_img_layers = num_img_layers, img_res = img_res).to(self.device)
        self.model.to(self.device)
        self.model.load_state_dict(self.model_state_dict)

        self.epsilon = torch.tensor(1e-12, dtype = torch.float)
    def evaluate(self, img):
        self.model.eval() 
        with torch.no_grad():
            img = img.to(self.device)
            out = self.model(img)
            out = out.to('cpu')
        return out
def loadImage(filename):
    im = Image.open("test.jpg")
    im = io._plugins.pil_plugin.pil_to_ndarray(im)
    im = rgb2gray(im)
    image = im.transpose((0, 1))
    im = torch.from_numpy(image).float()
    im = torch.unsqueeze(im,0)
    im = torch.unsqueeze(im,1)
    return im

def _worker(workload, evaluator):
    results = []
    for img in workload:
        results.append(evaluator.evaluate(img))
def main():
    # load a model for each GPU
    model_dict = torch.load('model_dict.ckpt')
    GPUs = [0,1,2] # available GPUs in the system
    evaluators = []
    for gpu_id in GPUs:
        evaluators.append(NNEvaluator(model_dict, gpu_id, num_classes=3))    
    # instantiate multiprocessing pool
    mp.set_start_method('spawn')
    mypool = mp.Pool()

    # evaluate all datapoints 20 times
    im = loadImage('test.jpg')
    total_nr_iterations = 20
    for i in range(total_nr_iterations):
        # run a subset of the workload on each GPU in a separate process
        nr_datapoints = 99
        dp_per_evaluator = int(nr_datapoints/len(evaluators))
        workload = [im for i in range(dp_per_evaluator)]
        jobslist = [(workload, evaluator) for evaluator in evaluators]
        mypool.starmap(_worker, jobslist)
        print("Finished iteration {}".format(i))
if __name__ == '__main__':
    main()

运行代码时输出:

Finished iteration 0
Finished iteration 1
Finished iteration 2
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-12:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
    return _ForkingPickler.loads(res)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
    event_sync_required)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
    return _ForkingPickler.loads(res)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
    event_sync_required)
RuntimeError: CUDA error: out of memory
RuntimeError: CUDA error: out of memory
Process SpawnPoolWorker-11:
Traceback (most recent call last):
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
    return _ForkingPickler.loads(res)
  File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
    event_sync_required)
RuntimeError: CUDA error: out of memory

标签: python-3.xpytorchpython-multiprocessing

解决方案


我发现了这个类似的线程,由于在循环中而不是在外部实例化 Pool() 而发生内存泄漏。

上述问题还在函数内部实例化 Pool() 而不使用with确保所有启动的进程返回的符号

例如不好的方式:

def evaluation(workload):
    jobslist = [job for job in workload]
    with Pool() as mypool:
        mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
    # pseudo data
    workload = [[(100,200) for i in range(1000)] for i in range(50)]
    for i in range(100):
        evaluation(workload)

这样做的正确方法是在循环外实例化池,并将对池的引用传递给函数进行处理,即:

def evaluation(workload, mypool):
    jobslist = [job for job in workload]
    mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
    # pseudo data
    with Pool() as mypool:
        workload = [[(100,200) for i in range(1000)] for i in range(50)]
        for i in range(100):
            evaluation(workload, mypool)

我怀疑由于并行进程中尚未清理的剩余引用而导致 GPU 内存泄漏。


推荐阅读