python-3.x - 在并行过程中评估 PyTorch 模型上的数据后,GPU 内存不会释放
问题描述
对于我的优化算法,我每次迭代都需要评估几百张图像。为了加快这个过程,我想充分利用我的 3 个 GPU。
我的过程:
- 在我的每个 GPU 上加载我的深度学习模型实例
- 然后将工作负载分成与 GPU 一样多的部分
- 将元组中的每个工作负载与应在其上处理的 GPU 加载模型的实例配对
- 运行 starmap(_runDataThroughModel, sub_workload) 并行处理所有 sub_workload
现在执行一次并结束问题没有问题,但是,当我重复执行此操作时,GPU 内存开始填满每次迭代,直到我收到“RuntimeError:CUDA 错误:内存不足”
我的问题:
- 这样做的正确方法是什么?
- 为什么没有释放 GPU 内存?由于我在“starmap”命令之外预先实例化了 GPU 模型并始终传递相同的实例,为什么会有累积?
更新 考虑到此线程中出现的问题,我重新编写了代码。在程序中的任何循环之外实例化 Pool() 并不能解决 GPU 内存溢出问题,但是,它会阻止 CPU 内存随着时间的推移而累积。
'''
Test GPU Memory Leak
Description: Tests how the memory doesn't get freed up when running multiprocessing with PyTorch Model forward pass
'''
import torch
import torch.multiprocessing as mp
import importlib
from PIL import Image
from skimage import io, transform
from skimage.color import rgb2gray
from skimage.io._plugins.pil_plugin import *
import torch
import torch.nn as nn
# Convolutional neural network (twohttps://duckduckgo.com/?q=install+gmsh+conda&t=canonical convolutional layers)
class ConvNet(nn.Module):
def __init__(self, num_classes=10, num_img_layers = 1, img_res = 128):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
#torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1,
# padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')
nn.Conv2d(num_img_layers, 64, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.LeakyReLU())
self.layer2 = nn.Sequential(
nn.Conv2d(64, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc1 = nn.Linear(32*int(img_res/2)*int(img_res/2), 32*32)
self.fc2 = nn.Linear(32*32, num_classes)
def forward(self, x):
#print(x.shape)
out = self.layer1(x)
#print(out.shape)
out = self.layer2(out)
#print(out.shape)
out = out.reshape(out.size(0), -1)
out = self.fc1(out)
out = self.fc2(out)
return out
class NNEvaluator:
def __init__(self, model_dict, GPU, img_res = 128, num_img_layers = 1, num_classes = None):
# Load the model checkpoint
gpu_id = 'cuda:' + str(GPU)
self.device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
self.model_state_dict = model_dict['model_state_dict']
self.model = ConvNet(num_classes = num_classes, num_img_layers = num_img_layers, img_res = img_res).to(self.device)
self.model.to(self.device)
self.model.load_state_dict(self.model_state_dict)
self.epsilon = torch.tensor(1e-12, dtype = torch.float)
def evaluate(self, img):
self.model.eval()
with torch.no_grad():
img = img.to(self.device)
out = self.model(img)
out = out.to('cpu')
return out
def loadImage(filename):
im = Image.open("test.jpg")
im = io._plugins.pil_plugin.pil_to_ndarray(im)
im = rgb2gray(im)
image = im.transpose((0, 1))
im = torch.from_numpy(image).float()
im = torch.unsqueeze(im,0)
im = torch.unsqueeze(im,1)
return im
def _worker(workload, evaluator):
results = []
for img in workload:
results.append(evaluator.evaluate(img))
def main():
# load a model for each GPU
model_dict = torch.load('model_dict.ckpt')
GPUs = [0,1,2] # available GPUs in the system
evaluators = []
for gpu_id in GPUs:
evaluators.append(NNEvaluator(model_dict, gpu_id, num_classes=3))
# instantiate multiprocessing pool
mp.set_start_method('spawn')
mypool = mp.Pool()
# evaluate all datapoints 20 times
im = loadImage('test.jpg')
total_nr_iterations = 20
for i in range(total_nr_iterations):
# run a subset of the workload on each GPU in a separate process
nr_datapoints = 99
dp_per_evaluator = int(nr_datapoints/len(evaluators))
workload = [im for i in range(dp_per_evaluator)]
jobslist = [(workload, evaluator) for evaluator in evaluators]
mypool.starmap(_worker, jobslist)
print("Finished iteration {}".format(i))
if __name__ == '__main__':
main()
运行代码时输出:
Finished iteration 0
Finished iteration 1
Finished iteration 2
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-12:
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
RuntimeError: CUDA error: out of memory
RuntimeError: CUDA error: out of memory
Process SpawnPoolWorker-11:
Traceback (most recent call last):
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
RuntimeError: CUDA error: out of memory
解决方案
我发现了这个类似的线程,由于在循环中而不是在外部实例化 Pool() 而发生内存泄漏。
上述问题还在函数内部实例化 Pool() 而不使用with
确保所有启动的进程返回的符号
例如不好的方式:
def evaluation(workload):
jobslist = [job for job in workload]
with Pool() as mypool:
mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
# pseudo data
workload = [[(100,200) for i in range(1000)] for i in range(50)]
for i in range(100):
evaluation(workload)
这样做的正确方法是在循环外实例化池,并将对池的引用传递给函数进行处理,即:
def evaluation(workload, mypool):
jobslist = [job for job in workload]
mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
# pseudo data
with Pool() as mypool:
workload = [[(100,200) for i in range(1000)] for i in range(50)]
for i in range(100):
evaluation(workload, mypool)
我怀疑由于并行进程中尚未清理的剩余引用而导致 GPU 内存泄漏。
推荐阅读
- python - 如何使用 gekko 估计 FOPDT 方程中的 theta 值?
- haskell - 在 Haskell 的 do 上下文中应用构造函数
- python - 我使用 scipy 的 python 代码抛出了一个值错误
- javascript - 从 JSON 获取 D3 的日期
- amazon-s3 - SSE-KMS 和 SSE-C 的区别
- python - 如何从嵌套字典中按值获取字典键?
- javascript - Angular - 如何使用 API 显示元素并使用复选框进行排序
- seo - 使用动态 slug 时,服务器端渲染如何在 Nuxt 上工作?
- c++ - 如何用矩形制作 2 点角并保持正确的厚度?
- database - 建模多个子表数据库的最佳方法