python - 如何修复“异常:进程 0 以信号 SIGSEGV 终止”错误以及单个 gpu 代码是否正常工作?
问题描述
我启动了 2 个进程,因为我只有 2 个 gpus,但它给了我一个Exception: process 0 terminated with signal SIGSEGV
. 此代码确实适用于多个 cpu(或至少没有引发错误)。此外,它适用于单个 GPU。除此之外,当world_size > 0
存在多个 cuda/gpus 时会失败。
我的错误信息是这样的:
(automl-meta-learning) miranda9~/ML4Coq $ python playground/multiprocessing_playground/ddp_hello_world.py
world_size=2
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_hello_world.py", line 49, in <module>
main()
File "playground/multiprocessing_playground/ddp_hello_world.py", line 43, in main
mp.spawn(example,
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV
这是给出错误的代码:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
print('Done\n\a')
[可选]更大的独立示例(给出相同的错误)
但是请注意,这个稍微更完整的示例(仅缺少分布式数据加载器)也给了我同样的问题:
"""
Based on: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
Correctness of code: https://stackoverflow.com/questions/66226135/how-to-parallelize-a-training-loop-ever-samples-of-a-batch-when-cpu-is-only-avai
Note: as opposed to the multiprocessing (torch.multiprocessing) package, processes can use
different communication backends and are not restricted to being executed on the same machine.
"""
import time
from typing import Tuple
import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
num_epochs = 5
batch_size = 8
Din, Dout = 10, 5
data_x = torch.randn(batch_size, Din)
data_y = torch.randn(batch_size, Dout)
data = [(i*data_x, i*data_y) for i in range(num_epochs)]
class PerDeviceModel(nn.Module):
"""
Toy example for a model ran in parallel but not distributed accross gpus
(only processes with their own gpu or hardware)
"""
def __init__(self):
super().__init__()
self.net1 = nn.Linear(Din, Din)
self.relu = nn.ReLU()
self.net2 = nn.Linear(Din, Dout)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def setup_process(rank, world_size, backend='gloo'):
"""
Initialize the distributed environment (for each process).
gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
"""
# set up the master's ip address so this child process can coordinate
# os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
if torch.cuda.is_available():
backend = 'nccl'
# Initializes the default distributed process group, and this will also initialize the distributed package.
dist.init_process_group(backend, rank=rank, world_size=world_size)
def cleanup():
""" Destroy a given process group, and deinitialize the distributed package """
dist.destroy_process_group()
def get_batch(batch: Tuple[torch.Tensor, torch.Tensor], rank):
x, y = batch
if torch.cuda.is_available():
x, y = x.to(rank), y.to(rank)
else:
x, y = x.share_memory_(), y.share_memory_()
return x, y
def get_ddp_model(model: nn.Module, rank):
"""
Moves the underlying storage to shared memory.
This is a no-op if the underlying storage is already in shared memory
and for CUDA tensors. Tensors in shared memory cannot be resized.
:return:
TODO: does this have to be done outside or inside the process? my guess is that it doesn't matter because
1) if its on gpu once it's on the right proc it moves it to cpu with id rank via mdl.to(rank)
2) if it's on cpu then mdl.share_memory() or data.share_memory() is a no op if it's already in shared memory o.w.
"""
# if gpu avail do the standard of creating a model and moving the model to the GPU with id rank
if torch.cuda.is_available():
# create model and move it to GPU with id rank
model = model.to(rank)
ddp_model = DDP(model, device_ids=[rank])
else:
# if we want multiple cpu just make sure the model is shared properly accross the cpus with shared_memory()
# note that op is a no op if it's already in shared_memory
model = model.share_memory()
ddp_model = DDP(model) # I think removing the devices ids should be fine...?
return ddp_model
# return OneDeviceModel().to(rank) if torch.cuda.is_available() else OneDeviceModel().share_memory()
def run_parallel_training_loop(rank, world_size):
"""
Distributed function to be implemented later.
This is the function that is actually ran in each distributed process.
Note: as DDP broadcasts model states from rank 0 process to all other processes in the DDP constructor,
you don’t need to worry about different DDP processes start from different model parameter initial values.
"""
setup_process(rank, world_size)
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# get ddp model
model = PerDeviceModel()
ddp_model = get_ddp_model(model, rank)
# do training
for batch_idx, batch in enumerate(data):
x, y = get_batch(batch, rank)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(x)
# Gradient synchronization communications take place during the backward pass and overlap with the backward computation.
loss_fn(outputs, y).backward() # When the backward() returns, param.grad already contains the synchronized gradient tensor.
optimizer.step() # TODO how does the optimizer know to do the gradient step only once?
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# Destroy a given process group, and deinitialize the distributed package
cleanup()
def main():
print()
print('running main()')
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# args
if torch.cuda.is_available():
world_size = torch.cuda.device_count()
else:
world_size = mp.cpu_count()
print(f'world_size={world_size}')
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
if __name__ == "__main__":
print('starting __main__')
start = time.time()
main()
print(f'execution length = {time.time() - start}')
print('Done!\a\n')
交叉发布:https ://discuss.pytorch.org/t/why-is-mp-spawn-spawning-4-processes-when-i-only-want-2/112299
解决方案
我在具有 4 个 GPU(python 版本:3.6.9 和 pytorch 版本:1.5.0+cu101)的服务器上运行了您的“(最小)代码示例”,没有任何更改和任何错误。
运行最小代码示例时问题是否仍然存在?
如果是这样,并且如果您使用的是 linux 机器,您能否改为运行以下代码,并告诉我您得到了什么输出:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def get_visible_gpus():
ns = os.popen('nvidia-smi')
lines_ns = ns.readlines()
# print(lines_ns)
for _i, _line in enumerate(lines_ns):
if _line.find('|=') >= 0:
break
line_gpus = lines_ns[_i:]
for _i, _line in enumerate(line_gpus):
if _line.find('Processes') >= 0:
break
line_gpus = line_gpus[:_i-3]
# print(line_gpus)
idx_gpu_lines = []
for _i, _line in enumerate(line_gpus):
if _line.find('+') >= 0:
idx_gpu_lines.append(_i+1)
idx_gpus = []
for _line_gpu in idx_gpu_lines:
idx_gpus.append(int(line_gpus[_line_gpu].split()[1]))
# print(idx_gpus)
return idx_gpus
def example(rank, world_size):
print('rank:{}'.format(rank))
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
print('world_size:{}'.format(world_size))
print('get_visible_gpus():{}'.format(get_visible_gpus()))
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
print(torch.__version__)
main()
print('Done\n\a')
就我而言,我只是得到:
1.5.0+cu101
world_size:4
get_visible_gpus():[0, 1, 2, 3]
rank:1
rank:3
rank:0
rank:2
Done
get_visible_gpus()
只是文本解析nvidia-smi
shell cmd 以获取 cuda 可以看到的 gpus 的 id。
注意:请原谅,我会发表评论而不是“回答”-因为我不是直接解决您的问题,而是询问更多细节-但我的声誉不够好TT
推荐阅读
- spring - 用 Spring Transaction 包装一个 Callable 和 Runnable
- api - 如何通过adobe(acrobat)以编程方式将pdf文件转换为excel文件格式?
- powershell - Powershell:Get-ChildItem 路径中的多个通配符
- javascript - ServiceWorker 通知单击 - 如何刷新页面并跳转到锚链接?
- javascript - 努力使用 jQuery 更改来显示特定的 div/h2
- javascript - 为什么表单主体在 PUT 请求中返回 null?
- ios - 2020 年 4 月 Fabric Crashlytics 消失后,如何在 iOS 中跟踪 OOM 关闭?
- python - Python - 从列表创建字典的函数
- python - 为什么我的 add_pass 函数不起作用?我是新手程序员所以请不要评判
- python - 如何在excel文件中附加for循环的结果?