首页 > 解决方案 > 训练pytorch模型时积累了什么

问题描述

在训练过程中,训练上升到 3%,然后出现内存不足。
但是当我在思考这个错误时,我想知道一些事情,因为一开始学习很顺利,然后就变得失忆了,它会随着训练的进展而积累一些东西。什么是积累?

def init_process_group(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("gloo", rank = rank,world_size =world_size)
def build_data_loader(vocab, infile, args, shuffle = True):
    dataset = MovieDataset(vocab, infile)
    if 1< args.n_gpu and shuffle:
        sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler = sampler, collate_fn =movie_collate_fn)
    else :
        sampler = None
        loader = torch.utils.data.DataLoader(dataset, batch_size=config.tconfig.batch_size, sampler=sampler, shuffle=shuffle, collate_fn = movie_collate_fn) 
    return loader, sampler
def train_model(rank, world_size, args):

    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size ==0)
    
    config.tconfig.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config.tconfig)

    best_epoch, best_loss, best_score = 0, 0, 0
    model = MovieClassification(config.tconfig)
    
    if os.path.isfile(args.save):
        print(f"rank:{rank} load state dict from : {args.save}")
    if 1 < args.n_gpu:
        model.to(config.tconfig.device)
        model = DistributedDataParallel(model, device_ids = [rank], find_unused_parameters = True)
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu", default = None, type = int, required = False)
    args = parser.parse_args()
    config.tconfig.device = config.device
    
    if torch.cuda.is_available():
        args.n_gpu = torch.cuda.device_count() if args.gpu is None else 1
    else:
        args.n_gpu = 0
    print("available GPU : ",args.n_gpu)
    if 1 < args.n_gpu:
        mp.spawn(train_model, args= (args.n_gpu,args), nprocs = args.n_gpu, join = True)

下面是oom错误信息

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/home/fightnyy/workspace/kotransformer/main.py", line 145, in train_model
    loss = train_epoch(config.tconfig, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
  File "/home/fightnyy/workspace/kotransformer/main.py", line 88, in train_epoch
    outputs = model.forward(enc_inputs, dec_inputs)
  File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/fightnyy/workspace/kotransformer/movieclassifier.py", line 24, in forward
    dec_outputs, enc_self_attn_probs, self_attn_probs, dec_enc_attn_probs = self.transformer.forward(enc_inputs, dec_inputs)
  File "/home/fightnyy/workspace/kotransformer/transformer.py", line 19, in forward
    enc_outputs, enc_self_attn_probs = self.encoder.forward(enc_inputs)
  File "/home/fightnyy/workspace/kotransformer/encoder.py", line 64, in forward
    outputs, attn_prob = layer.forward(outputs, attn_mask)
  File "/home/fightnyy/workspace/kotransformer/encoderlayer.py", line 24, in forward
    pos_output = self.pos_ffnn(attn_output)
  File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/fightnyy/workspace/kotransformer/PFFN.py", line 23, in forward
    output = self.activation(self.conv1(inputs.transpose(1,2)))
  File "/home/fightnyy/anaconda3/envs/selfresearch/lib/python3.8/site-packages/torch/nn/functional.py", line 1383, in gelu
    return torch._C._nn.gelu(input)
RuntimeError: CUDA out of memory. Tried to allocate 118.00 MiB (GPU 0; 10.76 GiB total capacity; 5.05 GiB already allocated; 77.44 MiB free; 5.31 GiB reserved in total by PyTorch)

标签: pythonpytorch

解决方案


推荐阅读