首页 > 解决方案 > 函数“CudnnConvolutionBackward”在其第 1 个输出中返回 nan 值

问题描述

我正在尝试训练这个简单的卷积模型:

class Modello1(nn.Module):

#struttura del modello
def __init__(self, in_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, hidden_size5, out_size):
    
    super().__init__()
    
    kernel_size = 3 #3
    stride = 3 #2 
    padding = 3 #3
    
    
    self.conv1 = nn.Conv1d(in_size, hidden_size1, kernel_size, stride, padding)
    self.pool = nn.MaxPool1d(kernel_size, padding = 1, stride = 3)
    
    self.conv2 = nn.Conv1d(hidden_size1, hidden_size2, kernel_size, stride, padding)

    self.conv3 = nn.Conv1d(hidden_size2, hidden_size3, kernel_size, stride, padding)

    # IMPORTANTE, qui aggiungere un livello di flattening (flatten qualcosa)?

    self.linear1 = nn.Linear(hidden_size3, hidden_size4)
    self.linear2 = nn.Linear(hidden_size4, hidden_size5)
    self.linear3 = nn.Linear(hidden_size5, out_size)
    
#i dati attraversano il modello   
def forward(self, input_data): 
    
    input_data = torch.reshape(input_data, (input_data.shape[0],input_data.shape[1],1)) 
    input_data = input_data.float()
    
    out = self.pool(F.relu(self.conv1(input_data))) #70 neuroni       
    out = torch.reshape(out, (out.shape[0],out.shape[1],1))  

    out = self.pool(F.relu(self.conv2(out))) #33 neuroni  
    out = torch.reshape(out, (out.shape[0],out.shape[1],1)) 
    
    out = self.pool(F.relu(self.conv3(out))) #33 neuroni  
    out = torch.reshape(out, (out.shape[0],out.shape[1]))
    
    out = F.relu(self.linear1(out))    
    out = F.relu(self.linear2(out))
    out = self.linear3(out)
    return out
    
#calcolo loss e accuratezza batch
def validation_step(self, batch):
    
    input_data, targets = batch 
    out = self(input_data) 
    targets = targets.view(targets.shape[0])
    targets = targets.long()
    loss = criterion(out, targets)
    acc = accuracy(out,targets)
    
    return {'val_loss': loss, 'val_acc': acc}


#loss e accuratezza di ciascuna epoca
def validation_epoch_end(self, outputs):
    #print(outputs)
    batch_losses = [x['val_loss'] for x in outputs]
    epoch_loss = torch.stack(batch_losses).mean()   #media tra le losses di ogni batch
    batch_accs = [x['val_acc'] for x in outputs]
    batch_accs = torch.tensor(batch_accs)
    epoch_acc = torch.mean(batch_accs) 

    return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc}

我使用 torch.autograd.set_detect_anomaly(True) 函数来检查 loss.backward() 函数中的异常,一旦我开始训练过程,我就会得到这个错误:函数'CudnnConvolutionBackward'在其第一个输出中返回了 nan 值。有谁知道它为什么会出现?

这是其余的代码:

精度函数

def accuracy(outputs, targets):
dim = targets.shape[0]
preds = torch.cuda.FloatTensor(dim).fill_(0)
_, preds = torch.max(outputs, dim=1)
    
i = 0
j = 0

targets = targets.long()
for x in preds:
    
    if(x == targets[i]):
        j+=1
        
    i+=1

return (j / i) * 100 

标准和优化器

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

评估和拟合函数

def evaluate(model, val_loader):
outputs = [model.validation_step(batch) for batch in val_loader]
return model.validation_epoch_end(outputs)

def fit(epochs, model, train_loader, val_loader):

history = [] #stores loss e accuracy for each epoch

for epoch in range(epochs): 
    since = time.time()

    running_loss = 0.0
    
    for batch in train_loader:
        
        inputs, targets = batch

        optimizer.zero_grad() 

        # forward + backward + optimize
        outputs = model(inputs)
        targets = targets.view(targets.shape[0])
        targets = targets.long()
        loss = criterion(outputs, targets)

        loss.backward()

        optimizer.step() 
  
    #validation phase       
    result = evaluate(model, val_loader)
    history.append(result)

    running_loss += loss.item()

    time_elapsed = time.time() - since

    print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}%".format(epoch, result['val_loss'], result['val_acc']))
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('-' * 10)

    running_loss = 0.0

return history

训练

model = Modello1(input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, hidden_size5, out_size)
history = fit(10, model, train_loader, val_loader) 

标签: pythonmachine-learningpytorchconv-neural-network

解决方案


推荐阅读