首页 > 解决方案 > 如何在 Huggingface + CUDA 内存不足的 BERT 之上添加 BiLSTM。尝试分配 16.00 MiB

问题描述

我有下面的二进制分类代码,它工作正常,但我想修改 nn.Sequential 参数并添加一个 BiLSTM 层。我有以下代码:

class BertClassifier(nn.Module):
 def __init__(self, freeze_bert=False):
  super(BertClassifier, self).__init__()
  # Specify hidden size of BERT, hidden size of our classifier, and number of labels
  D_in, H, D_out = 768, 50, 2

  # Instantiate BERT model
  self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
  # Instantiate an one-layer feed-forward classifier
  self.classifier = nn.Sequential(nn.Linear(D_in, H),nn.ReLU(),nn.Linear(H, D_out))

  # Freeze the BERT model
  if freeze_bert:
   for param in self.bert.parameters():
    param.requires_grad = False

 def forward(self, input_ids, attention_mask):
  # Feed input to BERT
  outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
  # Extract the last hidden state of the token `[CLS]` for classification task
  last_hidden_state_cls = outputs[0][:, 0, :]
  # Feed input to classifier to compute logits
  logits = self.classifier(last_hidden_state_cls)

  return logits

我曾尝试像这样修改顺序,但随后它会在线self.classifier = nn.Sequential(nn.LSTM(D_in, H, batch_first=True, bidirectional=True),nn.ReLU(),nn.Linear(H, D_out))抛出错误。我发现我可以使用 nn.ModuleDict 而不是 nn.Sequential 并且我做了以下内容:RuntimeError: input must have 3 dimensions, got 2logits = self.classifier(last_hidden_state_cls)

  self.classifier = nn.ModuleDict({
   'lstm': nn.LSTM(input_size=D_in, hidden_size=H,batch_first=True, bidirectional=True ),
   'linear': nn.Linear(in_features=H,out_features=D_out)})

但是现在我在用这个计算前向函数时遇到了问题。有人可以建议我如何正确修改转发功能吗?

更新:我还安装了 CUDA,现在当我运行代码时它返回错误CUDA out of memory. Tried to allocate 16.00 MiB ,我试图降低批量大小,但这并不能解决问题。我也尝试了以下但也没有解决。请问有什么建议吗?

import torch, gc
gc.collect()
torch.cuda.empty_cache()

使用代码更新:

    MAX_LEN = 64
    # For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
    batch_size = 32
    VALID_BATCH_SIZE = 4
    
    file1 = open('MH.txt', 'r')
    list_com = []
    list_label = []
    for line in file1:
     possible_labels = 'positive|negative'
     label = re.findall(possible_labels, line)
     line = re.sub(possible_labels, ' ', line)
     line = re.sub('\n', ' ', line)
     list_com.append(line)
     list_label.append(label[0])
    
    list_tuples = list(zip(list_com, list_label))
    file1.close()
    labels = ['positive', 'negative']
    df = pd.DataFrame(list_tuples, columns=['text', 'label'])
    df['label'] = df['label'].map({'positive': 1, 'negative': 0})
    for i in range(0,len(df['label'])):
     list_label[i] = df['label'][i]
    #print(df)
    #print(df['label'].value_counts())
    X = df.text.values
    y = df.label.values
    X_train, X_val, y_train, y_val =\
        train_test_split(X, y, test_size=0.1, random_state=2020)
    
    def text_preprocessing(text):
     # Remove '@name'
     text = re.sub(r'(@.*?)[\s]', ' ', text)
     # Replace '&' with '&'
     text = re.sub(r'&', '&', text)
     # Remove trailing whitespace
     text = re.sub(r'\s+', ' ', text).strip()
     return text
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
    
    # Create a function to tokenize a set of texts
    def preprocessing_for_bert(data):
     input_ids = []
     attention_masks = []
    
     for sent in data:
      encoded_sent = tokenizer.encode_plus(
       text=text_preprocessing(sent),  # Preprocess sentence
       add_special_tokens=True,  # Add `[CLS]` and `[SEP]`
       max_length=MAX_LEN,  # Max length to truncate/pad
       pad_to_max_length=True,  # Pad sentence to max length
       # return_tensors='pt',           # Return PyTorch tensor
       return_attention_mask=True  # Return attention mask
      )
    
      # Add the outputs to the lists
      input_ids.append(encoded_sent.get('input_ids'))
      attention_masks.append(encoded_sent.get('attention_mask'))
    
     # Convert lists to tensors
     input_ids = torch.tensor(input_ids)
     attention_masks = torch.tensor(attention_masks)
    
     return input_ids, attention_masks
    
    train_inputs, train_masks = preprocessing_for_bert(X_train)
    val_inputs, val_masks = preprocessing_for_bert(X_val)
    
    # Convert other data types to torch.Tensor
    train_labels = torch.tensor(y_train)
    val_labels = torch.tensor(y_val)
    
    # Create the DataLoader for our training set
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    # Create the DataLoader for our validation set
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
    
    # Create the BertClassfier class
    class BertClassifier(nn.Module):
     """Bert Model for Classification Tasks."""
     def __init__(self, freeze_bert=False):
      """
      @param    bert: a BertModel object
      @param    classifier: a torch.nn.Module classifier
      @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
      """
      super(BertClassifier, self).__init__()
      # Specify hidden size of BERT, hidden size of our classifier, and number of labels
      D_in, H, D_out = 768, 50, 2
    
      # Instantiate BERT model
      self.bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
      # Instantiate an one-layer feed-forward classifier
      self.classifier = nn.ModuleDict({
       'lstm': nn.LSTM(input_size=D_in, hidden_size=H, batch_first=True, bidirectional=True),
       'linear': nn.Linear(in_features=H, out_features=D_out)})
    
      # Freeze the BERT model
      if freeze_bert:
       for param in self.bert.parameters():
        param.requires_grad = False
    
     def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
      sequence_output = outputs[0]
      sequence_output, _ = self.lstm(sequence_output)
      linear_output = self.linear(sequence_output[:, -1])

  return linear_output
    
    def initialize_model(epochs=4):
        # Instantiate Bert Classifier
        bert_classifier = BertClassifier(freeze_bert=False)
        print(bert_classifier)
        # Tell PyTorch to run the model on GPU
        bert_classifier.to(device)
        # Create the optimizer
        optimizer = AdamW(bert_classifier.parameters(), lr=5e-5)
        # Total number of training steps
        total_steps = len(train_dataloader) * epochs
        # Set up the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
        return bert_classifier, optimizer, scheduler
    
    # Specify loss function
    loss_fn = nn.CrossEntropyLoss()
    
    def set_seed(seed_value=42):
     """Set seed for reproducibility."""
     random.seed(seed_value)
     np.random.seed(seed_value)
     torch.manual_seed(seed_value)
     torch.cuda.manual_seed_all(seed_value)
    
    
    def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
     """Train the BertClassifier model."""
     # Start training loop
     print("Start training...\n")
     for epoch_i in range(epochs):
      # Print the header of the result table
      print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
      print("-" * 70)
      # Measure the elapsed time of each epoch
      t0_epoch, t0_batch = time.time(), time.time()
      # Reset tracking variables at the beginning of each epoch
      total_loss, batch_loss, batch_counts = 0, 0, 0
      # Put the model into the training mode
      model.train()
      # For each batch of training data...
      for step, batch in enumerate(train_dataloader):
       batch_counts += 1
       # Load batch to GPU
       b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
       # Zero out any previously calculated gradients
       model.zero_grad()
       # Perform a forward pass. This will return logits.
       logits = model(b_input_ids, b_attn_mask)
       # Compute loss and accumulate the loss values
       loss = loss_fn(logits, b_labels)
       batch_loss += loss.item()
       total_loss += loss.item()
       # Perform a backward pass to calculate gradients
       loss.backward()
       # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
       torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
       # Update parameters and the learning rate
       optimizer.step()
       scheduler.step()
       # Print the loss values and time elapsed for every 20 batches
       if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
        # Calculate time elapsed for 20 batches
        time_elapsed = time.time() - t0_batch
        # Print training results
        print(
         f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
        # Reset batch tracking variables
        batch_loss, batch_counts = 0, 0
        t0_batch = time.time()
      # Calculate the average loss over the entire training data
      avg_train_loss = total_loss / len(train_dataloader)
    
      print("-" * 70)
      #Evaluation
      if evaluation == True:
       # After the completion of each training epoch, measure the model's performance
       # on our validation set.
       val_loss, val_accuracy = evaluate(model, val_dataloader)
    
       # Print performance over the entire training data
       time_elapsed = time.time() - t0_epoch
    
       print(
        f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
       print("-" * 70)
      print("\n")
    
     print("Training complete!")
    
    
    def evaluate(model, val_dataloader):
     """After the completion of each training epoch, measure the model's performance
     on our validation set.
     """
     # Put the model into the evaluation mode. The dropout layers are disabled during
     # the test time.
     model.eval()
    
     # Tracking variables
     val_accuracy = []
     val_loss = []
    
     # For each batch in our validation set...
     for batch in val_dataloader:
      # Load batch to GPU
      b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
    
      # Compute logits
      with torch.no_grad():
       logits = model(b_input_ids, b_attn_mask)
    
      # Compute loss
      loss = loss_fn(logits, b_labels)
      val_loss.append(loss.item())
    
      # Get the predictions
      preds = torch.argmax(logits, dim=1).flatten()
    
      # Calculate the accuracy rate
      accuracy = (preds == b_labels).cpu().numpy().mean() * 100
      val_accuracy.append(accuracy)
    
     # Compute the average accuracy and loss over the validation set.
     val_loss = np.mean(val_loss)
     val_accuracy = np.mean(val_accuracy)
    
     return val_loss, val_accuracy
    
    def accuracy(probs, y_true):
     """
     - Print AUC and accuracy on the test set
     @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
     @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    
    
     fpr, tpr, threshold = roc_curve(y_true, preds)
     roc_auc = auc(fpr, tpr)
     print(f'AUC: {roc_auc:.4f}')
    """
     preds = probs[:, 1]
     # Get accuracy over the test set
     y_pred = np.where(preds >= 0.5, 1, 0)
     accuracy = accuracy_score(y_true, y_pred)
     print(f'Accuracy: {accuracy * 100:.2f}%')
    
    def bert_predict(model, test_dataloader):
     """Perform a forward pass on the trained BERT model to predict probabilities on the test set."""
     # Put the model into the evaluation mode. The dropout layers are disabled during the test time.
     model.eval()
     all_logits = []
     # For each batch in our test set...
     for batch in test_dataloader:
      # Load batch to GPU
      b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
    
      # Compute logits
      with torch.no_grad():
       logits = model(b_input_ids, b_attn_mask)
      all_logits.append(logits)
    
     # Concatenate logits from each batch
     all_logits = torch.cat(all_logits, dim=0)
     # Apply softmax to calculate probabilities
     probs = F.softmax(all_logits, dim=1).cpu().numpy()
    
     return probs
    
    set_seed(42)    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(epochs=3)
    # start training
    train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)
    # Compute predicted probabilities on the test set
    probs = bert_predict(bert_classifier, val_dataloader)
    # Evaluate the Bert classifier
    accuracy(probs, y_val)

标签: pythonlstmbert-language-modelhuggingface-transformers

解决方案


推荐阅读