首页 > 解决方案 > 尝试执行 HuggingFace 教程 (pytorch) 时错误的张量类型

问题描述

我最近一直在尝试从 Hugging Face 获得变压器库的经验。由于我在使用 Pytorch(以及一般的深度学习)方面绝对是个菜鸟,所以我从可以在此处找到的介绍开始。

这是安装依赖项的代码:

#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm

这是他们建议用来微调 BERT MNPR 数据集(用于 GLUE 基准测试)的代码。该数据集每个“样本”包含两个句子,因此在分词器中我们必须使用sentence1sentence2

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"

# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
  ["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
  tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
  tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)
print(num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

# training loop:
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    # assert 1==0

这对我在 Google Colab 中非常有效。我想对另一个数据集做同样的事情sst2。我使用的代码与上面的代码非常相似。唯一改变的几行代码是导入数据的行和分词器(我们每个特征只有一个句子,而不是两个)。我已经仔细检查过,标记器工作正常。这是我的代码:

# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm

# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0

# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenization of dataset
def tokenize_function(example):
  return tokenizer(example["sentence"], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") 
tokenized_datasets.set_format("torch")

# setting DataLoader
train_dataloader = DataLoader(
  tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
  tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)

# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()} 
    for k,v in batch.items():
      print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
        
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

这是我得到的错误:

RuntimeError                              Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
     69     outputs = model(**batch)
     70     loss = outputs.loss
---> 71     loss.backward()
     72 
     73     optimizer.step()

1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    147     Variable._execution_engine.run_backward(
    148         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    150 
    151 

RuntimeError: Found dtype Long but expected Float

这似乎是一个非常愚蠢的错误,但就像我说的那样,我是一个绝对的 pytorch 菜鸟,我很难知道从哪里开始解决这个问题。我已经检查了值的类型,batch.items()在这两种情况下,它们都是torch.int64(或torch.long)。我试图将attention_maskinput_ids值更改为torch.float32,但我收到了相同的错误消息。

提前致谢。

Python版本和包:

标签: pythonnlppytorchhuggingface-transformers

解决方案


我找到了问题的根源。问题来自于线路

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

由于数据集有2个类,调用模型的正确方法应该是

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

通过这个修改,我的代码现在可以工作了。


推荐阅读