python - 尝试执行 HuggingFace 教程 (pytorch) 时错误的张量类型
问题描述
我最近一直在尝试从 Hugging Face 获得变压器库的经验。由于我在使用 Pytorch(以及一般的深度学习)方面绝对是个菜鸟,所以我从可以在此处找到的介绍开始。
这是安装依赖项的代码:
#!pip install transformers
!pip install transformers[sentencepiece] # includes transformers dependencies
!pip install datasets # datasets from huggingface hub
!pip install tqdm
这是他们建议用来微调 BERT MNPR 数据集(用于 GLUE 基准测试)的代码。该数据集每个“样本”包含两个句子,因此在分词器中我们必须使用sentence1
和sentence2
。
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
# functions defining how the tokenizer works
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
# tokenizer will use dynamic padding (https://huggingface.co/course/chapter3/2?fw=pt)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# remove unecessary columns from data and format in torch tensors
tokenized_datasets = tokenized_datasets.remove_columns(
["sentence1", "sentence2", "idx"]
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
# loading model and training requirements
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
print(num_training_steps)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
# training loop:
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# assert 1==0
这对我在 Google Colab 中非常有效。我想对另一个数据集做同样的事情sst2
。我使用的代码与上面的代码非常相似。唯一改变的几行代码是导入数据的行和分词器(我们每个特征只有一个句子,而不是两个)。我已经仔细检查过,标记器工作正常。这是我的代码:
# imports
import torch
from datasets import load_dataset # datasets from huggingface
# tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
# training
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm
# Hyperparameters
batch_size = 8
learning_rate = 5e-5
num_epochs = 3
num_warmup_steps = 0
# load dataset and choosing checkpoint
raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased"
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# tokenization of dataset
def tokenize_function(example):
return tokenizer(example["sentence"], truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# setting DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)
# import model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# setup training loop
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
print(num_training_steps)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps
)
# chose device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
for k,v in batch.items():
print(f"key={k},v.dtype={v.dtype}, type(v)={type(v)}")
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
这是我得到的错误:
RuntimeError Traceback (most recent call last)
<ipython-input-11-7893d7715ac2> in <module>()
69 outputs = model(**batch)
70 loss = outputs.loss
---> 71 loss.backward()
72
73 optimizer.step()
1 frames
/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
147 Variable._execution_engine.run_backward(
148 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
150
151
RuntimeError: Found dtype Long but expected Float
这似乎是一个非常愚蠢的错误,但就像我说的那样,我是一个绝对的 pytorch 菜鸟,我很难知道从哪里开始解决这个问题。我已经检查了值的类型,batch.items()
在这两种情况下,它们都是torch.int64
(或torch.long
)。我试图将attention_mask
和input_ids
值更改为torch.float32
,但我收到了相同的错误消息。
提前致谢。
Python版本和包:
- 蟒蛇 3.7.20
- Pytorch 1.9.0+cu102
- 变压器 4.8.2
- GPU:Tesla T4(也尝试使用 tesla P4)
解决方案
我找到了问题的根源。问题来自于线路
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
由于数据集有2个类,调用模型的正确方法应该是
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
通过这个修改,我的代码现在可以工作了。
推荐阅读
- ringcentral - RingCentral Meetings API - 用户需要有 [Meetings] 权限
- javascript - 以下场景的编码标准需要明确或建议
- reactjs - React Redux - 如何在 url 更改时使用 React 和 Redux 进行正确的加载屏幕
- ios - PickerView 快速删除占位符标签
- google-apps-script - 是否可以在 AppsScript 中获取新的 Google 幻灯片演示数据?
- python - Pyinstaller:不能包含 .mp3 文件
- apache-superset - 如何在apache超集中添加大于过滤器(过滤器框)的动态?
- cefsharp - URL的初始加载缓慢
- ios - 如何判断 iCloud 文件是否已下载?
- drools - 加载持久会话时所有事件的 Drools 触发规则