python - 用于句子分类的 Huggingface GPT2 和 T5 模型 API?
问题描述
我已经成功地使用Huggingface Transformers BERT 模型使用BERTForSequenceClassification类和 API 进行句子分类。我已经将它用于 1 句情感分析和 2 句 NLI。
我可以看到其他模型有类似的类,例如XLNetForSequenceClassification和RobertaForSequenceClassification。这种类型的句子分类通常涉及在表示整个句子的密集向量之上放置一个分类器层。
现在我正在尝试使用GPT2和T5模型。但是,当我查看每个可用的类和 API 时,没有等效的“ForSequenceClassification”类。例如,对于 GPT2,有GPT2Model、GPT2LMHeadModel和GPT2DoubleHeadsModel类。也许我对 GPT2 和 T5 的研究不够熟悉,但我确信这两个模型都能够进行句子分类。
所以我的问题是:
GPT2 和 T5 的 Huggingface 类应该用于 1 句分类吗?
我应该使用哪些类进行 2 句(句对)分类(如自然语言推理)?
感谢您的任何帮助。
解决方案
您需要使用 GPT2Model 类来生成文本的句子嵌入。一旦您将嵌入提供给线性NN和softmax函数以获得logits,下面是我正在研究的使用GPT2进行文本分类的组件(仍在进行中,所以我愿意接受建议),它遵循我刚才描述的逻辑:
from torch_model_base import TorchModelBase
import torch
import torch.nn as nn
import torch.utils.data
from transformers import GPT2Tokenizer, GPT2Model
import random
from spacy.util import minibatch, compounding
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from typing import List, Tuple
def mean_across_all_tokens(hidden_states):
return torch.mean(hidden_states[-1], dim=1)
def sum_all_tokens(hidden_states):
return torch.sum(hidden_states[-1], dim=1)
def concat_all_tokens(hidden_states):
batch_size, max_tokens, emb_dim = hidden_states[-1].shape
return torch.reshape(hidden_states[-1], (batch_size, max_tokens * emb_dim))
class GPT2SequenceClassifierModel(nn.Module):
def __init__(
self,
hidden_size: int,
num_classes: int,
gpt_model_name: str,
max_seq_length: int = 280,
embedding_func=mean_across_all_tokens,
combine_sentence_tokens=True
):
super(GPT2SequenceClassifierModel, self).__init__()
self.hidden_size = hidden_size
self.fc1 = nn.Linear(hidden_size, num_classes)
self.model = GPT2Model.from_pretrained(
gpt_model_name,
output_hidden_states=True
)
self.tokenizer = GPT2Tokenizer.from_pretrained(gpt_model_name)
self.combine_sentence_tokens = combine_sentence_tokens;
self.embedding_func = embedding_func;
self.model.eval()
self.max_length = max_seq_length
def _tokenize(self, text_list: List[str]) -> Tuple[torch.tensor, torch.tensor]:
# Tokenize the text with the provided tokenizer
#self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
self.tokenizer.add_special_tokens({'cls_token': '[CLS]'})
self.model.resize_token_embeddings(len(self.tokenizer))
input_ids = self.tokenizer.batch_encode_plus(text_list,
add_special_tokens=True,
max_length=self.max_length,
pad_to_max_length=True
)["input_ids"]
return torch.LongTensor(input_ids)
def _tokenize_and_predict(self, text_list: List[str]) -> torch.tensor:
input_ids_tensor = self._tokenize(text_list)
out = self.model(input_ids=input_ids_tensor)
hidden_states = out[2]
if (self.combine_sentence_tokens):
return self.embedding_func(hidden_states)
else:
return hidden_states[-1];
def forward(self, text_list: List[str]):
"""
:param input_ids: (torch.LongTensor of shape (batch_size, input_ids_length))
:return: logits for class
"""
if isinstance(text_list, pd.Series):
text_list = text_list.tolist()
with torch.no_grad():
# fine tuning GPT2 model is too expensive, so won't do it
gpt_out = self._tokenize_and_predict(text_list)
batch_size = len(text_list)
assert gpt_out.shape == (batch_size, self.hidden_size)
prediction_vector = self.fc1(gpt_out) # (batch_size , max_len, num_classes)
logits = torch.softmax(prediction_vector, dim=1)
return logits
class GPT2Classifier(TorchModelBase):
"""GPT2 + NN head for classification problems.
The network will work for any kind of classification task.
Parameters
----------
embed_dim: dimension of byte-pair/token embeddings generated by the model, check the model card(n_embd prop), since each model is compatible with only 1 no. of dimensions
max_seq_length: max tokens in a sequence(n_positions param in hugging face model config), if sequenc is shorter will get padded
"""
def __init__(self,
model_name="distilgpt2",
embed_dim=768,
max_seq_length=1024,
**kwargs
):
self.model_name = model_name
self.embed_dim = embed_dim
self.max_seq_length = max_seq_length
self.model = None # call fit() to set this
self.tokenizer = None # call fit() to set this
self.classes = None # call fit() to set this
super(GPT2Classifier, self).__init__(**kwargs)
self.params += ['model_name']
def fit(self, X, y):
"""Standard `fit` method.
Parameters
----------
X : np.array
y : array-like
Returns
-------
self
"""
self.classes = list(set(y))
self.model = GPT2SequenceClassifierModel(
hidden_size=self.embed_dim,
num_classes=len(self.classes),
gpt_model_name=self.model_name,
max_seq_length=self.max_seq_length
)
self.opt = self.optimizer(
self.model.parameters()
)
self.model.train()
loss = nn.CrossEntropyLoss()
print("Training... max iters: ", self.max_iter)
for ephoc in range(self.max_iter):
print("ephoc no: ", ephoc)
zipped_data = list(zip(X,y))
random.shuffle(zipped_data)
batches = minibatch(zipped_data, size=self.batch_size)
for batch in batches:
X_batch, y_batch = zip(*batch)
batch_preds = self.model(X_batch)
err = loss(batch_preds, torch.LongTensor(y_batch))
# Backprop:
self.opt.zero_grad()
err.backward()
self.opt.step()
return self
def predict_proba(self, X):
"""Predicted probabilities for the examples in `X`.
Parameters
----------
X : np.array
Returns
-------
np.array with shape (len(X), self.n_classes_)
"""
self.model.eval()
with torch.no_grad():
preds = self.model(X)
preds = preds.numpy()
return preds
def predict(self, X):
"""Predicted labels for the examples in `X`. These are converted
from the integers that PyTorch needs back to their original
values in `self.classes_`.
Parameters
----------
X : np.array
Returns
-------
list of length len(X)
"""
probs = self.predict_proba(X)
return [self.classes[i] for i in probs.argmax(axis=1)]
推荐阅读
- rust - 如何在 Rust 中使字符(char)大写
- c# - 将查询结果从 Powershell 调用到 C#
- c# - 我正在使用 MVC 并尝试从 Query 将值传递给我的视图
- java - Junit测试期间线程“main”java.lang.IllegalAccessError中的异常
- matlab - (-8)^(-2/3) 返回错误结果
- html - VS代码:编辑器:保存时格式化选项已关闭,但保存后正在删除新行
- python - Leetcode“二叉树前序遍历”问题
- c# - WPF Observable Collection 未更新
- docker - Grafana查询以选择不同时间的字段(PVPC)
- python - ModelChoiceField 返回一个数字而不是它的值