python - spaCy 空白 NER 模型拟合不足,即使在大型数据集上进行训练
问题描述
我正在尝试创建一个自定义 NER 模型来识别网络安全相关实体(其中 27 个)。我决定使用空白模型,因为我认为我有足够大(不确定)的训练数据集(从 Wikipedia 中提取的大约 11k 个句子)。
为了创建 spaCy 所需的训练数据,我使用了PhraseMatcher实用程序。这个想法是匹配与我想要识别的实体相关的某些预定义单词/短语,如下图所示:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en")
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
指定匹配器标签
users_pattern = [nlp(text) for text in ("user", "human", "person", "people", "end user")]
devices_pattern = [nlp(text) for text in ("device", "peripheral", "appliance", "component", "accesory", "equipment", "machine")]
accounts_pattern = [nlp(text) for text in ("account", "user account", "username", "user name", "loginname", "login name", "screenname", "screen name", "account name")]
identifiers_pattern = [nlp(text) for text in ("attribute", "id", "ID", "code", "ID code")]
authentication_pattern = [nlp(text) for text in ("authentication", "authenticity", "certification", "verification", "attestation", "authenticator", "authenticators")]
time_pattern = [nlp(text) for text in ("time", "date", "moment", "present", "pace", "moment")]
unauthorized_pattern = [nlp(text) for text in ("unauthorized", "illegal", "illegitimate", "pirated", "unapproved", "unjustified", "unofficial")]
disclosure_pattern = [nlp(text) for text in ("disclosure", "acknowledgment", "admission", "exposure", "advertisement", "divulgation")]
network_pattern = [nlp(text) for text in ("network", "net", "networking", "internet", "Internet")]
wireless_pattern = [nlp(text) for text in ("wireless", "wifi", "Wi-Fi", "wireless networking")]
password_pattern = [nlp(text) for text in ("password", "passwords", "passcode", "passphrase")]
configuration_pattern = [nlp(text) for text in ("configuration", "composition")]
signatures_pattern = [nlp(text) for text in ("signature", "signatures", "digital signature", "electronic signature")]
certificates_pattern = [nlp(text) for text in ("certificate", "digital certificates", "authorization certificate", "public key certificates", "PKI", "X509", "X.509")]
revocation_pattern = [nlp(text) for text in ("revocation", "annulment", "cancellation")]
keys_pattern = [nlp(text) for text in ("key", "keys")]
algorithms_pattern = [nlp(text) for text in ("algorithm", "algorithms", "formula", "program")]
standard_pattern = [nlp(text) for text in ("standard", "standards", "specification", "specifications", "norm", "rule", "rules", "RFC")]
invalid_pattern = [nlp(text) for text in ("invalid", "false", "unreasonable", "inoperative")]
access_pattern = [nlp(text) for text in ("access", "connection", "entry", "entrance")]
blocking_pattern = [nlp(text) for text in ("blocking", "block", "blacklist", "blocklist", "close", "cut off", "deter", "prevent", "stop")]
notification_pattern = [nlp(text) for text in ("notification", "notifications", "notice", "warning")]
messages_pattern = [nlp(text) for text in ("message", "messages", "note", "news")]
untrusted_pattern = [nlp(text) for text in ("untrusted", "malicious", "unsafe")]
security_pattern = [nlp(text) for text in ("security", "secure", "securely", "protect", "defend", "guard")]
symmetric_pattern = [nlp(text) for text in ("symmetric", "symmetric crypto")]
asymmetric_pattern = [nlp(text) for text in ("asymmetric", "asymmetric crypto")]
matcher = PhraseMatcher(nlp.vocab)
matcher.add("USER", None, *users_pattern)
matcher.add("DEVICE", None, *devices_pattern)
matcher.add("ACCOUNT", None, *accounts_pattern)
matcher.add("IDENTIFIER", None, *identifiers_pattern)
matcher.add("AUTHENTICATION", None, *authentication_pattern)
matcher.add("TIME", None, *time_pattern)
matcher.add("UNAUTHORIZED", None, *unauthorized_pattern)
matcher.add("DISCLOSURE", None, *disclosure_pattern)
matcher.add("NETWORK", None, *network_pattern)
matcher.add("WIRELESS", None, *wireless_pattern)
matcher.add("PASSWORD", None, *password_pattern)
matcher.add("CONFIGURATION", None, *configuration_pattern)
matcher.add("SIGNATURE", None, *signatures_pattern)
matcher.add("CERTIFICATE", None, *certificates_pattern)
matcher.add("REVOCATION", None, *revocation_pattern)
matcher.add("KEY", None, *keys_pattern)
matcher.add("ALGORITHM", None, *algorithms_pattern)
matcher.add("STANDARD", None, *standard_pattern)
matcher.add("INVALID", None, *invalid_pattern)
matcher.add("ACCESS", None, *access_pattern)
matcher.add("BLOCKING", None, *blocking_pattern)
matcher.add("NOTIFICATION", None, *notification_pattern)
matcher.add("MESSAGE", None, *messages_pattern)
matcher.add("UNTRUSTED", None, *untrusted_pattern)
matcher.add("SECURITY", None, *security_pattern)
matcher.add("SYMMETRIC", None, *symmetric_pattern)
matcher.add("ASYMMETRIC", None, *asymmetric_pattern)
准备训练数据
def offsetter(lbl, doc, matchitem):
"""
Convert PhaseMatcher result to the format required in training (start, end, label)
"""
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
to_train_ents = []
count_dic = defaultdict(int)
# Load the original sentences
df = pd.read_csv("sentences.csv", index_col=False)
phrases = df["sentence"].values
for line in tqdm(phrases):
nlp_line = nlp(line)
matches = matcher(nlp_line)
if matches:
for match in matches:
match_id = match[0]
start = match[1]
end = match[2]
label = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = nlp_line[start:end] # get the matched slice of the doc
count_dic[label] += 1
res = [offsetter(label, nlp_line, match)]
to_train_ents.append((line, dict(entities=res)))
count_dic = dict(count_dic)
TRAIN_DATA = to_train_ents
执行上述代码后,我得到了 spaCy 所需格式的训练数据。这些句子包含我感兴趣的实体,分布如下所示:
print(sorted(count_dic.items(), key=lambda x:x[1], reverse=True), len(count_dic))
sum(count_dic.values())
[('NETWORK', 1962), ('TIME', 1489), ('USER', 1206), ('SECURITY', 981), ('DEVICE', 884), ('STANDARD', 796), ('ACCESS', 652), ('ALGORITHM', 651), ('MESSAGE', 605), ('KEY', 423), ('IDENTIFIER', 389), ('BLOCKING', 354), ('AUTHENTICATION', 141), ('WIRELESS', 109), ('UNAUTHORIZED', 99), ('CONFIGURATION', 89), ('ACCOUNT', 86), ('UNTRUSTED', 77), ('PASSWORD', 62), ('DISCLOSURE', 58), ('NOTIFICATION', 55), ('INVALID', 44), ('SIGNATURE', 41), ('SYMMETRIC', 23), ('ASYMMETRIC', 11), ('CERTIFICATE', 10), ('REVOCATION', 9)] 27
11306
然后,我使用标准训练程序在 spaCy 中训练了一个空白 NER 模型,如下所示。
训练空白模型
# define variables
model = None
n_iter = 100
if model is not None:
nlp_new = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp_new = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp_new.pipe_names:
ner = nlp_new.create_pipe("ner")
nlp_new.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
ner = nlp_new.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp_new.pipe_names if pipe != "ner"]
with nlp_new.disable_pipes(*other_pipes): # only train NER
if model is None:
optimizer = nlp_new.begin_training()
else:
optimizer = nlp_new.resume_training()
# Set this based on this resource: spacy compounding batch size
sizes = compounding(1, 16, 1.001)
# batch up the examples using spaCy's minibatch
for itn in tqdm(range(n_iter)):
losses = {}
random.shuffle(TRAIN_DATA)
batches = minibatch(TRAIN_DATA, size=sizes)
for batch in batches:
texts, annotations = zip(*batch)
nlp_new.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
print("Losses", losses)
在此之后的最终损失约为500。
最后,我使用训练数据测试了新模型的性能。我希望恢复与训练数据集中最初指定的实体一样多的实体。然而,在运行下面的代码之后,我总共只得到了大约 600 个实例,其中大约 11k。
测试训练模型
count_dic = defaultdict(int)
for text, _ in TRAIN_DATA:
doc = nlp_new(text)
for ent in doc.ents:
count_dic[ent.label_] += 1
print(sorted(count_dic.items(), key=lambda x:x[1], reverse=True), len(count_dic))
sum(count_dic.values())
[('TIME', 369), ('NETWORK', 47), ('IDENTIFIER', 41), ('BLOCKING', 28), ('USER', 22), ('STANDARD', 22), ('SECURITY', 15), ('MESSAGE', 15), ('ACCESS', 7), ('CONFIGURATION', 7), ('DEVICE', 7), ('KEY', 4), ('ALGORITHM', 3), ('SYMMETRIC', 2), ('UNAUTHORIZED', 2), ('SIGNATURE', 2), ('WIRELESS', 1), ('DISCLOSURE', 1), ('INVALID', 1), ('PASSWORD', 1), ('NOTIFICATION', 1)] 21
598
我想知道为什么这个过程会产生一个具有这种欠拟合行为的模型。我知道这些帖子中的评论:NER training using Spacy and SPACY custom NER is not return any entity but they didn't resolve my issue。
我希望您可以就我所做的工作以及如何改进对训练集中实体的检测提供任何反馈。我认为 11k 句子就足够了,除非我做错了什么。我正在使用 Python 3.6.9 和 spaCy 2.2.4。
非常感谢你的帮助。
更新
我决定训练包含正样本和负样本的模型。现在训练数据有超过 40k 的句子。然而,这种变化确实改善了训练集中的分类结果。还有其他建议吗?
训练数据集
完整的训练数据集可以从这里下载。
解决方案
欠拟合可能是由于 spacy 空白模型太小而无法在您的情况下表现良好。根据我的经验,spacy 空白模型大约 5Mb,这很小(特别是如果我们将它与大约 500 Mb 的 spacy 预训练模型的大小进行比较)。
事实上,您有 27 个不同的标签和大量数据。
我不知道是否有可能从头开始创建更大的空间模型。欢迎回答。
推荐阅读
- pine-script - 有没有办法在不同的窗格中绘制策略中的不同图?
- postgresql - postgress 极光导出到 s3
- laravel - 为什么我的下拉列表没有显示在 Select2 中?
- database-design - 订单交付系统的关系和基数
- json - 道具值作为 JSON 文件中的索引 - React Hooks
- sql - 查找查询以根据月/年区分记录
- webrtc - 正确拆解 WebRTC RTCPeerConnection
- javascript - 投影上下文中连接对象的轮廓着色器?
- python - Pyautogui随机自动按住shift?我猜?
- google-bigquery - Bigquery - 如何组合连续日期范围内的数据