spacy - 如何在 spaCy 上训练伪投影解析器?
问题描述
我正在尝试按照https://raw.githubusercontent.com/explosion/spaCy/master/examples/training/train_intent_parser.py中的示例代码训练自定义语义解析器
这个想法是得到一个非投影解析所以当我传递了这样的文本:ROOT AAAA BBBB 12 21
12 成为 AAAA 的孩子,21 成为 BBBB 的孩子。为了测试这一点,我只训练这个案例并测试同样的案例,但它似乎不起作用,我得到的回应是:
[('ROOT', 'ROOT', 'ROOT'), ('AAAA', 'LETTERS', 'ROOT'), ('BBBB', 'LETTERS', 'ROOT'), ('12', 'NUMBERS', 'BBBB'), ('21', 'NUMBERS', 'BBBB')]
如您所见,这两个数字都依赖于 BBBB,而 12 应该依赖于 AAAA。
我用来训练和测试的代码是:
import plac
import random
import spacy
from spacy.util import minibatch, compounding
TRAIN_DATA = list()
samples = 1000
for _ in range(samples):
sample = (
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
TRAIN_DATA.append(sample)
def test_model(nlp):
texts = ['ROOT AAAA BBBB 12 21']
docs = nlp.pipe(texts)
for doc in docs:
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
n_iter=("Number of training iterations", "option", "n", int),
)
# Just in case I am using the german model since it supports pseudo-projective parsing (https://explosion.ai/blog/german-model#word-order)
def main(model='de_core_news_sm', n_iter=15):
"""Load the model, set up the pipeline and train the parser."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# We'll use the built-in dependency parser class, but we want to create a
# fresh instance – just in case.
if "parser" in nlp.pipe_names:
nlp.remove_pipe("parser")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser, first=True)
for text, annotations in TRAIN_DATA:
for dep in annotations.get("deps", []):
parser.add_label(dep)
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)
那么,我做错了什么?
提前感谢您对此提供的任何帮助!
解决方案
问题是简单的训练示例脚本在初始化和训练模型时没有投影训练实例。解析算法本身只能处理投影解析,但如果解析器组件在其输出中找到投影标签,它们会在后处理步骤中被反投影。您不需要修改任何解析器设置(因此从德国模型开始没有区别),只需以正确的格式提供投影化输入。
初始投影由 train CLI 自动处理,用于GoldCorpus.train_docs()
准备训练示例nlp.update()
并make_projective=True
在创建GoldParse
s. 一般来说,我建议切换到 train CLI(这也需要切换到内部 JSON 训练格式,这无疑是一个小麻烦),因为 train CLI 设置了很多更好的默认值。
但是,只要您创建投影化训练示例(使用GoldParse(make_projective=True
),将所有投影化依赖标签添加到解析器,并使用Doc
投影GoldParse
化输入而不是文本/注释输入进行训练,玩具示例也可以正常工作:
# tested with spaCy v2.2.4
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
TRAIN_DATA = [
(
'ROOT AAAA BBBB 12 21',
{
'heads': [0, 0, 0, 1, 2],
'deps': ['ROOT', 'LETTERS', 'LETTERS', 'NUMBERS', 'NUMBERS']
}
)
]
samples = 200
def test_model(nlp):
texts = ["ROOT AAAA BBBB 12 21"]
for doc in nlp.pipe(texts):
print(doc.text)
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
spacy.displacy.serve(doc)
@plac.annotations(
n_iter=("Number of training iterations", "option", "n", int),
)
def main(n_iter=10):
"""Load the model, set up the pipeline and train the parser."""
nlp = spacy.blank("xx")
parser = nlp.create_pipe("parser")
nlp.add_pipe(parser)
docs_golds = []
for text, annotation in TRAIN_DATA:
doc = nlp.make_doc(text)
gold = GoldParse(doc, **annotation, make_projective=True)
# add the projectivized labels
for dep in gold.labels:
parser.add_label(dep)
docs_golds.append((doc, gold))
# duplicate the training instances
docs_golds = docs_golds * samples
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train parser
optimizer = nlp.begin_training(min_action_freq=1)
for itn in range(n_iter):
random.shuffle(docs_golds)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(docs_golds, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, losses=losses)
print("Losses", losses)
# test the trained model
test_model(nlp)
if __name__ == "__main__":
plac.call(main)
推荐阅读
- javascript - 获取 MongoDB 中具有特定属性的所有文档
- html - mix-blend-mode:hue 在 safari 中不起作用 - 我该如何解决这个问题?
- javascript - React Context API 可能存在异步/等待问题?
- flutter - Flutter/Flame 限制游戏中的拖动区域
- java - 使用字符串流的函数不适用于单个字符串?
- python - 在没有 SAS-IML 许可证的情况下执行 R 脚本的最简单的简单 SAS 代码是什么?
- vagrant - 如何更改 vagrant ssh 上的欢迎启动画面?
- ionic3 - 用户在 AWS 用户池中更改密码时未通过身份验证
- php - WhatsApp URL 组链接的正则表达式
- c# - 物理对象在连续接触时“漂移”