python - 训练现有 Spacy Ner 管道会忘记之前的示例
问题描述
我正在为命名实体识别创建一个新模型。我有一个训练数据集,如下所示:
[[
"world leading global energy trading company has an exciting
opportunity for a senior csharp application platform developer to
join its systematic trading division developing new trading
applications tools and solutions for its successful front office
trading team",
{"entities": [[80, 86, "RNK"]]}
]]
所以我运行以下函数来训练模型。
@plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m",
str),
new_model_name=("New model name for model meta.", "option", "nm",
str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),
entity=("Name of the entity to be trained", "option", "e", str),
label=("The label to be given to the trained entity", "option", "l",
str),
)
def main(model = None, new_model_name=("ner%s" % str(datetime.now)),
output_dir= None, n_iter=20, entity=None, label=None):
if entity is None or label is None:
log.info("Entity and Label must both be supplied")
log.info("Bailing out as nothing to do ...... :-(")
return
log.info("Fetching training data for entity [%s] to be trained with
label [%s]" % (entity, label))
log.info("Training data retrieved and the first row is : ")
log.info(TRAIN_DATA[0])
log.info("There are %d rows to be trained" % len(TRAIN_DATA))
if model is not None:
nlp = spacy.load(output_dir) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
if "ner" not in nlp.pipe_names:
log.info("ner not in pipe names, adding it in now ....")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
log.info("retrieving previous ner pipe now ....")
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
move_names = list(ner.move_names)
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in
pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
optimizer=nlp.begin_training()
else:
optimizer=nlp.resume_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0,
1.001)) for batch in batchs: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - 让记忆数据更难 sgd=optimizer,损失=损失,)打印(“损失”,损失)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.meta["name"] = new_model_name # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
# Check the classes have loaded back consistently
assert nlp2.get_pipe("ner").move_names == move_names
return
if __name__ == "__main__":
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
log.addHandler(consoleHandler)
plac.call(main)
这很好用,然后我可以使用以下函数成功测试模型
@plac.annotations(
model_dir=("Optional output directory", "option", "o", Path),
test_text=("The test text to be used to test the model","option",
"t", str),
entity=("Name of the entity to be trained", "option", "e", str),
label=("The label to be given to the trained entity", "option", "l",
str),
)
def main(model_dir= None,test_text=None, entity=None, label=None):
if entity is None or label is None:
log.info("Entity and Label must both be supplied")
log.info("Bailing out as nothing to do ...... :-(")
return
if test_text is None:
test_text = ("Using default test string which is not optimal to
look for %s" % entity)
nlp = spacy.load(model_dir)
log.info("Loaded model %s" % nlp.meta["name"])
log.info("Testing the string %s" % test_text)
ner = nlp.get_pipe("ner")
for label in ner.labels:
log.info("NER Label : %s found in model" % label)
doc = nlp(test_text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
log.addHandler(consoleHandler)
plac.call(main)
但是,如果我再次运行代码以针对新标签进行训练,例如说“技能”并提供新的训练数据,它会成功加载旧模型,检索 Ner 管道并将优化器设置为恢复,但是当我再次对其进行测试时已经忘记了所有关于 RNK 标签的培训。
我假设简历会以某种方式获取先前的模型状态并保留先前学习的注释。它当然保留了 NER 标签。
为什么会这样?
认为这可能与灾难性遗忘问题有关,我创建了一大组训练数据,其中包含两个类别的示例,因此:
'TOKEN' >>> 'NER Annotation'
-----------------------------
'senior' >>> 'RNK'
'csharp' >>> 'SKILL'
'sql' >>> 'SKILL'
这些是损失
Losses {'ner': 721.8737016180717}
Losses {'ner': 5.999976082008388}
Losses {'ner': 5.970323057037423}
Losses {'ner': 5.996330579093365}
Losses {'ner': 6.028536462566022}
Losses {'ner': 12.043830573641666}
Losses {'ner': 10.001897952651317}
Losses {'ner': 6.016950026187274}
Losses {'ner': 6.624311646328313}
Losses {'ner': 10.602919933949224}
Losses {'ner': 6.1062697231067995}
Losses {'ner': 8.792055106010444}
Losses {'ner': 13.302123281119345}
Losses {'ner': 6.068028368915684}
Losses {'ner': 8.026694430880903}
Losses {'ner': 8.961434860193798}
Losses {'ner': 6.02721516249698}
Losses {'ner': 9.714660156853073}
Losses {'ner': 4.108544494319015}
Losses {'ner': 6.023105974059858}
Losses {'ner': 7.357760648981275}
Losses {'ner': 6.295292869532734}
Losses {'ner': 3.8088561052881995}
Losses {'ner': 6.059279332644757}
Losses {'ner': 7.024559462190113}
Losses {'ner': 4.784358718788942}
Losses {'ner': 5.935101364429172}
Losses {'ner': 4.027772727507415}
Losses {'ner': 2.1748163004265884}
Losses {'ner': 5.993975825343896}
我在 pastebin 中包含了我的训练数据,可以在这里找到:
解决方案
推荐阅读
- qt - 如何在 Qt Designer 中以图形方式拖动布局中的多个布局?
- javascript - Window.onbeforeunload 页面刷新后未设置
- paypal - 在沙盒模式下调用 PayPal API 时出错
- python - 关闭消息框后,Tkinter 程序自动关闭且无错误
- kotlin - 计算字符串需要的像素数
- jms - Spring JMS DefaultMessageListenerContainer 轮询频率
- android - Android:当您取消应用程序而不是关闭它时,SharedPreferences 添加了额外的 4 个空格
- node.js - 如何围绕远程 github 页面 JSON 对象构建 server.js 快递服务器
- laravel - 如果验证器在表单请求中失败,则添加数据
- python - 使用“scipy.optimize.brute”最小化两个变量函数的问题