首页 > 解决方案 > 特定域的 PoS 标记器模型

问题描述

我正在尝试在 spaCy v3.1 中构建一个带有.pos_特定域属性的标记器模型。下面的代码设法编译,但是,它没有返回.pos_属性。我怎样才能提取它们?

import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example

TAG_MAP = {
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
}

 TRAIN_DATA = [
('Eu gosto ovos cozidos', {'tags': ['N', 'V', 'N', 'J']}),
('Comer presunto azul', {'tags': ['V', 'N', 'J']})
]
@plac.annotations(
lang=("ISO Code of language to use", "option", "1", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),)

def main(lang="pt", output_dir="./output_2", n_iter=25):
    """Main function to create a new model, set up the pipeline and train 
    the tagger. In order to train the tagger with a custom tag map, 
    we're creating a new Language instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    tagger = nlp.add_pipe("tagger")

    for tag, values in TAG_MAP.items():
        tagger.add_label(tag) # tagger.add_label(tag, values) -> gives erro

    optimizer = nlp.begin_training()
    #optimizer = nlp.initialize()   
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:    
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)
    
    test_text = "Eu gosto ovos passados"        

    # Save model to output directory        
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
    
        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

if __name__ == "__main__":
    plac.call(main)

最后print返回:

Tags [('Eu', 'N', ''), ('gosto', 'V', ''), ('ovos', 'N', ''), ('passados', 'J', '')]

标签: python-3.xnlpspacy-3

解决方案


推荐阅读