首页 > 解决方案 > BERT 标注器不预测 #-split 单词

问题描述

我得到了这个结果。BERT 无法预测这 # 个标签词。这个 X 应该是 DRUG。

我正在使用 pytorch_pretrained_bert 库。我使用了这里的代码:
https ://github.com/Louis-udm/NER-BERT-CRF

Word in BERT layer  | Initial word   : Predicted NER-tag
-------------------------------------------------------------
holy                | holy           : O              
shit                | shit           : O              
that                | that           : O              
##one               | trazodone      : X              
actually            | actually       : O              
knocked             | knocked        : B-ADR          
me                  | me             : I-ADR          
the                 | the            : I-ADR          
fuck                | fuck           : I-ADR          
out                 | out            : I-ADR          
and                 | and            : O              
took                | took           : O              
me                  | me             : O              
for                 | for            : O              
a                   | a              : O              
ride                | ride           : O   

标签: deep-learningbert-language-model

解决方案


这是代码:

class PaddingInputExample(object):

    """ Fake example so the num input examples is a multiple of the batch size.
        When running eval/predict on the TPU, we need to pad the number of examples
        to be a multiple of the batch size, because the TPU requires a fixed batch
        size. The alternative is to drop the last batch, which is bad because it means
        the entire output data won't be generated.
        We use this class instead of `None` because treating `None` as padding
        battches could cause silent errors. """

def convert_text_to_examples(texts, labels):

    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, words=text, labels=label)
        )
    return InputExamples
def convert_examples_to_features(tokenizer, examples, max_seq_length=66):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels),
    )

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label_ids = [0] * max_seq_length
        return input_ids, input_mask, segment_ids, label_ids
#     pdb.set_trace()
    tokens_a = example.words
    if len(tokens_a) > max_seq_length-2:
        tokens_a = tokens_a[0 : (max_seq_length-2)]

# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.

# bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"]
# orig_to_tok_map == [1, 2, 4, 6]   
    orig_to_tok_map = []              
    tokens = []
    segment_ids = []
    
    tokens.append("[CLS]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    #print(len(tokens_a))
    for token in tokens_a:       
        tokens.extend(tokenizer.tokenize(token))
        orig_to_tok_map.append(len(tokens)-1)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map])
    #print(len(orig_to_tok_map), len(tokens), len(input_ids), len(segment_ids)) #for debugging

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)
    label_ids = []
    labels = example.labels
    label_ids.append(0)
    label_ids.extend([tag2int[label] for label in labels])
    label_ids.append(0)
    #print(len(label_ids)) #for debugging
    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        label_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, label_ids



test_example = convert_text_to_examples([sentence_ini], [['O']*len(sentence_ini)])
(input_ids, input_masks, segment_ids, _ ) =  convert_examples_to_features(tokenizer, test_example,max_seq_length)
input_ids = input_ids[0]
input_masks = input_masks[0]
segment_ids = segment_ids[0]
input_ids = torch.tensor([input_ids])
input_masks  = torch.tensor([input_masks])
segment_ids = torch.tensor([segment_ids])
model.eval()
with torch.no_grad():
    # Predict hidden states features for each layer
    predictions= model(input_ids,segment_ids, input_masks)
    _, predicted = torch.max(predictions[0], -1)

print("\n{:20}| {:15}: {:15}".format("Word in BERT layer", 'Initial word', "Predicted NER-tag"))
print(61*'-')
k = 0
for i, pred in enumerate(predicted):
#     print(pred)
    try:
        if pred.item()!=1:
            print("{:20}| {:15}: {:15}".format([tokens[i] for i in orig_to_tok_map][i], sentence_ini[i-1], int2tag[pred.item()]))            
            k+=1            
    except:
        pass


推荐阅读