python - PhraseMatcher 在尝试标记文档中的多个项目时返回空列表
问题描述
PhraseMatcher 在尝试标记文档中的多个项目时返回空列表。我从 csv 中的列传递值以标记文档内的产品名称,但 PhraseMatcher 仅标记具有一个字符串的行。当有多个标签元素时,它会返回一个空列表。这个有什么解决办法
import spacy
import re
import csv
from spacy.matcher import PhraseMatcher
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
#nlp = spacy.blank('en')
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\smoke\\'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open('qq1.csv', newline='') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
product = row[0].lower()
filename = row[1]
#print(product)
file = open(DIR+filename, "r", encoding ='utf-8')
filecontents = file.read()
#print(product)
for s in filecontents:
filecontents = re.sub(r'\[[0-9]*\]', ' ', filecontents)
filecontents = re.sub(r'\s+', ' ', filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
print(product.split())
matcher.add('PRODUCT', None, nlp(product))
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
except Exception as e:
#print(e)
pass`
当文档中有多个要标记的元素时,“to_train_ents”列表将返回一个空列表。要标记的元素以 csv 的形式提供,如下所示:`SAMPLE CSV:
PRODUCT FILES
ABC XXXX
ABC2, ABC3, BCA3 XXXX
BC2 XXXX
因此,在第二行的情况下,它在 to_train_ents 列表中返回一个空列表。在文档中标记此类案例的解决方案是什么
删除尝试和除块后:这次我只在我的 csv 中保留了产品名称和包含标记文件的目录中的文件名。但是,当我发送一个产品名称进行标记时,它在文档中有两个要标记的名称,它返回一个空列表。现在示例代码:
import spacy
import re
import csv
from spacy.matcher import PhraseMatcher
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
#nlp = spacy.blank('en')
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\sanity\\'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open('qq2.csv', newline='') as myFile:
reader = csv.reader(myFile)
for row in reader:
product = row[0].lower()
filename = row[1]
#print(product)
file = open(DIR+filename, "r", encoding ='utf-8')
filecontents = file.read()
#print(product)
for s in filecontents:
filecontents = re.sub(r'\[[0-9]*\]', ' ', filecontents)
filecontents = re.sub(r'\s+', ' ', filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
#print(product.split())
#product_patterns = [nlp(text) for text in product.split()]
matcher.add('PRODUCT', None, nlp(product))
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
解决方案
推荐阅读
- javascript - 监听函数执行
- c# - Hangfire:有没有办法将超链接注入到 DisplayName 属性中?
- docker - 有没有办法限制 Docker 中用户的容器访问?
- reactjs - ReduxForm - 不为 submit() 函数调用 onSubmit 方法
- c# - 从列表中创建一个嵌套的 Treeview 结构
- sql - 如何使用 TO_DATE 函数处理空值?
- java - Java 到 Kotlin_Gson.Json 转换错误
- java - 基于 Java 的 SIP 端点
- php - 链接到 zip 文件会在 Google Search Console 中创建“软 404”
- c# - 没有参考更新,Web 服务无法工作