python - 建立一个 IR 系统,在最后一行的第二行给出错误,即未定义名称
问题描述
我正在创建一个 IR 系统,它在集合中搜索查询并根据 TF IDF 排名给出结果。下面的代码给出了一个错误,即:文件“test.py”,第 223 行,在
document_freq = reduce(addto,DOC_Term,document_freq) NameError: name 'reduce' is not defined
import os,math,re,time,multiprocessing,itertools,argparse,sys
from collections import defaultdict
import operator
from pyparsing import *
# Term Frequency
# Wrapper for handling multi arguments
def wrap_getTF(a_b_c):
return getTF(*a_b_c)
# TF calculation for the given document. Log-normalisation is applied for TF.
# Doc -> Words
def getTF(doc,max_w_length,stopwords):
term_id=0
tf=defaultdict(float)
map_term_id=defaultdict(int)
map_id_term=defaultdict(str)
wfreq=defaultdict(int)
with open(doc,'r') as f:
for line in f:
if len(line.strip()) > 0:
list_terms=filter(lambda x: (1 < len(x) <= max_w_length) and (x not in stopwords), word_clean(re.split(r'\s+', line)))
for w in list_terms:
if not w in map_term_id.keys():
map_term_id[w]=term_id
map_id_term[term_id]=w
wfreq[map_term_id[w]]+=1
term_id+=1
for k in set(wfreq.keys()):
# Term Frequency: 1 + log(tf)
tf[map_id_term[k]]=1+math.log(wfreq[k],2)
return (doc,tf)
# Word Cleaning
def word_clean(words):
return map(lambda x: x.lower(), map(lambda x: re.sub("([^a-zA-Z]+$|^[^a-zA-Z]+)", "", x), words))
# Document Frequency
def addto(d,l):
for (x,y) in l:
d[y].append(x)
return d
def get_tf_dic(pair_doc_tf):
dic_ft=defaultdict(dict)
for (d,tf) in pair_doc_tf:
dic_ft[d]=tf
return dic_ft
# Query Parsing
#Classes for Query Parsing
class Unary(object):
def __init__(self, t):
self.op, self.a = t[0]
class Binary(object):
def __init__(self, t):
self.op = t[0][1]
self.operands = t[0][0::2]
class SearchAnd(Binary):
def generateSetExpression(self,docFreq):
return "(%s)" % " & ".join(oper.generateSetExpression(docFreq) for oper in self.operands)
def __repr__(self):
return "AND:(%s)" % (",".join(str(oper) for oper in self.operands))
class SearchOr(Binary):
def generateSetExpression(self,docFreq):
return "(%s)" % " | ".join(oper.generateSetExpression(docFreq) for oper in self.operands)
def __repr__(self):
return "OR:(%s)" % (",".join(str(oper) for oper in self.operands))
class SearchNot(Unary):
def generateSetExpression(self,docFreq):
return "(set(recipes) - %s)" % self.a.generateSetExpression(docFreq)
def __repr__(self):
return "NOT:(%s)" % str(self.a)
class SearchTerm(object):
def __init__(self, tokens):
self.term = tokens[0]
def __repr__(self):
return self.term
def generateSetExpression(self,docFreq):
if self.term in docFreq:
return "set(docFreq['%s'])" % self.term
else:
return "set()"
def query_parsing(path_query):
# define the grammar
and_=CaselessLiteral("and")
or_=CaselessLiteral("or")
not_=CaselessLiteral("not")
searchTerm=Word(alphas) | quotedString.setParseAction(removeQuotes)
searchTerm.setParseAction(SearchTerm)
searchExpr=operatorPrecedence(searchTerm,
[
(not_, 1, opAssoc.RIGHT, SearchNot),
(or_, 2, opAssoc.LEFT, SearchOr),
(Optional(and_,default="and"), 2, opAssoc.LEFT,SearchAnd),
#(and_, 2, opAssoc.LEFT, SearchAnd),
])
test_query=list()
try:
with open(path_query,'rb') as f:
for line in f:
if len(line.strip()) > 0:
test_query.append(line.strip())
except:
print ('cannot find the query file. Use the default query:'), test_query
pass
return (test_query,searchExpr)
# Search, TF-IDF, and Ranking
def query(tf,docFreq,pathquery):
# parsing the given queries
(list_queries,searchExpr)=query_parsing(pathquery)
# searching queries
for t in list_queries:
#
# Parse the given query
#
print ("-----------------")
print ("Search Query:"), t
try:
evalStack = (searchExpr+stringEnd).parseString(t)[0]
except ParseException as pe:
print ("Invalid search string"), t
continue
# Search Documents
evalExpr = evalStack.generateSetExpression(docFreq)
list_terms=evalExpr.split("'")[1::2]
print ("Search Query Logic:"), evalExpr
print ("Search Terms:"), list_terms
start = time.time()
matched_docs = eval(evalExpr)
if not matched_docs:
print (" (none)")
elapsed_time=time.time()-start
print ('Search Result: Found',len(matched_docs),'documents in', ("elapsed_time:{0}".format(elapsed_time)),'[sec]')
print ("\nSearch Result Ranking (document name, score)")
matched_doc_freq=defaultdict(list)
start = time.time()
# Document Frequency
# Calculating Doc Freq for each query term. The intersection of the logically
# matched docs and the pre-computed ground document frequency is computed using 'set' intersection.
for t in set(list_terms):
matched_doc_freq[t]=list(set(docFreq[t]).intersection(matched_docs))
# Scoring Algorithm: Accumulate TF-IDF scores for given query terms
scores=defaultdict(float)
for doc in matched_docs:
scores[doc] = reduce(lambda sum,x: sum + tf[doc][x] * math.log(1.0+1.0*(len(matched_docs))/(len(matched_doc_freq[x])+1),2),set(list_terms),0)
# Top 10 Documents
sorted_tfidf = sorted(scores.items(), key=operator.itemgetter(1),reverse=True)
for (doc_id,s) in sorted_tfidf[:10]:
print (doc_id+'\t'+str(s))
elapsed_time = time.time() - start
print ("Searched in:{0}".format(elapsed_time)) + "[sec]"
# Main
if __name__ == "__main__":
#(Step1) User Input
parser = argparse.ArgumentParser(description='Example: python test.py 20_newsgroups -max 15 -q query1.txt')
parser.add_argument('path_data_file',type=str,action='store',help='Path to data file')
parser.add_argument('-max','--word_length',nargs='?',default=15,const=15,type=int,action='store',help='Max Word Length')
parser.add_argument('-q','--path_queries',nargs='?',default='query1.txt',const='query1.txt',type=str,action='store',help='Path to query file')
args = parser.parse_args(sys.argv[1:])
data_path=args.path_data_file
max_word=args.word_length
path_query=args.path_queries
#(Step2) Read Doc Path
files=map(lambda x: zip([x[0]]*len(x[2]),x[2]), os.walk(data_path))
files=[y for x in files for y in x]
path_docs=map(lambda x: [os.path.join(x[0],x[1])][0] if len(x)==2 else None,files)
#(Step3) Term Frequency by multiprocessing
stops=[]
pool=multiprocessing.Pool(processes=50)
DOC_TF=pool.map(wrap_getTF,zip(path_docs,itertools.repeat(max_word),itertools.repeat(stops)))
#(Step4) Document Frequency
DOC_Term=map(lambda x: zip([x[0]]*len(x[1]),x[1]),DOC_TF)
document_freq=defaultdict(list)
document_freq = reduce(addto,DOC_Term,document_freq)
#(Step5) Making query and computing TF-IDF per query
query(get_tf_dic(DOC_TF),document_freq,path_query)
我使用了 reduce,但请检查代码并在我错的地方提供帮助。它说 reduce 没有定义,但我之前在另一行中也使用过它。
解决方案
推荐阅读
- c# - 如何并行处理潜在的异步聚合任务并在每个任务在 Unity 中完成时对其进行处理
- html - 在 Excel 上打开 htm 文件时,如何看到“大数字”(+14 位)?
- sql - Teradata SQL 排除错误
- visual-studio-code - 如何在 VSCode 的 launch.json 中设置秘密环境变量以进行调试?
- javascript - 流类型:“可选函数参数”和“可能类型”之间的区别
- html - 移动屏幕尺寸上的 DIV 垂直对齐
- java - 如何在 Java 中使用 TimeZone 获取当前时间戳
- django - 如何将一个模型的字段链接到另一个模型的特定字段
- ios - CollectionViewController 未调整使用 InputAccessoryView 显示的键盘
- php - PHP shell_exec 在 deb 9 上访问系统服务