首页 > 解决方案 > Python:显示字典单词的匹配键

问题描述

我想在我的项目中显示字典单词的匹配键。我的代码当前输出键,但对于您输入的任何单词,键都是相同的。例如,如果我'england played well'输入返回的键将是[737, 736, 735, 734, 733, 732, 731, 730, 729, 728]. 如果我把'Hello'相同的钥匙将被退回。请查看下面的代码,如果我做错了什么,请告诉我

import re
import os
import math
import heapq

def readfile(path, docid):
    files = sorted(os.listdir(path))
    f = open(os.path.join(path, files[docid]), 'r',encoding='latin-1')
    s = f.read()
    f.close()
    return s

DELIM = '[ \n\t0123456789;:.,/\(\)\"\'-]+'

def tokenize(text):
    return re.split(DELIM, text.lower())

N = len(sorted(os.listdir('docs')))

def indextextfiles_RR(path):
    postings={}
    docLength = {}
    term_in_document = {}
    for docID in range(N):
        s = readfile(path, docID)
        words = tokenize(s)
        length = 0
        for w in words:
            if w!='':
                length += (math.log10(words.count(w)))**2
        docLength[docID] = math.sqrt(length)
        for w in words:
            if w!='':
                doc_length = math.log10(words.count(w))/docLength[docID]
                term_in_document.setdefault(doc_length, set()).add(docID)
                postings[w] = term_in_document
    return postings


def query_RR(postings, qtext):
    words = tokenize(qtext)
    doc_scores = {}
    for docID in range(N):
        score = 0
        for w in words:
            tf = words.count(w)
            df = len(postings[w])
            idf = math.log10(N / (df+1))
            query_weights = tf * idf
        for w in words:
            if w in postings:
                score = score + query_weights
        doc_scores[docID] = score
    res = heapq.nlargest(10, doc_scores)
    return res

postings = indextextfiles_RR('docs')
print(query_RR(postings, 'hello'))

当我运行帖子时,它应该返回 hello 和与之关联的键列表。

标签: pythondictionarykey

解决方案


很可能,您的错误来自于term_in_document您对每个文件中的所有单词使用相同的字典。

多条评论

  1. len(sorted(...))它浪费资源排序不需要排序的东西(排序并不便宜),因为你只得到长度。
  2. 按编号读取文件根本没有意义,为此,您最终会多次调用文件系统来读取整个目录的文件名,因为每次读取文件时都会列出文件。
  3. 文件应该在with为我们处理关闭文件的语句中打开。
  4. 变量和函数应该使用this_notation,而类应该使用ThisNotation.
  5. 您在单词列表上迭代两次只是为了获得十进制对数。

之后的逻辑非常令人困惑,您似乎正在对每个单词出现的次数的十进制对数进行 RMS(均方根),但您没有将其除以单词数。然后你再次得到对数。您可能应该更好地定义您的问题。当我得到新信息时,我会编辑我的答案。

import re
import os
import math
import heapq

def read_file(path):
    with open(path, 'r', encoding='latin-1') as f:
        return f.read()

DELIM = '[ \n\t0123456789;:.,/\(\)\"\'-]+'

def tokenize(text):
    return re.split(DELIM, text.lower())

def index_text_files_rr(path):
    postings = {}
    doc_lengths = {}
    term_in_document = {}
    files = sorted(os.listdir(path))
    for i, file in enumerate(files):
        file_path = os.path.join(path, file)
        s = read_file(file_path)
        words = tokenize(s)
        length = 0
        # We will store pairs of the word with the decimal logarithm of
        # the word count here to use it later
        words_and_logs = []
        for word in words:
            # Discard empty words
            if word != '':
                # Compute the decimal logarithm of the word count
                log = math.log10(words.count(word))
                # Add the square of the decimal logarithm to the length
                length += log**2
                # Store the word and decimal logarithm pair
                words_and_logs.append((word, log))
        # Compute the square root of the sum of the squares
        # of the decimal logarithms of the words count
        doc_lengths[i] = math.sqrt(length)
        # Iterate over our stored pairs where we already have the
        # decimal logarithms computed so we do not have to do it again
        for word, log in words_and_logs:
            # No need to discard empty words here as we discarded them before
            # so words_and_logs will not have the empty word
            term_in_document.setdefault(log / doc_lengths[i], set()).add(i)
            postings[w] = term_in_document
    return postings


def query_rr(postings, qtext):
    words = tokenize(qtext)
    doc_scores = {}
    for i in range(N):
        score = 0
        for w in words:
            tf = words.count(w)
            df = len(postings[w])
            idf = math.log10(N / (df+1))
            query_weights = tf * idf
        for w in words:
            if w in postings:
                score = score + query_weights
        doc_scores[i] = score
    res = heapq.nlargest(10, doc_scores)
    return res

postings = index_text_files_rr('docs')
print(query_rr(postings, 'hello'))

推荐阅读