首页 > 解决方案 > Ranking words across multiple text files by TFIDF

问题描述

I have crawled a wikipedia article and extracted several climate change related url's and saved their content with their url's as the name of file. Now i wanna find out which are the most popular words amongst all this corpus by using tfidf. This is my piece of code:

from nltk.corpus import stopwords as stop

def termFrequency(term, doc):  

    """ 
    Input: term: Term in the Document, doc: Document 
    Return: Normalized tf: Number of times term occurs 
      in document/Total number of terms in the document 
    """
    # Splitting the document into individual terms 
    normalizeTermFreq = doc.lower().split()  

    # Number of times the term occurs in the document 
    term_in_document = normalizeTermFreq.count(term.lower())  

    # Total number of terms in the document 
    len_of_document = float(len(normalizeTermFreq ))  

    # Normalized Term Frequency 
    normalized_tf = term_in_document / len_of_document
    print("TFVAL",normalized_tf)  

    return normalized_tf 


def inverseDocumentFrequency(term, allDocs): 
    num_docs_with_given_term = 0

    """ 
    Input: term: Term in the Document, 
           allDocs: List of all documents 
    Return: Inverse Document Frequency (idf) for term 
            = Logarithm ((Total Number of Documents) /  
            (Number of documents containing the term)) 
    """
    # Iterate through all the documents 
    for doc in allDocs: 

        """ 
        Putting a check if a term appears in a document. 
        If term is present in the document, then  
        increment "num_docs_with_given_term" variable 
        """
        with open("2009_United_Nations_Climate_Change_Conference.txt","r+",encoding="utf-8") as f:
            doc=f.read().split()
            for word in doc:
                print(word)
                result=re.match(r'^[a-zA-Z]+$',word)
                if result is not None:
                    document.append(word)

        document = ' '.join([i for i in document if i not in stop.words()])
        if term.lower() in document.lower().split(): 
            num_docs_with_given_term += 1
        f.close()
    if num_docs_with_given_term > 0: 
        # Total number of documents 
        total_num_docs = len(allDocs)  

        # Calculating the IDF  
        idf_val = log(float(total_num_docs) / num_docs_with_given_term)
        print("IDF_VALUE:",idf_val) 
        return idf_val 
    else: 
        return 0

def start(file,alldocs): 
    #my_function()
    import nltk,re
    document=[]

    # print(count)
    with open(file,"r+",encoding="utf-8") as f:
        doc=f.read().split()
        for word in doc:
            try:
                print(word)
                result=re.match(r'^[a-zA-Z]+$',word)
                if result is not None:
                    document.append(word)
            except Exception:
                pass

        document = ' '.join([i for i in document if i not in stop.words()])
        print("xxxxxxxxxxxxxx",document,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
        words = nltk.tokenize.word_tokenize(document)
        print("xxxxxxxxxxxxxx",words,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx###################")
        words = [word for word in words if word not in stop.words()]
        print("xxxxxxxxxxxxxx",words,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx###################33")
        fdist = nltk.FreqDist(words)
    for term in fdist:            
        tf-idf(term, file) = tf(term, file)* idf(term, alldocs)```


My code shows issue in this portion of code:

for term in fdist:            
        tf-idf(term, file) = tf(term, file)* idf(term, alldocs)

here "file" refers to as filename and alldocs contains list of all the climate change related text file present.

标签: pythonnlpranktf-idf

解决方案


推荐阅读