首页 > 解决方案 > KeyError: 1 尝试通过 python 进行情绪分析时

问题描述

这是错误信息+ 最近的通话

KeyError                                  Traceback (most recent call last)
D:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   3079             try:
-> 3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 1

The above exception was the direct cause of the following exception:

这是我在 Github 上用于练习的代码

#Define the main sentiment analysis function
def sentiment_check(file):
    with open(file, 'r') as myfile:
      file_content = open(file, 'r').read()
    
    #Tokenise the management discussion using NLTK
    file_content_tokenized= nltk.word_tokenize(file_content)
    
    #Create a frequence distribution table of word tokens
    freq=pd.Series(nltk.FreqDist(file_content_tokenized)).sort_values(ascending=False)
    #print('Most popular 10 stop words',freq.iloc[0:10])
    #print('fraction of total word count that are stop words:',freq.iloc[0:10].sum()/freq.sum())
    
    #The top 10 most common words have been identified as stop words. 
    #These are words like: 'The', 'Ok', etc.
    stopwords=pd.Series(freq.iloc[0:10].index)
    

    #Remove stopwords
    file_content_tokenized=pd.Series([x for x in file_content_tokenized if x not in stopwords.values]).str.lower()
    
    # Load Loughran and McDonald dictionaries
    #these dictionaries are specially used for textual analysis of financial statements
    #More details on this in the README.md
    pos = pd.read_csv('POSITIVE.txt', squeeze=True).str.lower()
    neg = pd.read_csv('NEGATIVE.txt', squeeze=True).str.lower()
   
    
    positive_words= file_content_tokenized.isin(pos).sum()
    negative_words= file_content_tokenized.isin(neg).sum()
    
    #Total Positive & Negative words in the statement
    #("Positive Words:",positive_words)
    #print("Negative Words:",negative_words)
    
    sentiment_score = (positive_words-negative_words)/file_content_tokenized.count()
    print("for",file.rstrip('.txt'),"(positive words - negative words)/total words:",sentiment_score)
    print("for",file.rstrip('.txt'),"negative words/total words:",(negative_words)/file_content_tokenized.count())
    
    #print((positive_words-negative_words)/file_content_tokenized.count())


    nnn_words= pd.DataFrame(file_content_tokenized.isin(['no', 'not', 'never']))
    nnn_words=nnn_words[nnn_words.iloc[:,0]]
    nnn_words['idx']=nnn_words.index.values
    nnn_words['words']=file_content_tokenized[nnn_words['idx']]
    
    pos_after_neg=nnn_words.apply(pos_after_negator,axis=1,args=(pos.values,file_content_tokenized)).dropna()
    print('+ve words after a negator:',pos_after_neg.values)
    print('')
    return sentiment_score;

def pos_after_negator(row,pos,file_content_tokenized):
    #pos = pd.read_csv('LM_pos_words.txt', squeeze=True).str.lower()  
    #print(row)
    string = row['words']
    #print(file_content_tokenized.get(row[1]+1,''))
    string+=' '+ str(file_content_tokenized.get(row[1]+1,''))
    if file_content_tokenized.get(row[1]+1,'') in pos:
        return string
    string+=' '+ str(file_content_tokenized.get(row[1]+2,''))
    if file_content_tokenized.get(row[1]+2,'') in pos:
        return string
    
    string+=' '+ str(file_content_tokenized.get(row[1]+3,''))
    if file_content_tokenized.get(row[1]+3,'') in pos:
        return string
   # print(string)
    return None

def driver():
    #I have extracted Management Discussion section from last 5 10K annual reports and placed them in data folder
    path = "D:\history data\Dissertation\MDA copy"
    files = [s for s in os.listdir(path) if s.endswith('.txt')]
    year = pd.Series([],dtype=pd.StringDtype())
    sentiment = pd.Series([],dtype=pd.StringDtype())
    for file in files:
        year = year.append(pd.Series([int(file.split('.')[0])]))
        sentiment = sentiment.append(pd.Series([sentiment_check(path+'\\'+file)]))
    return (year, sentiment)

#Run for last 5 years
year, sentiment = driver()

我是 python 新手,这个错误已经困扰了我好几个小时 T_T 请帮忙!我真的不知道这段代码哪里出错了,所以我把我所有的代码都放在这里,以防我错过真正的原因。(对不起我凌乱的格式)

标签: python-3.xnlpsentiment-analysis

解决方案


推荐阅读