python-3.x - KeyError: 1 尝试通过 python 进行情绪分析时
问题描述
这是错误信息+ 最近的通话
KeyError Traceback (most recent call last)
D:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 1
The above exception was the direct cause of the following exception:
这是我在 Github 上用于练习的代码
#Define the main sentiment analysis function
def sentiment_check(file):
with open(file, 'r') as myfile:
file_content = open(file, 'r').read()
#Tokenise the management discussion using NLTK
file_content_tokenized= nltk.word_tokenize(file_content)
#Create a frequence distribution table of word tokens
freq=pd.Series(nltk.FreqDist(file_content_tokenized)).sort_values(ascending=False)
#print('Most popular 10 stop words',freq.iloc[0:10])
#print('fraction of total word count that are stop words:',freq.iloc[0:10].sum()/freq.sum())
#The top 10 most common words have been identified as stop words.
#These are words like: 'The', 'Ok', etc.
stopwords=pd.Series(freq.iloc[0:10].index)
#Remove stopwords
file_content_tokenized=pd.Series([x for x in file_content_tokenized if x not in stopwords.values]).str.lower()
# Load Loughran and McDonald dictionaries
#these dictionaries are specially used for textual analysis of financial statements
#More details on this in the README.md
pos = pd.read_csv('POSITIVE.txt', squeeze=True).str.lower()
neg = pd.read_csv('NEGATIVE.txt', squeeze=True).str.lower()
positive_words= file_content_tokenized.isin(pos).sum()
negative_words= file_content_tokenized.isin(neg).sum()
#Total Positive & Negative words in the statement
#("Positive Words:",positive_words)
#print("Negative Words:",negative_words)
sentiment_score = (positive_words-negative_words)/file_content_tokenized.count()
print("for",file.rstrip('.txt'),"(positive words - negative words)/total words:",sentiment_score)
print("for",file.rstrip('.txt'),"negative words/total words:",(negative_words)/file_content_tokenized.count())
#print((positive_words-negative_words)/file_content_tokenized.count())
nnn_words= pd.DataFrame(file_content_tokenized.isin(['no', 'not', 'never']))
nnn_words=nnn_words[nnn_words.iloc[:,0]]
nnn_words['idx']=nnn_words.index.values
nnn_words['words']=file_content_tokenized[nnn_words['idx']]
pos_after_neg=nnn_words.apply(pos_after_negator,axis=1,args=(pos.values,file_content_tokenized)).dropna()
print('+ve words after a negator:',pos_after_neg.values)
print('')
return sentiment_score;
def pos_after_negator(row,pos,file_content_tokenized):
#pos = pd.read_csv('LM_pos_words.txt', squeeze=True).str.lower()
#print(row)
string = row['words']
#print(file_content_tokenized.get(row[1]+1,''))
string+=' '+ str(file_content_tokenized.get(row[1]+1,''))
if file_content_tokenized.get(row[1]+1,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+2,''))
if file_content_tokenized.get(row[1]+2,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+3,''))
if file_content_tokenized.get(row[1]+3,'') in pos:
return string
# print(string)
return None
def driver():
#I have extracted Management Discussion section from last 5 10K annual reports and placed them in data folder
path = "D:\history data\Dissertation\MDA copy"
files = [s for s in os.listdir(path) if s.endswith('.txt')]
year = pd.Series([],dtype=pd.StringDtype())
sentiment = pd.Series([],dtype=pd.StringDtype())
for file in files:
year = year.append(pd.Series([int(file.split('.')[0])]))
sentiment = sentiment.append(pd.Series([sentiment_check(path+'\\'+file)]))
return (year, sentiment)
#Run for last 5 years
year, sentiment = driver()
我是 python 新手,这个错误已经困扰了我好几个小时 T_T 请帮忙!我真的不知道这段代码哪里出错了,所以我把我所有的代码都放在这里,以防我错过真正的原因。(对不起我凌乱的格式)
解决方案
推荐阅读
- python - python正则表达式匹配有效的c表达式
- php - 通知php
- python - 熊猫将标量值添加到数字列?
- asp.net - 根据 asp.net core 中的权限隐藏菜单
- c# - C# 为什么 string.format 将浮点值舍入到最接近的第 10 位?
- java - 为什么键盘给出的循环迭代范围本身算作一次迭代?
- selenium - Selenium java - 我们可以在服务器上同时运行 selenium 多个实例吗
- jmeter - JMeter 加速与持续时间。
- c# - 有没有办法获得通用类型信息
- maven - Selenide 中的“无法访问 com.google.common.base.Predicate”