首页 > 解决方案 > 尝试绘制列表中出现频率最高的 25 个单词并获得额外的撇号和逗号

问题描述

我正在测试一些 NLP 代码。我一起破解了这个。

import pandas as pd
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
global str


df = pd.read_csv('C:\\path_to_data\\Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
print(df)


list(df)

df.info()
df.shape


# find percentage of missing data points
# if some arbitray number of data points are missing, let's say over 50%, we won't use this feature
df_missing = df.isna()
df_num_missing = df_missing.sum()
print(df_num_missing / len(df))
print(df.isna().mean().round(4) * 100)

# these fields have over 50% missing values:
# reviews.dateAdded
# reviews.id 


len(df)



# let's experiment with some sentiment analysis concepts
# first we need to clean up the stuff in the independent field of the DF we are workign with
df.replace('\'','', regex=True, inplace=True) 
df['review_title'] = df[['reviews.title']].astype(str)
df['review_text'] = df[['reviews.text']].astype(str)
df['review_title'] = df['reviews.title'].str.replace('\d+', '')
df['review_text'] = df['reviews.text'].str.replace('\d+', '')


# get rid of special characters
df['review_title'] = df['reviews.title'].str.replace(r'[^\w\s]+', '')
df['review_text'] = df['reviews.text'].str.replace(r'[^\w\s]+', '')

# get rid of double spaces
df['review_title'] = df['reviews.title'].str.replace(r'\^[a-zA-Z]\s+', '')
df['review_text'] = df['reviews.text'].str.replace(r'\^[a-zA-Z]\s+', '')

# convert all case to lower
df['review_title'] = df['reviews.title'].str.lower()
df['review_text'] = df['reviews.text'].str.lower()


# let's do some exploratory data analysis
# we can plot counts of items in one column to get a visual clue about what's going on
categories = df.groupby("primaryCategories")
plt.figure(figsize=(15,10))
categories.size().sort_values(ascending=False).plot.bar()
plt.xticks(rotation=50)
plt.xlabel("Categories")
plt.ylabel("Number of Categories")
plt.show()


# let's check out results of a wordcloud to view frequencies of word occurrances
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud().generate(' '.join(df['primaryCategories']))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()


# perhaps a pie chart would look more appealing and be more useful too
# get top 10 categories; plotting all would give too many <1% of total and chart look wacky
s = df['primaryCategories'].value_counts().nlargest(10)
s.plot(kind='pie', autopct='%1.0f%%')


# get top 10 reviews
s = df['reviews.title'].value_counts().nlargest(10)
s.plot(kind='pie', autopct='%1.0f%%')


# 25 most common words in body
reviews_list = df['review_title'].tolist()
from collections import Counter 
Counter = Counter(reviews_list) 
most_occur = Counter.most_common(25) 
print(most_occur) 


# filter out stop words
# these are the most common words such as: “the“, “a“, and “is“.
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
nltk.download('stopwords') # run this one time
stop_words = stopwords.words('english')
import re
import spacy
import gensim
from gensim import corpora

english_stopwords = stopwords.words('english')
print(len(english_stopwords))
text=str(reviews_list)

# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])
   

# plot 25 most common words, with stop word stripped out
from nltk.tokenize import sent_tokenize
text=str(words)
tokenized_text = sent_tokenize(text)
#print(tokenized_text)
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(text)
#print(tokenized_word)
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
#print(fdist)
# Frequency Distribution Plot: most commonly occurring words
import matplotlib.pyplot as plt
#fdist.plot(25,cumulative=False)
fdist = FreqDist(word for word in tokenized_word if word.isalpha())
plt.show()

以下是标记化文本变量的一个小示例:

tokenized_text
Out[109]: ["['small', 'great', 'light', 'reader', 'easy', 'use', 'beach', 'great', 'price', 'great', 'buy', 'solid', 'entrylevel', 'kindle', 'great', 'kids', 'good', 'ebook', 'light', 'weight', 'makes', 'world', 'difference', 'taking', 'books', 'go', 'good', 'quality', 'best', 'ebook', 'great', 'product', 'good', 'price', 'excellent', 'reader', 'feels', 'like', 'real',
etc.
etc.
etc.

我的图表看起来像这样。

在此处输入图像描述

似乎我得到了一个撇号和逗号,作为图表中的前两个字符,并且出于某种原因,所有字符串前面都有一个撇号。

fdist 变量如下所示。

FreqDist({"'": 12277, ',': 12264, "'great": 1747, "'tablet": 900, "'love": 427, "'good": 407, "'product": 351, "'kids": 336, "'kindle": 238, "'echo": 197, ...})

我上面发布的代码一定有问题,因为原始数据看起来不像这样,在我尝试绘制结果的步骤之前,我已经摆脱了所有特殊字符。也许有一种更简单的方法可以做到这一点。知道我在这里做错了什么吗?谢谢。

标签: pythonpython-3.xmatplotlib

解决方案


我认为您words的输入代码text=str(words)是一个“列表”。str(words)将生成一个像"['a', 'b', 'c', ',']". 这会导致您的错误。

简单的解决方案是替换你str(words)" ".join(words)


推荐阅读