首页 > 解决方案 > LDAvis_prepared=pyLDAvis.gensim.prepare(lda_model,corpus,id2word) 具有多个元素的数组的真值是不明确的。使用 a.any()

问题描述

import pandas as pd 
papers = pd.read_csv("scraped_google_reviews.csv") 
papers = papers.drop(columns=['ratings', 'Unnamed: 4','Unnamed: 5'], axis=1)

加载正则表达式库

import re

删除标点符号

papers['paper_text_processed'] = \
papers['review'].map(lambda x: re.sub('[,\.!?]', '', x))

将标题转换为小写

papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: x.lower())

导入 wordcloud 库

from wordcloud import WordCloud

long_string = ','.join(list(papers['paper_text_processed'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

wordcloud.generate(long_string)

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
       

deacc=True 删除标点符号

        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

import gensim.corpora as corpora

创建字典

id2word = corpora.Dictionary(data_words)

创建语料库

texts = data_words

术语文档频率

corpus = [id2word.doc2bow(text) for text in texts]

看法

print(corpus[:1][0][:30])
from pprint import pprint

主题数

num_topics = 4

建立 LDA 模型

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
import pyLDAvis.gensim
import pickle 
import pyLDAvis

可视化主题

pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join(str(num_topics))

if 1 == 1:
    

我在以下行中遇到错误,其中包含多个 > 元素的数组的真值不明确。使用 a.any()

    **LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)**
   
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

从磁盘加载预先准备好的pyLDAvis数据

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, str(num_topics) +'.html')
LDAvis_prepared

标签: python-3.xpandasnlpsentiment-analysislda

解决方案


推荐阅读