python - 在 Python 中使用 NLTK 对单词进行标记的问题。返回单个字母而不是单词的列表
问题描述
我的 NLP python 程序遇到了一些问题,我正在尝试创建一个正面和负面推文的数据集,但是当我运行代码时,它只返回似乎被标记化的单个字母。我是 Python 和 NLP 的新手,所以如果这是基本的或者我对自己的解释很糟糕,我深表歉意。我在下面添加了我的代码:
import csv
import random
import re
import string
import mysql.connector
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
print(token)
return cleaned_tokens
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
if __name__ == "__main__":
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
stop_words = stopwords.words('english')
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, 'positive')
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, 'negative')
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
来自 CSV 文件的片段以供参考:
"tweetid","username","created_at","tweet","location","place","classification"
"1285666943073161216","MeFixerr","2020-07-21 20:04:20+00:00","Overwhelmed by all the calls, msgs and tweets. I apologize for getting lost without prior notice. Did not expect to be missed with such fervor.
I am good & taking a break. Lots of love and dua's for everyone of you in #PTIFamily ❤","Pakistan, Quetta",,"positive"
解决方案
您的令牌来自文件名('positive_tweets.csv'),而不是文件中的数据。添加如下打印语句。你会看到问题。
positive_tweet_tokens = word_tokenize(positive_tweets)
negative_tweet_tokens = word_tokenize(negative_tweets)
print("tokens=", positive_tweet_tokens) # add this line
完整脚本的输出
tokens= ['positive_tweets.csv']
v
v
[('e', 3), ('v', 2), ('p', 1), ('w', 1), ('c', 1)]
[('e', 4), ('v', 2), ('n', 1), ('g', 1), ('w', 1), ('c', 1)]
Accuracy is: 0
关于第二个错误,替换这个
with open('positive_tweets.csv') as csv_file:
positive_tweets = csv.reader(csv_file, delimiter=',')
with open('negative_tweets.csv') as csv_file:
negative_tweets = csv.reader(csv_file, delimiter=',')
有了这个
positive_tweets = negative_tweets = ""
with open('positive_tweets.csv') as csv_file:
positive_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(positive_tweets_rdr)
for lst in all[1:]: positive_tweets += ' ' + lst[3] #tweet column
with open('negative_tweets.csv') as csv_file:
negative_tweets_rdr = csv.reader(csv_file, delimiter=',')
all = list(negative_tweets_rdr)
for lst in all[1:]: negative_tweets += ' ' + lst[3] #tweet column
推荐阅读
- ios - 如何在 iOS 中拨打电话时隐藏 *DTMF* 号码
- c++ - 以下代码使用构造函数对字符串进行排序有什么问题?
- reactjs - 如何在 ReactTable V7 中管理 noDataComponent 或显示未找到的行
- amazon-web-services - 将 Kong 与 AWS EKS 结合使用
- python - flask-migrate:在迁移中运行自定义 SQL?
- javascript - JavaScript 正在返回一个重复的小数
- odoo-13 - 什么是 context_today?
- ubuntu - Ansible 出现奇怪的文件权限/身份验证错误
- javascript - 动态生成一个
- 从 Javascript 中的输入数组中列出
- python - 如何将带有 .Net 应用程序的 Python 作为一个安装可执行文件分发?