python - 首先训练模型并多次测试
问题描述
我一直在尝试将 python 的 NLP 脚本与基于 QT GUI 的 C++ 应用程序一起使用。基本上在应用程序中,我试图通过命令行访问 NLP 脚本:
QString path = "D:/DS Project/Treegramming";
QString command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();
以上工作完美。但问题是,它需要大约 40-50 秒来执行,因为它首先训练模型然后进行测试。但我想先训练模型并像在 Jupyter Notebook 中那样对其进行多次测试。为此,我制作了一个单独的函数来测试并尝试使用命令行访问它:
PS D:\DS Project\Treegramming> py nlp.py "test('这太棒了')"
但是这件事又是先执行整个脚本,然后再执行函数。我能做些什么来解决这个问题吗?
蟒蛇脚本:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 16:18:01 2019
@author: Muhammad Ahmed
"""
import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(@[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
解决方案
您需要创建两个 python 脚本:
- 首先训练并保存NaiveBayesClassifier
- 其次是加载和测试模型。
为了防止重复代码,我将为有用的函数创建一个脚本,我会调用它utils.py
,它应该如下所示:
import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(@[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
然后让我们创建训练脚本,我将调用它train.py
,它应该如下所示:
import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
pickle.dump(classifier, fout)
test.py
最后,应该如下所示的测试脚本:
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize
from utils import remove_noise
#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
classifier = pickle.load(fin)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
现在,运行train.py
一次以训练朴素贝叶斯分类器,该分类器将创建一个名为的新文件,该文件model.pickle
包含经过训练的分类器。然后test.py
在您的自定义推文上从您的 C++ 应用程序运行。test.py
应该加载经过训练的模型model.pickle
并将其用于给定的自定义推文。
推荐阅读
- python-3.x - 在 lambda 中使用 boto3 从 s3 读取和写入 excel 文件
- listview - Flutter:SimpleDialog 中的 ListView
- ios - 具有开发依赖性的 podspec
- c# - 服务中的文件系统观察程序未处理文件
- javascript - AWS 版本 4 签名片段代码以及 javascript 中的示例
- php - SQL 查询在插入特定行时持续存在
- c++ - 我可以制作一个 64 位驱动程序并仅在我的 PC 上使用它而不购买证书吗?
- textures - UE4 贴花始终具有相同的材质
- angularjs - 在 AngularJS 中动态更改选项值时,选择选项未正确反映
- perl - DateTime::Event::Cron::Quartz 无法处理 */2 小时