python - 机器学习垃圾邮件分类
问题描述
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
dataset = pd.read_csv(r'emails.csv')
dataset.columns #Index(['text', 'spam'], dtype='object')
dataset.shape #(5728, 2)
#Checking for duplicates and removing them
dataset.drop_duplicates(inplace = True)
dataset.shape #(5695, 2)
#Checking for any null entries in the dataset
print (pd.DataFrame(dataset.isnull().sum()))
'''
text 0
spam 0
'''
#Using Natural Language Processing to cleaning the text to make one corpus
# Cleaning the texts
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Every mail starts with 'Subject :' will remove this from each text
dataset['text']=dataset['text'].map(lambda text: text[1:])
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())
ps = PorterStemmer()
corpus=dataset['text'].apply(lambda text_list:' '.join(list(map(lambda word:ps.stem(word),(list(filter(lambda text:text not in set(stopwords.words('english')),text_list)))))))
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:, 1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
classifier.fit(X_train , y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
我练习了没有。就在今天,我转移了文本,所以我准备好这个模型,精度为 0.98,但是当我试图预测新的文本输入时,我遇到了错误。
subject = "hello this is a test"
classifier.predict([[subject]])
我得到的错误是
FutureWarning:从 0.22 版开始,如果 dtype='numeric',字节/字符串数组将被转换为十进制数。建议您在 scikit-learn 中使用之前将数组转换为 float dtype,例如使用 your_array = your_array.astype(np.float64)。返回 f(**kwargs)
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 29223 is different from 1)
我可以尝试的任何建议或可能的解决方案
我也尝试转换句子
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
tfidfconverter = TfidfTransformer()
text = "Hello world!"
text = vectorizer.transform([text]).toarray()
text = tfidfconverter.transform(text).toarray()
label = classifier.predict(text)[0]```
but got a NotFittedError: Vocabulary not fitted or provided.
解决方案
推荐阅读
- db2 - 用于创建用户的 Db2 查询
- mysql-workbench - 如何更改 Windows 的选项文件 (my.ini) 位置?
- javascript - Phaser 3 创建一个带有分和秒的游戏时钟
- selenium - 无法从网站 https://demoqa.com/webtables 的 web 表中检索数据
- r - RMarkdown 投影仪和目录
- graphviz - 如何只写一次长标签,并在图中使用对它们的引用?
- angular - 一个又一个http调用的角度合并图
- pytorch - 解码器总是预测相同的标记
- javascript - Material-UI 选择器在模糊而不是更改时触发验证?
- python - KivyMD:无法绑定函数