python - TypeError:逻辑回归模型中的“float”和“str”实例之间不支持“<”
问题描述
我想在 python 中使用机器学习将恶意网址分类为好或坏,一切正常,但出现此错误。下面的代码用于对数据集进行建模,最后写的是问题。此外,我写下了完整的 Traceback 错误,我尝试删除所有空值但注意到工作
[# EDA Packages
import pandas as pd
import numpy as np
import random
# Machine Learning Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Load Url Data
urls_data = pd.read_csv('C:/Users/user/Desktop/iot/data/dataot.csv',error_bad_lines=False, encoding = 'unicode_escape')
type(urls_data)
urls_data.head()
def makeTokens(f):
tkns_BySlash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
total_Tokens = \[\]
for i in tkns_BySlash:
tokens = str(i).split('-') # make tokens after splitting by dash
tkns_ByDot = \[\]
for j in range(0,len(tokens)):
temp_Tokens = str(tokens\[j\]).split('.') # make tokens after splitting by dot
tkns_ByDot = tkns_ByDot + temp_Tokens
total_Tokens = total_Tokens + tokens + tkns_ByDot
total_Tokens = list(set(total_Tokens)) #remove redundant tokens
if 'com' in total_Tokens:
total_Tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features
return total_Tokens
# Labels
y = urls_data\["label"\]
# Features
url_list = urls_data\["url"\]
# Using Default Tokenizer
#vectorizer = TfidfVectorizer()
# Using Custom Tokenizer
vectorizer = TfidfVectorizer(tokenizer=makeTokens)
# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model Building
#using logistic regression
logit = LogisticRegression()
logit.fit(X_train, y_train, errors='coerce')]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-73b6ea97c396> in <module>
2 #using logistic regression
3 logit = LogisticRegression()
----> 4 logit.fit(X_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1284 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
1285 accept_large_sparse=solver != 'liblinear')
-> 1286 check_classification_targets(y)
1287 self.classes_ = np.unique(y)
1288 n_samples, n_features = X.shape
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
166 y : array-like
167 """
--> 168 y_type = type_of_target(y)
169 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
170 'multilabel-indicator', 'multilabel-sequences']:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
285 return 'continuous' + suffix
286
--> 287 if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
288 return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
289 else:
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
231 ar = np.asanyarray(ar)
232 if axis is None:
--> 233 ret = _unique1d(ar, return_index, return_inverse, return_counts)
234 return _unpack_tuple(ret)
235
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
279 aux = ar[perm]
280 else:
--> 281 ar.sort()
282 aux = ar
283 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'float' and 'str'
解决方案
推荐阅读
- laravel - 对于我的 laravel 项目,api 中的验证在邮递员中不起作用
- python - 在 python 中打印方法时,十六进制值是什么意思?
- asp.net-mvc - 我的带有 MVC css 的 asp.net 项目不工作
- javascript - 01 与整数规则中的 1 相同吗?
- java - Box Java sdk - API 返回错误代码 404 - 未找到
- r - 将数据从 ftp 文件直接获取到环境中或作为文件 r
- python - 在 Tkinter 中使用 askdirectory() 选择文件夹时显示所有文件
- django - Docker + Django + Vue.js + Webpack 如何正确配置 nginx?
- c# - 如何保护 .Net Core 上的 Web api 控制器仅供本地计算机使用
- postgresql - 使用 postgresql 在 K8s 中配置超集的问题