python - 使用 sklearn 管道对 tf-idf 向量进行 K 折逻辑回归
问题描述
我正在尝试对使用 sklearn 管道作为输入 tfidf 向量的逻辑回归应用交叉折叠验证。我发现了几个以这种方式接近的示例,但我的代码不起作用。我收到错误“ValueError:找到样本数量不一致的输入变量:[1, 200]”。如果我从管道中删除回归模型,则 tfidf 矢量化器可以正常工作。这是我的代码:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
rows = [['1','buy, help, useful'], ['0','buy,bad, useful']]
data = pd.DataFrame(rows, columns = ['polarity', 'tokens'])
data = pd.concat([data]*150).sort_index()
tvec = TfidfVectorizer(preprocessor=lambda x: x,max_features=10000)
lr = LogisticRegression()
X = data.drop('polarity',axis = 1)
y = data.polarity
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score
def lgr_cv(splits, X, y, pipeline, average_method):
kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
accuracy = []
precision = []
recall = []
f1 = []
for train_index, test_index in kfold.split(X,y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
lr_fit = pipeline.fit(X_train,y_train)
prediction = lr_fit.predict(X_test)
scores = lr_fit.score(X_test,y_test)
accuracy.append(scores * 100)
precision.append(precision_score(y_test, prediction, average=average_method)*100)
print(' negative neutral positive')
print('precision:',precision_score(y_test, prediction, average=None))
recall.append(recall_score(y_test, prediction, average=average_method)*100)
print('recall: ',recall_score(y_test, prediction, average=None))
f1.append(f1_score(y_test, prediction, average=average_method)*100)
print('f1 score: ',f1_score(y_test, prediction, average=None))
print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))
from sklearn.pipeline import Pipeline
original_pipeline = Pipeline([
('vectorizer', tvec),
('classifier', lr)
])
lgr_cv(3, X, y, original_pipeline, 'macro')
感谢任何帮助。
解决方案
推荐阅读
- oracle - 获取无效标识符 SQL 查询
- xcode - 找不到“反应/RCTDefines.h”文件。在反应原生推送通知集成
- postgresql - postgres - 创建带参数的触发函数
- java - 拆分后如何访问每个元素
- python - 使用具有 NaN 的无监督最近邻
- angularjs - 使用 karma 和 jamis 进行 Angular 单元测试
- css - 是否可以通过 CSS 禁用输入文本字段中的删除十字?
- r - R中检查(英语)语法的功能
- java - Hazelcast 节点没有收到第一个发布的消息
- indy10 - 向 TIdTCPClient 发送文本时如何修复 Indy TIdTCPServer 冻结?