python - 测试多个 ML 模型 - 每次都死内核
问题描述
行数接近 100 万。我这样定义我想要测试的模型
X = df_final[['short_description', 'details', 'root_cause']]
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
tfidf_pipeline = Pipeline([
('tfidf' ,TfidfVectorizer(max_features=1500, ngram_range=(1, 3), stop_words = 'english', strip_accents= 'ascii',))])
countvec_pipeline = Pipeline([
('countvec' ,CountVectorizer(max_features=1500, ngram_range=(1, 1), stop_words = 'english', strip_accents= 'ascii', binary = True))])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('short_description', countvec_pipeline,'short_description'),
('details', tfidf_pipeline,'details'),
('root_cause', countvec_pipeline, 'root_cause'),
])
models = [
('rf', RandomForestClassifier(n_estimators=100,
max_depth=3,
random_state=0,
n_jobs = -1)),
('svc',LinearSVC()),
('nb', MultinomialNB()),
('lr', LogisticRegression(random_state=0,
solver = 'saga',
n_jobs = -1))
]
然后我这样fit_transform
做
X_prepped = preprocessor_pipeline.fit_transform(X)
下面的部分是我的内核一旦遇到第一个模型是随机森林就会破坏的地方。
for model_name, model in models:
print(model_name)
results_dict = cross_validate(model, X_prepped, labels, cv = cv, scoring = 'accuracy', return_train_score = True)
results_df = pd.DataFrame(results_dict)
我在这里做错了什么吗?
解决方案
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2, SelectKBest
import spacy
from sklearn.preprocessing import MaxAbsScaler
df=pd.read_csv('input_data.csv')
NUMERIC=['Cost','Field2']
TEXT=['Text1', 'Text2']
def combine_text_columns(data_frame, text_labels=TEXT):
""" converts all text in each row of data_frame to single vector """
#to_drop = set(to_drop) & set(data_frame.columns.tolist())
#text_dta=data_frame.drop(to_drop,axis=1)
text_data =data_frame[text_labels]
# Replace nans with blanks
text_data.fillna("",inplace=True)
# Join all text items in a row that have a space in between
return text_data.apply(lambda x: " ".join(x), axis=1)
nlp = spacy.load('en_core_web_sm')
stopwords=spacy.lang.en.stop_words.STOP_WORDS
chi_k = 300
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
get_text_data = FunctionTransformer(combine_text_columns,validate=False)
# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC], validate=False)
pl = Pipeline([
('union', FeatureUnion(
transformer_list = [
('numeric_features', Pipeline([
('selector', get_numeric_data),
('imputer', SimpleImputer())
])),
('text_features', Pipeline([
('selector', get_text_data),
('vectorizer', TfidfVectorizer(stop_words='english')),
#('vectorizer',CountVectorizer(stop_words=stopwords,token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1,2))),('dim_red', SelectKBest(chi2, chi_k))
]))
]
)),
('scale', MaxAbsScaler()),
#('svc',LinearSVC())
#('nb', MultinomialNB()),
#('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=15)))
('lr', OneVsRestClassifier(LogisticRegression(C=100)))
])
TARGET=['IsApartment']
sc_X = StandardScaler()
encoder=LabelEncoder()
label_enc=pd.Series(encoder.fit_transform(df['OpportunityName']))
NUMERIC2=[]
[NUMERIC2.append(x) for x in NUMERIC]
NUMERIC2.append('opportunityName_enc')
X2=df[NUMERIC]
X2=pd.concat([X2,label_enc],axis=1)
X2 = pd.DataFrame(sc_X.fit_transform(X2),columns=NUMERIC2)
X=pd.concat([X,X2],axis=1)
X=pd.concat([X,df[TEXT]],axis=1)
y=df[TARGET]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3, random_state=42)
pl.fit(X_train,y_train)
predictions=pl.predict(X_train)
accuracy = pl.score(X_train, y_train)
print("\nAccuracy on sample data - numeric, no nans:{:.2f}% ".format(accuracy))
ytrain_pred_probas = pl.predict_proba(X_train)[:, 1]
# prob of predict as 1
fpr, tpr, thresholds = roc_curve(y_train, ytrain_pred_probas) # precision_recall_curve
roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Thresholds':thresholds})
_ = plt.figure()
plt.plot(roc.FPR, roc.TPR)
plt.axvline(0.1, color = '#00C851', linestyle = '--')
plt.xlabel("FPR")
plt.ylabel("TPR")
推荐阅读
- mapbox - 是否可以使用样式规范在 Mapbox Gl Native (Android) 中添加线/圆/多边形层?
- python - 如何找到python lib目录?
- reactjs - req.body.something 返回未定义
- rest-assured - expectBody("", is(2)) 与 expectBody(is(2))
- python - 如何让主题指令显示在目录中?
- rust - 如何在 Rust 中使用来自 web_sys 的 WebGL 扩展
- python - 如何在纯 Django 模板中处理图像,以便它们像在 Wagtail 中一样工作
- angular - 如何动态修改角度分量输入值
- windows-installer - 带有 GUI 安装程序的 InstallShield 安装程序错误 2006
- mfc - 使用 MFC 在不更改 GUI 的情况下动态读取卫星 DLL