首页 > 解决方案 > 测试多个 ML 模型 - 每次都死内核

问题描述

行数接近 100 万。我这样定义我想要测试的模型

X = df_final[['short_description', 'details', 'root_cause']]
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

tfidf_pipeline = Pipeline([
    ('tfidf' ,TfidfVectorizer(max_features=1500, ngram_range=(1, 3), stop_words = 'english', strip_accents= 'ascii',))])

countvec_pipeline = Pipeline([
    ('countvec' ,CountVectorizer(max_features=1500, ngram_range=(1, 1), stop_words = 'english', strip_accents= 'ascii', binary = True))])


preprocessor_pipeline = ColumnTransformer(
    transformers=[
    ('short_description', countvec_pipeline,'short_description'),
    ('details', tfidf_pipeline,'details'),
    ('root_cause', countvec_pipeline, 'root_cause'),
])

models = [
    ('rf', RandomForestClassifier(n_estimators=100, 
                                  max_depth=3, 
                                  random_state=0,
                                  n_jobs = -1)),
    ('svc',LinearSVC()),
    ('nb', MultinomialNB()),
    ('lr', LogisticRegression(random_state=0, 
                              solver = 'saga', 
                              n_jobs = -1))
]

然后我这样fit_transform

X_prepped = preprocessor_pipeline.fit_transform(X)

下面的部分是我的内核一旦遇到第一个模型是随机森林就会破坏的地方。

for model_name, model in models:
    print(model_name)
    results_dict = cross_validate(model, X_prepped, labels, cv = cv, scoring = 'accuracy', return_train_score = True)
    results_df = pd.DataFrame(results_dict)

我在这里做错了什么吗?

标签: pythonmachine-learningscikit-learn

解决方案


import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline,  FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk  
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import     CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2, SelectKBest
 import spacy
 from sklearn.preprocessing import MaxAbsScaler

 df=pd.read_csv('input_data.csv')

 NUMERIC=['Cost','Field2']
 TEXT=['Text1', 'Text2']


 def combine_text_columns(data_frame, text_labels=TEXT):
    """ converts all text in each row of data_frame to single vector """
    #to_drop = set(to_drop) & set(data_frame.columns.tolist())
    #text_dta=data_frame.drop(to_drop,axis=1)

    text_data =data_frame[text_labels]
 
    # Replace nans with blanks
    text_data.fillna("",inplace=True)

# Join all text items in a row that have a space in between
return text_data.apply(lambda x: " ".join(x), axis=1)

 nlp = spacy.load('en_core_web_sm')
 stopwords=spacy.lang.en.stop_words.STOP_WORDS
 chi_k = 300
 # Create the token pattern: TOKENS_ALPHANUMERIC
 TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'


 get_text_data = FunctionTransformer(combine_text_columns,validate=False)

 # Preprocess the numeric data: get_numeric_data
 get_numeric_data = FunctionTransformer(lambda x:    x[NUMERIC], validate=False)

  pl = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('imputer', SimpleImputer())
            ])),
            ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vectorizer', TfidfVectorizer(stop_words='english')),
                    #('vectorizer',CountVectorizer(stop_words=stopwords,token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1,2))),('dim_red', SelectKBest(chi2, chi_k))
            ]))
         ]
    )),
    ('scale', MaxAbsScaler()),
    #('svc',LinearSVC())
    #('nb', MultinomialNB()),
    #('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=15)))
    ('lr', OneVsRestClassifier(LogisticRegression(C=100)))
  ])

  TARGET=['IsApartment']    
  sc_X = StandardScaler()
   encoder=LabelEncoder()
   label_enc=pd.Series(encoder.fit_transform(df['OpportunityName']))

   NUMERIC2=[]
   [NUMERIC2.append(x) for x in NUMERIC]
   NUMERIC2.append('opportunityName_enc')
   X2=df[NUMERIC]
   X2=pd.concat([X2,label_enc],axis=1)

   X2 = pd.DataFrame(sc_X.fit_transform(X2),columns=NUMERIC2)
   X=pd.concat([X,X2],axis=1)
   X=pd.concat([X,df[TEXT]],axis=1)
   y=df[TARGET]

   X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3, random_state=42)

 pl.fit(X_train,y_train)
 predictions=pl.predict(X_train)
 accuracy = pl.score(X_train, y_train)
 print("\nAccuracy on sample data - numeric, no nans:{:.2f}% ".format(accuracy))
 ytrain_pred_probas = pl.predict_proba(X_train)[:, 1]        
 # prob of predict as 1
 fpr, tpr, thresholds = roc_curve(y_train, ytrain_pred_probas)   # precision_recall_curve
 roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Thresholds':thresholds})
 _ = plt.figure()
 plt.plot(roc.FPR, roc.TPR)
 plt.axvline(0.1, color = '#00C851', linestyle = '--')
 plt.xlabel("FPR")
 plt.ylabel("TPR")

推荐阅读