import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings

cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)

def build_model(model_name, model_class, params=None):
    return model instance
    if 'Ridge' in model_name:
        model = model_class(penalty='l2')
    elif 'Lasso' in model_name:
        model = model_class(penalty='l1')
    elif 'Ensemble' in model_name:
        model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
        model = model_class()

    if params is not None:
        print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
        rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
                                  random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
                                 scoring=make_scorer(f1_score), error_score=0.0)
        return rscv

    print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
    return model

def fit_model(model_name, model_instance, xTrain, yTrain):
    fit model
    if model_name == 'SVM':
        scaler = StandardScaler()
        model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
        model = model_instance.fit(xTrain, yTrain)

    return model

def predict_vals(fitted_model, xTest):
    predict and return vals
    if model_name == 'SVM':
        scaler = StandardScaler()
        y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
        y_prediction = fitted_model.predict(xTest)

    return y_prediction

def get_metrics(yTest, y_prediction):
    get metrics after getting prediction
    return [recall_score(yTest, y_prediction),
            precision_score(yTest, y_prediction), 
            f1_score(yTest, y_prediction),
           roc_auc_score(yTest, y_prediction)]

def model_report(list_of_metrics):
    add metrics to df, return df
    df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
    df = df.round(3)
    return df

models = {
    'Logistic Regression Ridge': LogisticRegression,
    'Logistic Regression Lasso': LogisticRegression,
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'GBM': GradientBoostingClassifier,
    'EnsembleRFGBM': VotingClassifier

model_parameters = {
    'SVM': {
        'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
        'class_weight': ['balanced'],
        'gamma': [0.0001, 0.001],
        'kernel': ['linear']
    'Random Forest': {
        'n_estimators': [5, 10, 50, 100, 200],
        'max_depth': [3, 5, 10, 20, 40],
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'min_samples_leaf': [np.random.randint(1,10)]
    'Logistic Regression Ridge': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    'Logistic Regression Lasso': {
        'C': np.random.rand(25),
        'class_weight': ['balanced']
    'GBM': {
        'n_estimators': [10, 50, 100, 200, 500],
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [np.random.randint(1,10)]
    'EnsembleRFGBM': {
        'rf__n_estimators': [5, 10, 50, 100, 200],
        'rf__max_depth': [3, 5, 10, 20, 40],
        'rf__min_samples_leaf': [np.random.randint(1,10)],
        'gbm__n_estimators': [10, 50, 100, 200, 500],
        'gbm__max_depth': [3, 5, 10, None],
        'gbm__min_samples_leaf': [np.random.randint(1,10)]


# without parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)




# with parameters
lst = []
for model_name, model_class in models.items():
    model_instance = build_model(model_name, model_class, model_parameters)
    fitted_model = fit_model(model_name, model_instance, X_train, y_train)
    y_predicted = predict_vals(fitted_model, X_test)
    metrics = get_metrics(y_test, y_predicted)

    lst.append([model_name] + metrics)




  1. 从用户那里获取模型字典及其参数。如果未提供参数,则使用模型的默认值。
  2. 将报告作为输出提供(如图所示)

有人告诉我应该将函数更改为类。并尽可能避免 for 循环。


  1. 如何将所有函数更改为类和方法?基本上我的前辈想要类似的东西

report.getReport # gives the dataFrame of the report


customReport(whatever inputs I'd like to give) # gives df of report
  1. 如何避免for loops通过各种模型的用户输入?我的想法是我可以使用sklearn 管道,因为根据我的理解,管道是一系列步骤,所以从用户那里获取参数和模型,并将它们作为一系列步骤执行。这避免了 for 循环。


customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
                             'SVC', SVC(with relevant params from params dict)) ] )

我在这里找到了类似的解决方案,但我想避免for loops这样。

这里的另一个相关解决方案是使用一个可以在不同模型之间切换的类。但在这里我会要求用户能够选择是否要执行 Gridsearch/RandomizedSearch/CV/None。我的想法是我使用这个类,然后将它继承到另一个类,用户可以提供输入以选择 Gridsearch/RandomizedSearch/CV/None 等。我不确定我的想法是否正确。


您可以考虑使用 map(),详细信息在这里:https ://www.geeksforgeeks.org/python-map-function/

一些程序员有避免原始循环的习惯——“原始循环是函数内部的任何循环,其中函数的用途大于循环实现的算法”。更多详细信息:https ://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf

我认为这就是要求您删除 for 循环的原因。
