一、特征选择可以减少过拟合代码实例
该实例来自机器学习实战第四章
#coding=utf-8 ''' We use KNN to show that feature selection maybe reduce overfitting ''' from sklearn.base import clone from itertools import combinations import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score class SBS(): def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1): self.scoring = scoring self.estimator = clone(estimator) self.k_features = k_features self.test_size = test_size self.random_state = random_state def fit(self, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = self.test_size, random_state=self.random_state) dim = X_train.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r=dim-1): score = self._calc_score(X_train, y_train, X_test, y_test, p) scores.append(score) subsets.append(p) best = np.argmax(scores) self.indices_ = subsets[best] self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) self.k_score_ = self.scores_[-1] return self def transform(self, X): return X[:, self.indices_] def _calc_score(self, X_train, y_train, X_test, y_test, indices): self.estimator.fit(X_train[:, indices], y_train) y_pred = self.estimator.predict(X_test[:, indices]) score = self.scoring(y_test, y_pred) return score import pandas as pd from sklearn.model_selection import train_test_split df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) k_feat = [len(k) for k in sbs.subsets_] plt.figure(figsize=(8,10))#must be a tuple plt.subplot(2,1,1) plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() #plt.show() #Let's see what those five features are that yield such a good performance on validation dataset #subsets_的第九个元素是选择了13个特征中的五个来进行训练 k5 = list(sbs.subsets_[8]) print(df_wine.columns[1:][k5]) ''' Index(['Alcohol', 'Malic acid', 'Alcalinity of ash', 'Hue', 'Proline'], dtype='object') ''' #Let's evaluate the performance of the KNN classifer on the original test set knn.fit(X_train_std, y_train) print("Training Accuracy:", knn.score(X_train_std, y_train)) print("Test Accuracy:", knn.score(X_test_std, y_test)) ''' Training accuracy: 0.9838709677419355 Test Accuracy: 0.9444444444444444 ''' #We find a slight degree of overftting if we used all the 13 features on training knn.fit(X_train_std[:, k5], y_train) print("Training Accuracy:", knn.score(X_train_std[:, k5], y_train)) print("Test Accuracy:", knn.score(X_test_std[:, k5], y_test)) ''' Training Accuracy: 0.9596774193548387 Test Accuracy: 0.9629629629629629 ''' #We reduced overfitting and the prediction accuracy improved. #RF Show Feature Importance from sklearn.ensemble import RandomForestClassifier feat_labels = df_wine.columns[1:] forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f+1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.subplot(2,1,2) plt.title("Feature Importances") plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() plt.show()