python - 用于交叉验证时的 Python 管道错误
问题描述
我有一个无法解释也无法弄清楚的复杂错误。显然,我的预处理管道在简单地拟合模型时有效,但在我尝试交叉验证时失败。我无法解读错误并且不理解问题。请帮忙。
预处理
我创建了一个对数据执行一些预处理任务的管道。有用。包括一些客户变压器。这是代码。
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.cols = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.cols]
class dummy_creator(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
X_categorical_scaled_df = pd.get_dummies(X)
return X_categorical_scaled_df
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self):
self.ss = None
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
pipeline_categorical = Pipeline(steps = [
('column_selector', column_selector(categorical_features)),
('create_dummies', dummy_creator())
])
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('scaler',DFStandardScaler())
])
feature_union = FeatureUnion([('cat', pipeline_categorical),
('cont', pipeline_continuous)])
如果我fit_transform
使用管道,我会得到很好的结果:
X_train_enc = feature_union.fit_transform(X_train)
X_train_enc
>>>array([[ 0. , 1. , 0. , ..., -0.05977797,
-0.21011127, -0.24460191],
[ 1. , 0. , 0. , ..., -0.68765273,
-0.00946558, -0.82457039],
[ 0. , 1. , 0. , ..., -1.06122696,
没有交叉验证的模型
如果我现在使用上述预处理管道和模型(在这种情况下为线性回归)制作管道,我仍然会得到很好的结果(只是下面显示的预测以指示正确预处理和模型拟合的数据):
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(feature_union, LinearRegression())
pipe.fit(X_train, y_train)
pipe.predict(X_validation)
>>>array([ 9.17773438, 9.38226318, 8.35693359, 10.62176514, 11.29095459,
7.45025635, 6.03497314, 10.04321289, 10.57568359, 9.86663818,
7.01202393, 8.08374023, 8.80700684, 10.80102539, 12.32678223,
6.7588501 , 10.44604492, 6.86547852, 9.20465088, 9.04406738,
具有交叉验证的模型
现在我尝试使用交叉验证来测试相同的模型。您会注意到我将管道放入列表(`pipelines')中,并在循环中进行交叉验证。这是因为我打算使用不同的模型创建与此类似的管道列表并遍历它们,但这超出了我的问题范围(但以防万一你想知道我为什么要这样编码)
seed = 7
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('Preprocess', feature_union),('LR', LinearRegression())])))
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(), cv_results.std()))
我收到以下错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2655 try:
-> 2656 return self._engine.get_loc(key)
2657 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-70-1aa4a50ac843> in <module>
29
30 kfold=KFold(n_splits=10, random_state=7)
---> 31 cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
32 results.append(cv_results)
33 names.append(name)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
791 delayed(_fit_transform_one)(trans, X, y, weight,
792 **fit_params)
--> 793 for name, trans, weight in self._iter())
794
795 if not result:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
296 """
297 last_step = self._final_estimator
--> 298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
300 return last_step.fit_transform(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
463 else:
464 # fit method of arity 2 (supervised transformation)
--> 465 return self.fit(X, y, **fit_params).transform(X)
466
467
<ipython-input-24-666c2228e73d> in transform(self, X, y)
13
14 def transform(self, X, y=None):
---> 15 return X.loc[:, self.cols]
16
17
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
986 for i, key in enumerate(tup):
987 if is_label_like(key) or isinstance(key, tuple):
--> 988 section = self._getitem_axis(key, axis=i)
989
990 # we have yielded a scalar ?
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3574
3575 if axis == 1:
-> 3576 return self[key]
3577
3578 self._consolidate_inplace()
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 return self._engine.get_loc(key)
2657 except KeyError:
-> 2658 return self._engine.get_loc(self._maybe_cast_indexer(key))
2659 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2660 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
该错误似乎位于预处理功能联合中 - 但我不确定确切的位置或原因。我相信它可能在 pd.get_dummies() 函数周围的 create_dummies 类中,但不确定。
有人可以建议发生了什么吗?
解决方案
我知道这是一个老问题,但是当我遇到同样的问题时,它是第一个出现的问题(代码在没有交叉验证的情况下工作并且失败了)。
失败的原因是您的自定义估算器的签名column_selector
与属性不匹配。来自关于开发估算器的sklearn文档:
此外,__init__ 接受的每个关键字参数都应该对应于实例上的一个属性。在进行模型选择时,Scikit-learn 依靠此来找到要在估计器上设置的相关属性。
如果您将其更改为,它应该可以正常工作:
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.columns = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.columns]
推荐阅读
- shell - iSeries 上的 shell 解释器路径是什么?
- android - Android 8 nullpointer 异常与片段中的函数调用(仅在某些设备上!!!)
- c# - 使用 C# 和 ASP.NET MVC 在 OneDrive 上读取 Excel (xlsx) 文件
- html - 当列表项在 d-flex 中向左浮动时对齐列表中心
- java - 春季批量回滚所有步骤以防出现异常
- javascript - 单击“加载更多”按钮时如何找到下一个页面链接?
- javascript - JQuery / Laravel - 启用输入文本
- reactjs - 显示索引目录列表而不是应用程序
- html - 始终定位底部页脚
- java - Android Studio 上带有 mXparser.jar 的 DexArchiveBuilderException