python - Scikit-Learn Pipeline ValueError: Input contains NaN, infinity or a value too large for dtype('float64') whenfitting model
问题描述
我想拟合一个机器学习模型。尽管删除NaN
了无限值,但我的代码仍然会引发ValueError: Input contains NaN, infinity or a value too large for dtype('float64') when fitting model
错误。
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
# Locate and load the data file
df = pd.read_csv('C:/Users/User/Downloads/suicide_rates.csv')
print(f'#rows={len(df)} #columns={len(df.columns)}')
# Delete duplicate features
X = df.drop(['suicides/100k pop', 'country-year', 'suicides_no', 'population'], axis=1)
y = df['suicides/100k pop']
numeric_features = ['year', 'HDI for year', 'gdp_for_year ($)', 'gdp_per_capita ($)']
categorical_features = ['country', 'sex', 'age', 'generation']
# Transform numeric values
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values='NaN', strategy='mean')),
('scaler', StandardScaler())
])
# Remove empty values
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
# Remove infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Transform nominal values
categorical_transformer = Pipeline(
steps=[
('onehot', OneHotEncoder())
])
# Preprocessing
preprocessor = ColumnTransformer(
transformers = [
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
clf = Pipeline(steps=[('preprocessor', preprocessor)])
X = clf.fit_transform(X)
> --------------------------------------------------------------------------- ValueError Traceback (most recent call
> last) <ipython-input-22-8e046ad092d2> in <module>
> 1 clf = Pipeline(steps=[('preprocessor', preprocessor)])
> ----> 2 X = clf.fit_transform(X)
>
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> fit_transform(self, X, y, **fit_params)
> 385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
> 386 if hasattr(last_step, 'fit_transform'):
> --> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
> 388 else:
> 389 return last_step.fit(Xt, y,
>
> ~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py
> in fit_transform(self, X, y)
> 506 self._validate_remainder(X)
> 507
> --> 508 result = self._fit_transform(X, y, _fit_transform_one)
> 509
> 510 if not result:
>
> ~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py
> in _fit_transform(self, X, y, func, fitted)
> 433 self._iter(fitted=fitted, replace_strings=True))
> 434 try:
> --> 435 return Parallel(n_jobs=self.n_jobs)(
> 436 delayed(func)(
> 437 transformer=clone(trans) if not fitted else trans,
>
> ~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self,
> iterable) 1039 # remaining jobs. 1040
> self._iterating = False
> -> 1041 if self.dispatch_one_batch(iterator): 1042 self._iterating = self._original_iterator is not None 1043
>
> ~\anaconda3\lib\site-packages\joblib\parallel.py in
> dispatch_one_batch(self, iterator)
> 857 return False
> 858 else:
> --> 859 self._dispatch(tasks)
> 860 return True
> 861
>
> ~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self,
> batch)
> 775 with self._lock:
> 776 job_idx = len(self._jobs)
> --> 777 job = self._backend.apply_async(batch, callback=cb)
> 778 # A job can complete so quickly than its callback is
> 779 # called before we get here, causing self._jobs to
>
> ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in
> apply_async(self, func, callback)
> 206 def apply_async(self, func, callback=None):
> 207 """Schedule a func to be run"""
> --> 208 result = ImmediateResult(func)
> 209 if callback:
> 210 callback(result)
>
> ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in
> __init__(self, batch)
> 570 # Don't delay the application, to avoid keeping the input
> 571 # arguments in memory
> --> 572 self.results = batch()
> 573
> 574 def get(self):
>
> ~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
> 260 # change the default number of processes to -1
> 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 262 return [func(*args, **kwargs)
> 263 for func, args, kwargs in self.items]
> 264
>
> ~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
> 260 # change the default number of processes to -1
> 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 262 return [func(*args, **kwargs)
> 263 for func, args, kwargs in self.items]
> 264
>
> ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self,
> *args, **kwargs)
> 220 def __call__(self, *args, **kwargs):
> 221 with config_context(**self.config):
> --> 222 return self.function(*args, **kwargs)
>
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
> 752 with _print_elapsed_time(message_clsname, message):
> 753 if hasattr(transformer, 'fit_transform'):
> --> 754 res = transformer.fit_transform(X, y, **fit_params)
> 755 else:
> 756 res = transformer.fit(X, y, **fit_params).transform(X)
>
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> fit_transform(self, X, y, **fit_params)
> 376 """
> 377 fit_params_steps = self._check_fit_params(**fit_params)
> --> 378 Xt = self._fit(X, y, **fit_params_steps)
> 379
> 380 last_step = self._final_estimator
>
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y,
> **fit_params_steps)
> 301 cloned_transformer = clone(transformer)
> 302 # Fit or load from cache the current transformer
> --> 303 X, fitted_transformer = fit_transform_one_cached(
> 304 cloned_transformer, X, y, None,
> 305 message_clsname='Pipeline',
>
> ~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self,
> *args, **kwargs)
> 350
> 351 def __call__(self, *args, **kwargs):
> --> 352 return self.func(*args, **kwargs)
> 353
> 354 def call_and_shelve(self, *args, **kwargs):
>
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
> 752 with _print_elapsed_time(message_clsname, message):
> 753 if hasattr(transformer, 'fit_transform'):
> --> 754 res = transformer.fit_transform(X, y, **fit_params)
> 755 else:
> 756 res = transformer.fit(X, y, **fit_params).transform(X)
>
> ~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self,
> X, y, **fit_params)
> 697 if y is None:
> 698 # fit method of arity 1 (unsupervised transformation)
> --> 699 return self.fit(X, **fit_params).transform(X)
> 700 else:
> 701 # fit method of arity 2 (supervised transformation)
>
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in fit(self, X,
> y)
> 286 self : SimpleImputer
> 287 """
> --> 288 X = self._validate_input(X, in_fit=True)
> 289
> 290 # default fill_value is 0 for numerical input and "missing_value"
>
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in
> _validate_input(self, X, in_fit)
> 260 raise new_ve from None
> 261 else:
> --> 262 raise ve
> 263
> 264 _check_inputs_dtype(X, self.missing_values)
>
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in
> _validate_input(self, X, in_fit)
> 250
> 251 try:
> --> 252 X = self._validate_data(X, reset=in_fit,
> 253 accept_sparse='csc', dtype=dtype,
> 254 force_all_finite=force_all_finite,
>
> ~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self,
> X, y, reset, validate_separately, **check_params)
> 419 out = X
> 420 elif isinstance(y, str) and y == 'no_validation':
> --> 421 X = check_array(X, **check_params)
> 422 out = X
> 423 else:
>
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> inner_f(*args, **kwargs)
> 61 extra_args = len(args) - len(all_args)
> 62 if extra_args <= 0:
> ---> 63 return f(*args, **kwargs)
> 64
> 65 # extra_args > 0
>
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> check_array(array, accept_sparse, accept_large_sparse, dtype, order,
> copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
> ensure_min_features, estimator)
> 661
> 662 if force_all_finite:
> --> 663 _assert_all_finite(array,
> 664 allow_nan=force_all_finite == 'allow-nan')
> 665
>
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> _assert_all_finite(X, allow_nan, msg_dtype)
> 101 not allow_nan and not np.isfinite(X).all()):
> 102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
> --> 103 raise ValueError(
> 104 msg_err.format
> 105 (type_err,
>
> ValueError: Input contains NaN, infinity or a value too large for
> dtype('float64').
解决方案
如果您在代码中'gdp_for_year ($)'
转换为数字并替换missing_values='NaN'
为应该按预期工作。missing_values=np.nan
SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
# Load the data file
df = pd.read_csv('suicide_rates.csv')
# Replace ' gdp_for_year ($) ' with 'gdp_for_year ($)' in the column names
df.rename(columns={' gdp_for_year ($) ': 'gdp_for_year ($)'}, inplace=True)
# Cast 'gdp_for_year ($)' to numeric
df['gdp_for_year ($)'] = df['gdp_for_year ($)'].apply(lambda x: x.replace(',', '')).astype(float)
# Extract the features and target
X = df.drop(['suicides/100k pop', 'country-year', 'suicides_no', 'population'], axis=1)
y = df['suicides/100k pop']
numeric_features = ['year', 'HDI for year', 'gdp_for_year ($)', 'gdp_per_capita ($)']
categorical_features = ['country', 'sex', 'age', 'generation']
# Transform numeric features
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
('scaler', StandardScaler())
])
# Transform nominal features
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder())
])
# Preprocess the data
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
clf = Pipeline(steps=[('preprocessor', preprocessor)])
X_ = clf.fit_transform(X)
# Missing values before preprocessing
print(np.sum(X.isna().values))
# 19456
# Missing values after preprocessing
print(np.sum(np.isnan(X_.toarray())))
# 0
推荐阅读
- angular - 如何正确使用 mat-sidenav?
- recursion - Could someone explain this erlang function to round a number to a specific amount of decimal places?
- amazon-s3 - 无法从 S3 存储桶加载 Amazon Neptune 中的数据
- android - Android Studio 应用程序安装问题
- java - 使用 Java 11 和 Cassandra Spring Unit 可能与 lombok 和 gradle 一起使用时 Eclipse 出现问题
- javascript - 如何在网页中实现自定义 Tensorflow.js 模型?
- python - 如何处理批处理脚本中的退出代码?
- sql - 在 sqlite (Swift) 中选择 count(col_name) 不起作用
- c# - 选项卡式页面未显示,但我收到 0 个错误
- kubernetes - Traefik v2 IngressRoute CRD 到非 docker 服务