首页 > 解决方案 > Scikit-Learn Pipeline ValueError: Input contains NaN, infinity or a value too large for dtype('float64') whenfitting model

问题描述

我想拟合一个机器学习模型。尽管删除NaN了无限值,但我的代码仍然会引发ValueError: Input contains NaN, infinity or a value too large for dtype('float64') when fitting model错误。

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

# Locate and load the data file
df = pd.read_csv('C:/Users/User/Downloads/suicide_rates.csv')
print(f'#rows={len(df)} #columns={len(df.columns)}')

# Delete duplicate features
X = df.drop(['suicides/100k pop', 'country-year', 'suicides_no', 'population'], axis=1)
y = df['suicides/100k pop']

numeric_features = ['year', 'HDI for year', 'gdp_for_year ($)', 'gdp_per_capita ($)']
categorical_features = ['country', 'sex', 'age', 'generation']

# Transform numeric values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values='NaN', strategy='mean')),
    ('scaler', StandardScaler())
])
# Remove empty values
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
# Remove infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Transform nominal values
categorical_transformer = Pipeline(
steps=[
    ('onehot', OneHotEncoder())
])

# Preprocessing
preprocessor = ColumnTransformer(
transformers = [
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

clf = Pipeline(steps=[('preprocessor', preprocessor)])
X = clf.fit_transform(X)


> --------------------------------------------------------------------------- ValueError                                Traceback (most recent call
> last) <ipython-input-22-8e046ad092d2> in <module>
>       1 clf = Pipeline(steps=[('preprocessor', preprocessor)])
> ----> 2 X = clf.fit_transform(X)
> 
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> fit_transform(self, X, y, **fit_params)
>     385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
>     386             if hasattr(last_step, 'fit_transform'):
> --> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
>     388             else:
>     389                 return last_step.fit(Xt, y,
> 
> ~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py
> in fit_transform(self, X, y)
>     506         self._validate_remainder(X)
>     507 
> --> 508         result = self._fit_transform(X, y, _fit_transform_one)
>     509 
>     510         if not result:
> 
> ~\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py
> in _fit_transform(self, X, y, func, fitted)
>     433             self._iter(fitted=fitted, replace_strings=True))
>     434         try:
> --> 435             return Parallel(n_jobs=self.n_jobs)(
>     436                 delayed(func)(
>     437                     transformer=clone(trans) if not fitted else trans,
> 
> ~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self,
> iterable)    1039             # remaining jobs.    1040            
> self._iterating = False
> -> 1041             if self.dispatch_one_batch(iterator):    1042                 self._iterating = self._original_iterator is not None    1043 
> 
> ~\anaconda3\lib\site-packages\joblib\parallel.py in
> dispatch_one_batch(self, iterator)
>     857                 return False
>     858             else:
> --> 859                 self._dispatch(tasks)
>     860                 return True
>     861 
> 
> ~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self,
> batch)
>     775         with self._lock:
>     776             job_idx = len(self._jobs)
> --> 777             job = self._backend.apply_async(batch, callback=cb)
>     778             # A job can complete so quickly than its callback is
>     779             # called before we get here, causing self._jobs to
> 
> ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in
> apply_async(self, func, callback)
>     206     def apply_async(self, func, callback=None):
>     207         """Schedule a func to be run"""
> --> 208         result = ImmediateResult(func)
>     209         if callback:
>     210             callback(result)
> 
> ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in
> __init__(self, batch)
>     570         # Don't delay the application, to avoid keeping the input
>     571         # arguments in memory
> --> 572         self.results = batch()
>     573 
>     574     def get(self):
> 
> ~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
>     260         # change the default number of processes to -1
>     261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 262             return [func(*args, **kwargs)
>     263                     for func, args, kwargs in self.items]
>     264 
> 
> ~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
>     260         # change the default number of processes to -1
>     261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
> --> 262             return [func(*args, **kwargs)
>     263                     for func, args, kwargs in self.items]
>     264 
> 
> ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self,
> *args, **kwargs)
>     220     def __call__(self, *args, **kwargs):
>     221         with config_context(**self.config):
> --> 222             return self.function(*args, **kwargs)
> 
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
>     752     with _print_elapsed_time(message_clsname, message):
>     753         if hasattr(transformer, 'fit_transform'):
> --> 754             res = transformer.fit_transform(X, y, **fit_params)
>     755         else:
>     756             res = transformer.fit(X, y, **fit_params).transform(X)
> 
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> fit_transform(self, X, y, **fit_params)
>     376         """
>     377         fit_params_steps = self._check_fit_params(**fit_params)
> --> 378         Xt = self._fit(X, y, **fit_params_steps)
>     379 
>     380         last_step = self._final_estimator
> 
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y,
> **fit_params_steps)
>     301                 cloned_transformer = clone(transformer)
>     302             # Fit or load from cache the current transformer
> --> 303             X, fitted_transformer = fit_transform_one_cached(
>     304                 cloned_transformer, X, y, None,
>     305                 message_clsname='Pipeline',
> 
> ~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self,
> *args, **kwargs)
>     350 
>     351     def __call__(self, *args, **kwargs):
> --> 352         return self.func(*args, **kwargs)
>     353 
>     354     def call_and_shelve(self, *args, **kwargs):
> 
> ~\anaconda3\lib\site-packages\sklearn\pipeline.py in
> _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
>     752     with _print_elapsed_time(message_clsname, message):
>     753         if hasattr(transformer, 'fit_transform'):
> --> 754             res = transformer.fit_transform(X, y, **fit_params)
>     755         else:
>     756             res = transformer.fit(X, y, **fit_params).transform(X)
> 
> ~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self,
> X, y, **fit_params)
>     697         if y is None:
>     698             # fit method of arity 1 (unsupervised transformation)
> --> 699             return self.fit(X, **fit_params).transform(X)
>     700         else:
>     701             # fit method of arity 2 (supervised transformation)
> 
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in fit(self, X,
> y)
>     286         self : SimpleImputer
>     287         """
> --> 288         X = self._validate_input(X, in_fit=True)
>     289 
>     290         # default fill_value is 0 for numerical input and "missing_value"
> 
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in
> _validate_input(self, X, in_fit)
>     260                 raise new_ve from None
>     261             else:
> --> 262                 raise ve
>     263 
>     264         _check_inputs_dtype(X, self.missing_values)
> 
> ~\anaconda3\lib\site-packages\sklearn\impute\_base.py in
> _validate_input(self, X, in_fit)
>     250 
>     251         try:
> --> 252             X = self._validate_data(X, reset=in_fit,
>     253                                     accept_sparse='csc', dtype=dtype,
>     254                                     force_all_finite=force_all_finite,
> 
> ~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self,
> X, y, reset, validate_separately, **check_params)
>     419             out = X
>     420         elif isinstance(y, str) and y == 'no_validation':
> --> 421             X = check_array(X, **check_params)
>     422             out = X
>     423         else:
> 
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> inner_f(*args, **kwargs)
>      61             extra_args = len(args) - len(all_args)
>      62             if extra_args <= 0:
> ---> 63                 return f(*args, **kwargs)
>      64 
>      65             # extra_args > 0
> 
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> check_array(array, accept_sparse, accept_large_sparse, dtype, order,
> copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
> ensure_min_features, estimator)
>     661 
>     662         if force_all_finite:
> --> 663             _assert_all_finite(array,
>     664                                allow_nan=force_all_finite == 'allow-nan')
>     665 
> 
> ~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
> _assert_all_finite(X, allow_nan, msg_dtype)
>     101                 not allow_nan and not np.isfinite(X).all()):
>     102             type_err = 'infinity' if allow_nan else 'NaN, infinity'
> --> 103             raise ValueError(
>     104                     msg_err.format
>     105                     (type_err,
> 
> ValueError: Input contains NaN, infinity or a value too large for
> dtype('float64').

标签: pythonpandasnumpymachine-learningscikit-learn

解决方案


如果您在代码中'gdp_for_year ($)'转换为数字并替换missing_values='NaN'为应该按预期工作。missing_values=np.nanSimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

# Load the data file
df = pd.read_csv('suicide_rates.csv')

# Replace ' gdp_for_year ($) ' with 'gdp_for_year ($)' in the column names
df.rename(columns={' gdp_for_year ($) ': 'gdp_for_year ($)'}, inplace=True)

# Cast 'gdp_for_year ($)' to numeric
df['gdp_for_year ($)'] = df['gdp_for_year ($)'].apply(lambda x: x.replace(',', '')).astype(float)

# Extract the features and target
X = df.drop(['suicides/100k pop', 'country-year', 'suicides_no', 'population'], axis=1)
y = df['suicides/100k pop']

numeric_features = ['year', 'HDI for year', 'gdp_for_year ($)', 'gdp_per_capita ($)']
categorical_features = ['country', 'sex', 'age', 'generation']

# Transform numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler())
])

# Transform nominal features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

# Preprocess the data
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

clf = Pipeline(steps=[('preprocessor', preprocessor)])
X_ = clf.fit_transform(X)

# Missing values before preprocessing
print(np.sum(X.isna().values))
# 19456

# Missing values after preprocessing
print(np.sum(np.isnan(X_.toarray())))
# 0

推荐阅读