首页 > 解决方案 > TF 预处理:分类值的插补类

问题描述

这里的目标是为浮点值和分类值设置插补层,我不想更改/预处理用作插补的数组类型。问题是,当从包含 np.nan 的 pandas df 读取数组时,我有一个数组,其中包含字符串和 nan,TF 将其解释为 float 并且在尝试创建张量时具有不匹配的 dtype。所以我想我可能需要 Lambda 的某种转换层。我不确定。

ENV 和数据

import numpy as np
from scipy.stats import mode
import tensorflow as tf
from tensorflow import keras
import pandas as pd

train = pd.read_csv('https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv') 

有问题的数组:

cabin = train['Cabin'].values

数组([南,'C85',南,'C123',南,南,'E46',南,南,南,'G6','C103',南,南,南,南,南,南,南,南,南,'D56',南,'A6',南,南,南,'C23 C25 C27',南,南,南,'B78',南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,南,'D33',南,'B30','C52',南,南,南,南,南,'B28','C83',南,南,南,'F33',南,南,南,南,南,南,南,南,'F G73',南,南,南,南,南,南,南,南,南,南,南,南,'C23 C25 C27' ..]

插补层

class Imputation(keras.layers.Layer):
    
    def adapt(self, data_sample):
        
      if data_sample.dtype in ('float32', 'float64'):
          self.means_ = np.nanmean(data_sample, axis=0, keepdims=0)
          print(self.means_)
      if data_sample.dtype == 'object':
          self.mode_ = mode(data_sample, axis=0)[0]
          print(self.mode_)
        
    def call(self, input):

      if input.dtype in ('float32', 'float64'):
        input = tf.where(tf.math.is_nan(input),
                         tf.constant(self.means_, dtype=tf.float32),
                         input)
          
      if input.dtype == tf.string:

        # HERE i need help
        input = tf.where(input == 'NaN',
                         tf.constant(self.mode_, dtype=tf.string),
                         input)
          
      return input

该层适用于例如整数的示例

imputed = Imputation()
imputed.adapt(train['Age'].values)
imputed(train['Age'].values)

该层与预处理的分类数组一起使用的示例(我不想做的事情)

imputed = Imputation()
imputed.adapt(train['Cabin'].values)
cabin2 = train['Cabin'].replace(np.nan, 'NaN').values
imputed(cabin2)

数组([b'C23 C25 C27',b'C85',b'C23 C25 C27',b'C123',b'C23 C25 C27',b'C23 C25 C27',b'E46',b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'G6'、b'C103'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b' C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'D56'、b'C23 C25 C27 ', b'A6', b'C23 C25 C27', b'C23 C25 C27', b'C23 C25 C27', b'C23 C25 C27', b'C23 C25 C27', b'C23 C25 C27', b 'C23 C25 C27'、b'B78'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27'、b'C23 C25 C27', ...])

我有问题的案例:

imputed = Imputation()
imputed.adapt(train['Cabin'].values)
cabin = train['Cabin'].values
imputed(cabin)

错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-46-ce3b039382a7> in <module>()
      2 imputed.adapt(train['Cabin'].values)
      3 cabin = train['Cabin'].values
----> 4 imputed(cabin)

10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
    931     # Accept NumPy and scalar inputs by converting to Tensors.
    932     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
--> 933       inputs = nest.map_structure(_convert_numpy_or_python_types, inputs)
    934       input_list = nest.flatten(inputs)
    935 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
    633 
    634   return pack_sequence_as(
--> 635       structure[0], [func(*x) for x in entries],
    636       expand_composites=expand_composites)
    637 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
    633 
    634   return pack_sequence_as(
--> 635       structure[0], [func(*x) for x in entries],
    636       expand_composites=expand_composites)
    637 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py in _convert_numpy_or_python_types(x)
   3237 def _convert_numpy_or_python_types(x):
   3238   if isinstance(x, (np.ndarray, float, int)):
-> 3239     return ops.convert_to_tensor_v2(x)
   3240   return x
   3241 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in convert_to_tensor_v2(value, dtype, dtype_hint, name)
   1380       name=name,
   1381       preferred_dtype=dtype_hint,
-> 1382       as_ref=False)
   1383 
   1384 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
   1497 
   1498     if ret is None:
-> 1499       ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
   1500 
   1501     if ret is NotImplemented:

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
     50 def _default_conversion_function(value, dtype, name, as_ref):
     51   del as_ref  # Unused.
---> 52   return constant_op.constant(value, dtype, name=name)
     53 
     54 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py in constant(value, dtype, shape, name)
    262   """
    263   return _constant_impl(value, dtype, shape, name, verify_shape=False,
--> 264                         allow_broadcast=True)
    265 
    266 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
    273       with trace.Trace("tf.constant"):
    274         return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
--> 275     return _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
    276 
    277   g = ops.get_default_graph()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py in _constant_eager_impl(ctx, value, dtype, shape, verify_shape)
    298 def _constant_eager_impl(ctx, value, dtype, shape, verify_shape):
    299   """Implementation of eager constant."""
--> 300   t = convert_to_eager_tensor(value, ctx, dtype)
    301   if shape is None:
    302     return t

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
     96       dtype = dtypes.as_dtype(dtype).as_datatype_enum
     97   ctx.ensure_initialized()
---> 98   return ops.EagerTensor(value, ctx.device_name, dtype)
     99 
    100 

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

标签: pythonpython-3.xnumpytensorflowtensorflow2.0

解决方案


嗨,我认为这是因为 pandas 在一个列系列中支持不同的类型,并将“object”dtype 分配给列。

如果您想要更简化的管道,请尝试使用 tensorflow csv loader tf.data.experimental.make_csv_dataset(根据我的经验,它的实验性比 pandas read_csv 方法慢)这将使数据集符合张量。

空字符串将是 b'',因此您可以轻松地在 tf.where 中检查它们,就像这样

 input = tf.where(input == b'',
                  tf.constant(self.mode_, dtype=tf.string),
                  input)

推荐阅读