python - 是否可以将数据帧传递给每行都有一个 numpy 数组的 TF/Keras？

问题描述

我正在做一个有效的回归，但为了改善结果，我想添加一个 numpy 数组（它代表我在应用程序之外预处理的用户属性）。

这是我的数据示例：

MPG Cylinders   Displacement    Horsepower  Weight  Acceleration    Model Year  Origin  NumpyColumn
0   18.0    8   307.0   130.0   3504.0  12.0    70  1   [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1   15.0    8   350.0   165.0   3693.0  11.5    70  1   [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2   18.0    8   318.0   150.0   3436.0  11.0    70  1   [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3   16.0    8   304.0   150.0   3433.0  12.0    70  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4   17.0    8   302.0   140.0   3449.0  10.5    70  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
... ... ... ... ... ... ... ... ... ...
393 27.0    4   140.0   86.0    2790.0  15.6    82  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
394 44.0    4   97.0    52.0    2130.0  24.6    82  2   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
395 32.0    4   135.0   84.0    2295.0  11.6    82  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
396 28.0    4   120.0   79.0    2625.0  18.6    82  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
397 31.0    4   119.0   82.0    2720.0  19.4    82  1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

以下是它的生成方法：

import numpy as np
import pandas as pd
import scipy.sparse as sparse

#download data
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names,
          na_values='?', comment='\t',
          sep=' ', skipinitialspace=True)

lenOfDF = (len(df))
#add numpy array
arr = sparse.coo_matrix(([1,1,1], ([0,1,2], [1,2,0])), shape=(lenOfDF,lenOfDF))
df['NumpyColumn'] = arr.toarray().tolist()

然后我的模型是这样的：

g_input = Input(shape=[Xtrain.shape[1]])
H1 = Dense(512)(g_input)
H1r = Activation('relu')(H1)
H2 = Dense(256)(H1r)
H2r = Activation('relu')(H2)
H3 = Dense(256)(H2r)
H3r = Activation('relu')(H3)
H4 = Dense(128)(H3r)
H4r = Activation('relu')(H4)
H5 = Dense(128)(H4r)

H5r = Activation('relu')(H5)
H6 = Dense(64)(H5r)
H6r = Activation('relu')(H6)
H7 = Dense(32)(H6r)
Hr = Activation('relu')(H7)
g_V = Dense(1)(Hr)

generator = Model(g_input,g_V)
generator.compile(loss='binary_crossentropy', optimizer=opt)

当我使用带有 NumpyColumn 的数据集调用它时（x_batch 只是上述数据帧的拆分和缩放数据集，其中传递了 numpy 数组，因此它保持不变）。我收到以下错误：

# generated = generator.predict(x_batch)                            #making prediction from the generator
generated = generator.predict(tf.convert_to_tensor(x_batch))      #making prediction from the generator

错误：

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

我在这里做错了什么？我的想法是拥有一个数组可以提供模型信息以做出更好的预测，所以我正在尝试对其进行测试。是否可以将 numpy 数组添加到数据框中进行训练？或者有没有我应该做的替代方法？

编辑 1

以上是一个示例，可帮助您快速了解问题。就我而言，在编码/缩放数据帧之后，我有一个看起来像这样的 numpy 数组（它是代表分类编码的数字 + 最后的两个 numpy 数组）：

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 9921.0,
       20.0, 0.40457918757980704, 0.11369258150627903, 0.868421052631579,
       0.47368421052631576, 0.894736842105263, 0.06688034531010473,
       0.16160188713280013, 0.7368421052631579, 0.1673332894736842,
       0.2099143206854345, 0.3690644464300929, 0.07097828135799109,
       0.8157894736842104, 0.9210526315789473, 0.23091420289239645,
       0.08623506024464939, 0.5789473684210527, 0.763157894736842, 0.0,
       0.18421052631578946, 0.07949239000059796, 0.18763907099960708,
       0.7368421052631579, 0.2668740256483197, 0.6842105263157894,
       0.13699219747488295, 0.868421052631579, 0.868421052631579,
       0.052631349139178094, 0.6842105263157894, 0.5526315789473684,
       0.6842105263157894, 0.6842105263157894, 0.6842105263157894,
       0.7105263157894737, 0.7105263157894737, 0.7105263157894737,
       0.23684210526315788, 0.0, 0.7105263157894737, 0.5789473684210527,
       0.763157894736842, 0.5263157894736842, 0.6578947368421052,
       0.6842105263157894, 0.7105263157894737, 0.0, 0.5789473684210527,
       0.2631578947368421, 0.6842105263157894, 0.6578947368421052,
       0.42105263157894735, 0.5789473684210527, 0.42105263157894735,
       0.7368421052631579, 0.7368421052631579, 0.15207999030227856,
       0.8445892232119124, 0.2683721567016762, 0.3142850329243405,
       0.18421052631578946, 0.19132292433056333, 0.20615136344079915,
       0.14475710664724623, 0.1624920232728424, 0.6989826700898587,
       0.18421052631578946, 0.21052631578947367, 0.4793448772543646,
       0.7894736842105263, 0.682967263567459, 0.37139592674256894,
       0.21123755190149363, 0.18421052631578946, 0.6578947368421052,
       0.39473684210526316, 0.631578947368421, 0.7894736842105263,
       0.36842105263157887, 0.1863353145721346, 0.7368421052631579,
       0.26809396092240706, 0.22492185003691062, 0.1460488284639197,
       0.631578947368421, 0.15347526114630458, 0.763157894736842,
       0.2097323620058104, 0.3684210526315789, 0.631578947368421,
       0.631578947368421, 0.631578947368421, 0.6842105263157894,
       0.36842105263157887, 0.10507952765043811, 0.22418515695024185,
       0.23755698619020282, 0.22226500126902, 0.530004040377794,
       0.3421052631578947, 0.19018711711349692, 0.19629244102133708,
       0.5789473684210527, 0.10526315789473684, 0.49999999999999994,
       0.5263157894736842, 0.5263157894736842, 0.49999999999999994,
       0.1052631578947368, 0.10526315789473678, 0.5263157894736842,
       0.4736842105263157, 2013.0,
       array([0.        , 0.        , 0.        , 0.62235785, 0.        ,
       0.27049118, 0.        , 0.31094068, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.4330532 , 0.        ,
       0.        , 0.2515796 , 0.        , 0.        , 0.        ,
       0.40683705, 0.01569915, 0.        , 0.        , 0.        ,
       0.13090582, 0.        , 0.49955425, 0.06970194, 0.29155406,
       0.        , 0.        , 0.27342197, 0.        , 0.        ,
       0.        , 0.04415211, 0.        , 0.03908829, 0.        ,
       0.07673171, 0.33199945, 0.        , 0.51759815, 0.        ,
       0.4719149 , 0.4538082 , 0.13475986, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.08000553,
       0.        , 0.02991109, 0.        , 0.5051543 , 0.        ,
       0.24663273, 0.        , 0.50839704, 0.        , 0.        ,
       0.05281948, 0.44884402, 0.        , 0.44542992, 0.15376966,
       0.        , 0.        , 0.        , 0.39128256, 0.49497205,
       0.        , 0.        ], dtype=float32),
       array([0.        , 0.        , 0.        , 0.62235785, 0.        ,
       0.27049118, 0.        , 0.31094068, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.4330532 , 0.        ,
       0.        , 0.25157961, 0.        , 0.        , 0.        ,
       0.40683705, 0.01569915, 0.        , 0.        , 0.        ,
       0.13090582, 0.        , 0.49955425, 0.06970194, 0.29155406,
       0.        , 0.        , 0.27342197, 0.        , 0.        ,
       0.        , 0.04415211, 0.        , 0.03908829, 0.        ,
       0.07673171, 0.33199945, 0.        , 0.51759815, 0.        ,
       0.47191489, 0.45380819, 0.13475986, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.08000553,
       0.        , 0.02991109, 0.        , 0.50515431, 0.        ,
       0.24663273, 0.        , 0.50839704, 0.        , 0.        ,
       0.05281948, 0.44884402, 0.        , 0.44542992, 0.15376966,
       0.        , 0.        , 0.        , 0.39128256, 0.49497205,
       0.        , 0.        ])], dtype=object)

标签： pythonpandasnumpytensorflow

解决方案

问题：

您正在尝试将嵌套列表/数组对象作为转换为张量的特征传递。这就是错误的原因。您可以pandas通过简单地将 n 长度列表/数组转换为 n 列来处理它（检查解决方案 2）。但是，通常，在使用此类列时，理想情况下您希望在网络中以不同方式处理它们（例如将此列传递到 LSTM 中）。因此，理想的方法是拥有一个multi-input model，这通常是我们在行业中使用这些功能的方式（查看解决方案 1）。

解决方案 1：通过多输入解决此问题

这是一个相当普遍的问题，尤其是当我们处理多个数据序列或多个编码时。

解决此问题的一种直接方法是为每种编码创建单独的输入。

（假设 X_train 有 9 列）将 9 列中的 8 列传递给第一个输入，并将编码（带有列表/数组的列）作为单独的输入。
连接这些以创建一个8+398长度张量，该张量现在通过计算图。
带有列表的单个系列可以通过转换为张量/np.array np.array(df.column.tolist())。这会将(398,)带有列表的长度序列转换为(398, 398)成形的 NumPy 数组。
现在，您也可以在连接它们并通过密集层传递它们之前分别处理特征和编码。例如，通过 LSTM 层传递第二个输入。

from tensorflow.keras import layers, Model, utils, activations

g_input = layers.Input(shape=(8,))       #<--------
np_input = layers.Input(shape=(398,))    #<--------
x = layers.concatenate([g_input, np_input])
x = layers.Dense(512, activation='relu')(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
g_V = layers.Dense(1, activation='sigmoid')(x)

generator = Model([g_input,np_input],g_V)
generator.compile(loss='binary_crossentropy', optimizer='adam')

utils.plot_model(generator, show_layer_names=False, show_shapes=True)

print('')
print('RESHAPING DATA TO - (398,8) and (398,398)')
generator.predict([df.drop('NumpyColumn',1).to_numpy(), 
                   np.array(df['NumpyColumn'].tolist())]).shape

RESHAPING DATA TO - (398,8) and (398,398)
(398, 1)

解决方案 2：通过 Pandas 解决这个问题

但是，如果您不希望编码是单独的，并且只是将其作为一个扁平化的特性与其他特性一起使用，那么您可以简单地将数据框在 axis=1 上扁平化以创建8+398列，然后将其转换为张量。

import tensorflow as tf
from tensorflow.keras import layers, Model, utils, activations

g_input = layers.Input(shape=(406,))       #<---------
x = layers.Dense(512, activation='relu')(g_input)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
g_V = layers.Dense(1, activation='sigmoid')(x)

generator = Model(g_input,g_V)
generator.compile(loss='binary_crossentropy', optimizer='adam')

utils.plot_model(generator, show_layer_names=False, show_shapes=True)

print('')
print('RESHAPING DATA TO - (398, 406)')
ddf = pd.concat([df.iloc[:,:-1], df.NumpyColumn.apply(pd.Series)], axis=1) #<-----
generator.predict(tf.convert_to_tensor(ddf)).shape #<-----

RESHAPING DATA TO - (398, 406)
(398, 1)