首页 > 解决方案 > 如何在机器学习模型之前转换、编码或标准化每行包含(项目)列表的列?

问题描述

数据具有各种类型的值类型。如下,对于分类列,我可以申请OneHotEncoder. 但我得到了错误:TypeError: argument must be a string or number对每行都有一个子字符串或令牌列表的列进行操作,如SUBSTRING_4LSUBSTRING_5L列。

我已经在 google、stackoverflow 和 scikit-learn 文档上搜索了很长一段时间,但没有找到任何有用的东西。

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'tyler'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['tyle', 'yler']],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['tyler']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

def transform_numerical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_trainT = scaler.transform(x_train)
    x_testT = scaler.transform(x_test)

    print(x_train)
    print(x_trainT)
    print()
    print(x_test)
    print(x_testT)
    print('/////////////////////////', '\n')

transform_numerical()

def transform_categorical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)

    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    encoder.fit(x_trainT)
    x_trainT = encoder.transform(x_trainT)
    x_testT = encoder.transform(x_testT)

    print(x_trainT.toarray())
    print(x_train)
    print()
    print(x_testT.toarray())
    print(x_test)
    print('/////////////////////////', '\n')

transform_categorical()

def transform_list():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['SUBSTRING_4L', 'SUBSTRING_5L']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)

    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    encoder.fit(x_trainT)
    x_trainT = encoder.transform(x_trainT)
    x_testT = encoder.transform(x_testT)

    print(x_trainT.toarray())
    print(x_train)
    print()
    print(x_testT.toarray())
    print(x_test)
    print('/////////////////////////', '\n')

transform_list()

标签: python-3.xpandasscikit-learn

解决方案


您可以将list对象展平为列df.apply(lamda x: pd.Series(x)),然后您可以使用它对列pd.get_dummies()进行编码object

object_cols = ["URBAN", "NAME"]
list_cols = ["SUBSTRING_4L","SUBSTRING_5L"]

features = df.drop("DISEASE", axis = 1)
features = features.drop(object_cols, axis = 1)
features = features.drop(list_cols, axis = 1)

for col in list_cols:
    features = pd.concat([features, pd.get_dummies(df[col].apply(lambda x: pd.Series(x)), prefix = col)], axis = 1)
for col in object_cols:
    features = pd.concat([features, pd.get_dummies(df[col], prefix = col)], axis = 1)

其中提供:

    AGE SUBSTRING_4L_jack   SUBSTRING_4L_just   SUBSTRING_4L_phil   SUBSTRING_4L_tyle   SUBSTRING_4L_uste   SUBSTRING_4L_yler   SUBSTRING_5L_juste  SUBSTRING_5L_tyler  URBAN_rural URBAN_urban NAME_ann    NAME_gil    NAME_jack   NAME_juste  NAME_phil   NAME_tyler
0   39.0    1   0   0   0   0   0   0   0   0   1   0   0   1   0   0   0
1   NaN 0   1   0   0   1   0   1   0   0   0   0   0   0   1   0   0
2   21.0    0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0
3   13.0    0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
4   45.0    1   0   0   0   0   0   0   0   0   1   0   0   1   0   0   0
5   26.0    0   0   0   0   0   0   0   0   1   0   0   1   0   0   0   0
6   NaN 0   0   1   0   0   0   0   0   0   1   0   0   0   0   1   0
7   48.0    0   0   0   1   0   1   0   1   0   1   0   0   0   0   0   1

推荐阅读