首页 > 解决方案 > LSTM模型在多类分类中只分类1类

问题描述

我使用 LSTM 为 Rainfall Dataset 的不平衡多类分类编写了 Python 代码 输入变量是温度、阳光和湿度的数字形式 目标有四个类别,无雨、小雨、中雨和大雨,但模型仅分类混淆矩阵中的一类,如下面的代码所示。

我还尝试了 SMOTE 和班级重量技术来平衡班级,但结果没有改变。

谁能帮我完整的 LSTM 代码以进行不平衡的多类分类?

混淆矩阵

数据集截图

import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.utils import resample
import pandas as pd
import seaborn as sns
from numpy import array
from numpy import argmax
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from keras.regularizers import l1,l2,l1_l2
from pandas.plotting import register_matplotlib_converters
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Dropout, GRU, Bidirectional, Activation
from scipy import stats
from tensorflow.keras import Sequential
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from sklearn.utils import class_weight
%matplotlib inline
%config InlineBackend.figure_format='retina'
from sklearn.metrics import confusion_matrix
from dateutil.parser import parse
register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 22, 10
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# load dataset
df = pd.read_csv("Arbaminch_Rainfall_All_In_One_Classiffication.csv")
#df['Date'] =  pd.to_datetime(df['Date'], dayfirst=True, format='%d-%m-%Y', errors='coerce')
#df = df.set_index('Date').rename_axis('Rainfall', axis=1)
df = df.drop('Date', 1)
#df.plot()
df.head()



columns = df.columns.tolist()
# filter the columns to remove data e do not want
columns = [c for c in columns if c not in['Rainfall9AM']]
target = df.Rainfall9AM
state = np.random.RandomState(42)

X = df[columns] # independent Variable
Y = target      # dependent Variable

X.shape, Y.shape

X = X.ffillna(X.mean())
Y= Y.fillna(method='ffill')


from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
Y = label_encoder.fit_transform(Y)
Y = pd.DataFrame(Y)
Y.columns = ['Rainfall9AM']





from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=1,stratify=Y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape


#All Varibles
scale_columns = ['MaxTemp6PM', 'MinTemp9AM', 'AVGTemp', 'Sunshine6PM', 'Humidity6AM', 'Humidity9AM', 'Humidity12PM', 
                'Humidity3PM', 'Humidity6PM', 'MaxHumid', 'MinHumid', 'AVGHumid']

# Granger selected features 
#'Rainfall9AM','MaxTemp6PM','Sunshine6PM','Humidity6AM','Humidity12PM',
#    'Humidity6PM','MaxHumid','MinHumid','AVGHumid'

#scale_columns = ['MaxTemp6PM','Sunshine6PM','Humidity6AM','Humidity12PM','Humidity6PM','MaxHumid','MinHumid','AVGHumid']

scaler = MinMaxScaler(feature_range = (0,1))
scaler = scaler.fit(X_train[scale_columns])

X_train.loc[:, scale_columns] = scaler.transform(X_train[scale_columns].to_numpy())
X_test.loc[:, scale_columns] = scaler.transform(X_test[scale_columns].to_numpy())




#from sklearn.utils.class_weight import compute_class_weight
#classes = np.array([0,1,2,3])
#weights = compute_class_weight('balanced', classes, y_for_train).all()
class_weights = {
    0: 4.,
    1 :5.,
    2 :5.,
    3 :1.
}





#To create 3D for Lstm
def create_dataset(X, y, time_steps=1, step=1):
    Xs, ys = [], []
    for i in range(0, len(X) - time_steps, step):
        v = X.iloc[i:(i + time_steps)].values
        labels = y.iloc[i: i + time_steps]
        Xs.append(v)        
        ys.append(stats.mode(labels)[0][0])
        return np.array(Xs), np.array(ys).reshape(-1, 1)
TIME_STEPS = 30
STEP = 1
X_train, y_train = create_dataset(X_train, y_train, TIME_STEPS, STEP)
X_test, y_test = create_dataset(X_test, y_test, TIME_STEPS, STEP)



# One Hot Encode
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
enc = enc.fit(y_train)
y_train = enc.transform(y_train)
y_test = enc.transform(y_test)



#define model
optimizer = keras.optimizers.RMSprop(learning_rate=0.001, momentum= 0.0)
model = Sequential()
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation='relu'), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(y_train.shape[1], Activation('softmax'), bias_regularizer=l2(1e-2)))
model.compile(loss='categorical_crossentropy', optimizer = optimizer, metrics=['acc'])
model.summary()




history = model.fit(X_train, y_train,epochs=10,class_weight = class_weights, batch_size=32,validation_data=(X_test,y_test),shuffle=True)


model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
# Creates a confusion matrix
num_classes = 4
y_true = tf.argmax(y_test, axis = 1)
y_pred = tf.argmax(y_pred, axis = 1)




cm = confusion_matrix(y_true, y_pred, labels=[0,1,2,3]) 
cm_df = pd.DataFrame(cm,index = ['No_Rain', 'Light_Rain', 'Moderate', 'Heavy_Rain'], columns = ['No_Rain', 'Light_Rain', 'Moderate', 'Heavy_Rain'])

plt.figure(figsize=(15,5))
sns.heatmap(cm_df, annot=True, fmt="d", cmap='Blues') 
plt.title('Bidirectional_LSTM_Model\nAccuracy:{0:.3f}'.format(accuracy_score(y_true, y_pred)))
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

标签: pythontensorflowmachine-learning

解决方案


我的信念是您的代码中有一些错误。您没有向我解释的一件事是stats.mode(labels)例如为什么您要调用标签。另外,我不知道您是否正确使用了 SMOTE 等等。

但是,由于您无法在 stackoverflow 上提供任何数据,因此我从 Kaggle 获取了澳大利亚的 Rain数据集。

它看起来和你的很相似。最大的不同是我们只有两个“明天下雨”的类:yesno。但它也很不平衡:

yes   77%
no    23%

我用一些非常懒惰的预处理实现了一个简单的模型(所以这一切可能会改进很多。但是,它应该足以让你从这个例子中学习并将它与你的进行比较。只需在此处下载数据并给它一个去。

在没有任何调整和平衡的情况下,我能够在测试数据上获得大约 82% 的准确度。在 Kaggle 上,我看到人们在这方面得到了 88%,所以我们绝对可以改进我们的模型。

您需要做的就是确保它weatherAUS.csv在您的系统上可用,并且您安装了 Tensorflow(我使用的是 2.4.1)和其他依赖项。

之后,您应该能够更改它在数据集上运行的代码。目前,不考虑不平衡数据。一旦你的模型开始预测一些更合理的东西,我们可以尝试通过使用一些平衡技术来改进它。

代码(weatherAUS.csv)

import os

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers


def preprocess(df_, split, num_pipeline, cat_pipeline, targets_encoder, numeric_columns, categorical_columns):
    """Quick, dirty and lazy preprocessing."""
    df_split = df_.loc[split]

    nums = num_pipeline.transform(df_split[numeric_columns])
    cats = cat_pipeline.transform(df_split[categorical_columns].drop(columns='RainTomorrow')).todense()

    df_num = pd.DataFrame(nums, index=df_split[numeric_columns].index, columns=df_split[numeric_columns].columns)
    df_cat = pd.DataFrame(cats, index=df_split[categorical_columns].index, columns=list(np.concatenate(cat_pipeline.categories_)))

    df_X = df_num.join(df_cat).sort_index()

    df_Y = targets_encoder.transform(df_split.RainTomorrow.values.reshape(-1, 1)).todense()
    df_Y = pd.DataFrame(df_Y, index=df_split.index, columns=targets_encoder.categories_[0]).sort_index()

    return df_X, df_Y


def sample_generator(df_X, df_Y, nb_days: int, seed: int = None, max_samples: int = None):
    rnd = np.random.RandomState(seed)
    locations = df_X.index.get_level_values(0)
    loc_indices = list(range(len(locations)))

    count = 0
    while True:

        if max_samples is not None and count >= max_samples:
            break

        count += 1

        # We cannot simply sample a series arbitrarily. Need to make sure
        # that samples come from one particular location (city). Thus, we pic a location first ...
        location = locations[rnd.choice(loc_indices)]

        # .. and now we pick nb_days for that location
        offs = rnd.randint(0, len(df_X.loc[location]) - nb_days)
        start = offs
        end = offs + nb_days

        x = df_X.loc[location].iloc[start:end]
        y = df_Y.loc[location].iloc[end - 1]

        yield x, y


def fill_missing_values(df_raw, numeric_columns, categorical_columns):
    """Quick and dirty missing-value preprocessing."""

    locations = set(df_raw.Location.unique())

    # Fill missing values

    dfs = list()
    for location in locations:
        df_loc = df_raw[df_raw.Location == location].set_index('Date')

        # We resample and interpolate ..
        df_num = df_loc[numeric_columns].resample('1D').mean().interpolate()
        # .. and fill the rest with mean values
        df_num = df_num.fillna(df_num.mean())

        df_cat = df_loc[categorical_columns.union({'Location'})]
        # For categorical features we simple take the mode
        df_cat = df_cat.fillna(df_cat.mode())
        df_ = df_num.join(df_cat)
        df_ = df_.reset_index().set_index(['Location', 'Date'])
        dfs.append(df_)

    df_ = pd.concat(dfs)
    df_ = df_[~df_.RainTomorrow.isnull()]
    # There could still be some missing values. Again, out of laziness, I just fill nan values with averages and modes
    df_num = df_[numeric_columns].fillna(df_[numeric_columns].mean())
    df_cat = df_[categorical_columns]
    df_cat = df_cat.fillna(df_cat.mode().iloc[0])

    df = df_num.join(df_cat)
    df = df[sorted(df.columns)]

    return df


def main():

    file_path = 'weatherAUS.csv'
    assert os.path.exists(file_path), f'Cannot find data-file: {file_path}'

    seed = 42

    df_raw = pd.read_csv(file_path)

    # Convert to datetime
    df_raw.Date = pd.to_datetime(df_raw.Date, infer_datetime_format=True)

    # If a column contains more than 15% null-values we do not consider it as feature.
    # I only do this out of laziness. We could probably do better.
    null_threshold = 0.15
    null_perc = df_raw.isnull().sum() / len(df_raw)
    useful_columns = set(null_perc[null_perc < null_threshold].index)

    # Separate (useful) numeric and categorical columns

    numeric_columns = useful_columns & set(df_raw.select_dtypes(exclude=['object', 'datetime64[ns]']).columns)
    categorical_columns = useful_columns & set(df_raw.select_dtypes(exclude=['float64', 'datetime64[ns]']).columns)
    categorical_columns = categorical_columns - {'Location'}

    df = fill_missing_values(df_raw, numeric_columns=numeric_columns, categorical_columns=categorical_columns)

    # Create train/test split by location

    train, test = train_test_split(df.index.get_level_values(0).unique(), test_size=0.2, random_state=seed)

    # Create preprocessing pipeline for numeric and categorical data

    num_pipeline = StandardScaler()
    num_pipeline.fit(df.loc[train][numeric_columns])

    cat_pipeline = OneHotEncoder()
    cat_pipeline.fit(df.loc[train][categorical_columns].drop(columns='RainTomorrow'))

    # Targets encoder

    targets_encoder = OneHotEncoder()
    targets_encoder.fit(df.loc[train].RainTomorrow.dropna().unique().reshape(-1, 1))

    # Preprocess

    df_train_X, df_train_Y = preprocess(
        df, train,
        num_pipeline=num_pipeline,
        cat_pipeline=cat_pipeline,
        targets_encoder=targets_encoder,
        numeric_columns=numeric_columns,
        categorical_columns=categorical_columns
    )
    df_test_X, df_test_Y = preprocess(
        df, test,
        num_pipeline=num_pipeline,
        cat_pipeline=cat_pipeline,
        targets_encoder=targets_encoder,
        numeric_columns=numeric_columns,
        categorical_columns=categorical_columns
    )

    # Just checking that we have indeed separated locations
    assert len(set(df_train_X.index.get_level_values(0)).intersection(set(df_test_X.index.get_level_values(0)))) == 0

    # Create Tensorflow dataset

    nb_days = 7  # The number of days we consider
    batch_size = 50
    nb_features = len(df_train_X.columns)
    nb_classes = len(targets_encoder.categories_[0])

    train_data = tf.data.Dataset.from_generator(
        lambda: sample_generator(df_train_X, df_train_Y, nb_days, seed=seed),
        output_shapes=((None, nb_features), (nb_classes,)),
        output_types=(tf.float32, tf.float32)
    ).prefetch(batch_size).padded_batch(batch_size)

    test_data = tf.data.Dataset.from_generator(
        lambda: sample_generator(df_test_X, df_test_Y, nb_days, seed=seed, max_samples=1000),
        output_shapes=((None, nb_features), (nb_classes,)),
        output_types=(tf.float32, tf.float32)
    ).prefetch(batch_size).padded_batch(batch_size)

    # Create model

    inputs = layers.Input(shape=(None, nb_features,))
    x = inputs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False, dropout=0.1))(x)
    x = layers.Dense(nb_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=x)

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics='accuracy'
    )

    # Train

    model.fit(
        train_data,
        epochs=20,
        steps_per_epoch=100,
        validation_data=test_data.repeat(),
        validation_steps=50
    )

    _, accuracy = model.evaluate(test_data)

    random = (df.RainTomorrow.value_counts() / df.RainTomorrow.value_counts().sum()).max()

    print(f'Accuracy: {100 * accuracy:.2f} %')
    print(f'Random:   {100 * random:.2f} %')


if __name__ == '__main__':
    main()

输出

Epoch 1/5
100/100 [==============================] - 50s 469ms/step - loss: 0.4868 - accuracy: 0.7843 - val_loss: 0.5015 - val_accuracy: 0.7708
Epoch 2/5
100/100 [==============================] - 45s 459ms/step - loss: 0.4142 - accuracy: 0.8202 - val_loss: 0.4592 - val_accuracy: 0.7972
Epoch 3/5
100/100 [==============================] - 46s 462ms/step - loss: 0.3692 - accuracy: 0.8383 - val_loss: 0.4622 - val_accuracy: 0.8036
Epoch 4/5
100/100 [==============================] - 46s 461ms/step - loss: 0.3535 - accuracy: 0.8469 - val_loss: 0.4463 - val_accuracy: 0.8204
Epoch 5/5
100/100 [==============================] - 46s 463ms/step - loss: 0.3505 - accuracy: 0.8427 - val_loss: 0.4347 - val_accuracy: 0.8132
20/20 [==============================] - 3s 145ms/step - loss: 0.4384 - accuracy: 0.8140
Accuracy: 81.40 %
Random:   77.58 %

OP 数据的代码和结果

注意:提供的数据集非常小(约 3000 个样本)。

from operator import itemgetter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers


def preprocess(df: pd.DataFrame, inputs_pipeline, targets_encoder):

    numeric_columns = df.select_dtypes(exclude=['object']).columns

    inputs = inputs_pipeline.transform(df[numeric_columns])
    targets = targets_encoder.transform(df.Rainfall9AM.values.reshape(-1, 1)).todense()

    df_inputs = pd.DataFrame(inputs, index=df.index, columns=numeric_columns)
    df_targets = pd.DataFrame(targets, index=df.index, columns=targets_encoder.categories_[0])

    return df_inputs, df_targets


def sample_generator(df_inputs: pd.DataFrame, df_targets: pd.DataFrame, nb_days: int, max_samples: int = None, seed=42):

    rnd = np.random.RandomState(seed)
    count = 0

    while True:

        if max_samples is not None and count >= max_samples:
            break

        count += 1

        offs = rnd.randint(0, len(df_inputs) - nb_days)

        start = offs
        end = offs + nb_days

        x = df_inputs.iloc[start:end]
        y = df_targets.iloc[end]

        yield x, y


def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Very lazy and dirty preprocessing. Replace this by something more sophisticated."""
    df_targets = df[['Rainfall9AM']]
    df = df.resample('1D').mean().interpolate()
    df = df.fillna(df.mean())
    df = df[sorted(df.columns)]
    df = df.join(df_targets)
    df = df[~df.Rainfall9AM.isnull()]
    df = df.dropna(1)
    assert not df.isnull().any().any()
    df = df.sort_index()
    return df


def get_model(nb_features: int, nb_classes: int):
    inputs = layers.Input(shape=(None, nb_features,))
    x = inputs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False, dropout=0.1))(x)
    x = layers.Dense(nb_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=x)

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics='accuracy'
    )

    return model


def main():
    fp = 'Arbaminch_Rainfall_Classiffication.csv'

    df_raw = pd.read_csv(fp)

    # Convert to datetime
    df_raw.Date = pd.to_datetime(
        df_raw.Date,
        infer_datetime_format=True,
        # FIXME The provided data contains errors (invalid dates e.g. 2/29/2009)
        errors='coerce'
    )
    df_raw = df_raw.set_index('Date')

    # Fill missing values
    # FIXME The preprocessing is lazy and should be improved!
    df = fill_missing_values(df_raw)

    # Create train/test split

    nb_train = int(len(df) * 0.8)
    df_train_data, df_test_data = df[:nb_train], df[nb_train:]

    print(f'Training samples: {len(df_train_data)}')
    print(f'Test samples:     {len(df_test_data)}')

    # Create preprocessing pipeline for numeric and categorical data

    numeric_columns = df_train_data.select_dtypes(exclude=['object']).columns
    inputs_pipeline = StandardScaler()
    inputs_pipeline.fit(df_train_data[numeric_columns])

    targets_encoder = OneHotEncoder()
    targets_encoder.fit(df_train_data.Rainfall9AM.dropna().unique().reshape(-1, 1))

    # Preprocess

    df_train_X, df_train_Y = preprocess(
        df_train_data,
        inputs_pipeline=inputs_pipeline,
        targets_encoder=targets_encoder
    )

    df_test_X, df_test_Y = preprocess(
        df_test_data,
        inputs_pipeline=inputs_pipeline,
        targets_encoder=targets_encoder
    )

    assert len(set(df_train_X.index.get_level_values(0)).intersection(set(df_test_X.index.get_level_values(0)))) == 0

    # Create Tensorflow dataset

    nb_days = 7
    batch_size = 100
    nb_features = len(df_train_X.columns)
    nb_classes = len(targets_encoder.categories_[0])

    train_data = tf.data.Dataset.from_generator(
        lambda: sample_generator(df_train_X, df_train_Y, nb_days=nb_days),
        output_shapes=((None, nb_features), (nb_classes,)),
        output_types=(tf.float32, tf.float32)
    ).prefetch(batch_size).padded_batch(batch_size)

    test_data = tf.data.Dataset.from_generator(
        lambda: sample_generator(df_test_X, df_test_Y, nb_days=nb_days, max_samples=1000),
        output_shapes=((None, nb_features), (nb_classes,)),
        output_types=(tf.float32, tf.float32)
    ).prefetch(batch_size).padded_batch(batch_size)

    # Get model

    model = get_model(nb_features=nb_features, nb_classes=nb_classes)

    # Start training

    class_names = list(targets_encoder.categories_[0])
    class_counts = dict(df_train_data.Rainfall9AM.value_counts())
    total = float(len(df_train_data))
    class_weight = dict([(i, np.log(total / class_counts[cname])) for i, cname in enumerate(class_names)])

    model.fit(
        train_data,
        epochs=5,
        steps_per_epoch=100,
        validation_data=test_data.repeat(),
        validation_steps=50,
        class_weight=class_weight
    )

    # Evaluate model

    y_pred = np.argmax(model.predict(test_data), axis=1)
    y_true = np.argmax(np.concatenate(list(map(itemgetter(1), list(test_data)))), axis=1)

    confusion = pd.DataFrame(confusion_matrix(y_true, y_pred), index=class_names, columns=class_names)

    plt.figure()
    sns.heatmap(confusion, annot=True, fmt='d')
    plt.show()

    print(classification_report(y_true, y_pred, target_names=class_names))


if __name__ == '__main__':
    main()


在此处输入图像描述

              precision    recall  f1-score   support

  Heavy_Rain       0.25      0.25      0.25       170
  Light_Rain       0.12      0.31      0.17       126
    Moderate       0.18      0.37      0.24       140
     No_Rain       0.79      0.30      0.43       564

    accuracy                           0.30      1000
   macro avg       0.33      0.31      0.27      1000
weighted avg       0.53      0.30      0.34      1000

推荐阅读