python - 验证准确性不能改善不平衡数据
问题描述
尝试使用 Kaggle 糖尿病视网膜病变数据集和 CNN 模型进行预测。有五类要预测。数据标签的分布百分比如下。
0 0.73
2 0.15
1 0.07
3 0.02
4 0.02
Name: level, dtype: float64
下面提供了相关的重要代码块。
# Network training parameters
EPOCHS = 25
BATCH_SIZE =50
VERBOSE = 1
lr=0.0001
OPTIMIZER = tf.keras.optimizers.Adam(lr)
target_size =(256, 256)
NB_CLASSES = 5
图像生成器类和预处理代码如下。
data_gen=tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=45,
horizontal_flip=True,
vertical_flip=True,
rescale=1./255,
validation_split=0.2)
train_gen=data_gen.flow_from_dataframe(
dataframe=label_csv, directory=IMAGE_FOLDER_PATH,
x_col='image', y_col='level',
target_size=target_size,
class_mode='categorical',
batch_size=BATCH_SIZE, shuffle=True,
subset='training',
validate_filenames=True
)
Found 28101 validated image filenames belonging to 5 classes.
validation_gen=data_gen.flow_from_dataframe(
dataframe=label_csv, directory=IMAGE_FOLDER_PATH,
x_col='image', y_col='level',
target_size=target_size,
class_mode='categorical',
batch_size=BATCH_SIZE, shuffle=True,
subset='validation',
validate_filenames=True
)
Found 7025 validated image filenames belonging to 5 classes.
train_gen.image_shape
(256, 256, 3)
模型构建代码块如下。
# Architect your CNN model1
model1=tf.keras.models.Sequential()
model1.add(tf.keras.layers.Conv2D(256,(3,3),input_shape=INPUT_SHAPE,activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(128,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(64,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Conv2D(32,(3,3),activation='relu'))
model1.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))
model1.add(tf.keras.layers.Flatten())
model1.add(tf.keras.layers.Dense(units=512,activation='relu'))
model1.add(tf.keras.layers.Dense(units=256,activation='relu'))
model1.add(tf.keras.layers.Dense(units=128,activation='relu'))
model1.add(tf.keras.layers.Dense(units=64,activation='relu'))
model1.add(tf.keras.layers.Dense(units=32,activation='relu'))
model1.add(tf.keras.layers.Dense(units=NB_CLASSES,activation='softmax'))
model1.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 254, 254, 256) 7168
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 127, 127, 256) 0
_________________________________________________________________
conv2d_1 (Conv2D) (None, 125, 125, 128) 295040
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 62, 62, 128) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 60, 60, 64) 73792
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 30, 64) 0
_________________________________________________________________
conv2d_3 (Conv2D) (None, 28, 28, 32) 18464
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 14, 14, 32) 0
_________________________________________________________________
flatten (Flatten) (None, 6272) 0
_________________________________________________________________
dense (Dense) (None, 512) 3211776
_________________________________________________________________
dense_1 (Dense) (None, 256) 131328
_________________________________________________________________
dense_2 (Dense) (None, 128) 32896
_________________________________________________________________
dense_3 (Dense) (None, 64) 8256
_________________________________________________________________
dense_4 (Dense) (None, 32) 2080
_________________________________________________________________
dense_5 (Dense) (None, 5) 165
=================================================================
Total params: 3,780,965
Trainable params: 3,780,965
Non-trainable params: 0
# Compile model1
model1.compile(optimizer=OPTIMIZER,metrics=['accuracy'],loss='categorical_crossentropy')
print (train_gen.n,train_gen.batch_size)
28101 50
STEP_SIZE_TRAIN=train_gen.n//train_gen.batch_size
STEP_SIZE_VALID=validation_gen.n//validation_gen.batch_size
print(STEP_SIZE_TRAIN)
print(STEP_SIZE_VALID)
562
140
# Fit the model1
history1=model1.fit(train_gen,
steps_per_epoch=STEP_SIZE_TRAIN,
validation_data=validation_gen,
validation_steps=STEP_SIZE_VALID,
epochs=EPOCHS,verbose=1)
如下所示的纪元历史和训练在纪元 -14 停止,因为没有观察到任何改善。
Epoch 1/25
562/562 [==============================] - 1484s 3s/step - loss: 0.9437 - accuracy: 0.7290 - val_loss: 0.8678 - val_accuracy: 0.7309
Epoch 2/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8748 - accuracy: 0.7337 - val_loss: 0.8673 - val_accuracy: 0.7309
Epoch 3/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8681 - accuracy: 0.7367 - val_loss: 0.8614 - val_accuracy: 0.7306
Epoch 4/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8619 - accuracy: 0.7333 - val_loss: 0.8592 - val_accuracy: 0.7306
Epoch 5/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8565 - accuracy: 0.7375 - val_loss: 0.8625 - val_accuracy: 0.7304
Epoch 6/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8608 - accuracy: 0.7357 - val_loss: 0.8556 - val_accuracy: 0.7310
Epoch 7/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8568 - accuracy: 0.7335 - val_loss: 0.8614 - val_accuracy: 0.7304
Epoch 8/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8541 - accuracy: 0.7349 - val_loss: 0.8591 - val_accuracy: 0.7301
Epoch 9/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8582 - accuracy: 0.7321 - val_loss: 0.8583 - val_accuracy: 0.7303
Epoch 10/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8509 - accuracy: 0.7354 - val_loss: 0.8599 - val_accuracy: 0.7311
Epoch 11/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8521 - accuracy: 0.7325 - val_loss: 0.8584 - val_accuracy: 0.7304
Epoch 12/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8422 - accuracy: 0.7352 - val_loss: 0.8481 - val_accuracy: 0.7307
Epoch 13/25
562/562 [==============================] - 1463s 3s/step - loss: 0.8511 - accuracy: 0.7345 - val_loss: 0.8477 - val_accuracy: 0.7307
Epoch 14/25
562/562 [==============================] - 1462s 3s/step - loss: 0.8314 - accuracy: 0.7387 - val_loss: 0.8528 - val_accuracy: 0.7300
Epoch 15/25
73/562 [==>...........................] - ETA: 17:12 - loss: 0.8388 - accuracy: 0.7344
即使在几个 epoch 之后,验证准确度也没有提高超过 73%。在早期的试验中,我尝试了 0.001 的学习率,但情况相同,没有任何改进。
- 请求建议以提高模型的准确性。
- 另外,当我们使用图像生成器进行预处理时,我们如何使用网格搜索并且会邀请相同的建议提前非常感谢
解决方案
您的问题很可能是由于过度拟合造成的。您的数据非常不平衡,而且除了找到更好的模型、更好的学习率或更好的优化器之外。您还可以创建一个自定义生成器,以更平衡的方式扩充和选择您的数据。
我为工作中的大多数模型使用自定义生成器,我无法分享生成器的完整代码,但我将向您展示如何创建一个的伪代码示例。玩玩并添加更多步骤实际上很有趣。你可以 - 你可能应该 - 添加预处理和后处理步骤,但我希望这段代码能让你对这个过程有一个整体的了解。
import random
import numpy as np
class myCostumGenerator:
def __init__(self) -> None:
# load dataset into a dict, if it's too big then just load filenames and load them at runtime
# each dict key is a class name, and each value is a list of images or filenames
self.dataSet, self.imageHeight, self.imageWidth, self.imageChannels = loadData()
def labelBinarizer(self, label):
# this is how you convert class names into target Y
pass
def augment(self, image):
# this is how you augment your images
pass
def yeildData(self):
while True:#keras generators need to run infinitly
for className, data in self.dataSet.items():
yield self.augment(random.choice(data)), self.labelBinarizer(className)
def getEmptyBatch(self, batchSize):
return (
np.empty([batchSize, self.imageHeight, self.imageWidth, self.imageChannels]),
np.empty([batchSize, len(self.dataset.keys())]), 0)
def getBatches(self, batchSize):
X, Y, i = self.getEmptyBatch(batchSize)
for image, label in self.yieldData():
X[i, ...] = image
Y[i, ...] = label
i += 1
if i== batchSize:
yield X, Y
X, Y, i = self.getEmptyBatch(batchSize)
# your model definition and other stuff
# ...
# ...
# ...
# with this method of defining a generator, you have to set number of steps per epoch
generator = myCostumGenerator()
model.fit(
generator.getBatches(batchSize=256),
steps_per_epoch = 500
# other params
)