tensorflow - tf.keras fit() 方法不打印预期反馈
问题描述
我创建了自定义 ResNet 模型,如下面的代码片段所示:
class ResNet:
@staticmethod
def residual_module(data, K, stride, chanDim, reduce=False, reg=0.0001, bnEps=2e-5, bnMom=0.9):
# the shortcut branch of the ResNet module should be
# initialize as the input (identity) data
shortcut = data
# the first block of the ResNet module are the 1x1 CONVs
bn1 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(data)
act1 = Activation("relu")(bn1)
conv1 = Conv2D(int(K * 0.25), (1, 1), use_bias=False, kernel_regularizer=l2(reg))(act1)
# the second block of the ResNet module are the 3x3 CONVs
bn2 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(conv1)
act2 = Activation("relu")(bn2)
conv2 = Conv2D(int(K * 0.25), (3, 3), strides=stride, padding="same", use_bias=False, kernel_regularizer=l2(reg))(act2)
# the third block of the ResNet module is another set of 1x1 CONVs
bn3 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(conv2)
act3 = Activation("relu")(bn3)
conv3 = Conv2D(K, (1, 1), use_bias=False, kernel_regularizer=l2(reg))(act3)
# if we are to reduce the spatial size, apply a CONV layer t the shortcut
if reduce:
shortcut = Conv2D(K, (1, 1), strides=stride, use_bias=False, kernel_regularizer=l2(reg))(act1)
# add together the shortcut and the final CONV
x = add([conv3, shortcut])
# return the addition as the output of the ResNet module
return x
@staticmethod
def build(width, height, depth, classes, stages, filters, reg=0.0001, bnEps=2e-5, bnMom=0.9):
# initialize the input shape to be "channels last" and the
# channels dimension itself
inputShape = (height, width, depth)
chanDim = -1
# if we are using "channels first", update the input shape and channels dimension
if K.image_data_format() == "channels_first":
inputShape = (depth, height, width)
chanDim = 1
# set the input and apply BN
inputs = Input(shape=inputShape)
x = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(inputs)
# apply a single CONV layer
x = Conv2D(filters[0], (3, 3), use_bias=False, padding="same", kernel_regularizer=l2(reg))(x)
# loop over the number of stages
for i in range(0, len(stages)):
# initialize the stride, then apply a residual module
# used to reduce the spatial size of the input volume
stride = (1, 1) if i == 0 else (2, 2)
x = ResNet.residual_module(x, filters[i + 1], stride, chanDim, reduce=True, bnEps=bnEps, bnMom=bnMom)
# loop over the number of layers in the stage
for j in range(0, stages[i] - 1):
# apply a ResNet module
x = ResNet.residual_module(x, filters[i + 1],
(1, 1), chanDim, bnEps=bnEps, bnMom=bnMom)
# apply BN => ACT => POOL
x = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(x)
x = Activation("relu")(x)
x = AveragePooling2D((8, 8))(x)
# softmax classifier
x = Flatten()(x)
x = Dense(classes, kernel_regularizer=l2(reg))(x)
x = Activation("softmax")(x)
# create the model
model = Model(inputs, x, name="resnet")
# return the constructed network architecture
return model
我正在尝试以下列方式训练它:
training_data_path = os.path.join("TFRecords","Training","patch_classifier_0.tfrecords")
validation_data_path = os.path.join("TFRecords","Validation","patch_classifier_0.tfrecords")
TRAINING_SIZE = 9287
VALIDATION_SIZE = 1194
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API
BATCH_SIZE = 32
def get_batched_dataset(filenames):
option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.with_options(option_no_order)
dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=AUTO)
dataset = dataset.map(_decode, num_parallel_calls=AUTO)
dataset = dataset.repeat()
dataset = dataset.shuffle(1024)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(AUTO) #
return dataset
opt = SGD(lr = 1e-1)
model = ResNet.build(299, 299, 1, 5, (9, 9, 9), (64, 64, 128, 256), reg = 0.005)
model.compile(loss="categorical_crossentropy", optimizer = opt, metrics = ["accuracy"])
model_checkpoint_path = "patch_classifier_checkpoint"
if not os.path.exists(model_checkpoint_path):
os.mkdir(model_checkpoint_path)
callbacks = [
keras.callbacks.ModelCheckpoint(
filepath=os.path.join(model_checkpoint_path, 'path_classifier_{epoch}'),
save_best_only=True,
monitor='val_loss',
verbose=1),
]
compute_steps_per_epoch = lambda x: int(math.ceil(1. * x / BATCH_SIZE))
steps_per_epoch = compute_steps_per_epoch(TRAINING_SIZE)
val_steps = compute_steps_per_epoch(VALIDATION_SIZE)
history = model.fit(get_batched_dataset(training_data_path), steps_per_epoch=steps_per_epoch, epochs=10,
validation_data=get_batched_dataset(validation_data_path), validation_steps=val_steps)
但是,每次我开始训练过程时,我都会在 Jupyter Notebook Kernel 死亡前大约 3 分钟得到以下输出:
Epoch 1/10
我期望看到正在通过的批号、模型准确性等。有人知道可能出了什么问题吗?
顺便说一句,我使用的是最新的 Tensorflow 版本。
解决方案
我有这种问题。当它第一次出现时,我注意到笔记本电脑上还有其他内核在使用 GPU 的能力。然后又出现了。这次我发现问题是由批量大小引起的。综上所述,我的电脑功率不足(MSI GTX 1080 TI)。如果笔记本上没有打开其他内核;通过将超参数设置为非常低的水平,您可以找出问题是否是由于计算能力不足引起的。我希望它有所帮助。
推荐阅读
- laravel - Laravel 默认认证登录请求
- python - 有助于 Flask 分页
- javascript - 在 moch 单元测试文件中重置 module.exports 对象
- javascript - 字符串操作,删除字符直到在 java 脚本中第二次出现
- node.js - 如何在node-rdkafka中一一读取消息
- javascript - 每次单击时更改按钮颜色 [Javascript]
- c# - 如何在 Automapper 中从源代码制作部分地图
- qt - Rapsbian 和 Qt5
- kubernetes - 如何仅由操作员的一个实例处理 K8S CRD CRUD 请求
- javascript - 离子 4 基本 ngClass 没有按预期工作