首页 > 解决方案 > tf.keras fit() 方法不打印预期反馈

问题描述

我创建了自定义 ResNet 模型,如下面的代码片段所示:

class ResNet:
    @staticmethod
    def residual_module(data, K, stride, chanDim, reduce=False, reg=0.0001, bnEps=2e-5, bnMom=0.9):
        # the shortcut branch of the ResNet module should be
        # initialize as the input (identity) data
        shortcut = data

        # the first block of the ResNet module are the 1x1 CONVs
        bn1 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(data)
        act1 = Activation("relu")(bn1)
        conv1 = Conv2D(int(K * 0.25), (1, 1), use_bias=False, kernel_regularizer=l2(reg))(act1)

        # the second block of the ResNet module are the 3x3 CONVs
        bn2 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(conv1)
        act2 = Activation("relu")(bn2)
        conv2 = Conv2D(int(K * 0.25), (3, 3), strides=stride, padding="same", use_bias=False, kernel_regularizer=l2(reg))(act2)

        # the third block of the ResNet module is another set of 1x1 CONVs
        bn3 = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(conv2)
        act3 = Activation("relu")(bn3)
        conv3 = Conv2D(K, (1, 1), use_bias=False, kernel_regularizer=l2(reg))(act3)

        # if we are to reduce the spatial size, apply a CONV layer t the shortcut
        if reduce:
            shortcut = Conv2D(K, (1, 1), strides=stride, use_bias=False, kernel_regularizer=l2(reg))(act1)

        # add together the shortcut and the final CONV
        x = add([conv3, shortcut])

        # return the addition as the output of the ResNet module
        return x

    @staticmethod
    def build(width, height, depth, classes, stages, filters, reg=0.0001, bnEps=2e-5, bnMom=0.9):
        # initialize the input shape to be "channels last" and the
        # channels dimension itself
        inputShape = (height, width, depth)
        chanDim = -1

        # if we are using "channels first", update the input shape and channels dimension
        if K.image_data_format() == "channels_first":
            inputShape = (depth, height, width)
            chanDim = 1

        # set the input and apply BN
        inputs = Input(shape=inputShape)
        x = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(inputs)
        # apply a single CONV layer
        x = Conv2D(filters[0], (3, 3), use_bias=False, padding="same", kernel_regularizer=l2(reg))(x)

        # loop over the number of stages
        for i in range(0, len(stages)):
            # initialize the stride, then apply a residual module
            # used to reduce the spatial size of the input volume
            stride = (1, 1) if i == 0 else (2, 2)
            x = ResNet.residual_module(x, filters[i + 1], stride, chanDim, reduce=True, bnEps=bnEps, bnMom=bnMom)

            # loop over the number of layers in the stage
            for j in range(0, stages[i] - 1):
                # apply a ResNet module
                x = ResNet.residual_module(x, filters[i + 1],
                                           (1, 1), chanDim, bnEps=bnEps, bnMom=bnMom)

        # apply BN => ACT => POOL
        x = BatchNormalization(axis=chanDim, epsilon=bnEps, momentum=bnMom)(x)
        x = Activation("relu")(x)
        x = AveragePooling2D((8, 8))(x)

        # softmax classifier
        x = Flatten()(x)
        x = Dense(classes, kernel_regularizer=l2(reg))(x)
        x = Activation("softmax")(x)

        # create the model
        model = Model(inputs, x, name="resnet")

        # return the constructed network architecture
        return model

我正在尝试以下列方式训练它:

training_data_path = os.path.join("TFRecords","Training","patch_classifier_0.tfrecords")
validation_data_path = os.path.join("TFRecords","Validation","patch_classifier_0.tfrecords")

TRAINING_SIZE = 9287
VALIDATION_SIZE = 1194

AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API
BATCH_SIZE = 32

def get_batched_dataset(filenames):
    option_no_order = tf.data.Options()
    option_no_order.experimental_deterministic = False

    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.with_options(option_no_order)
    dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=AUTO)
    dataset = dataset.map(_decode, num_parallel_calls=AUTO)

    dataset = dataset.repeat()
    dataset = dataset.shuffle(1024)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) 
    dataset = dataset.prefetch(AUTO) #

    return dataset

opt = SGD(lr = 1e-1)

model = ResNet.build(299, 299, 1, 5, (9, 9, 9), (64, 64, 128, 256), reg = 0.005)
model.compile(loss="categorical_crossentropy", optimizer = opt, metrics = ["accuracy"])

model_checkpoint_path = "patch_classifier_checkpoint"
if not os.path.exists(model_checkpoint_path):
    os.mkdir(model_checkpoint_path)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(model_checkpoint_path, 'path_classifier_{epoch}'),
        save_best_only=True,
        monitor='val_loss',
        verbose=1),
]

compute_steps_per_epoch = lambda x: int(math.ceil(1. * x / BATCH_SIZE))

steps_per_epoch = compute_steps_per_epoch(TRAINING_SIZE)
val_steps = compute_steps_per_epoch(VALIDATION_SIZE)

history = model.fit(get_batched_dataset(training_data_path), steps_per_epoch=steps_per_epoch, epochs=10,
                      validation_data=get_batched_dataset(validation_data_path), validation_steps=val_steps)

但是,每次我开始训练过程时,我都会在 Jupyter Notebook Kernel 死亡前大约 3 分钟得到以下输出:

Epoch 1/10

我期望看到正在通过的批号、模型准确性等。有人知道可能出了什么问题吗?

顺便说一句,我使用的是最新的 Tensorflow 版本。

标签: tensorflowkerasdeep-learning

解决方案


我有这种问题。当它第一次出现时,我注意到笔记本电脑上还有其他内核在使用 GPU 的能力。然后又出现了。这次我发现问题是由批量大小引起的。综上所述,我的电脑功率不足(MSI GTX 1080 TI)。如果笔记本上没有打开其他内核;通过将超参数设置为非常低的水平,您可以找出问题是否是由于计算能力不足引起的。我希望它有所帮助。


推荐阅读