python - 如何让这个 OCR 模型与可变长度示例一起工作？

问题描述

这是keras 文档中 OCR示例的修改版本。首先，您需要下载输入数据，这是一个包含 1000 张具有固定长度 (5) 验证码的照片的文件夹。

curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
unzip -qq captcha_images_v2.zip

这是我需要调整的版本

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (LSTM, Bidirectional, Conv2D, Dense,
                                     Dropout, Input, Layer, MaxPooling2D,
                                     Reshape)
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers import StringLookup


class TrainManager:
    def __init__(self, src, image_width=200, image_height=50, batch_size=16):
        src = Path(src)
        self.image_width = image_width
        self.image_height = image_height
        self.batch_size = batch_size
        self.images = []
        self.labels = []
        for image in src.glob('*.png'):
            self.labels.append(image.stem)
            self.images.append(image.as_posix())
        self.max_label_length = len(max(self.labels, key=len))
        self.characters = sorted(set(''.join(self.labels)))
        self.char_to_num = StringLookup(vocabulary=self.characters, mask_token=None)
        self.num_to_char = StringLookup(
            vocabulary=self.char_to_num.get_vocabulary(), mask_token=None, invert=True
        )

    def encode_sample(self, img_path, label):
        img = tf.io.read_file(img_path)
        img = tf.io.decode_png(img, channels=1)
        img = tf.image.convert_image_dtype(img, tf.float32)
        img = tf.image.resize(img, [self.image_height, self.image_width])
        img = tf.transpose(img, perm=[1, 0, 2])
        label = self.char_to_num(
            tf.strings.unicode_split(label, input_encoding='UTF-8')
        )
        return {'image': img, 'label': label}

    def create_dataset(self, x, y, batch_size):
        dataset = tf.data.Dataset.from_tensor_slices((x, y))
        return (
            dataset.map(self.encode_sample, num_parallel_calls=tf.data.AUTOTUNE)
            .batch(batch_size)
            .prefetch(buffer_size=tf.data.AUTOTUNE)
        )

    def create_datasets(self, train_size=0.9, shuffle=True):
        images, labels = np.array(self.images), np.array(self.labels)
        size = len(images)
        indices = np.arange(size)
        if shuffle:
            np.random.shuffle(indices)
        train_samples = int(size * train_size)
        x_train, y_train = (
            images[indices[:train_samples]],
            labels[indices[:train_samples]],
        )
        x_valid, y_valid = (
            images[indices[train_samples:]],
            labels[indices[train_samples:]],
        )
        train_dataset = self.create_dataset(x_train, y_train, self.batch_size)
        valid_dataset = self.create_dataset(x_valid, y_valid, self.batch_size)
        return train_dataset, valid_dataset

    def display_dataset(self, dataset, n_rows=1, n_cols=1, fig_size=(10, 5)):
        _, ax = plt.subplots(n_rows, n_cols, figsize=fig_size)
        for batch in dataset.take(1):
            images = batch['image']
            labels = batch['label']
            for i in range(n_rows * n_cols):
                img = (images[i] * 255).numpy().astype('uint8')
                label = (
                    tf.strings.reduce_join(self.num_to_char(labels[i]))
                    .numpy()
                    .decode('utf-8')
                )
                row = i // n_rows
                col = i % n_cols
                ax[row, col].imshow(img[:, :, 0].T, cmap='gray')
                ax[row, col].set_title(label)
                ax[row, col].axis('off')

    def create_model(self, training=True):
        x0 = Input(
            shape=(self.image_width, self.image_height, 1),
            name='image',
            dtype='float32',
        )
        x = Conv2D(
            32,
            (3, 3),
            activation='relu',
            kernel_initializer='he_normal',
            padding='same',
            name='Conv1',
        )(x0)
        x = MaxPooling2D((2, 2), name='pool1')(x)
        x = Conv2D(
            64,
            (3, 3),
            activation='relu',
            kernel_initializer='he_normal',
            padding='same',
            name='Conv2',
        )(x)
        x = MaxPooling2D((2, 2), name='pool2')(x)
        new_shape = ((self.image_width // 4), (self.image_height // 4) * 64)
        x = Reshape(target_shape=new_shape, name='reshape')(x)
        x = Dense(64, activation='relu', name='dense1')(x)
        x = Dropout(0.2)(x)
        x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)
        x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x)
        x = Dense(
            len(self.char_to_num.get_vocabulary()) + 1,
            activation='softmax',
            name='dense2',
        )(x)
        if not training:
            return Model(x0, x)
        labels = Input(name='label', shape=(None,), dtype='float32')
        output = CTCLayer(name='ctc_loss')(labels, x)
        model = Model(inputs=[x0, labels], outputs=output, name='ocr_model_v1')
        return model

    def decode_batch_predictions(self, pred):
        input_len = np.ones(pred.shape[0]) * pred.shape[1]
        results = tf.keras.backend.ctc_decode(
            pred, input_length=input_len, greedy=True
        )[0][0][:, : self.max_label_length]
        output_text = []
        for result in results:
            result = (
                tf.strings.reduce_join(self.num_to_char(result)).numpy().decode('utf-8')
            )
            output_text.append(result)
        return output_text


class CTCLayer(Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    def call(self, y_true, *args, **kwargs):
        y_pred = args[0]
        batch_length = tf.cast(tf.shape(y_true)[0], dtype='int64')
        input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
        label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
        input_length = input_length * tf.ones(shape=(batch_length, 1), dtype='int64')
        label_length = label_length * tf.ones(shape=(batch_length, 1), dtype='int64')
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        return y_pred


def main():
    w, h = 200, 50
    manager = TrainManager('captcha_images_v2', w, h)
    print('Number of images found: ', len(manager.images))
    print('Number of labels found: ', len(manager.labels))
    print('Number of unique characters: ', len(manager.characters))
    print('Characters present: ', manager.characters)
    optimizer = Adam()
    m = manager.create_model()
    m.compile(optimizer)
    m.summary()
    early_stopping = EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True
    )
    tr_dataset, val_dataset = manager.create_datasets()
    history = m.fit(
        tr_dataset,
        validation_data=val_dataset,
        epochs=100,
        callbacks=[early_stopping],
    )

它适用于固定长度的验证码，并注意文件的名称是2b827.png, 2bg48.png, 2cegf.png, ...它们各自图像中包含的标签。

如果我修改2b827.png为2b827abcde.png，我会收到以下错误：

2021-10-12 09:14:41.276269: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
Epoch 1/100
18/59 [========>.....................] - ETA: 8s - loss: 29.0998Traceback (most recent call last):
  File "/Users/user/Desktop/ocr_example.py", line 216, in <module>
    main()
  File "/Users/user/Desktop/ocr_example.py", line 179, in main
    history = m.fit(
  File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1184, in fit
    tmp_logs = self.train_function(iterator)
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py", line 885, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py", line 917, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 3039, in __call__
    return graph_function._call_flat(
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 1963, in _call_flat
    return self._build_call_outputs(self._inference_function.call(
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 591, in call
    outputs = execute.execute(
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 59, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError:  Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [10], [batch]: [5]
     [[node IteratorGetNext (defined at usr/local/lib/python3.9/site-packages/keras/engine/training.py:841) ]] [Op:__inference_train_function_11873]

Errors may have originated from an input operation.
Input Source operations connected to node IteratorGetNext:
 iterator (defined at usr/local/lib/python3.9/site-packages/keras/engine/training.py:1184)

Function call stack:
train_function

我需要修改它以接受和输出可变长度的输入/输出。我认为输入需要根据数据集中包含的最长标签进行填充。

这是一个示例来说明我认为可能有效的方法：假设我们有abc.png、abcde.png和abcdefghij.png、输入以及可能的输出，它们应该具有类似于以下的形式：

abc.png
abcde.png
abcdefghij.png

但是这种方法将仅限于长度为 10 的示例。我预计超过 10 个标签会出现问题。理想的解决方案应该接受任何长度并输出任何长度。这是一个解决相同问题的问题，通过填充标签解决了这个问题，我认为出于我提到的相同原因，这些标签具有不可预见的缺点。

标签： pythontensorflowkerasocrcaptcha

因此，假设您要使用与当前标签不同的更长的标签，其固定长度为 5，正如我在打印时看到的那样max_length：

max_length = max([len(label) for label in labels])
# 5

然后，您需要调整您的功能def encode_sample(self, img_path, label)，create_dataset(self, x, y, batch_size)使标签短于或等于最大长度，这可以是任意的。在这里，我假设 amax_length=20和 0 保留为填充字符：

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
max_length = 20
for data in train_dataset:
  x,y = data
  data_dict = encode_single_sample(x, y)
  difference = max_length -  data_dict['label'].shape[0]
  if difference != 0:
    padding = np.zeros(difference)
    data_dict['label'] = np.concatenate((data_dict['label'], padding))

我想你应该已经明白了。请注意，您可能还需要调整模型中的输入形状。如果您想避免以这种方式填充，您只需要确保每个单独的批次（即该批次中的所有标签）具有相同的形状。在推理期间，如果您的输入形状保持不变，您可以有任何长度shape=(None,)。

python - 如何让这个 OCR 模型与可变长度示例一起工作？

问题描述

解决方案

推荐阅读