python - 如何让这个 OCR 模型与可变长度示例一起工作?
问题描述
这是keras 文档中 OCR示例的修改版本。首先,您需要下载输入数据,这是一个包含 1000 张具有固定长度 (5) 验证码的照片的文件夹。
curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
unzip -qq captcha_images_v2.zip
这是我需要调整的版本
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (LSTM, Bidirectional, Conv2D, Dense,
Dropout, Input, Layer, MaxPooling2D,
Reshape)
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.layers import StringLookup
class TrainManager:
def __init__(self, src, image_width=200, image_height=50, batch_size=16):
src = Path(src)
self.image_width = image_width
self.image_height = image_height
self.batch_size = batch_size
self.images = []
self.labels = []
for image in src.glob('*.png'):
self.labels.append(image.stem)
self.images.append(image.as_posix())
self.max_label_length = len(max(self.labels, key=len))
self.characters = sorted(set(''.join(self.labels)))
self.char_to_num = StringLookup(vocabulary=self.characters, mask_token=None)
self.num_to_char = StringLookup(
vocabulary=self.char_to_num.get_vocabulary(), mask_token=None, invert=True
)
def encode_sample(self, img_path, label):
img = tf.io.read_file(img_path)
img = tf.io.decode_png(img, channels=1)
img = tf.image.convert_image_dtype(img, tf.float32)
img = tf.image.resize(img, [self.image_height, self.image_width])
img = tf.transpose(img, perm=[1, 0, 2])
label = self.char_to_num(
tf.strings.unicode_split(label, input_encoding='UTF-8')
)
return {'image': img, 'label': label}
def create_dataset(self, x, y, batch_size):
dataset = tf.data.Dataset.from_tensor_slices((x, y))
return (
dataset.map(self.encode_sample, num_parallel_calls=tf.data.AUTOTUNE)
.batch(batch_size)
.prefetch(buffer_size=tf.data.AUTOTUNE)
)
def create_datasets(self, train_size=0.9, shuffle=True):
images, labels = np.array(self.images), np.array(self.labels)
size = len(images)
indices = np.arange(size)
if shuffle:
np.random.shuffle(indices)
train_samples = int(size * train_size)
x_train, y_train = (
images[indices[:train_samples]],
labels[indices[:train_samples]],
)
x_valid, y_valid = (
images[indices[train_samples:]],
labels[indices[train_samples:]],
)
train_dataset = self.create_dataset(x_train, y_train, self.batch_size)
valid_dataset = self.create_dataset(x_valid, y_valid, self.batch_size)
return train_dataset, valid_dataset
def display_dataset(self, dataset, n_rows=1, n_cols=1, fig_size=(10, 5)):
_, ax = plt.subplots(n_rows, n_cols, figsize=fig_size)
for batch in dataset.take(1):
images = batch['image']
labels = batch['label']
for i in range(n_rows * n_cols):
img = (images[i] * 255).numpy().astype('uint8')
label = (
tf.strings.reduce_join(self.num_to_char(labels[i]))
.numpy()
.decode('utf-8')
)
row = i // n_rows
col = i % n_cols
ax[row, col].imshow(img[:, :, 0].T, cmap='gray')
ax[row, col].set_title(label)
ax[row, col].axis('off')
def create_model(self, training=True):
x0 = Input(
shape=(self.image_width, self.image_height, 1),
name='image',
dtype='float32',
)
x = Conv2D(
32,
(3, 3),
activation='relu',
kernel_initializer='he_normal',
padding='same',
name='Conv1',
)(x0)
x = MaxPooling2D((2, 2), name='pool1')(x)
x = Conv2D(
64,
(3, 3),
activation='relu',
kernel_initializer='he_normal',
padding='same',
name='Conv2',
)(x)
x = MaxPooling2D((2, 2), name='pool2')(x)
new_shape = ((self.image_width // 4), (self.image_height // 4) * 64)
x = Reshape(target_shape=new_shape, name='reshape')(x)
x = Dense(64, activation='relu', name='dense1')(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)
x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x)
x = Dense(
len(self.char_to_num.get_vocabulary()) + 1,
activation='softmax',
name='dense2',
)(x)
if not training:
return Model(x0, x)
labels = Input(name='label', shape=(None,), dtype='float32')
output = CTCLayer(name='ctc_loss')(labels, x)
model = Model(inputs=[x0, labels], outputs=output, name='ocr_model_v1')
return model
def decode_batch_predictions(self, pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
results = tf.keras.backend.ctc_decode(
pred, input_length=input_len, greedy=True
)[0][0][:, : self.max_label_length]
output_text = []
for result in results:
result = (
tf.strings.reduce_join(self.num_to_char(result)).numpy().decode('utf-8')
)
output_text.append(result)
return output_text
class CTCLayer(Layer):
def __init__(self, name=None):
super().__init__(name=name)
self.loss_fn = tf.keras.backend.ctc_batch_cost
def call(self, y_true, *args, **kwargs):
y_pred = args[0]
batch_length = tf.cast(tf.shape(y_true)[0], dtype='int64')
input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
input_length = input_length * tf.ones(shape=(batch_length, 1), dtype='int64')
label_length = label_length * tf.ones(shape=(batch_length, 1), dtype='int64')
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
self.add_loss(loss)
return y_pred
def main():
w, h = 200, 50
manager = TrainManager('captcha_images_v2', w, h)
print('Number of images found: ', len(manager.images))
print('Number of labels found: ', len(manager.labels))
print('Number of unique characters: ', len(manager.characters))
print('Characters present: ', manager.characters)
optimizer = Adam()
m = manager.create_model()
m.compile(optimizer)
m.summary()
early_stopping = EarlyStopping(
monitor='val_loss', patience=10, restore_best_weights=True
)
tr_dataset, val_dataset = manager.create_datasets()
history = m.fit(
tr_dataset,
validation_data=val_dataset,
epochs=100,
callbacks=[early_stopping],
)
它适用于固定长度的验证码,并注意文件的名称是2b827.png, 2bg48.png, 2cegf.png, ...
它们各自图像中包含的标签。
如果我修改2b827.png
为2b827abcde.png
,我会收到以下错误:
2021-10-12 09:14:41.276269: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
Epoch 1/100
18/59 [========>.....................] - ETA: 8s - loss: 29.0998Traceback (most recent call last):
File "/Users/user/Desktop/ocr_example.py", line 216, in <module>
main()
File "/Users/user/Desktop/ocr_example.py", line 179, in main
history = m.fit(
File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1184, in fit
tmp_logs = self.train_function(iterator)
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py", line 885, in __call__
result = self._call(*args, **kwds)
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/def_function.py", line 917, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 3039, in __call__
return graph_function._call_flat(
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 1963, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/function.py", line 591, in call
outputs = execute.execute(
File "/usr/local/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [10], [batch]: [5]
[[node IteratorGetNext (defined at usr/local/lib/python3.9/site-packages/keras/engine/training.py:841) ]] [Op:__inference_train_function_11873]
Errors may have originated from an input operation.
Input Source operations connected to node IteratorGetNext:
iterator (defined at usr/local/lib/python3.9/site-packages/keras/engine/training.py:1184)
Function call stack:
train_function
我需要修改它以接受和输出可变长度的输入/输出。我认为输入需要根据数据集中包含的最长标签进行填充。
这是一个示例来说明我认为可能有效的方法:假设我们有abc.png
、abcde.png
和abcdefghij.png
、 输入以及可能的输出,它们应该具有类似于以下的形式:
abc.png
abcde.png
abcdefghij.png
但是这种方法将仅限于长度为 10 的示例。我预计超过 10 个标签会出现问题。理想的解决方案应该接受任何长度并输出任何长度。这是一个解决相同问题的问题,通过填充标签解决了这个问题,我认为出于我提到的相同原因,这些标签具有不可预见的缺点。
解决方案
因此,假设您要使用与当前标签不同的更长的标签,其固定长度为 5,正如我在打印时看到的那样max_length
:
max_length = max([len(label) for label in labels])
# 5
然后,您需要调整您的功能def encode_sample(self, img_path, label)
,create_dataset(self, x, y, batch_size)
使标签短于或等于最大长度,这可以是任意的。在这里,我假设 amax_length=20
和 0 保留为填充字符:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
max_length = 20
for data in train_dataset:
x,y = data
data_dict = encode_single_sample(x, y)
difference = max_length - data_dict['label'].shape[0]
if difference != 0:
padding = np.zeros(difference)
data_dict['label'] = np.concatenate((data_dict['label'], padding))
我想你应该已经明白了。请注意,您可能还需要调整模型中的输入形状。如果您想避免以这种方式填充,您只需要确保每个单独的批次(即该批次中的所有标签)具有相同的形状。在推理期间,如果您的输入形状保持不变,您可以有任何长度shape=(None,)
。
推荐阅读
- syntax - ansible playbook中的if else语法
- java - 使用 Spring Webclient 下载文件,文件为空
- c - 用 C 编程时出现跟踪/断点陷阱问题
- python - 如果在 PYTHON 3.8 中使用缩进编写,为什么无法读取 JSON
- anylogic - “服务器没有响应。连接超时”有什么影响。Anylogic中的错误?
- geocoding - Google - 将纬度、经度翻译成 Geotarget 位置
- java - 创建空数据框 Java Spark
- google-apps-script - 如何解决自动分配号码和电子邮件触发器的错误?
- python - 我应该如何通过 python django 从数据库中读取特定用户的特定数据
- class - libGDX 类关系