python - 用于单词数据的 CRNN 模型在 932 个训练图像上经过 10 个 epoch 后没有学习任何东西
问题描述
我正在尝试使用 CRNN 方法从孟加拉语单词的图像中预测单词。我正在使用 932 张图像和 10 个 epoch 的本地环境中对其进行训练。损失正在减少,但准确度为 0% 。该模型有点做随机猜测,而不是学习任何东西。以下是我的模型:
import torch.nn as nn
class BidirectionalLSTM(nn.Module):
def __init__(self, number_of_input, number_of_hidden, nunmer_of_out):
super(BidirectionalLSTM, self).__init__()
self.rnn = nn.LSTM(number_of_input, number_of_hidden, bidirectional=True)
self.embedding = nn.Linear(number_of_hidden * 2, nunmer_of_out)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
# print("size from recurrent forward :{} {} {}".format(T,b,h))
t_rec = recurrent.view(T * b, h)
# print("size after recurrent view : {}".format(t_rec.size()))
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class CRNN(nn.Module):
def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(CRNN, self).__init__()
# assert imgH % 16 == 0, 'imgH has to be a multiple of 16'
ks = [3, 3, 3, 3, 3, 3, 2] # original [3, 3, 3, 3, 3, 3, 2]
ps = [1, 1, 1, 1, 1, 1, 0]
ss = [1, 1, 1, 1, 1, 1, 1]
nm = [64, 128, 256, 256, 512, 512, 512]
cnn = nn.Sequential()
def conv_relu(i, batchNormalization=False):
nIn = nc if i == 0 else nm[i - 1]
nOut = nm[i]
cnn.add_module('conv{0}'.format(i),
nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
if batchNormalization:
cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
if leakyRelu:
cnn.add_module('relu{0}'.format(i),
nn.LeakyReLU(0.2, inplace=True))
else:
cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
conv_relu(0)
cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64 original
conv_relu(1)
cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32 original
conv_relu(2, True)
conv_relu(3)
cnn.add_module('pooling{0}'.format(2),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16 original
conv_relu(4, True)
conv_relu(5)
cnn.add_module('pooling{0}'.format(3),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16 original
conv_relu(6, True) # 512x1x16
# Extra max pull to bring down height to 1
cnn.add_module('pooling{0}'.format(4),
nn.MaxPool2d((3, 3), (2, 4), (0, 1))) # final size 512x2x32
self.cnn = cnn
self.rnn = nn.Sequential(
BidirectionalLSTM(512, nh, nh),
BidirectionalLSTM(nh, nh, nclass))
def forward(self, input):
# conv features
conv = self.cnn(input)
b, c, h, w = conv.size()
# assert h == 1, "the height of conv must be 1"
# print('size before squeeze')
# print(conv.size())
conv = conv.squeeze(2)
# print('size after squeeze')
# print(conv.size())
conv = conv.permute(2, 0, 1) # [w, b, c]
# rnn features
output = self.rnn(conv)
# print("from model {}".format(output))
return output
以下是我的主要内容:
from data_preparation.dataloader import DataSetOCR
from utills.dataloader_services import *
from torch.utils.data import DataLoader
import parameters
from models import crnn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import CTCLoss
from utills.string_label_converter import averager, StrLabelConverter
import torch.nn.functional as F
train_dataset = DataSetOCR(
csv_file_path= parameters.train_csv_path,
text_file_path= parameters.text_file_path,
root_directory= parameters.train_root)
assert train_dataset
test_dataset = DataSetOCR(
csv_file_path= parameters.test_csv_path,
text_file_path= parameters.text_file_path,
root_directory= parameters.test_root)
assert test_dataset
dataloader_params = {
'batch_size': 2,
'shuffle': True,
'collate_fn': my_collate
}
train_loader = DataLoader(train_dataset, **dataloader_params)
train_iter = iter(train_loader)
# custom weights initialization called on crnn
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
m.weight.data.normal_(0.0, 0.02)
elif classname.find('BatchNorm') != -1:
m.weight.data.normal_(1.0, 0.02)
m.bias.data.fill_(0)
crnn = crnn.CRNN(parameters.max_image_height, 3, parameters.number_of_classes, 256)
crnn.apply(weights_init)
criterion = CTCLoss()
optimizer = optim.Adam(crnn.parameters(), lr=0.001)
loss_avg = averager()
image = torch.FloatTensor(2, 3, parameters.max_image_width, parameters.max_image_height)
text = torch.LongTensor(2 * 5)
length = torch.LongTensor(2)
string_converter = StrLabelConverter()
# string_converter.convert_integer_to_string()
def val(net, dataset, criterion, max_iter=100):
print('Start val')
for p in crnn.parameters():
p.requires_grad = False
net.eval()
data_loader = torch.utils.data.DataLoader(
dataset, shuffle=True, batch_size=2, collate_fn=my_collate)
val_iter = iter(data_loader)
i = 0
n_correct = 0
loss_avg = averager()
max_iter = min(max_iter, len(data_loader))
# print('length of dataloader')
# print(len(data_loader))
for i in range(max_iter):
data = val_iter.next()
i += 1
images, texts = data
batch_size = images.size(0)
# print(images)
# print(image)
loadData(image, images)
t, l = string_converter.convert_string_to_integer(texts, [])
loadData(text, t)
loadData(length, l)
# print('actual label')
# print(text)
# print(text.size())
# print('input from val')
# print(i)
print(image)
preds = crnn(image)
preds = F.log_softmax(preds, 2)
print(preds)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
# print('predictions')
# print(preds.size())
cost = criterion(preds, text, preds_size, length) / batch_size
print('validation {}'.format(cost))
loss_avg.add(cost)
# print('predictions before max')
# print(preds.size())
_, preds = preds.max(2)
# print('after max function')
# print(preds)
# print(preds.size())
preds = preds.squeeze(1) # originial 2
# print('after squeeze')
# print(preds.size())
preds = preds.transpose(1, 0).contiguous().view(-1)
# print('after transpose')
# print(preds.size())
sim_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)
cpu_texts = string_converter.convert_integer_to_string(text, length)
# for pred, target in zip(sim_preds, cpu_texts):
# # if pred == target:
# # n_correct += 1
# print((pred, target))
#
# raw_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)[:2]
# for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
# print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
#
# accuracy = n_correct / float(max_iter * 100)
# print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))
def trainBatch(train_iter, criterion, optimizer):
try:
data = train_iter.next()
except StopIteration:
train_iter = iter(train_loader)
data = train_iter.next()
images,texts = data
loadData(image,images)
t,l =string_converter.convert_string_to_integer(texts, [])
loadData(text, t)
loadData(length,l)
batch_size = dataloader_params['batch_size']
optimizer.zero_grad()
# # test_util.loadData(image, data['image'])
# text = Variable(data['integer_sequence_label'])
# print("from train {}".format(image))
preds = crnn(image)
preds = F.log_softmax(preds, 2)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
cost = criterion(preds, text, preds_size, length) / batch_size
# print('cost from train {}'.format(cost))
crnn.zero_grad()
cost.backward()
optimizer.step()
return cost
for epoch in range(25):
i = 0
while i < len(train_loader):
for p in crnn.parameters():
p.requires_grad = True
crnn.train()
cost = trainBatch(train_iter, criterion, optimizer)
loss_avg.add(cost)
i += 1
if i % 50 == 0:
print('[%d/%d][%d/%d] Loss: %f' % (epoch, 25, i, len(train_loader), loss_avg.val()))
loss_avg.reset()
if i % 50 == 0:
val(crnn, test_dataset, criterion)
# #do checkpointing
# if i % 1 == 0:
# torch.save(crnn.state_dict(), '{0}/netCRNN_{1}_{2}.pth'.format('/home/bjit-531/PycharmProjects/python/bangla-ocr-version-2/bangla-ocr-version-2/weights/', epoch, i))
#
我正在使用各种字体在数据集中生成图像,因此它们具有不同的高度和宽度。我在数据加载期间填充它们,因此模型接收大小为 90 x 752 的每个图像。我在这里做错了什么?为什么模型精度没有提高?
解决方案
推荐阅读
- c++ - 使用 C++ 创建特殊的二叉搜索树
- angular - 在 ng serve 构建 Angular 库
- reactjs - react 会重新渲染只有状态的某些部分发生变化的孩子,还是所有使用状态的孩子都被重新渲染?
- gradle - 使用 Gradle 工具 api 以编程方式获取 Gradle 插件
- python - python - 为什么循环不会在python中的if子句条件上跳转索引?
- html - 屏幕中的px单位宽度分辨率问题
- c# - Tizen.NET NfcCardEmulation 抛出 UnsupportedException(具有附加权限)
- css - 在视口底部固定元素定位 - 不诉诸位置:固定(纯 CSS 解决方案)
- sql - 如何在没有错误缩进的情况下正确格式化 SQL 文件?
- javascript - 单击扩展程序的图标时,有没有办法添加 HTML