首页 > 解决方案 > 用于单词数据的 CRNN 模型在 932 个训练图像上经过 10 个 epoch 后没有学习任何东西

问题描述

我正在尝试使用 CRNN 方法从孟加拉语单词的图像中预测单词。我正在使用 932 张图像和 10 个 epoch 的本地环境中对其进行训练。损失正在减少,但准确度为 0% 。该模型有点做随机猜测,而不是学习任何东西。以下是我的模型:

import torch.nn as nn


class BidirectionalLSTM(nn.Module):

    def __init__(self, number_of_input, number_of_hidden, nunmer_of_out):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(number_of_input, number_of_hidden, bidirectional=True)
        self.embedding = nn.Linear(number_of_hidden * 2, nunmer_of_out)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        # print("size from recurrent forward :{} {} {}".format(T,b,h))
        t_rec = recurrent.view(T * b, h)
        # print("size after recurrent view : {}".format(t_rec.size()))
        output = self.embedding(t_rec)  # [T * b, nOut]

        output = output.view(T, b, -1)
        return output


class CRNN(nn.Module):

    def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(CRNN, self).__init__()
        # assert imgH % 16 == 0, 'imgH has to be a multiple of 16'

        ks = [3, 3, 3, 3, 3, 3, 2]  # original [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]

        cnn = nn.Sequential()

        def conv_relu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))

        conv_relu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64 original
        conv_relu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32 original
        conv_relu(2, True)
        conv_relu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16 original
        conv_relu(4, True)
        conv_relu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16 original
        conv_relu(6, True)  # 512x1x16
        # Extra max pull to bring down height to 1
        cnn.add_module('pooling{0}'.format(4),
                       nn.MaxPool2d((3, 3), (2, 4), (0, 1))) # final size 512x2x32
        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, nh, nh),
            BidirectionalLSTM(nh, nh, nclass))

    def forward(self, input):
        # conv features
        conv = self.cnn(input)
        b, c, h, w = conv.size()
        # assert h == 1, "the height of conv must be 1"
        # print('size before squeeze')
        # print(conv.size())
        conv = conv.squeeze(2)
        # print('size after squeeze')
        # print(conv.size())
        conv = conv.permute(2, 0, 1)  # [w, b, c]

        # rnn features
        output = self.rnn(conv)
        # print("from model {}".format(output))
        return output

以下是我的主要内容:

from data_preparation.dataloader import DataSetOCR
from utills.dataloader_services import *
from torch.utils.data import DataLoader
import parameters
from models import crnn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import CTCLoss
from utills.string_label_converter import averager, StrLabelConverter
import torch.nn.functional as F

train_dataset = DataSetOCR(
    csv_file_path= parameters.train_csv_path,
    text_file_path= parameters.text_file_path,
    root_directory= parameters.train_root)
assert train_dataset

test_dataset = DataSetOCR(
    csv_file_path= parameters.test_csv_path,
    text_file_path= parameters.text_file_path,
    root_directory= parameters.test_root)
assert test_dataset

dataloader_params = {
    'batch_size': 2,
    'shuffle': True,
    'collate_fn': my_collate

}

train_loader = DataLoader(train_dataset, **dataloader_params)
train_iter = iter(train_loader)

# custom weights initialization called on crnn
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)


crnn = crnn.CRNN(parameters.max_image_height, 3, parameters.number_of_classes, 256)
crnn.apply(weights_init)
criterion = CTCLoss()
optimizer = optim.Adam(crnn.parameters(), lr=0.001)
loss_avg = averager()

image = torch.FloatTensor(2, 3, parameters.max_image_width, parameters.max_image_height)
text = torch.LongTensor(2 * 5)
length = torch.LongTensor(2)


string_converter = StrLabelConverter()
# string_converter.convert_integer_to_string()

def val(net, dataset, criterion, max_iter=100):
    print('Start val')

    for p in crnn.parameters():
        p.requires_grad = False

    net.eval()
    data_loader = torch.utils.data.DataLoader(
        dataset, shuffle=True, batch_size=2, collate_fn=my_collate)
    val_iter = iter(data_loader)

    i = 0
    n_correct = 0
    loss_avg = averager()

    max_iter = min(max_iter, len(data_loader))
    # print('length of dataloader')
    # print(len(data_loader))
    for i in range(max_iter):
        data = val_iter.next()
        i += 1
        images, texts = data
        batch_size = images.size(0)
        # print(images)
        # print(image)
        loadData(image, images)
        t, l = string_converter.convert_string_to_integer(texts, [])
        loadData(text, t)
        loadData(length, l)
        # print('actual label')
        # print(text)
        # print(text.size())


        # print('input from val')
        # print(i)
        print(image)
        preds = crnn(image)
        preds = F.log_softmax(preds, 2)
        print(preds)
        preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
        # print('predictions')
        # print(preds.size())
        cost = criterion(preds, text, preds_size, length) / batch_size
        print('validation {}'.format(cost))
        loss_avg.add(cost)

        # print('predictions before max')
        # print(preds.size())
        _, preds = preds.max(2)
        # print('after max function')
        # print(preds)
        # print(preds.size())
        preds = preds.squeeze(1) # originial 2
        # print('after squeeze')
        # print(preds.size())
        preds = preds.transpose(1, 0).contiguous().view(-1)
        # print('after transpose')
        # print(preds.size())
        sim_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)

        cpu_texts = string_converter.convert_integer_to_string(text, length)

        # for pred, target in zip(sim_preds, cpu_texts):
        #     # if pred == target:
        #     #     n_correct += 1
        #     print((pred, target))
    #
    # raw_preds = string_converter.convert_integer_to_string(preds.data, preds_size.data)[:2]
    # for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
    #     print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
    #
    # accuracy = n_correct / float(max_iter * 100)
    # print('Test loss: %f, accuray: %f' % (loss_avg.val(), accuracy))

def trainBatch(train_iter, criterion, optimizer):
    try:
        data = train_iter.next()
    except StopIteration:
        train_iter = iter(train_loader)
        data = train_iter.next()

    images,texts = data
    loadData(image,images)
    t,l =string_converter.convert_string_to_integer(texts, [])
    loadData(text, t)
    loadData(length,l)
    batch_size = dataloader_params['batch_size']
    optimizer.zero_grad()
    # # test_util.loadData(image, data['image'])
    # text = Variable(data['integer_sequence_label'])
    # print("from train {}".format(image))
    preds = crnn(image)
    preds = F.log_softmax(preds, 2)
    preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))



    cost = criterion(preds, text, preds_size, length) / batch_size
    # print('cost from train {}'.format(cost))
    crnn.zero_grad()
    cost.backward()
    optimizer.step()
    return cost

for epoch in range(25):
    i = 0
    while i < len(train_loader):
        for p in crnn.parameters():
            p.requires_grad = True
        crnn.train()

        cost = trainBatch(train_iter, criterion, optimizer)
        loss_avg.add(cost)
        i += 1

        if i % 50 == 0:
            print('[%d/%d][%d/%d] Loss: %f' % (epoch, 25, i, len(train_loader), loss_avg.val()))
            loss_avg.reset()

        if i % 50 == 0:
            val(crnn, test_dataset, criterion)

        # #do checkpointing
        # if i % 1 == 0:
        #     torch.save(crnn.state_dict(), '{0}/netCRNN_{1}_{2}.pth'.format('/home/bjit-531/PycharmProjects/python/bangla-ocr-version-2/bangla-ocr-version-2/weights/', epoch, i))
        #

我正在使用各种字体在数据集中生成图像,因此它们具有不同的高度和宽度。我在数据加载期间填充它们,因此模型接收大小为 90 x 752 的每个图像。我在这里做错了什么?为什么模型精度没有提高?

标签: pythonneural-networkconv-neural-networkpytorchrecurrent-neural-network

解决方案


推荐阅读