首页 > 解决方案 > 从头开始的 NN 适用于简单的问题,但不适用于 MNIST 数字

问题描述

我一直在研究一个带有一个隐藏层的神经网络,三层中的每一层都有灵活数量的节点。这是代码:

import time
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist

class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

class NeuralNetwork():

    correct = 0
    num_predictions = 10
    epochs = 100
    sizeOfEpoch = 5000
    Lambda = 10
    learningRate = 0.00001

    def __init__(self, sizes):
        self.dimensions = sizes

        self.x = np.arange(1,self.epochs+1)
        self.y = np.empty(self.epochs)

        self.secondLayerNeurons = np.empty(sizes[1])
        self.outputNeurons = np.empty(sizes[2])

        self.firstLayerWeights = np.random.rand(sizes[1], sizes[0])
        self.secondLayerWeights = np.random.rand(sizes[2], sizes[1])
        self.firstLayerBiases = np.random.rand(sizes[1])
        self.secondLayerBiases = np.random.rand(sizes[2])

        self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
        self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
        self.firstLayerBiasesSummations = np.zeros([sizes[1]])
        self.secondLayerBiasesSummations = np.zeros([sizes[2]])

        self.hiddenLayerErrors = np.empty(sizes[1])
        self.outputLayerErrors = np.empty(sizes[2])

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def sigmoidDerivative(self, x):
        return np.multiply(x,(1-x))

    def forwardProp(self, inputs):
        for i in range (self.dimensions[1]):
            self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
        for i in range (self.dimensions[2]):
            self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])

    def backProp(self, inputs, correct_output):
        self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
        self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))

        for i in range (self.dimensions[2]):
            for j in range (self.dimensions[1]):
                if j==0:
                    self.secondLayerBiasesSummations[i] += self.outputLayerErrors[i]
                self.secondLayerWeightsSummations[i][j] += self.outputLayerErrors[i]*self.secondLayerNeurons[j]
        for i in range (self.dimensions[1]):
            for j in range (self.dimensions[0]):
                if j==0:
                    self.firstLayerBiasesSummations[i] += self.hiddenLayerErrors[i]
                self.firstLayerWeightsSummations[i][j] += self.hiddenLayerErrors[i]*inputs[j]

    def train(self, trainImages, trainLabels):
        size = str(self.sizeOfEpoch)
        greatestError = 0.0
        start_time2 = time.time()

        for m in range (self.sizeOfEpoch):
            correct_output = np.zeros([self.dimensions[2]])
            correct_output[int(class_names[trainLabels[m]])] = 1.0

            self.forwardProp(trainImages[m].flatten())
            self.backProp(trainImages[m].flatten(), correct_output)

            if np.argmax(self.outputNeurons) == int(trainLabels[m]):
                self.correct+=1

            if m%200 == 0:
                error = np.amax(np.absolute(self.outputLayerErrors))
                if error > greatestError:
                    greatestError = error
                accuracy = str(int((self.correct/(m+1))*100)) + '%'
                percent = str(int((m/self.sizeOfEpoch)*100)) + '%'
                print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
        self.change()

        time2 = str(round((time.time() - start_time2), 2))
        print (size + '/' + size + " -- " + time2 + "s" + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
        return greatestError

    def change(self):
        for i in range (self.dimensions[2]):
            for j in range (self.dimensions[1]):
                if j == 0:
                    self.secondLayerBiases[i] -= self.learningRate*self.secondLayerBiasesSummations[i]
                self.secondLayerWeights[i][j] -= self.learningRate*(self.secondLayerWeightsSummations[i][j]+self.Lambda*self.secondLayerWeights[i][j])
        for i in range (self.dimensions[1]):
            for j in range (self.dimensions[0]):
                if j == 0:
                    self.firstLayerBiases[i] -= self.learningRate*self.firstLayerBiasesSummations[i]
                self.firstLayerWeights[i][j] -= self.learningRate*(self.firstLayerWeightsSummations[i][j]+self.Lambda*self.firstLayerWeights[i][j])

        self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
        self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
        self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
        self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])
        self.correct = 0
            
    def predict(self, testImage):
        secondLayerAnsNodes = np.empty([self.dimensions[1]])
        outputAns = np.empty([self.dimensions[2]])
        for i in range (self.dimensions[1]):
            secondLayerAnsNodes[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], testImage)+self.firstLayerBiases[i])
        for i in range (self.dimensions[2]):
            outputAns[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], secondLayerAnsNodes)+self.secondLayerBiases[i])
        return np.argmax(outputAns)

if __name__ == "__main__":

    (train_images, train_labels), (test_images, test_labels) = mnist.load_data()
    train_images = train_images/255.0
    test_images = test_images/255.0

    neural_network = NeuralNetwork([784, 16, 10])

    start_time = time.time()
    for i in range (neural_network.epochs):
        print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
        neural_network.y[i]=neural_network.train(train_images, train_labels)
    time = time.time() - start_time

    plt.plot(neural_network.x, neural_network.y, 'b')
    plt.ylabel('Error Change')
    plt.xlabel('Epochs')
    plt.show()

    print("\n\n\nTotal Time Used")
    if time/60 < 60:
        print("Minutes: %s" % round((time/60),2))
    else:
        print("Seconds: %s" % round(time,2))

    for i in range (neural_network.num_predictions):
        prediction = neural_network.predict(test_images[i].flatten())
        plt.grid(False)
        plt.imshow(test_images[i], cmap=plt.cm.binary)
        plt.title("Prediction: " + str(prediction) + " -- Actual: " + class_names[test_labels[i]] + "\n" + str(i+1) + "/" + str(neural_network.num_predictions))
        plt.show()

由于某种原因,此代码不适用于更复杂的问题。误差没有得到最小化,准确性保持不变。这个确切的代码适用于 xor 问题和另一个类似的问题。当我尝试给它 MNIST 数字数据集时,它不起作用。唯一不同的是每一层的节点较多,算法是一样的。

这里可能是什么问题?

这是运行 20 个 epoch 后的图表,学习率为 0.000001,lambda 为 10。它显示了每个 epoch 的误差。y 标签应该说错误,而不是错误更改。 https://i.stack.imgur.com/fLXzz.png

标签: pythonmachine-learningdeep-learningneural-networkregression

解决方案


您的实施在技术上没有任何问题。但是,有几件事需要注意,所有这些都会对您所看到的性能产生重大影响。这是一个很长的答案,但每个部分都反映了我对您的代码所做的重要更改,以使其按预期工作,因此请仔细阅读。

首先,您不应该在 (0, 1) 中初始化您的权重,这是np.random.randn默认设置的。具体来说,如果您要选择均匀的随机权重,则均匀分布应以零为中心。例如,选择 (-1, 1) 或 (-.1, .1) 范围内的随机数。否则,您的 MLP 会立即出现偏差;许多隐藏层神经元将通过 sigmoid 激活立即映射到接近 1。毕竟,sigmoid 激活以零为中心(沿 x 轴),因此您的默认输入也应该如此。这个问题可以很容易地阻止您的 MLP 完全收敛(事实上,在您的情况下确实如此)。有比从均匀随机分布中抽样更好的权重初始化方法,但这并不是说如果做得好,这种方法就行不通。

其次,您可能应该规范化图像数据。神经网络对 0 到 255 之间的输入效果不佳,这是默认情况下从 keras 导出图像数据的方式。您可以通过将每个输入特征除以 255 来解决此问题。原因是 sigmoid 曲线在高幅度子域的导数非常小。换句话说,当 x 非常大或非常小(非常负)时,sigmoid(x) 对 x 的导数非常接近于零。当您将某些权重乘以非常大的值(例如 255)时,您很可能会立即进入 S 型曲线的这个高数值域。这不一定会阻止你的网络收敛,但它肯定会在一开始就减慢它,因为小的导数会导致小的梯度,这反过来会导致较小的权重更新。您可以提高学习率,但这可能会导致神经网络在离开 sigmoid 曲线的低导数区域时越步(并可能发散)。同样,我已经在您的特定程序中测试(并修复)了这个问题,它确实产生了显着差异(最终精度大约为 0.8,而不是 0.6)。

接下来,您计算“错误”的方式有点奇怪。它计算整个时期的最大误差并打印出来。一个时期的最大误差几乎不是有用的误差度量;即使是一个设计良好、训练有素的深度卷积神经网络,有时也会在一个时期内的至少一个数据点上表现不佳。您的准确度测量可能足以衡量您的模型收敛程度。但是,我还通过简单地调整您当前的错误计算来添加“平均错误”。由于您使用的是交叉熵损失(至少,考虑到您计算梯度的方法,这是正确的),我建议您编写一个实际上计算交叉熵损失(在您的情况下为负对数似然的总和)。请记住,在解释这样的损失时,sigmoid 上的负对数似然限制在 (0, infinity) 内,因此交叉熵损失也是如此。

当然,另一个问题可能是学习率。事实上,大多数人会认为学习率是最重要的调整超参数。我最终使用了0.00001,尽管我没有做太多的网格搜索。

接下来,您将使用完整的批量学习。这意味着您计算每个数据点的梯度总和,然后更新一次权重。换句话说,每个 epoch 只执行一次权重更新。如果是这样的话,你将不得不做很多 epochs 才能获得不错的结果。如果你有时间和计算资源,那可能没问题。但是,如果您不这样做,您可能会考虑使用小批量。至少与在线/随机学习相比,小批量对样本顺序仍然相当稳健(尽管理论上您仍然应该对每个时期的数据进行洗牌)。它涉及将您的完整数据集划分为一些预定义大小的“批次”。对于每个批次,您计算批次中每个数据点的模型梯度总和。然后,您进行权重更新(通过调用change())。一旦你检查了每一批次,这就构成了一个时期。我使用了小批量和 1,000 的批量。

最后(我想说的是最重要的一点,但我提到的其他事情也阻止了收敛),你没有在所有训练数据上进行训练(8,000 / 60,000);您没有训练足够多的 epoch(5 个可能还不够,尤其是当您只训练一小部分数据时);并且您的模型可能太简单(没有足够的隐藏层节点)。然而,最重要的问题是,实现并不总是在适当的时候使用向量化操作,因此在具有足够数量的 epoch 和模型复杂性的所有训练数据上进行实际训练太慢了。

我更新了您的实现(最值得注意的是backprop()and change())以尽可能使用 numpy 的矢量化操作。这将实施速度提高了几个数量级。但是,我认为它根本不会改变代码的语义。我还实施了我在这篇文章中建议的其他更改。在仅仅 20 个 epoch 和隐藏层中只有 32 个隐藏节点之后,我平均获得了大约 85% 的训练准确度(尽管它随批次变化 +/- 6%)。我没有针对测试集运行它,所以我也没有弄乱正则化参数(我只是设置Lambda为零)。这是更新的代码(predict()为简洁起见,我对函数等部分进行了编辑):

import numpy as np
from tensorflow.keras.datasets import mnist

class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

class NeuralNetwork():

    correct = 0
    epochs = 20
    Lambda = 0
    learningRate = 0.00001

    def __init__(self, sizes, batchSize):
        self.batchSize = batchSize
        self.dimensions = sizes

        self.secondLayerNeurons = np.empty(sizes[1])
        self.outputNeurons = np.empty(sizes[2])

        # Draw weights and biases from (-1, 1) by multiplying the (0, 1)
        # values by 2 and subtracting 1. There are better ways of doing this,
        # but this works just fine.
        self.firstLayerWeights = np.random.rand(sizes[1], sizes[0]) * 2 - 1
        self.secondLayerWeights = np.random.rand(sizes[2], sizes[1]) * 2 - 1
        self.firstLayerBiases = np.random.rand(sizes[1]) * 2 - 1
        self.secondLayerBiases = np.random.rand(sizes[2]) * 2 - 1

        self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
        self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
        self.firstLayerBiasesSummations = np.zeros([sizes[1]])
        self.secondLayerBiasesSummations = np.zeros([sizes[2]])

        self.hiddenLayerErrors = np.empty(sizes[1])
        self.outputLayerErrors = np.empty(sizes[2])

    def sigmoid(self, x):
        return 1/(1+np.exp(-x))

    def sigmoidDerivative(self, x):
        return np.multiply(x,(1-x))


    def forwardProp(self, inputs):
        for i in range (self.dimensions[1]):
            self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
        for i in range (self.dimensions[2]):
            self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])

    def backProp(self, inputs, correct_output):
        self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
        self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))

        self.secondLayerBiasesSummations += self.outputLayerErrors
        self.secondLayerWeightsSummations += np.outer(self.outputLayerErrors, self.secondLayerNeurons)

        self.firstLayerBiasesSummations += self.hiddenLayerErrors
        self.firstLayerWeightsSummations += np.outer(self.hiddenLayerErrors, inputs)

    def train(self, trainImages, trainLabels):
        size = str(self.batchSize)
        err_sum = 0.0
        err_count = 0
        avg_err = 0.0

        for m in range (self.batchSize):
            correct_output = np.zeros([self.dimensions[2]])
            correct_output[trainLabels[m]] = 1.0

            self.forwardProp(trainImages[m].flatten())
            self.backProp(trainImages[m].flatten(), correct_output)

            if np.argmax(self.outputNeurons) == int(trainLabels[m]):
                self.correct+=1

            if m%150 == 0:
                error = np.amax(np.absolute(self.outputLayerErrors))
                err_sum += error
                err_count += 1
                avg_err = err_sum / err_count
                accuracy = str(int((self.correct/(m+1))*100)) + '%'
                percent = str(int((m/self.batchSize)*100)) + '%'
                print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(avg_err), end="\r")

        self.change()
        print (size + '/' + size + " -- " + " -- Accuracy: " + accuracy + " -- Error: " + str(avg_err), end="\r")
        self.correct = 0

    def change(self):

        self.secondLayerBiases -= self.learningRate * self.secondLayerBiasesSummations
        self.secondLayerWeights -= self.learningRate * self.secondLayerWeightsSummations
        self.firstLayerBiases -= self.learningRate * self.firstLayerBiasesSummations
        self.firstLayerWeights -= self.learningRate * self.firstLayerWeightsSummations

        self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
        self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
        self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
        self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])

if __name__ == "__main__":

    (train_images, train_labels), (test_images, test_labels) = mnist.load_data()
    train_images = train_images / 255 # Normalize image data

    num_using = 60000 # Amount of data points to use. It's fast now, so we may as well use the full 60,000
    bs = 1000 # Batch size. 60,000 is full batch. Consider trying mini-batch
    neural_network = NeuralNetwork([784, 32, 10], bs)

    for i in range (neural_network.epochs):
        print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
        for j in range(int(num_using / bs)):
            print("Batch", str(j+1) + "/" + str(int(60000 / bs)))
            neural_network.train(train_images[int(j * bs):int(j * bs) + bs], train_labels[int(j * bs):int(j * bs) + bs])

对于需要最少努力的进一步改进,我建议尝试更多的隐藏节点(甚至可能是 128 个),进一步调整学习率和正则化参数,尝试不同的批量大小,并调整 epoch 的数量。

如果您有任何问题,请告诉我。


推荐阅读