python - 从头开始的 NN 适用于简单的问题,但不适用于 MNIST 数字
问题描述
我一直在研究一个带有一个隐藏层的神经网络,三层中的每一层都有灵活数量的节点。这是代码:
import time
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
class NeuralNetwork():
correct = 0
num_predictions = 10
epochs = 100
sizeOfEpoch = 5000
Lambda = 10
learningRate = 0.00001
def __init__(self, sizes):
self.dimensions = sizes
self.x = np.arange(1,self.epochs+1)
self.y = np.empty(self.epochs)
self.secondLayerNeurons = np.empty(sizes[1])
self.outputNeurons = np.empty(sizes[2])
self.firstLayerWeights = np.random.rand(sizes[1], sizes[0])
self.secondLayerWeights = np.random.rand(sizes[2], sizes[1])
self.firstLayerBiases = np.random.rand(sizes[1])
self.secondLayerBiases = np.random.rand(sizes[2])
self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
self.firstLayerBiasesSummations = np.zeros([sizes[1]])
self.secondLayerBiasesSummations = np.zeros([sizes[2]])
self.hiddenLayerErrors = np.empty(sizes[1])
self.outputLayerErrors = np.empty(sizes[2])
def sigmoid(self, x):
return 1/(1+np.exp(-x))
def sigmoidDerivative(self, x):
return np.multiply(x,(1-x))
def forwardProp(self, inputs):
for i in range (self.dimensions[1]):
self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])
def backProp(self, inputs, correct_output):
self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j==0:
self.secondLayerBiasesSummations[i] += self.outputLayerErrors[i]
self.secondLayerWeightsSummations[i][j] += self.outputLayerErrors[i]*self.secondLayerNeurons[j]
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j==0:
self.firstLayerBiasesSummations[i] += self.hiddenLayerErrors[i]
self.firstLayerWeightsSummations[i][j] += self.hiddenLayerErrors[i]*inputs[j]
def train(self, trainImages, trainLabels):
size = str(self.sizeOfEpoch)
greatestError = 0.0
start_time2 = time.time()
for m in range (self.sizeOfEpoch):
correct_output = np.zeros([self.dimensions[2]])
correct_output[int(class_names[trainLabels[m]])] = 1.0
self.forwardProp(trainImages[m].flatten())
self.backProp(trainImages[m].flatten(), correct_output)
if np.argmax(self.outputNeurons) == int(trainLabels[m]):
self.correct+=1
if m%200 == 0:
error = np.amax(np.absolute(self.outputLayerErrors))
if error > greatestError:
greatestError = error
accuracy = str(int((self.correct/(m+1))*100)) + '%'
percent = str(int((m/self.sizeOfEpoch)*100)) + '%'
print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
self.change()
time2 = str(round((time.time() - start_time2), 2))
print (size + '/' + size + " -- " + time2 + "s" + " -- Accuracy: " + accuracy + " -- Error: " + str(greatestError), end="\r")
return greatestError
def change(self):
for i in range (self.dimensions[2]):
for j in range (self.dimensions[1]):
if j == 0:
self.secondLayerBiases[i] -= self.learningRate*self.secondLayerBiasesSummations[i]
self.secondLayerWeights[i][j] -= self.learningRate*(self.secondLayerWeightsSummations[i][j]+self.Lambda*self.secondLayerWeights[i][j])
for i in range (self.dimensions[1]):
for j in range (self.dimensions[0]):
if j == 0:
self.firstLayerBiases[i] -= self.learningRate*self.firstLayerBiasesSummations[i]
self.firstLayerWeights[i][j] -= self.learningRate*(self.firstLayerWeightsSummations[i][j]+self.Lambda*self.firstLayerWeights[i][j])
self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])
self.correct = 0
def predict(self, testImage):
secondLayerAnsNodes = np.empty([self.dimensions[1]])
outputAns = np.empty([self.dimensions[2]])
for i in range (self.dimensions[1]):
secondLayerAnsNodes[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], testImage)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
outputAns[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], secondLayerAnsNodes)+self.secondLayerBiases[i])
return np.argmax(outputAns)
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images/255.0
test_images = test_images/255.0
neural_network = NeuralNetwork([784, 16, 10])
start_time = time.time()
for i in range (neural_network.epochs):
print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
neural_network.y[i]=neural_network.train(train_images, train_labels)
time = time.time() - start_time
plt.plot(neural_network.x, neural_network.y, 'b')
plt.ylabel('Error Change')
plt.xlabel('Epochs')
plt.show()
print("\n\n\nTotal Time Used")
if time/60 < 60:
print("Minutes: %s" % round((time/60),2))
else:
print("Seconds: %s" % round(time,2))
for i in range (neural_network.num_predictions):
prediction = neural_network.predict(test_images[i].flatten())
plt.grid(False)
plt.imshow(test_images[i], cmap=plt.cm.binary)
plt.title("Prediction: " + str(prediction) + " -- Actual: " + class_names[test_labels[i]] + "\n" + str(i+1) + "/" + str(neural_network.num_predictions))
plt.show()
由于某种原因,此代码不适用于更复杂的问题。误差没有得到最小化,准确性保持不变。这个确切的代码适用于 xor 问题和另一个类似的问题。当我尝试给它 MNIST 数字数据集时,它不起作用。唯一不同的是每一层的节点较多,算法是一样的。
这里可能是什么问题?
这是运行 20 个 epoch 后的图表,学习率为 0.000001,lambda 为 10。它显示了每个 epoch 的误差。y 标签应该说错误,而不是错误更改。 https://i.stack.imgur.com/fLXzz.png
解决方案
您的实施在技术上没有任何问题。但是,有几件事需要注意,所有这些都会对您所看到的性能产生重大影响。这是一个很长的答案,但每个部分都反映了我对您的代码所做的重要更改,以使其按预期工作,因此请仔细阅读。
首先,您不应该在 (0, 1) 中初始化您的权重,这是np.random.randn
默认设置的。具体来说,如果您要选择均匀的随机权重,则均匀分布应以零为中心。例如,选择 (-1, 1) 或 (-.1, .1) 范围内的随机数。否则,您的 MLP 会立即出现偏差;许多隐藏层神经元将通过 sigmoid 激活立即映射到接近 1。毕竟,sigmoid 激活以零为中心(沿 x 轴),因此您的默认输入也应该如此。这个问题可以很容易地阻止您的 MLP 完全收敛(事实上,在您的情况下确实如此)。有比从均匀随机分布中抽样更好的权重初始化方法,但这并不是说如果做得好,这种方法就行不通。
其次,您可能应该规范化图像数据。神经网络对 0 到 255 之间的输入效果不佳,这是默认情况下从 keras 导出图像数据的方式。您可以通过将每个输入特征除以 255 来解决此问题。原因是 sigmoid 曲线在高幅度子域的导数非常小。换句话说,当 x 非常大或非常小(非常负)时,sigmoid(x) 对 x 的导数非常接近于零。当您将某些权重乘以非常大的值(例如 255)时,您很可能会立即进入 S 型曲线的这个高数值域。这不一定会阻止你的网络收敛,但它肯定会在一开始就减慢它,因为小的导数会导致小的梯度,这反过来会导致较小的权重更新。您可以提高学习率,但这可能会导致神经网络在离开 sigmoid 曲线的低导数区域时越步(并可能发散)。同样,我已经在您的特定程序中测试(并修复)了这个问题,它确实产生了显着差异(最终精度大约为 0.8,而不是 0.6)。
接下来,您计算“错误”的方式有点奇怪。它计算整个时期的最大误差并打印出来。一个时期的最大误差几乎不是有用的误差度量;即使是一个设计良好、训练有素的深度卷积神经网络,有时也会在一个时期内的至少一个数据点上表现不佳。您的准确度测量可能足以衡量您的模型收敛程度。但是,我还通过简单地调整您当前的错误计算来添加“平均错误”。由于您使用的是交叉熵损失(至少,考虑到您计算梯度的方法,这是正确的),我建议您编写一个实际上计算交叉熵损失(在您的情况下为负对数似然的总和)。请记住,在解释这样的损失时,sigmoid 上的负对数似然限制在 (0, infinity) 内,因此交叉熵损失也是如此。
当然,另一个问题可能是学习率。事实上,大多数人会认为学习率是最重要的调整超参数。我最终使用了0.00001
,尽管我没有做太多的网格搜索。
接下来,您将使用完整的批量学习。这意味着您计算每个数据点的梯度总和,然后更新一次权重。换句话说,每个 epoch 只执行一次权重更新。如果是这样的话,你将不得不做很多 epochs 才能获得不错的结果。如果你有时间和计算资源,那可能没问题。但是,如果您不这样做,您可能会考虑使用小批量。至少与在线/随机学习相比,小批量对样本顺序仍然相当稳健(尽管理论上您仍然应该对每个时期的数据进行洗牌)。它涉及将您的完整数据集划分为一些预定义大小的“批次”。对于每个批次,您计算批次中每个数据点的模型梯度总和。然后,您进行权重更新(通过调用change()
)。一旦你检查了每一批次,这就构成了一个时期。我使用了小批量和 1,000 的批量。
最后(我想说的是最重要的一点,但我提到的其他事情也阻止了收敛),你没有在所有训练数据上进行训练(8,000 / 60,000);您没有训练足够多的 epoch(5 个可能还不够,尤其是当您只训练一小部分数据时);并且您的模型可能太简单(没有足够的隐藏层节点)。然而,最重要的问题是,实现并不总是在适当的时候使用向量化操作,因此在具有足够数量的 epoch 和模型复杂性的所有训练数据上进行实际训练太慢了。
我更新了您的实现(最值得注意的是backprop()
and change()
)以尽可能使用 numpy 的矢量化操作。这将实施速度提高了几个数量级。但是,我认为它根本不会改变代码的语义。我还实施了我在这篇文章中建议的其他更改。在仅仅 20 个 epoch 和隐藏层中只有 32 个隐藏节点之后,我平均获得了大约 85% 的训练准确度(尽管它随批次变化 +/- 6%)。我没有针对测试集运行它,所以我也没有弄乱正则化参数(我只是设置Lambda
为零)。这是更新的代码(predict()
为简洁起见,我对函数等部分进行了编辑):
import numpy as np
from tensorflow.keras.datasets import mnist
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
class NeuralNetwork():
correct = 0
epochs = 20
Lambda = 0
learningRate = 0.00001
def __init__(self, sizes, batchSize):
self.batchSize = batchSize
self.dimensions = sizes
self.secondLayerNeurons = np.empty(sizes[1])
self.outputNeurons = np.empty(sizes[2])
# Draw weights and biases from (-1, 1) by multiplying the (0, 1)
# values by 2 and subtracting 1. There are better ways of doing this,
# but this works just fine.
self.firstLayerWeights = np.random.rand(sizes[1], sizes[0]) * 2 - 1
self.secondLayerWeights = np.random.rand(sizes[2], sizes[1]) * 2 - 1
self.firstLayerBiases = np.random.rand(sizes[1]) * 2 - 1
self.secondLayerBiases = np.random.rand(sizes[2]) * 2 - 1
self.firstLayerWeightsSummations = np.zeros([sizes[1], sizes[0]])
self.secondLayerWeightsSummations = np.zeros([sizes[2], sizes[1]])
self.firstLayerBiasesSummations = np.zeros([sizes[1]])
self.secondLayerBiasesSummations = np.zeros([sizes[2]])
self.hiddenLayerErrors = np.empty(sizes[1])
self.outputLayerErrors = np.empty(sizes[2])
def sigmoid(self, x):
return 1/(1+np.exp(-x))
def sigmoidDerivative(self, x):
return np.multiply(x,(1-x))
def forwardProp(self, inputs):
for i in range (self.dimensions[1]):
self.secondLayerNeurons[i] = self.sigmoid(np.dot(self.firstLayerWeights[i], inputs)+self.firstLayerBiases[i])
for i in range (self.dimensions[2]):
self.outputNeurons[i] = self.sigmoid(np.dot(self.secondLayerWeights[i], self.secondLayerNeurons)+self.secondLayerBiases[i])
def backProp(self, inputs, correct_output):
self.outputLayerErrors = np.subtract(self.outputNeurons, correct_output)
self.hiddenLayerErrors = np.multiply(np.dot(self.secondLayerWeights.T, self.outputLayerErrors), self.sigmoidDerivative(self.secondLayerNeurons))
self.secondLayerBiasesSummations += self.outputLayerErrors
self.secondLayerWeightsSummations += np.outer(self.outputLayerErrors, self.secondLayerNeurons)
self.firstLayerBiasesSummations += self.hiddenLayerErrors
self.firstLayerWeightsSummations += np.outer(self.hiddenLayerErrors, inputs)
def train(self, trainImages, trainLabels):
size = str(self.batchSize)
err_sum = 0.0
err_count = 0
avg_err = 0.0
for m in range (self.batchSize):
correct_output = np.zeros([self.dimensions[2]])
correct_output[trainLabels[m]] = 1.0
self.forwardProp(trainImages[m].flatten())
self.backProp(trainImages[m].flatten(), correct_output)
if np.argmax(self.outputNeurons) == int(trainLabels[m]):
self.correct+=1
if m%150 == 0:
error = np.amax(np.absolute(self.outputLayerErrors))
err_sum += error
err_count += 1
avg_err = err_sum / err_count
accuracy = str(int((self.correct/(m+1))*100)) + '%'
percent = str(int((m/self.batchSize)*100)) + '%'
print ("Progress: " + percent + " -- Accuracy: " + accuracy + " -- Error: " + str(avg_err), end="\r")
self.change()
print (size + '/' + size + " -- " + " -- Accuracy: " + accuracy + " -- Error: " + str(avg_err), end="\r")
self.correct = 0
def change(self):
self.secondLayerBiases -= self.learningRate * self.secondLayerBiasesSummations
self.secondLayerWeights -= self.learningRate * self.secondLayerWeightsSummations
self.firstLayerBiases -= self.learningRate * self.firstLayerBiasesSummations
self.firstLayerWeights -= self.learningRate * self.firstLayerWeightsSummations
self.firstLayerSummations = np.zeros([self.dimensions[1], self.dimensions[0]])
self.secondLayerSummations = np.zeros([self.dimensions[2], self.dimensions[1]])
self.firstLayerBiasesSummations = np.zeros(self.dimensions[1])
self.secondLayerBiasesSummations = np.zeros(self.dimensions[2])
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images / 255 # Normalize image data
num_using = 60000 # Amount of data points to use. It's fast now, so we may as well use the full 60,000
bs = 1000 # Batch size. 60,000 is full batch. Consider trying mini-batch
neural_network = NeuralNetwork([784, 32, 10], bs)
for i in range (neural_network.epochs):
print ("\nEpoch", str(i+1) + "/" + str(neural_network.epochs))
for j in range(int(num_using / bs)):
print("Batch", str(j+1) + "/" + str(int(60000 / bs)))
neural_network.train(train_images[int(j * bs):int(j * bs) + bs], train_labels[int(j * bs):int(j * bs) + bs])
对于需要最少努力的进一步改进,我建议尝试更多的隐藏节点(甚至可能是 128 个),进一步调整学习率和正则化参数,尝试不同的批量大小,并调整 epoch 的数量。
如果您有任何问题,请告诉我。
推荐阅读
- c - 从静态 C 库的源代码中提取函数定义以避免库编译
- javascript - 如何使用 Vue JS 在子组件中保留数字和增量?
- computer-vision - YOLOv3 错误的边界框预测
- c# - 如何从报告中获取最大时间
- c++ - 为什么我的复制构造函数不起作用?(C++)
- ruby-on-rails - 如何使用闭包树对嵌套哈希树(注释)进行分页并将分页
- ios - Xcode 在添加 swift 包时提示存储库被锁定
- c# - 我的代码在 dataGridView1.Rows.Clear() 之后直到结束都不起作用
- python - 如何在 Discord.py 中添加消息并对该消息添加反应
- c++ - 如果子类具有与父类相同的参数,子类是否应该覆盖构造函数?