首页 > 解决方案 > Google 机器学习速成课程 MNIST 示例训练在本地 PC 上出现分歧

问题描述

我需要谷歌机器学习速成课程中关于 MNIST 代码分歧行为的帮助

以下代码复制自 Google 深度学习速成课程的 MNIST 示例:编程练习:使用神经网络对手写数字进行分类

但是,本地 PC(Windows 或 Linux)上的训练存在差异

Linux PC 的分歧

相同的代码在Google Colab中运行正常

Google Colab 中的融合

请建议如何调试。

from __future__ import print_function

import glob
import os

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

mnist_dataframe = pd.read_csv(
#  "https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",
  "mnist_train_small.csv",
  sep=",",
  header=None)

# Use just the first 10,000 records for training/validation.
mnist_dataframe = mnist_dataframe.head(10000)

mnist_dataframe = mnist_dataframe.reindex(np.random.permutation(mnist_dataframe.index))

def parse_labels_and_features(dataset):
  labels = dataset[0]

  features = dataset.loc[:,1:784]
  features = features / 255

  return labels, features

training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])

validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])


def construct_feature_columns():
  return set([tf.feature_column.numeric_column('pixels', shape=784)])

def create_training_input_fn(features, labels, batch_size, num_epochs=None, shuffle=True):
  def _input_fn(num_epochs=None, shuffle=True):
    idx = np.random.permutation(features.index)
    raw_features = {"pixels":features.reindex(idx)}
    raw_targets = np.array(labels[idx])

    ds = Dataset.from_tensor_slices((raw_features,raw_targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)

    if shuffle:
      ds = ds.shuffle(10000)

    # Return the next batch of data.
    feature_batch, label_batch = ds.make_one_shot_iterator().get_next()

    return feature_batch, label_batch

  return _input_fn

def create_predict_input_fn(features, labels, batch_size):
  def _input_fn():
    raw_features = {"pixels": features.values}
    raw_targets = np.array(labels)

    ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
    ds = ds.batch(batch_size)

    # Return the next batch of data.
    feature_batch, label_batch = ds.make_one_shot_iterator().get_next()

    return feature_batch, label_batch

  return _input_fn

def train_nn_classification_model(
    learning_rate,
    steps,
    batch_size,
    hidden_units,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):

  periods = 3
  steps_per_period = steps / periods  

  # Create the input functions.
  predict_training_input_fn = create_predict_input_fn(
    training_examples, training_targets, batch_size)
  predict_validation_input_fn = create_predict_input_fn(
    validation_examples, validation_targets, batch_size)
  training_input_fn = create_training_input_fn(
    training_examples, training_targets, batch_size)

  # Create feature columns.
  feature_columns = [tf.feature_column.numeric_column('pixels', shape=784)]

  # Create a DNNClassifier object.
  my_optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  classifier = tf.estimator.DNNClassifier(
      feature_columns=feature_columns,
      n_classes=10,
      hidden_units=hidden_units,
      optimizer=my_optimizer,
#      config=tf.contrib.learn.RunConfig(keep_checkpoint_max=1),
      model_dir='./mdd'
  )

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("LogLoss error (on validation data):")
  training_errors = []
  validation_errors = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    classifier.train(
        input_fn=training_input_fn,
        steps=steps_per_period
    )

    # Take a break and compute probabilities.
    training_predictions = list(classifier.predict(input_fn=predict_training_input_fn))
    training_probabilities = np.array([item['probabilities'] for item in training_predictions])
    training_pred_class_id = np.array([item['class_ids'][0] for item in training_predictions])
    training_pred_one_hot = tf.keras.utils.to_categorical(training_pred_class_id,10)

    validation_predictions = list(classifier.predict(input_fn=predict_validation_input_fn))
    validation_probabilities = np.array([item['probabilities'] for item in validation_predictions])    
    validation_pred_class_id = np.array([item['class_ids'][0] for item in validation_predictions])
    validation_pred_one_hot = tf.keras.utils.to_categorical(validation_pred_class_id,10)    

    # Compute training and validation errors.
    training_log_loss = metrics.log_loss(training_targets, training_pred_one_hot)
    validation_log_loss = metrics.log_loss(validation_targets, validation_pred_one_hot)
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, validation_log_loss))
    # Add the loss metrics from this period to our list.
    training_errors.append(training_log_loss)
    validation_errors.append(validation_log_loss)
  print("Model training finished.")
  # Remove event files to save disk space.
  #_ = map(os.remove, glob.glob(os.path.join(classifier.model_dir, 'events.out.tfevents*')))

  # Calculate final predictions (not probabilities, as above).
  final_predictions = classifier.predict(input_fn=predict_validation_input_fn)
  final_predictions = np.array([item['class_ids'][0] for item in final_predictions])

  accuracy = metrics.accuracy_score(validation_targets, final_predictions)
  print("Final accuracy (on validation data): %0.2f" % accuracy)

  # Output a graph of loss metrics over periods.
  plt.ylabel("LogLoss")
  plt.xlabel("Periods")
  plt.title("LogLoss vs. Periods")
  plt.plot(training_errors, label="training")
  plt.plot(validation_errors, label="validation")
  plt.legend()
  plt.show()

  # Output a plot of the confusion matrix.
  cm = metrics.confusion_matrix(validation_targets, final_predictions)
  # Normalize the confusion matrix by row (i.e by the number of samples
  # in each class).
  cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
  ax = sns.heatmap(cm_normalized, cmap="bone_r")
  ax.set_aspect(1)
  plt.title("Confusion matrix")
  plt.ylabel("True label")
  plt.xlabel("Predicted label")
  plt.show()

  return classifier

classifier = train_nn_classification_model(
    learning_rate=0.05,
    steps=300,
    batch_size=30,
    hidden_units=[100, 100],
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

mnist_test_dataframe = pd.read_csv(
#  "https://download.mlcc.google.com/mledu-datasets/mnist_test.csv",
  "mnist_test.csv",
  sep=",",
  header=None)

test_targets, test_examples = parse_labels_and_features(mnist_test_dataframe)
test_examples.describe()

predict_test_input_fn = create_predict_input_fn(
    test_examples, test_targets, batch_size=100)

test_predictions = classifier.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['class_ids'][0] for item in test_predictions])

accuracy = metrics.accuracy_score(test_targets, test_predictions)
print("Accuracy on test data: %0.2f" % accuracy)

[编辑] 看来问题出在np.random.permutation. 下面的代码可以显示两个平台的区别。Numpy/Pandas 的 PC 版有问题吗?

import pandas as pd
import numpy as np

a = pd.DataFrame([[1,2],[3,4]])

print(a)
print(list(a.index))

while 1:
    idx = np.random.permutation(a.index)
    if idx[0] == 1:
        break

print(a)
print(list(a.index))
print(idx)

来自 Google Colab 的输出

   0  1
0  1  2
1  3  4
[0, 1]
   0  1
0  1  2
1  3  4
[0, 1]
[1 0]

Linux PC 的输出:

   0  1
0  1  2
1  3  4
[0, 1]
   0  1
1  1  2
0  3  4
[0, 1]
[1 0]

所以它似乎np.random.permutation(a.index)在两个平台上有不同的行为。这是一个错误吗?

标签: machine-learningmnist

解决方案


np.random.permutation所有其他基于随机性的函数都基于随机数工作。

计算机使用任何语言或库生成的随机数通常是伪随机数。它会因机器而异,即 colab 中生成的随机数在您的机器上可能是 2018,也可能是 1234,依此类推。

此外,如果您没有修复随机种子,生成的随机数将因同一台机器上的不同运行而改变。

这种行为可能会导致您的模型出现问题。

作为修复,您可以尝试修复随机种子,看看会发生什么。但是不能保证colab生成的随机数和你机器的随机数匹配就是排列。

随机种子可以像这样固定在numpy中

seed = 1234
np.random.seed(seed)

如果每次运行代码时都运行它,可以确保获得相同的随机数集。

有关随机性和获得可重现结果的更多详细信息,请参阅此处的出色博客文章。


推荐阅读