首页 > 解决方案 > GPU工作时colab中的TPU失败

问题描述

我正在研究TPU fmnist 示例的修改版本。虽然原来的作品很好,但在我的版本中,我根据百分位数减少了某些类的频率(也省略了一些类,但这似乎不会导致问题)。

在 TPU 上进行训练会导致关于形状不匹配的非常不具信息性的错误:

TPUExecute 参数 [0] (cond_15/Merge) 的运行时形状不匹配。预期元素类型:F32

在 GPU 上运行完全相同的代码可以工作。

我认为这是由于 TPU 之间的工作负载共享,但我对如何修复它有点迷茫。

我附上了笔记本的链接,以及它下面的代码。目前在 TPU 上,在 GPU 上运行取消注释相关的 startegy,同时注释 TPU 部分,并更改 colab 运行时。

协作笔记本

import tensorflow as tf
import numpy as np

import distutils
if distutils.version.LooseVersion(tf.__version__) < '1.14':
    raise Exception('This notebook is compatible with TensorFlow 1.14 or higher, for TensorFlow 1.13 or lower please use the previous version at https://github.com/tensorflow/tpu/blob/r1.13/tools/colab/fashion_mnist.ipynb')

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
def make_tail_ds(x_train, y_train,x_test,y_test, tail_classes, omitted_classes, precentile):
    true_indices = np.invert(np.isin(y_train,tail_classes+omitted_classes))
    tail_indices = np.isin(y_train,tail_classes)
    normal_data = x_train[true_indices]
    normal_gt = y_train[true_indices]
    tail_data = x_train[tail_indices]
    tail_gt = y_train[tail_indices]
    tail_len = int(np.floor(len(tail_gt)*precentile))
    tail_data =tail_data[:tail_len]
    tail_gt =tail_gt[:tail_len]
    new_train = np.concatenate([normal_data,tail_data])
    new_gt = np.concatenate([normal_gt,tail_gt])
    test_indices = np.invert(np.isin(y_test,omitted_classes))
    return np.copy(new_train),np.copy(new_gt), np.copy(x_test[test_indices]),np.copy(y_test[test_indices])




x_train, y_train,x_test,y_test = make_tail_ds(x_train,y_train,x_test,y_test, [3,4], [8,9],0.9)



# add empty color dimension
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)


def create_model(num_of_classes):
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.BatchNormalization(input_shape=x_train.shape[1:]))
  model.add(tf.keras.layers.Conv2D(64, (5, 5), padding='same', activation='elu'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
  model.add(tf.keras.layers.Dropout(0.25))

  model.add(tf.keras.layers.BatchNormalization(input_shape=x_train.shape[1:]))
  model.add(tf.keras.layers.Conv2D(128, (5, 5), padding='same', activation='elu'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
  model.add(tf.keras.layers.Dropout(0.25))

  model.add(tf.keras.layers.BatchNormalization(input_shape=x_train.shape[1:]))
  model.add(tf.keras.layers.Conv2D(256, (5, 5), padding='same', activation='elu'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
  model.add(tf.keras.layers.Dropout(0.25))

  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(256))
  model.add(tf.keras.layers.Activation('elu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Dense(num_of_classes))
  model.add(tf.keras.layers.Activation('softmax'))
  return model

import os

resolver = tf.contrib.cluster_resolver.TPUClusterResolver('grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.contrib.distribute.initialize_tpu_system(resolver)
strategy = tf.contrib.distribute.TPUStrategy(resolver)

# strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")

with strategy.scope():
  model = create_model(len(np.unique(y_train)))
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, ),
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy'])

model.fit(
    x_train.astype(np.float32), y_train.astype(np.float32),
    epochs=17,    
    validation_data=(x_test.astype(np.float32), y_test.astype(np.float32)), batch_size = 100

)

model.save_weights('./fashion_mnist.h5', overwrite=True)

标签: tensorflowkerasgpugoogle-colaboratorytpu

解决方案


推荐阅读