首页 > 解决方案 > 覆盖 TF2 优化器类,应用渐变有问题?

问题描述

我正在尝试实现我自己的优化器(SGD + 动量),它会覆盖原始的 keras SGD + 动量类。我正在尝试传入一些预训练的初始化参数和超参数(速度、动量和学习率),并将它们用作我的 SGD + 动量的超参数。在对 TF2 优化器类进行了一些修改之后,我推断用我自己的速度权重覆盖创建槽,然后将它们乘以动量常数应该可以解决问题,但是,当我尝试 optim.apply_gradients( )。关于我是否在正确的方向上的任何输入?这是我编辑的课程:

class ParamwiseSGD(tf.keras.optimizers.SGD):
def __init__(self,
           learning_rate=1.0,
           lr_multipliers=None,
           velocity_multipliers = None,
           momentum_multipliers = None,
           momentum=1.0,
           nesterov=False,
           clipvalue=1.0,
           name='ParamwiseSGD',
           **kwargs):
    super(ParamwiseSGD, self).__init__(learning_rate, momentum=momentum,
                                       nesterov=nesterov, name=name, **kwargs)
    self.lr_multipliers = lr_multipliers
    self.velocity_multipliers = velocity_multipliers
    # import ipdb; ipdb.set_trace()
    self.momentum_multipliers = momentum_multipliers

def _get_lr(self, name):
      tokens = name.split('/')
      # import ipdb; ipdb.set_trace()
      return self.lr_multipliers[tokens[0]]

def _create_slots(self, var_list):
    if self._momentum:
      for var in var_list:
        # new_var = tf.Variable()
        print(var.name)
        new_val = self._get_velocity(var.name)
        self.add_slot(var, "momentum", initializer = new_val)


def _get_velocity(self, name, shape = None):
      tokens = name.split('/')
      # import ipdb; ipdb.set_trace()
      return self.velocity_multipliers[tokens[0]]

def _get_mom(self, name):
      tokens = name.split('/')

      return self.momentum_multipliers[tokens[0]]

def _resource_apply_dense(self, grad, var, apply_state=None):
    lr_mult = self._get_lr(var.name)
    # print_op = tf.print(var.name, lr_mult)
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    # with tf.control_dependencies([print_op]):
    if self._momentum:
     # momentum_var = self.get_slot(var, "momentum")
      momentum_var = self.get_slot(var, "momentum")
      momentum_mult = self._get_mom(name=var.name)
      # import ipdb; ipdb.set_trace()
      return training_ops.resource_apply_keras_momentum(
            var.handle,
            momentum_var.handle,
            coefficients["lr_t"] * lr_mult,
            grad,
            coefficients["momentum"] * momentum_mult,
            use_locking=self._use_locking,
            use_nesterov=self.nesterov)
    else:
      return training_ops.resource_apply_gradient_descent(
            var.handle, coefficients["lr_t"] * lr_mult, grad, use_locking=self._use_locking)

def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
    lr_mult = self._get_lr(var.name)
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    momentum_var = self.get_slot(var, "momentum")
    momentum_mult = self._get_mom(var.name)
    return training_ops.resource_sparse_apply_keras_momentum(
        var.handle,
        momentum_var.handle,
        coefficients["lr_t"] * lr_mult,
        grad,
        indices,
        coefficients["momentum"] * momentum_mult,
        use_locking=self._use_locking,
        use_nesterov=self.nesterov)

#TODO: how to fix this implementation
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices,
                                           **kwargs):
    lr_mult = self._get_lr(var.name)
    # import ipdb; ipdb.set_trace()
    if self._momentum:
      # import ipdb; ipdb.set_trace()
      return super(ParamwiseSGD, self)._resource_apply_sparse_duplicate_indices(
          grad, var, indices, **kwargs)
    else:
      var_device, var_dtype = var.device, var.dtype.base_dtype
      coefficients = (kwargs.get("apply_state", {}).get((var_device, var_dtype))
                      or self._fallback_apply_state(var_device, var_dtype))

      return resource_variable_ops.resource_scatter_add(
          var.handle, indices, -grad * coefficients["lr_t"] * lr_mult)

然后使用 iris 数据集初始化和使用来自 TF2 的简单训练循环,会引发一些错误:

optim_2 = ParamwiseSGD(
    momentum=.9,
    nesterov=False,
    lr_multipliers=lr,
    momentum_multipliers=mom,
    velocity_multipliers=vel)
train_loss_results = []
train_accuracy_results = []

num_epochs = 201

for epoch in range(num_epochs):
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  # Training loop - using batches of 32
  for x, y in train_dataset:
    # Optimize the model
    loss_value, grads = grad(model, x, y)
    optim_2.apply_gradients(zip(grads, model.trainable_variables))

    # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss
    # Compare predicted label to actual label
    # training=True is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    epoch_accuracy.update_state(y, model(x, training=True))

  # End epoch
  train_loss_results.append(epoch_loss_avg.result())
  train_accuracy_results.append(epoch_accuracy.result())

  if epoch % 50 == 0:
    print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))

抛出此错误:

/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)

InvalidArgumentError: var 和 accum 的形状不同[4,10] [10] [Op:ResourceApplyKerasMomentum]

这是我用来初始化权重的玩具问题:

model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(4,)),  # input shape required
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(3)

s_1 = (10,)
s_2 = (10,)
s_3 = (3,)

layer_1 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_1)}
layer_2 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_2)}
layer_3 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_3)}

vel = {"dense": layer_1['vel'], "dense_1": layer_2['vel'], "dense_2": layer_3['vel']}
mom = {"dense": layer_1['mom'], "dense_1": layer_2['mom'], "dense_2": layer_3['mom']}
lr = {"dense": layer_1['lr'], "dense_1": layer_2['lr'], "dense_2": layer_3['lr']}

标签: pythonoptimizationtensorflow2.0tf.kerasstochastic-gradient

解决方案


推荐阅读