python - 覆盖 TF2 优化器类,应用渐变有问题?
问题描述
我正在尝试实现我自己的优化器(SGD + 动量),它会覆盖原始的 keras SGD + 动量类。我正在尝试传入一些预训练的初始化参数和超参数(速度、动量和学习率),并将它们用作我的 SGD + 动量的超参数。在对 TF2 优化器类进行了一些修改之后,我推断用我自己的速度权重覆盖创建槽,然后将它们乘以动量常数应该可以解决问题,但是,当我尝试 optim.apply_gradients( )。关于我是否在正确的方向上的任何输入?这是我编辑的课程:
class ParamwiseSGD(tf.keras.optimizers.SGD):
def __init__(self,
learning_rate=1.0,
lr_multipliers=None,
velocity_multipliers = None,
momentum_multipliers = None,
momentum=1.0,
nesterov=False,
clipvalue=1.0,
name='ParamwiseSGD',
**kwargs):
super(ParamwiseSGD, self).__init__(learning_rate, momentum=momentum,
nesterov=nesterov, name=name, **kwargs)
self.lr_multipliers = lr_multipliers
self.velocity_multipliers = velocity_multipliers
# import ipdb; ipdb.set_trace()
self.momentum_multipliers = momentum_multipliers
def _get_lr(self, name):
tokens = name.split('/')
# import ipdb; ipdb.set_trace()
return self.lr_multipliers[tokens[0]]
def _create_slots(self, var_list):
if self._momentum:
for var in var_list:
# new_var = tf.Variable()
print(var.name)
new_val = self._get_velocity(var.name)
self.add_slot(var, "momentum", initializer = new_val)
def _get_velocity(self, name, shape = None):
tokens = name.split('/')
# import ipdb; ipdb.set_trace()
return self.velocity_multipliers[tokens[0]]
def _get_mom(self, name):
tokens = name.split('/')
return self.momentum_multipliers[tokens[0]]
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_mult = self._get_lr(var.name)
# print_op = tf.print(var.name, lr_mult)
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
# with tf.control_dependencies([print_op]):
if self._momentum:
# momentum_var = self.get_slot(var, "momentum")
momentum_var = self.get_slot(var, "momentum")
momentum_mult = self._get_mom(name=var.name)
# import ipdb; ipdb.set_trace()
return training_ops.resource_apply_keras_momentum(
var.handle,
momentum_var.handle,
coefficients["lr_t"] * lr_mult,
grad,
coefficients["momentum"] * momentum_mult,
use_locking=self._use_locking,
use_nesterov=self.nesterov)
else:
return training_ops.resource_apply_gradient_descent(
var.handle, coefficients["lr_t"] * lr_mult, grad, use_locking=self._use_locking)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_mult = self._get_lr(var.name)
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
momentum_var = self.get_slot(var, "momentum")
momentum_mult = self._get_mom(var.name)
return training_ops.resource_sparse_apply_keras_momentum(
var.handle,
momentum_var.handle,
coefficients["lr_t"] * lr_mult,
grad,
indices,
coefficients["momentum"] * momentum_mult,
use_locking=self._use_locking,
use_nesterov=self.nesterov)
#TODO: how to fix this implementation
def _resource_apply_sparse_duplicate_indices(self, grad, var, indices,
**kwargs):
lr_mult = self._get_lr(var.name)
# import ipdb; ipdb.set_trace()
if self._momentum:
# import ipdb; ipdb.set_trace()
return super(ParamwiseSGD, self)._resource_apply_sparse_duplicate_indices(
grad, var, indices, **kwargs)
else:
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = (kwargs.get("apply_state", {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
return resource_variable_ops.resource_scatter_add(
var.handle, indices, -grad * coefficients["lr_t"] * lr_mult)
然后使用 iris 数据集初始化和使用来自 TF2 的简单训练循环,会引发一些错误:
optim_2 = ParamwiseSGD(
momentum=.9,
nesterov=False,
lr_multipliers=lr,
momentum_multipliers=mom,
velocity_multipliers=vel)
train_loss_results = []
train_accuracy_results = []
num_epochs = 201
for epoch in range(num_epochs):
epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
# Training loop - using batches of 32
for x, y in train_dataset:
# Optimize the model
loss_value, grads = grad(model, x, y)
optim_2.apply_gradients(zip(grads, model.trainable_variables))
# Track progress
epoch_loss_avg.update_state(loss_value) # Add current batch loss
# Compare predicted label to actual label
# training=True is needed only if there are layers with different
# behavior during training versus inference (e.g. Dropout).
epoch_accuracy.update_state(y, model(x, training=True))
# End epoch
train_loss_results.append(epoch_loss_avg.result())
train_accuracy_results.append(epoch_accuracy.result())
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
epoch_loss_avg.result(),
epoch_accuracy.result()))
抛出此错误:
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: var 和 accum 的形状不同[4,10] [10] [Op:ResourceApplyKerasMomentum]
这是我用来初始化权重的玩具问题:
model = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(4,)), # input shape required
tf.keras.layers.Dense(10, activation=tf.nn.relu),
tf.keras.layers.Dense(3)
s_1 = (10,)
s_2 = (10,)
s_3 = (3,)
layer_1 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_1)}
layer_2 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_2)}
layer_3 = {"lr": .01, "mom":.9, "vel": tf.zeros(shape = s_3)}
vel = {"dense": layer_1['vel'], "dense_1": layer_2['vel'], "dense_2": layer_3['vel']}
mom = {"dense": layer_1['mom'], "dense_1": layer_2['mom'], "dense_2": layer_3['mom']}
lr = {"dense": layer_1['lr'], "dense_1": layer_2['lr'], "dense_2": layer_3['lr']}
解决方案
推荐阅读
- answer-set-programming - Clingo:我可以匹配多个变量(类似于 varargs)吗?
- visibility - 从子包中隐藏记录
- r - R:基于列条件的线覆盖密度图
- amazon-web-services - AWS S3Control 创建作业无效请求
- flutter - Listview 轴水平不显示小部件
- python-3.x - 我可以在 Django 中自动向 MongoEngine 查询添加过滤器吗?
- javascript - 如果元素具有具有值的属性,我该如何应用函数
- vector - 我如何处理带有胶质(锈)的纹理加载
- git - 无法提交 GitHub 密码
- windows - 扫描计算机列表以获取 Windows 激活状态和当前名称、状态