python - DDPG(Actor-Critic)跑到最小值/最大值
问题描述
我希望对我的 DDPG 算法有所帮助。一切都运行,并且来自评论家网络的 Q 值被正确创建,但操作的结果最终与最大或最小允许值挂钩。这种行为似乎暗示我的梯度计算有问题,但我无法弄清楚。
出于测试目的,我使用了一个非常简单的输入,代理应该能够学习并且我知道正确答案:
state | action | reward
A | 100 | 100
A | 100 | 100
A | 100 | 100
A | 100 | 100
A | 200 | 200
A | 200 | 0
A | 200 | 0
A | 200 | 0
由于 200 被接受一次,它的期望值为 50 奖励,而 100 每次都被接受,这意味着它的期望值为 100 奖励。因此,鉴于状态相同,结果应该始终为 100。我应该指出,未来的奖励目前被忽略了——这只是一个会话最大化模型。
代码有点长,所以我编辑了不相关的部分:
def build_actor_net(s, scope, trainable,a_dim,act_max,act_min,features):
with tf.variable_scope(scope):
s = tf.feature_column.input_layer(features=features, feature_columns=s)
init_w = tf.random_normal_initializer(0., 0.003)
init_b = tf.constant_initializer(0.1)
regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
net = tf.layers.batch_normalization(s,fused=True)
net = tf.layers.dense(net, 400, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l1',trainable=trainable,kernel_regularizer=regularizer)
net = tf.layers.batch_normalization(net,fused=True)
net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)
with tf.variable_scope('actor_action'):
actions = tf.layers.dense(net, a_dim, activation=tf.nn.sigmoid,name='actions', trainable=trainable)
scaled_a = tf.add(tf.multiply(actions,tf.subtract(act_max,act_min)), act_min, name='scaled_a')
return scaled_a
def build_critic_net(s, a, scope, trainable,s_dim,a_dim,features):
with tf.variable_scope(scope):
s = tf.feature_column.input_layer(features=features, feature_columns=s)
init_w = tf.random_normal_initializer(0., 0.003)
init_b = tf.constant_initializer(0.1)
regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
net = tf.layers.batch_normalization(s,fused=True)
net = tf.layers.dense(net,400,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,name='l1',trainable=trainable,kernel_regularizer=regularizer)
net = tf.layers.batch_normalization(net+a,fused=True)
net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)
with tf.variable_scope('q'):
q = tf.layers.dense(net, a_dim, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable,kernel_regularizer=regularizer)# Q(s,a)
return q
def model_fn(features, mode, params):
state = (There are several state columns here that dont matter)
reward = tf.feature_column.numeric_column('reward')
action = tf.feature_column.numeric_column('action')
# ---------------------- Build Actor Networks ---------------------------
with tf.variable_scope('Actor'):
act_a = build_actor_net(state, 'act_eval_net', True,params['a_dim'],params['act_max'],params['act_min'],features)
act_a_ = build_actor_net(state_, 'act_target_net', True,params['a_dim'],params['act_max'],params['act_min'],features)
# ---------------------- Build Critic Networks ---------------------------
with tf.variable_scope('Critic'):
crit_a = tf.feature_column.input_layer(features=features,feature_columns=[action]) #Interchange action and mult here
crit_q = build_critic_net(state, crit_a, 'crit_eval_net', True,params['s_dim'],params['a_dim'],features)
crit_actor_update = build_critic_net(state,act_a,'crit_update_actor_net',True,params['s_dim'],params['a_dim'],features)
crit_q_ = build_critic_net(state_, act_a_, 'crit_target_net', True,params['s_dim'],params['a_dim'],features)
# ---------------------- Set up target, loss, and gradient --------------------
with tf.variable_scope('target_q'):
r = tf.feature_column.input_layer(features=features, feature_columns=[reward])
crit_target_q = r #+ params['gamma'] * crit_q_ #(Session-max model)
with tf.variable_scope('crit_loss'):
crit_loss = tf.reduce_mean(tf.squared_difference(crit_target_q, crit_q))
with tf.variable_scope('update'):
act_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_eval_net')
act_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_target_net')
crit_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_eval_net')
crit_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_target_net')
crit_update_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_update_actor_net')
crit_update_replace_op = [crit_update_params[i].assign(crit_e_params[i]) for i in range(len(crit_update_params))]
with tf.control_dependencies(crit_update_replace_op):
crit_target_replace_op = [crit_t_params[i].assign(tf.multiply(crit_e_params[i], params['tau']) +
tf.multiply(crit_t_params[i], 1. - params['tau'])) for i in range(len(crit_t_params))]
with tf.control_dependencies(crit_target_replace_op):
act_target_replace_op = [act_t_params[i].assign(tf.multiply(act_e_params[i], params['tau']) +
tf.multiply(act_t_params[i], 1. - params['tau'])) for i in range(len(act_t_params))]
with tf.variable_scope('C_train'):
reg = tf.losses.get_regularization_loss()
with tf.control_dependencies(act_target_replace_op):
crit_train_op = tf.train.AdamOptimizer(params['clr']).minimize(loss=crit_loss, global_step=tf.train.get_global_step())
with tf.variable_scope('a_grad'):
with tf.control_dependencies([crit_train_op]):###Not sure about this. May be necessary may not be
with tf.control_dependencies([act_a]):
a_grads = tf.gradients(crit_actor_update, act_a)[0] # tensor of gradients of each sample (None, a_dim)
with tf.variable_scope('A_train'):
act_policy_grads = tf.gradients(ys=act_a, xs=act_e_params, grad_ys=a_grads)
actor_gradients = list(map(lambda x: tf.div(x,params['bsize']),act_policy_grads))
with tf.control_dependencies([crit_train_op]):
act_train_op = tf.train.AdamOptimizer(-params['alr']).apply_gradients(zip(actor_gradients, act_e_params))
return tf.estimator.EstimatorSpec(mode=mode,loss=crit_loss,train_op=tf.group(crit_train_op,act_train_op))
此外,还有一个 main 函数,它设置一些参数(假设上面代码中调用的任何参数都已定义),然后调用 Estimator:
DDPG = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir=model_dir)
DDPG.train(input_fn=lambda: my_input_fn(path,True,args.maxe,args.batch,args.buffer,feature_names))
我也跳过了代码my_input_fn
,但假设它工作得很好。
我相当肯定我的梯度在某个地方存在问题,无论是在梯度本身的计算、梯度的反向传播还是在训练操作中。最终结果是,我最终将每个建议的操作都设为最大允许操作 (200) 或最小允许操作 (0),但绝不是正确的操作 (100)。如果有人可以帮助我阐明我的错误,我将不胜感激。
解决方案
推荐阅读
- r - MuMIn::dredge()。仅排除包含主效应的模型,仅保留具有交互作用的模型
- c++ - C++程序结束而不是重复for循环
- mongodb - 如何在 Mongodb 中按月份和年份范围进行过滤
- entity-framework - 数据库图工具理解Servicestack/Ormlite
- jmeter - 有没有办法配置 JMeter 来收听 kafka 主题?
- json - Vapor 3 内容到 JSON 到字符串
- java - jsf 方法未找到异常,尽管它在那里,javax.el.MethodNotFoundException
- php - 使用 Nginx 代理安装 Wordpress 子目录
- macos - 如何在 macOS 上使用 WKWebView 支持像 Safari 一样的缩放
- mysql - 如果每列等于相同的值,则 MySQL 更新行