python - 尝试解决 CartPole-v0 的交叉熵方法中的神经网络输出问题
问题描述
我正在尝试对经典的 CartPole-v0 环境实施基于交叉熵策略的方法。我实际上是在 MountainCarContinuous-v0 上重新格式化该算法的工作实现,但是当我尝试让代理学习时,我收到以下错误消息:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
in
4
5 agent = Agent(env)
----> 6 scores = agent.learn()
7
8 # plot the scores
~/cross_entropy.py in learn(self, n_iterations, max_t, gamma, print_every, pop_size, elite_frac, sigma)
83 for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
84 weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
---> 85 rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
86
87 # get the best policies
~/cross_entropy.py in (.0)
83 for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
84 weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
---> 85 rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
86
87 # get the best policies
~/cross_entropy.py in evaluate(self, weights, gamma, max_t)
56 action = self.forward(state)
57 #action = torch.argmax(action_vals).item()
---> 58 state, reward, done, _ = self.env.step(action)
59 episode_return += reward * math.pow(gamma, t)
60 if done:
/gym/wrappers/time_limit.py in step(self, action)
14 def step(self, action):
15 assert self._elapsed_steps is not None, "Cannot call env.step() before calling reset()"
---> 16 observation, reward, done, info = self.env.step(action)
17 self._elapsed_steps += 1
18 if self._elapsed_steps >= self._max_episode_steps:
/gym/envs/classic_control/cartpole.py in step(self, action)
102 def step(self, action):
103 err_msg = "%r (%s) invalid" % (action, type(action))
--> 104 assert self.action_space.contains(action), err_msg
105
106 x, x_dot, theta, theta_dot = self.state
AssertionError: tensor([ 0.3987, 0.6013]) () invalid
我发现这是因为 MountainCarContinuous-v0 环境有一个 Box(2) 类型的 action_space,而 CartPole-v0 是 Discrete(2),这意味着我只想要一个整数作为动作选择。
我尝试通过应用 softmax 激活函数来解决这个概念,然后将较高值的索引作为操作。
action_vals = self.forward(state)
action = torch.argmax(action_vals).item()
这消除了错误,但是当我训练代理时,它似乎学得非常快,这表明存在问题。这是我的完整代理课程:
class Agent(nn.Module):
def __init__(self, env, h_size=16):
super().__init__()
self.env = env
# state, hidden layer, action sizes
self.s_size = env.observation_space.shape[0]
self.h_size = h_size
self.a_size = env.action_space.n
# define layers
self.fc1 = nn.Linear(self.s_size, self.h_size)
self.fc2 = nn.Linear(self.h_size, self.a_size)
self.device = torch.device('cpu')
def set_weights(self, weights):
s_size = self.s_size
h_size = self.h_size
a_size = self.a_size
# separate the weights for each layer
fc1_end = (s_size*h_size)+h_size
fc1_W = torch.from_numpy(weights[:s_size*h_size].reshape(s_size, h_size))
fc1_b = torch.from_numpy(weights[s_size*h_size:fc1_end])
fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h_size*a_size)].reshape(h_size, a_size))
fc2_b = torch.from_numpy(weights[fc1_end+(h_size*a_size):])
# set the weights for each layer
self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
def get_weights_dim(self):
return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.softmax(self.fc2(x))
return x
def evaluate(self, weights, gamma=1.0, max_t=5000):
self.set_weights(weights)
episode_return = 0.0
state = self.env.reset()
for t in range(max_t):
state = torch.from_numpy(state).float().to(self.device)
action_vals = self.forward(state)
action = torch.argmax(action_vals).item()
state, reward, done, _ = self.env.step(action)
episode_return += reward * math.pow(gamma, t)
if done:
break
return episode_return
def learn(self, n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5):
"""PyTorch implementation of the cross-entropy method.
Params
======
n_iterations (int): maximum number of training iterations
max_t (int): maximum number of timesteps per episode
gamma (float): discount rate
print_every (int): how often to print average score (over last 100 episodes)
pop_size (int): size of population at each iteration
elite_frac (float): percentage of top performers to use in update
sigma (float): standard deviation of additive noise
"""
n_elite=int(pop_size*elite_frac) # number of elite policies from the population
scores_deque = deque(maxlen=100) # list of the past 100 scores
scores = [] # list of all the scores
best_weight = sigma*np.random.randn(self.get_weights_dim()) # initialize the first best weight randomly
for i_iteration in range(1, n_iterations+1): # loop over all the training iterations
weights_pop = [best_weight + (sigma*np.random.randn(self.get_weights_dim())) for i in range(pop_size)] # population of the weights/policies
rewards = np.array([self.evaluate(weights, gamma, max_t) for weights in weights_pop]) # rewards from the policies resulting from all individual weights
# get the best policies
##
elite_idxs = rewards.argsort()[-n_elite:]
elite_weights = [weights_pop[i] for i in elite_idxs]
##
best_weight = np.array(elite_weights).mean(axis=0) # take the average of the best weights
reward = self.evaluate(best_weight, gamma=1.0) # evaluate this new policy
scores_deque.append(reward) # append the reward
scores.append(reward) # also append the reward
torch.save(self.state_dict(), 'checkpoint.pth') # save the agent
if i_iteration % print_every == 0: # print every 100 steps
print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))
if np.mean(scores_deque)>=195.0: # print if environment is solved
print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
break
return scores
如果有人对如何正确进行代理培训有任何想法,请给我任何建议。
解决方案
事实证明,我只需要向 Agent 类添加一个 act() 方法。
def act(self, state):
state = state.unsqueeze(0)
probs = self.forward(state).cpu()
m = Categorical(probs)
action = m.sample()
return action.item()
推荐阅读
- c# - 无法将类型为“System.DBNull”的对象转换为类型“System.Byte []
- python - 是否有 Python 文本挖掘脚本可以对具有多个分类的文本进行分类?
- image - 如果图像相同,CNN 能否识别大小差异?
- java - 尝试转换列表时出错
列出 - vue.js - 如何检查 Vue.js 中的复选框?
- assembly - 缓冲区溢出函数地址被转义
- azure-ad-b2c - Azure Active Directory B2C - 应用服务 API 防火墙限制
- python - 根据具有爆炸和数组的条件复制行
- angular - Angular 4 Kendo 柱形图默认显示值并在底部显示 Y 轴名称
- c# - Entity Framework Core 3.0 使用 DbSet vs.DbQuery 和 FromSqlRaw 时的不同结果