python - 带有 Cartpole 问题的 pytorch NN 使分数最小化
问题描述
我试图通过在 pytorch 中训练一个简单的 2 层神经网络来解决 openAI 健身房中的 CartPole 问题。使用的方法是 DQN,但结果收敛在大约 8 或 9 的最高分数上,并且随着时间的推移或训练没有看到改善。相反,随着训练,分数会降低。如何改进/使其执行此操作的代码有什么问题?下面是使用的代码:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np
class network(nn.Module):
def __init__(self):
nn.Module.__init__(self)
# network takes 4 inputs (state, action, next_state, reward), hidden layer then has
# 256 inputs and the network has 2 outputs (the q value of going left or right)
# in this network the index of the output references the action.
self.l1 = nn.Linear(4, 256)
self.l2 = nn.Linear(256, 2)
def forward(self, x):
# forward function defines how the model will run
x = F.relu(self.l1(x))
x = self.l2(x)
return (x)
class replay_memory():
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
def save(self, transition):
# saves all transitions for the environment in a tensor
self.memory.append(transition)
if len(self.memory) > self.capacity:
del self.memory[0]
def sample(self, batch_size):
# generates a random sample from the memory
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
class agent():
def __init__(self, env, model):
self.epsilon = 1 # exploration rate
self.epsilon_min = 0.001 # smallest exploration value
self.epsilon_decay = 0.995 # rate at which exploration occurs
self.learning_rate = 0.001
def act(self, state, model):
# define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
if random.uniform(0, 1) <= self.epsilon:
action = torch.LongTensor([[random.randrange(2)]])
action_np = (action.numpy())[0][0]
else:
action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
action_np = (action.numpy())[0][0]
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
else:
self.epsilon = self.epsilon_min
return action, action_np
def trained_act(self, episodes, network, env):
for e in range (episodes):
state = env.reset()
for t in range (200):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
env.render()
if done:
break
print(t)
env.close()
def learn(batch_size, gamma, memory, optimizer):
BATCH_SIZE = batch_size
if len(memory) < BATCH_SIZE:
return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)
batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))
current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))
max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)
# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)
# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss
env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []
for e in range (episode):
state = env.reset()
for t in range (T):
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
if done:
reward = -2
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
解决方案
我会将您的训练算法重写为:
for e in range (episode):
state = env.reset()
done = False
t = 0
while not done:
action, action_np = agent.act(state, network)
next_state, reward, done, info = env.step(action_np)
transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])
memory.save(transition)
state = next_state
loss = learn(batch_size, gamma, memory, optimizer)
l.append(loss)
if t < T:
t += 1
else:
done = True
if done:
print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
s.append(t)
break
推荐阅读
- r - RStudio - 将数字转换为经度/纬度格式
- java - Spring boot mockito登录测试在使用h2 db时未创建用户
- sql - SQL如何约束检查插入只能是字母和撇号?
- python - 如何使用 Speech_recognition 修复模块导入错误?
- c++ - 'using' 的 C++ 语法理解问题
- python-3.x - 如何检查命令用户是否在不和谐中具有特定角色?
- javascript - 使用 JavaScript 从 IOS Safari 浏览器连接到 WIFI
- postgresql - TypeORM - PrimaryGeneratedColumn 增量策略与默认策略
- cordova - “ionic cordova platform add ios”返回ENOENT错误
- spark-ar-studio - 在 Spark AR 中获取纹理采样器的单一颜色值