python - lstm pytorch RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)

问题描述

我正在尝试使用nn.lstm

从文档https://pytorch.org/docs/master/generated/torch.nn.LSTM.html我得到 h0 和 c0 应该是维度：（num_layers * num_directions，batch，hidden_size）。

但是当我试图给出批量大小>1和h0的输入张量时，c0批量大小>1。它给了我错误说明："RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)"

这是我的代码：它包含 1 个内存缓冲区、Actor、Critic、TD3、ENV 类，主要训练在 TD3 中，它有 actor 和critic 对象。

有人可以帮忙检查一下我在这里缺少什么。

import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from random import random as rndm
from torch.autograd import Variable
from collections import deque
import pandas_datareader.data as pdr
import datetime

os.chdir('C:\\Users\\granthjain\\Desktop\\startup_code')

torch.set_default_tensor_type('torch.DoubleTensor')

f = open('lstm_with_noise_batch.txt', 'w+')


class ReplayBuffer(object):

    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
        else:
            self.storage.append(transition)
        self.ptr = (self.ptr + 1) % self.max_size

    def sample(self, batch_size):

        ind = np.random.randint(0, self.ptr, size=batch_size)
        ind = np.random.randint(self.ptr)
        (batch_states, batch_next_states, batch_actions, batch_rewards,
         batch_dones) = ([], [], [], [], [])
        for i in range(ind - batch_size, ind):
            (state, next_state, action, reward, done) = self.storage[i]

            if state is None:
                continue
            elif next_state is None:
                continue
            elif action is None:
                continue
            elif reward is None:
                continue
            elif done is None:
                continue

            batch_states.append(np.array(state, copy=False))
            batch_next_states.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))

        return (np.array(batch_states, dtype=object).astype(float),
                np.array(batch_next_states,
                dtype=object).astype(float), np.array(batch_actions,
                dtype=object).astype(float), np.array(batch_rewards,
                dtype=object).astype(float), np.array(batch_dones,
                dtype=object).astype(float))


class Actor(nn.Module):

    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        ):
        super(Actor, self).__init__()
        self.lstm = nn.LSTM(state_dim, 256)
        self.layer_1 = nn.Linear(256, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x, hx):
        (hx, cx) = hx
        (output, (hx, cx)) = self.lstm(x, (hx, cx))
        x = F.relu(self.layer_1(output))
        x = F.relu(self.layer_2(x))
        x = self.max_action * torch.tanh(self.layer_3(x))

    # print("inside forward type cx:",len(output))

        return (x, hx, cx)


class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

    # Defining the first Critic neural network

        self.lstm1 = nn.LSTM(state_dim + action_dim, 256)
        self.layer_1 = nn.Linear(256, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)

    # Defining the second Critic neural network

        self.lstm2 = nn.LSTM(state_dim + action_dim, 256)
        self.layer_4 = nn.Linear(256, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

    def forward(
        self,
        x,
        u,
        hx,
        ):
        xu = torch.cat([x, u], 1)

    # Forward-Propagation on the first Critic Neural Network

        xu = torch.reshape(xu, (xu.shape[0], 1, 6))
        (hx1, cx1) = hx
        (hx2, cx2) = hx
        (output, (hx1, cx1)) = self.lstm1(xu, (hx1, hx2))
        x1 = F.relu(self.layer_1(output))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)

    # Forward-Propagation on the second Critic Neural Network

        (output, (hx2, cx2)) = self.lstm2(xu, (hx2, cx2))
        x2 = F.relu(self.layer_4(output))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        return (
            x1,
            x2,
            hx1,
            hx2,
            cx1,
            cx2,
            )

    def Q1(
        self,
        x,
        u,
        hx1,
        ):
        xu = torch.cat([x, u], 1)
        xu = torch.reshape(xu, (xu.shape[0], 1, 6))
        (hx1, cx1) = hx1
        (output, (hx1, cx1)) = self.lstm1(xu, (hx1, cx1))
        x1 = F.relu(self.layer_1(output))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return (x1, hx1, cx1)


class TD3(object):

    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        ):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim,
                                  max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = \
            torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state, hx1):
        (hx, cx) = hx1
        x = self.actor(state, hx1)
        return x

    def train(
        self,
        replay_buffer,
        iterations,
        batch_size=50,
        discount=0.99,
        tau=0.005,
        policy_noise=0.2,
        noise_clip=0.5,
        policy_freq=2,
        ):
        
        b_state = torch.Tensor([])
        b_next_state = torch.Tensor([])
        b_done = torch.Tensor([])
        b_reward = torch.Tensor([])
        b_action = torch.Tensor([])

        for it in range(iterations):

            # print ('it: ', it, ' iterations: ', iterations)

      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory

            (batch_states, batch_next_states, batch_actions,
             batch_rewards, batch_dones) = \
                replay_buffer.sample(batch_size)

            batch_states = batch_states.astype(float)
            batch_next_states = batch_next_states.astype(float)
            batch_actions = batch_actions.astype(float)
            batch_rewards = batch_rewards.astype(float)
            batch_dones = batch_dones.astype(float)

            state = torch.from_numpy(batch_states)
            next_state = torch.from_numpy(batch_next_states)
            action = torch.from_numpy(batch_actions)
            reward = torch.from_numpy(batch_rewards)
            done = torch.from_numpy(batch_dones)

            b_size = 1
            seq_len = state.shape[0]
            batch = b_size
            input_size = state_dim

            state = torch.reshape(state, (seq_len, 1, state_dim))
            next_state = torch.reshape(next_state, (seq_len, 1,
                    state_dim))
            done = torch.reshape(done, (seq_len, 1, 1))
            reward = torch.reshape(reward, (seq_len, 1, 1))
            action = torch.reshape(action, (seq_len, 1, action_dim))
            
            b_state = torch.cat((b_state, state),dim=1)
            b_next_state = torch.cat((b_next_state, next_state),dim=1)
            b_done = torch.cat((b_done, done),dim=1)
            b_reward = torch.cat((b_reward, reward),dim=1)
            b_action = torch.cat((b_action, action),dim=1)
            
        print("dim state:",b_state.shape)
        print("dim next_state:",b_next_state.shape)
        print("dim done:",b_done.shape)
        print("dim reward:",b_reward.shape)
        print("dim action:",b_action.shape)

      # for h and c shape (num_layers * num_directions, batch, hidden_size)

        h0 = torch.zeros(1, b_state.shape[1], 256)
        c0 = torch.zeros(1, b_state.shape[1], 256)
      # Step 5: From the next state s’, the Actor target plays the next action a’

        next_action = self.actor_target(next_state, (h0, c0))
        next_action = next_action[0]

      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment

        noise = torch.Tensor(next_action).data.normal_(0,
                policy_noise).to(device)
        noise = noise.clamp(-noise_clip, noise_clip)
        next_action = (next_action + noise).clamp(-self.max_action,
                self.max_action)

      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs

        result = self.critic_target(next_state, next_action, (h0,
                c0))
        target_Q1 = result[0]
        target_Q2 = result[1]

      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)

        target_Q = torch.min(target_Q1, target_Q2).double()

      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor

        target_Q = reward + (1 - done) * discount * target_Q

      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs

        action = torch.reshape(action, next_action.shape)
        result = self.critic(state, action, (h0, c0))
        current_Q1 = result[0]
        current_Q2 = result[1]

      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)

        critic_loss = F.mse_loss(current_Q1, target_Q) \
            + F.mse_loss(current_Q2, target_Q)

      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model

        out = self.actor(state, (h0, c0))
        out = out[0]
        (actor_loss, hx, cx) = self.critic.Q1(state, out, (h0,
                c0))
        actor_loss = -1 * actor_loss.mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

      # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging

        for (param, target_param) in zip(self.actor.parameters(),
                self.actor_target.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau)
                    * target_param.data)

      # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging

        for (param, target_param) in zip(self.critic.parameters(),
                self.critic_target.parameters()):
            target_param.data.copy_(tau * param.data + (1 - tau)
                    * target_param.data)

  # Making a save method to save a trained model

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth'
                   % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth'
                   % (directory, filename))

  # Making a load method to load a pre-trained model

    def load(self, filename, directory):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth'
                                   % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth'
                                    % (directory, filename)))


class ENV:

    def __init__(
        self,
        state_dim,
        action_dim,
        data,
        ):
        self.state_dim = state_dim
        self.state = torch.zeros(self.state_dim)
        self.state[state_dim - 1] = 100000.0
        self.next_state = torch.zeros(self.state_dim)
        self.next_state[state_dim - 1] = 100000.0
        self.action_dim = action_dim
        self.data = data
        self.idx = 0
        self._max_episode_steps = 200
        self.state[1] = self.data[self.idx]
        self.next_state[1] = self.data[self.idx]
        self.buy = 0

    def reset(self):
        self.next_state = torch.zeros(self.state_dim)
        self.next_state[state_dim - 1] = 100000.0
        self.state = torch.zeros(self.state_dim)
        self.state[state_dim - 1] = 100000.0
        self.state[1] = self.data[self.idx]
        self.next_state[1] = self.data[self.idx]

        ch = self.state[0]
        cp = self.state[1]
        cc = self.state[2]
        st = torch.tensor([ch, cp, cc])
        self.buy = 0
        return st

    def step(self, action):
        done = False
        act_t = torch.argmax(action)
        self.idx += 1
        if act_t == 0:
            cp = 1.0003 * self.state[1]
            num_s = int(self.state[2] / cp)

            self.next_state[0] += num_s
            self.next_state[2] = self.state[2] % cp

            self.next_state[1] = self.data[self.idx]
            self.buy = 1
        elif act_t == 1:
            self.next_state[1] = self.data[self.idx]
        elif act_t == 2:
            self.next_state[2] = self.state[2] + self.state[1] * (1
                    - 0.0023) * self.state[0]
            self.next_state[0] = 0
            self.next_state[1] = self.data[self.idx]

            if self.buy == 1:
                done = True
                self.buy = 0

        reward = self.next_state[2] - self.state[2] \
            + self.next_state[1] * self.next_state[0] - self.state[1] \
            * self.state[0] - 1

        self.state[0] = self.next_state[0]
        self.state[1] = self.next_state[1]
        self.state[2] = self.next_state[2]

        ch = self.state[0]
        cp = self.state[1]
        cc = self.state[2]

        st = torch.tensor([ch, cp, cc])

        return (st, reward, done)


# Selecting the device (CPU or GPU)

device = torch.device(('cuda' if torch.cuda.is_available() else 'cpu'))

# set the parameters

start_timesteps = 1e3  # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e1  # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e3  # Total number of iterations/timesteps
save_models = True  # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1  # Exploration noise - STD value of exploration Gaussian noise
batch_size = 200  # Size of the batch
discount = 0.99  # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005  # Target network update rate
policy_noise = 0.2  # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5  # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2  # Number of iterations to wait before the policy network (Actor model) is updated

state_dim = 3
action_dim = 3
max_action = 1
idx = 0

# instantiate policy

policy = TD3(state_dim, action_dim, max_action)

indices = pd.read_csv('nifty_test.csv')
indices = indices['0']

indices = pd.read_csv('EQUITY_L.csv')
indices = indices['SYMBOL']


# Create the environment for each ticker
# data = pd.read_csv('PAGEIND.csv')

for ticker in indices:
    print(ticker)

    ohlcv = pd.read_csv(ticker + '.csv')
    data = ohlcv.copy()
    data = data['Close']
    data = np.array(data).reshape(-1, 1)
    count = 0
    max_timesteps = data.shape[0]

    data = torch.DoubleTensor(data)
    env = ENV(state_dim, action_dim, data)

    replay_buffer = ReplayBuffer()

    # init training variables

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    obs = env.reset()
    hx = torch.zeros(1, 1, 256)
    cx = torch.zeros(1, 1, 256)

    # Set rewards and episode timesteps to zero

    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    # We start the main loop over max_timesteps

    while total_timesteps < max_timesteps:

      # If the episode is done

        if done | (total_timesteps == max_timesteps - 2) \
            & (episode_timesteps > 200):
            count = count + 1
            if (count % 100 == 0) & (count >= 100) \
                | (total_timesteps == max_timesteps - 2) \
                & (episode_timesteps > 200):

            # If we are not at the very beginning, we start the training process of the model

                if total_timesteps != 0:
                    print('Total Timesteps: {} Episode Num: {} Reward: {}'.format(total_timesteps,
                            episode_num, episode_reward))
                    policy.train(
                        replay_buffer,
                        episode_timesteps,
                        batch_size,
                        discount,
                        tau,
                        policy_noise,
                        noise_clip,
                        policy_freq,
                        )

                    if total_timesteps > 0.6 * max_timesteps + 1:
                        print('model output: Total Timesteps: {} Episode Num: {} Reward: {}'.format(total_timesteps,
                                episode_num, episode_reward))
                        f.write('model output: Total Timesteps: '
                                + str(total_timesteps)
                                + ' episode_num '
                                + str(episode_num)
                                + ' episode_reward '
                                + str(episode_reward))

            # When the training step is done, we reset the state of the environment

                obs = env.reset()

            # Set the Done to False

                done = False

            # Set rewards and episode timesteps to zero

                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1
                hx = torch.zeros(1, 1, 256)
                cx = torch.zeros(1, 1, 256)

      # Before 1000 timesteps, we play random actions

        if total_timesteps < 0.6 * max_timesteps:

    # random action

            actn = torch.randn(action_dim)
            action = torch.zeros(action_dim)
            action[torch.argmax(actn)] = 1
        else:

            # After 1000 timesteps, we switch to the model

    #    input of shape (seq_len, batch, input_size)

            obs1 = torch.reshape(obs, (1, 1, state_dim))
            action = policy.select_action(obs1, (hx, cx))
            
            actn = action[0]
            hx = action[1]
            cx = action[2]
            

        # If the explore_noise parameter is not 0, we add noise to the action and we clip it

            if expl_noise != 0:
                print ('policy action:', actn)
                actn = actn + torch.randn(action_dim)
                action = torch.zeros(action_dim)
                action[torch.argmax(actn)] = 1
        

      # The agent performs the action in the environment, then reaches the next state and receives the reward

        (new_obs, reward, done) = env.step(action)

      # We check if the episode is done

        done_bool = (0 if episode_timesteps + 1
                     == env._max_episode_steps else float(done))

      # We increase the total reward

        episode_reward += reward

      # We store the new transition into the Experience Replay memory (ReplayBuffer)

        replay_buffer.add((obs, new_obs, action, reward, done_bool))

      # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy

        obs = new_obs
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

f.close()

以下是输出：

20MICRONS
Total Timesteps: 611 Episode Num: 0 Reward: -53044.2697380831
dim state: torch.Size([200, 611, 3])
dim next_state: torch.Size([200, 611, 3])
dim done: torch.Size([200, 611, 1])
dim reward: torch.Size([200, 611, 1])
dim action: torch.Size([200, 611, 3])
Traceback (most recent call last):

  File "C:\Users\granthjain\Desktop\try_lstm.py", line 538, in <module>
    policy_freq,

  File "C:\Users\granthjain\Desktop\try_lstm.py", line 279, in train
    next_action = self.actor_target(next_state, (h0, c0))

  File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)

  File "C:\Users\granthjain\Desktop\try_lstm.py", line 106, in forward
    (output, (hx, cx)) = self.lstm(x, (hx, cx))

  File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)

  File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 567, in forward
    self.check_forward_args(input, hx, batch_sizes)

  File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 523, in check_forward_args
    'Expected hidden[0] size {}, got {}')

  File "C:\Users\granthjain\anaconda3\lib\site-packages\torch\nn\modules\rnn.py", line 187, in check_hidden_size
    raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))

RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)

标签： pythonpytorchartificial-intelligencereinforcement-learning

python - lstm pytorch RuntimeError: Expected hidden[0] size (1, 1, 256), got (1, 611, 256)

问题描述

解决方案

推荐阅读