首页 > 技术文章 > 实现A Painless Q-learning Tutorial (深度学习笔记二)

x0216u 2017-11-15 16:38 原文

egg:  A Painless Q-learning Tutorial (一个 Q-learning 算法的简明教程)  

第一种:

import numpy as np
import random
import copy

#first
# initial初始化
q = np.zeros([6, 6]) #6*6 矩阵
q = np.matrix(q)

r = np.array([[-1, -1, -1, -1, 0, -1], [-1, -1, -1, 0, -1, 100], [-1, -1, -1, 0, -1, -1], [-1, 0, 0, -1, 0, -1], [0, -1, -1, 0, -1, 100], [-1, 0, -1, -1, 0, 100]])
r = np.matrix(r)

gamma = 0.8 #折扣因子

e=  np.matrix(np.zeros([6, 6]))


for i in range(100):
    # one episode
    #s随机状态
    state = random.randint(0, 5)
    while (state != 5):
        # choose positive r-value action randomly
        #有效路径集合 用于获取下一个可能的状态
        r_pos_action = []
        for action in range(6):
             #可以通过的路径
            if r[state, action] >= 0:
                r_pos_action.append(action)
        #下一个状态  随机获取 random.randint(a,b)==[a,b]
        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]
        q[state, next_state] = r[state, next_state] + gamma * q[next_state].max()
         #e 与 q 不同
        if (e-q).any()==True:
              e=copy.copy(q)
              print("%s----%s"%(state,next_state))
              print(q)
              print("\n")
        state = next_state
        

得到策略矩阵Q:

[[   0.     0.     0.     0.    80.     0. ]
 [   0.     0.     0.    64.     0.   100. ]
 [   0.     0.     0.    64.     0.     0. ]
 [   0.    80.    51.2    0.    80.     0. ]
 [  64.     0.     0.    64.     0.   100. ]
 [   0.     0.     0.     0.     0.     0. ]]

 

第二种:

       #设定初始 q 和 reward
q = np.zeros((6,6))
rewards = np.zeros((6,6)) ; rewards[:,5]=500
# 可行的动作
actions = [[4],[3,5],[3],[1,2,4],[0,3,5],[1,4,5]]

def trial():
     #初始化状态
    s = random.randint(0,6)
    while s<5:
         #choice方法返回一个列表,元组或字符串的随机项
        s1 = a = random.choice(actions[s])
        q[s,a]=rewards[s,a]+0.8*q[s1].max()
        s = s1

for i in range(200):
    trial()

print (q)

def test(s):
    print (s)
    while s<5:
         #返回的是最大数的索引
        s = q[s].argmax()
        print ("->%s"% s)

test(2)

得到策略矩阵Q及测试状态2:

[[   0.    0.    0.    0.  400.    0.]
 [   0.    0.    0.  320.    0.  500.]
 [   0.    0.    0.  320.    0.    0.]
 [   0.  400.  256.    0.  400.    0.]
 [ 320.    0.    0.  320.    0.  500.]
 [   0.    0.    0.    0.    0.    0.]]
2
->3
->1
->5

 

推荐阅读