DQN 实现（一）

整个DQN的实现分为两部分，第一部分是用纯Q-learning的表格形式训练Agent，第二部分是DQN算法，结合深度学习，用网络进行Q值更新。

Agent环境

首先，我们需要虚拟一个环境，这里以走迷宫为例。黄色圆圈代表Agent，红色方块表示地雷区域，绿色表示终点。

GYM是一个通用的强化学习实验环境，是OPEN AI的一个开源项目。为了了解OPEN AI的gym环境，此简易迷宫环境的代码参照了gym中游戏的API，实现了reset、step和render方法。以下为env.py文件实现代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import time
import sys
import numpy as np

if sys.version_info.major == 2:
    import Tkinter as tk
else:
    import tkinter as tk

WIDTH = 4
HEIGHT = 3
UNIT = 40


class Maze(tk.Tk, object):
    def __init__(self):
        super(Maze, self).__init__()
        self.action_space = ['u', 'd', 'l', 'r']
        self.n_actions = len(self.action_space)
        self.title('MAZE')
        self.geometry('{0}x{1}'.format(WIDTH*UNIT, HEIGHT*UNIT))
        self._build_maze()

    def _create_object(self, center_x, center_y, size, shape='oval', color='yellow'):
        """create different object of maze including robot, bomb and treasure
        """
        if(shape.lower() == 'oval'):
            object = self.canvas.create_oval(
                center_x - size, center_y - size,
                center_x + size, center_y + size,
                fill=color
            )
        elif(shape.lower() == 'rectangle'):
            object = self.canvas.create_rectangle(
                center_x - size, center_y - size,
                center_x + size, center_y + size,
                fill=color
            )
        return object
    
    def _build_maze(self):
        """draw maze including the whole map and different objects
        """
        self.canvas = tk.Canvas(self, bg='white', width=WIDTH*UNIT, height=HEIGHT*UNIT)
    
        for c in range(0, WIDTH * UNIT, UNIT):
            x0, y0, x1, y1 = c, 0 ,c , HEIGHT * UNIT
            self.canvas.create_line(x0, y0, x1, y1)
        for r in range(0, HEIGHT * UNIT, UNIT):
            x0, y0, x1, y1 = 0, r, WIDTH * UNIT, r
            self.canvas.create_line(x0, y0, x1, y1)
    
        self.origin = np.array([20, 20]) # center
        self.robot_center = self.origin + np.array([0, UNIT*2])
        self.robot_size = 15
        self.robot = self._create_object(
            self.robot_center[0], self.robot_center[1], self.robot_size,
            shape='oval', color='yellow'
        )
    
        bomb1_center = self.origin + UNIT
        bomb_size = 15
        self.bomb1 = self._create_object(
            bomb1_center[0], bomb1_center[1], bomb_size,
            shape='rectangle', color='red'
        )
        bomb2_center = self.origin + np.array([UNIT * 3, UNIT])
        self.bomb2 = self._create_object(
            bomb2_center[0], bomb2_center[1], bomb_size,
            shape='rectangle', color='red'
        )
    
        treasure_center = self.origin + np.array([UNIT * 3, 0])
        treasure_size = 15
        self.treasure = self._create_object(
            treasure_center[0], treasure_center[1], treasure_size,
            shape='rectangle', color='green'
        )
        self.canvas.pack()
        # self.canvas.wait_window() # preview maze
    
    def reset(self):
        """reset the game, init the coords of robot
        """
        self.update()
        time.sleep(0.5)
        self.canvas.delete(self.robot)
        self.robot = self.create_object(
            self.robot_center[0], self.robot_center[1], self.robot_size,
            shape='oval', color='yellow'
        )
        return self.canvas.coords(self.robot)
    
    def step(self, action):
        """operation of the robots and return the coords of robo, reward and  final state
        """
        s = self.canvas.coords(self.robot)
        base_action = np.array([0, 0])
        if action == 0:
            if s[1] > UNIT:
                base_action[1] -= UNIT  # up
        elif action == 1:
            if s[1] < (HEIGHT - 1) * UNIT:
                base_action[1] += UNIT  # down
        elif action == 2:
            if s[0] < (WIDTH - 1) * UNIT:
                base_action[0] += UNIT  # right
        elif action == 3:
            if s[0] > UNIT:
                base_action[0] -= UNIT  # left
    
        self.canvas.move(self.robot, base_action[0], base_action[1])
        s = self.canvas.coords(self.robot)  # next coords
    
        if s == self.canvas.coords(self.treasure):
            reward = 1
            done = True
            s = 'terminal'
            print('Mission complete')
        elif s == self.canvas.coords(self.bomb1) or s == self.canvas.coords(self.bomb2):
            reward = -1
            done = True
            s = 'terminal'
            print('boom! failed!')
        else:
            reward = 0
            done = False
        return s, reward, done
    
    def render(self):
        time.sleep(0.1)
        self.update()

Q-Learning实现

算法实现：

纯Q-learnning方法不需要使用深度学习的神经网络，Q-learning通过每一步获取的reward不断更新每个Q(S, A)，使用e-greedy策略选取action。

根据以上env环境，在q_learning.py中实现Q-learning算法如下：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
import pandas as pd

class QLearning:
    def __init__(self, actions, learning_rate=0.01, discount=0.9, e_greedy=0.1):
        self.actions = actions
        self.alpha = learning_rate
        self.gamma = discount
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float32)
        # print(self.q_table)


    def check_state_exist(self, state):
        if state not in self.q_table.index:
            self.q_table = self.q_table.append(pd.Series(
                [0] * len(self.actions),
                index=self.q_table.columns,
                name=state
            ))  # use state as the name of df's index
    
    def choose_action(self, state):
        self.check_state_exist(state)
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(self.actions)
        else:
            """
            if the state in the table, pick the action_value of the state
            e.g. state = 2, state_action = df.loc[2, :] = [1,2,3,4,5]
            permutation = [2,3,1,0,4] -> state_action = [3, 4, 2, 1, 5]
            action = state_action.idxmax() = 4
            """
            state_action = self.q_table.loc[state, :]
            state_action = state_action.reindex(np.random.permutation(state_action.index))  # shuffle
            action = state_action.idxmax()
        return action
    
    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        q_predict = self.q_table.loc[s, a]
        if s_ != 'terminal':
            q_target = r + self.gamma * self.q_table.loc[s_, :].max()
        else:
            q_target = r
        self.q_table.loc[s, a] += self.alpha * (q_target - q_predict)

Agent训练

有了以上的环境env和学习算法q_learning，我们就可以将两部分结合起来训练我们的Agent了。以下是play.py代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from env import Maze
from q_learning import QLearning

def update():
    for episode in range(100):
        state = env.reset()
        step_count = 0
        while True:
            env.render()
            action = RL.choose_action(str(state))
            state_, reward, done = env.step(action)
            step_count += 1
            RL.learn(str(state), action, reward, str(state_))
            state = state_
            if done:
                print(' Round over at: {0} round, Total steps: {1} steps'.format(episode, step_count))
                break
    env.distroy()

if __name__ == '__main__':
    env = Maze()
    RL = QLearning(actions=list(range(env.n_actions)))

    env.after(100, update())
    env.mainloop()
    
    print('\n Q Table')
    print(RL.q_table)