diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..7d72f9cae03c935431f58043fdb505cec526cb6b --- /dev/null +++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import numpy as np + + +class QLearningAgent(object): + def __init__(self, + obs_n, + act_n, + learning_rate=0.01, + gamma=0.9, + e_greed=0.1): + self.act_n = act_n # 动作维度,有几个动作可选 + self.lr = learning_rate # 学习率 + self.gamma = gamma # reward的衰减率 + self.epsilon = e_greed # 按一定概率随机选动作 + self.Q = np.zeros((obs_n, act_n)) + + # 根据输入观察值,采样输出的动作值,带探索 + def sample(self, obs): + if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作 + action = self.predict(obs) + else: + action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作 + return action + + # 根据输入观察值,预测输出的动作值 + def predict(self, obs): + Q_list = self.Q[obs, :] + maxQ = np.max(Q_list) + action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action + action = np.random.choice(action_list) + return action + + # 学习方法,也就是更新Q-table的方法 + def learn(self, obs, action, reward, next_obs, done): + """ off-policy + obs: 交互前的obs, s_t + action: 本次交互选择的action, a_t + reward: 本次动作获得的奖励r + next_obs: 本次交互后的obs, s_t+1 + done: episode是否结束 + """ + predict_Q = self.Q[obs, action] + if done: + target_Q = reward # 没有下一个状态了 + else: + target_Q = reward + self.gamma * np.max( + self.Q[next_obs, :]) # Q-learning + self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q + + # 把 Q表格 的数据保存到文件中 + def save(self): + npy_file = './q_table.npy' + np.save(npy_file, self.Q) + print(npy_file + ' saved.') + + # 从文件中读取数据到 Q表格 + def restore(self, npy_file='./q_table.npy'): + self.Q = np.load(npy_file) + print(npy_file + ' loaded.') diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48 --- /dev/null +++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + # env.render() # 渲染一帧图像 diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py new file mode 100644 index 0000000000000000000000000000000000000000..2b34abe905fec3b8b884266850554d0d96932ed8 --- /dev/null +++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +from gridworld import CliffWalkingWapper, FrozenLakeWapper +from agent import QLearningAgent +import time + + +def run_episode(env, agent, render=False): + total_steps = 0 # 记录每个episode走了多少step + total_reward = 0 + + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + + while True: + action = agent.sample(obs) # 根据算法选择一个动作 + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + # 训练 Q-learning算法 + agent.learn(obs, action, reward, next_obs, done) + + obs = next_obs # 存储上一个观察值 + total_reward += reward + total_steps += 1 # 计算step数 + if render: + env.render() #渲染新的一帧图形 + if done: + break + return total_reward, total_steps + + +def test_episode(env, agent): + total_reward = 0 + obs = env.reset() + while True: + action = agent.predict(obs) # greedy + next_obs, reward, done, _ = env.step(action) + total_reward += reward + obs = next_obs + time.sleep(0.5) + env.render() + if done: + print('test reward = %.1f' % (total_reward)) + break + + +def main(): + env = gym.make( + "FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + env = FrozenLakeWapper(env) + + agent = QLearningAgent( + obs_n=env.observation_space.n, + act_n=env.action_space.n, + learning_rate=0.1, + gamma=0.9, + e_greed=0.1) + + for episode in range(500): + ep_reward, ep_steps = run_episode(env, agent) + print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, + ep_reward)) + + # 训练结束,查看算法效果 + test_episode(env, agent) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..964230c88bef164dc8f22d5a3eb5e99f242097d3 --- /dev/null +++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import numpy as np + + +class SarsaAgent(object): + def __init__(self, + obs_n, + act_n, + learning_rate=0.01, + gamma=0.9, + e_greed=0.1): + self.act_n = act_n # 动作维度,有几个动作可选 + self.lr = learning_rate # 学习率 + self.gamma = gamma # reward的衰减率 + self.epsilon = e_greed # 按一定概率随机选动作 + self.Q = np.zeros((obs_n, act_n)) + + # 根据输入观察值,采样输出的动作值,带探索 + def sample(self, obs): + if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作 + action = self.predict(obs) + else: + action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作 + return action + + # 根据输入观察值,预测输出的动作值 + def predict(self, obs): + Q_list = self.Q[obs, :] + maxQ = np.max(Q_list) + action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action + action = np.random.choice(action_list) + return action + + # 学习方法,也就是更新Q-table的方法 + def learn(self, obs, action, reward, next_obs, next_action, done): + """ on-policy + obs: 交互前的obs, s_t + action: 本次交互选择的action, a_t + reward: 本次动作获得的奖励r + next_obs: 本次交互后的obs, s_t+1 + next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1 + done: episode是否结束 + """ + predict_Q = self.Q[obs, action] + if done: + target_Q = reward # 没有下一个状态了 + else: + target_Q = reward + self.gamma * self.Q[next_obs, + next_action] # Sarsa + self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q + + def save(self): + npy_file = './q_table.npy' + np.save(npy_file, self.Q) + print(npy_file + ' saved.') + + def restore(self, npy_file='./q_table.npy'): + self.Q = np.load(npy_file) + print(npy_file + ' loaded.') diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48 --- /dev/null +++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + # env.render() # 渲染一帧图像 diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1e3d199b6fec2ce6fe041f17f7b14489a266a2 --- /dev/null +++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +from gridworld import CliffWalkingWapper, FrozenLakeWapper +from agent import SarsaAgent +import time + + +def run_episode(env, agent, render=False): + total_steps = 0 # 记录每个episode走了多少step + total_reward = 0 + + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + action = agent.sample(obs) # 根据算法选择一个动作 + + while True: + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + next_action = agent.sample(next_obs) # 根据算法选择一个动作 + # 训练 Sarsa 算法 + agent.learn(obs, action, reward, next_obs, next_action, done) + + action = next_action + obs = next_obs # 存储上一个观察值 + total_reward += reward + total_steps += 1 # 计算step数 + if render: + env.render() #渲染新的一帧图形 + if done: + break + return total_reward, total_steps + + +def test_episode(env, agent): + total_reward = 0 + obs = env.reset() + while True: + action = agent.predict(obs) # greedy + next_obs, reward, done, _ = env.step(action) + total_reward += reward + obs = next_obs + time.sleep(0.5) + env.render() + if done: + print('test reward = %.1f' % (total_reward)) + break + + +def main(): + env = gym.make( + "FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + env = FrozenLakeWapper(env) + + agent = SarsaAgent( + obs_n=env.observation_space.n, + act_n=env.action_space.n, + learning_rate=0.1, + gamma=0.9, + e_greed=0.1) + + for episode in range(500): + ep_reward, ep_steps = run_episode(env, agent) + print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, + ep_reward)) + + # 训练结束,查看算法效果 + test_episode(env, agent) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e14a737f16b62256ee0eb0efcfe3290222209f51 --- /dev/null +++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py @@ -0,0 +1,97 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class Agent(parl.Agent): + def __init__(self, + algorithm, + obs_dim, + act_dim, + e_greed=0.1, + e_greed_decrement=0): + assert isinstance(obs_dim, int) + assert isinstance(act_dim, int) + self.obs_dim = obs_dim + self.act_dim = act_dim + super(Agent, self).__init__(algorithm) + + self.global_step = 0 + self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 + + self.e_greed = e_greed # 有一定概率随机选取动作,探索 + self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.value = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + action = layers.data(name='act', shape=[1], dtype='int32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) + + def sample(self, obs): + sample = np.random.rand() # 产生0~1之间的小数 + if sample < self.e_greed: + act = np.random.randint(self.act_dim) # 探索:每个动作都有概率被选择 + else: + act = self.predict(obs) # 选择最优动作 + self.e_greed = max( + 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 + return act + + def predict(self, obs): # 选择最优动作 + obs = np.expand_dims(obs, axis=0) + pred_Q = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.value])[0] + pred_Q = np.squeeze(pred_Q, axis=0) + act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 + return act + + def learn(self, obs, act, reward, next_obs, terminal): + # 每隔200个training steps同步一次model和target_model的参数 + if self.global_step % self.update_target_steps == 0: + self.alg.sync_target() + self.global_step += 1 + + act = np.expand_dims(act, -1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int32'), + 'reward': reward, + 'next_obs': next_obs.astype('float32'), + 'terminal': terminal + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 + return cost diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py new file mode 100644 index 0000000000000000000000000000000000000000..17c7a8d93a532884187abf0a8cb44d3823018e56 --- /dev/null +++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py @@ -0,0 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import parl +from parl import layers # 封装了 paddle.fluid.layers 的API + + +class Model(parl.Model): + def __init__(self, act_dim): + hid1_size = 128 + hid2_size = 128 + # 3层全连接网络 + self.fc1 = layers.fc(size=hid1_size, act='relu') + self.fc2 = layers.fc(size=hid2_size, act='relu') + self.fc3 = layers.fc(size=act_dim, act=None) + + def value(self, obs): + h1 = self.fc1(obs) + h2 = self.fc2(h1) + Q = self.fc3(h2) + return Q diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5 --- /dev/null +++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py + +import random +import collections +import numpy as np + + +class ReplayMemory(object): + def __init__(self, max_size): + self.buffer = collections.deque(maxlen=max_size) + + def append(self, exp): + self.buffer.append(exp) + + def sample(self, batch_size): + mini_batch = random.sample(self.buffer, batch_size) + obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] + + for experience in mini_batch: + s, a, r, s_p, done = experience + obs_batch.append(s) + action_batch.append(a) + reward_batch.append(r) + next_obs_batch.append(s_p) + done_batch.append(done) + + return np.array(obs_batch).astype('float32'), \ + np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ + np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') + + def __len__(self): + return len(self.buffer) diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b198833e3cf9b8565d2316870775555866b03d5a --- /dev/null +++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py @@ -0,0 +1,127 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import os +import gym +import numpy as np +import parl +from parl.utils import logger # 日志打印工具 + +from model import Model +from agent import Agent +from parl.algorithms import DQN + +from replay_memory import ReplayMemory + +LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 +MEMORY_SIZE = 20000 # replay memory的大小,越大越占用内存 +MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再从里面sample一个batch的经验让agent去learn +BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 +LEARNING_RATE = 0.001 # 学习率 +GAMMA = 0.99 # reward 的衰减因子,一般取 0.9 到 0.999 不等 + + +# 训练一个episode +def run_episode(env, agent, rpm): + total_reward = 0 + obs = env.reset() + step = 0 + while True: + step += 1 + action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 + next_obs, reward, done, _ = env.step(action) + rpm.append((obs, action, reward, next_obs, done)) + + # train model + if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): + (batch_obs, batch_action, batch_reward, batch_next_obs, + batch_done) = rpm.sample(BATCH_SIZE) + train_loss = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, + batch_done) # s,a,r,s',done + + total_reward += reward + obs = next_obs + if done: + break + return total_reward + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + episode_reward = 0 + while True: + action = agent.predict(obs) # 预测动作,只选最优动作 + obs, reward, done, _ = env.step(action) + episode_reward += reward + if render: + env.render() + if done: + break + eval_reward.append(episode_reward) + return np.mean(eval_reward) + + +def main(): + env = gym.make('MountainCar-v0') # MountainCar-v0:expected reward > -120 + action_dim = env.action_space.n # CartPole-v0: 2 + obs_shape = env.observation_space.shape # CartPole-v0: (4,) + + rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 + + # 根据parl框架构建agent + model = Model(act_dim=action_dim) + algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) + agent = Agent( + algorithm, + obs_dim=obs_shape[0], + act_dim=action_dim, + e_greed=0.1, # 有一定概率随机选取动作,探索 + e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 + + # 加载模型 + # save_path = './dqn_model.ckpt' + # agent.restore(save_path) + + # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 + while len(rpm) < MEMORY_WARMUP_SIZE: + run_episode(env, agent, rpm) + + max_episode = 2000 + + # start train + episode = 0 + while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 + # train part + for i in range(0, 50): + total_reward = run_episode(env, agent, rpm) + episode += 1 + + # test part + eval_reward = evaluate(env, agent, render=False) # render=True 查看显示效果 + logger.info('episode:{} e_greed:{} test_reward:{}'.format( + episode, agent.e_greed, eval_reward)) + + # 训练结束,保存模型 + save_path = './dqn_model.ckpt' + agent.save(save_path) + + +if __name__ == '__main__': + main() diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..fad9528a1d1f4035aece21fb0aec753cf6519ae9 --- /dev/null +++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class Agent(parl.Agent): + def __init__(self, algorithm, obs_dim, act_dim): + self.obs_dim = obs_dim + self.act_dim = act_dim + super(Agent, self).__init__(algorithm) + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.act_prob = self.alg.predict(obs) + + with fluid.program_guard( + self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = layers.data(name='act', shape=[1], dtype='int64') + reward = layers.data(name='reward', shape=[], dtype='float32') + self.cost = self.alg.learn(obs, act, reward) + + def sample(self, obs): + obs = np.expand_dims(obs, axis=0) # 增加一维维度 + act_prob = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.act_prob])[0] + act_prob = np.squeeze(act_prob, axis=0) # 减少一维维度 + act = np.random.choice(range(self.act_dim), p=act_prob) # 根据动作概率选取动作 + return act + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act_prob = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.act_prob])[0] + act_prob = np.squeeze(act_prob, axis=0) + act = np.argmax(act_prob) # 根据动作概率选择概率最高的动作 + return act + + def learn(self, obs, act, reward): + act = np.expand_dims(act, axis=-1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int64'), + 'reward': reward.astype('float32') + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] + return cost diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b14abcf209f3fac1baf066bda39995c6752ed5 --- /dev/null +++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py @@ -0,0 +1,35 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import parl +from parl import layers + + +class Model(parl.Model): + def __init__(self, act_dim): + act_dim = act_dim + hid1_size = 256 + hid2_size = 64 + + self.fc1 = layers.fc(size=hid1_size, act='relu') + self.fc2 = layers.fc(size=hid2_size, act='relu') + self.fc3 = layers.fc(size=act_dim, act='softmax') + + def forward(self, obs): + h1 = self.fc1(obs) + h2 = self.fc2(h1) + out = self.fc3(h2) + return out diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b78d0c2228ebf65c74204eff42c3af9b52a4985f --- /dev/null +++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py @@ -0,0 +1,125 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import os +import gym +import numpy as np +import parl + +from agent import Agent +from model import Model +from parl.algorithms import PolicyGradient + +from parl.utils import logger + +LEARNING_RATE = 1e-3 + + +def run_episode(env, agent): + obs_list, action_list, reward_list = [], [], [] + obs = env.reset() + while True: + obs = preprocess(obs) # from shape (210, 160, 3) to (100800,) + obs_list.append(obs) + action = agent.sample(obs) + action_list.append(action) + + obs, reward, done, info = env.step(action) + reward_list.append(reward) + + if done: + break + return obs_list, action_list, reward_list + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + episode_reward = 0 + while True: + obs = preprocess(obs) # from shape (210, 160, 3) to (100800,) + action = agent.predict(obs) + obs, reward, isOver, _ = env.step(action) + episode_reward += reward + if render: + env.render() + if isOver: + break + eval_reward.append(episode_reward) + return np.mean(eval_reward) + + +def preprocess(image): + """ 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """ + image = image[35:195] # 裁剪 + image = image[::2, ::2, 0] # 下采样,缩放2倍 + image[image == 144] = 0 # 擦除背景 (background type 1) + image[image == 109] = 0 # 擦除背景 (background type 2) + image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色 + return image.astype(np.float).ravel() + + +def calc_reward_to_go(reward_list, gamma=0.99): + """calculate discounted reward""" + reward_arr = np.array(reward_list) + for i in range(len(reward_arr) - 2, -1, -1): + # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1 + reward_arr[i] += gamma * reward_arr[i + 1] + # normalize episode rewards + reward_arr -= np.mean(reward_arr) + reward_arr /= np.std(reward_arr) + return reward_arr + + +def main(): + env = gym.make('Pong-v0') + obs_dim = 80 * 80 + act_dim = env.action_space.n + logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) + + # 根据parl框架构建agent + model = Model(act_dim=act_dim) + alg = PolicyGradient(model, lr=LEARNING_RATE) + agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) + + # 加载模型 + # if os.path.exists('./model.ckpt'): + # agent.restore('./model.ckpt') + + for i in range(1000): + obs_list, action_list, reward_list = run_episode(env, agent) + if i % 10 == 0: + logger.info("Train Episode {}, Reward Sum {}.".format( + i, sum(reward_list))) + + batch_obs = np.array(obs_list) + batch_action = np.array(action_list) + batch_reward = calc_reward_to_go(reward_list) + + agent.learn(batch_obs, batch_action, batch_reward) + if (i + 1) % 100 == 0: + total_reward = evaluate(env, agent, render=False) + logger.info('Episode {}, Test reward: {}'.format( + i + 1, total_reward)) + + # save the parameters to ./model.ckpt + agent.save('./model.ckpt') + + +if __name__ == '__main__': + main() diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..726f2f18eb3ebb4afbbe335d5bcb5f69e4cc798d --- /dev/null +++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py @@ -0,0 +1,73 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import numpy as np +import parl +from parl import layers +from paddle import fluid + + +class QuadrotorAgent(parl.Agent): + def __init__(self, algorithm, obs_dim, act_dim=4): + assert isinstance(obs_dim, int) + assert isinstance(act_dim, int) + self.obs_dim = obs_dim + self.act_dim = act_dim + super(QuadrotorAgent, self).__init__(algorithm) + + # Attention: In the beginning, sync target model totally. + self.alg.sync_target(decay=0) + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.pred_act = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = layers.data( + name='act', shape=[self.act_dim], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs, + terminal) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.pred_program, feed={'obs': obs}, + fetch_list=[self.pred_act])[0] + return act + + def learn(self, obs, act, reward, next_obs, terminal): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal + } + critic_cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0] + self.alg.sync_target() + return critic_cost diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py new file mode 100644 index 0000000000000000000000000000000000000000..99f0ce3f47faabc68dd92b7f540bb295b7a03c06 --- /dev/null +++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py @@ -0,0 +1,63 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import paddle.fluid as fluid +import parl +from parl import layers + + +class ActorModel(parl.Model): + def __init__(self, act_dim): + hidden_dim_1, hidden_dim_2 = 64, 64 + self.fc1 = layers.fc(size=hidden_dim_1, act='tanh') + self.fc2 = layers.fc(size=hidden_dim_2, act='tanh') + self.fc3 = layers.fc(size=act_dim, act='tanh') + + def policy(self, obs): + x = self.fc1(obs) + x = self.fc2(x) + return self.fc3(x) + + +class CriticModel(parl.Model): + def __init__(self): + hidden_dim_1, hidden_dim_2 = 64, 64 + self.fc1 = layers.fc(size=hidden_dim_1, act='tanh') + self.fc2 = layers.fc(size=hidden_dim_2, act='tanh') + self.fc3 = layers.fc(size=1, act=None) + + def value(self, obs, act): + x = self.fc1(obs) + concat = layers.concat([x, act], axis=1) + x = self.fc2(concat) + Q = self.fc3(x) + Q = layers.squeeze(Q, axes=[1]) + return Q + + +class QuadrotorModel(parl.Model): + def __init__(self, act_dim): + self.actor_model = ActorModel(act_dim) + self.critic_model = CriticModel() + + def policy(self, obs): + return self.actor_model.policy(obs) + + def value(self, obs, act): + return self.critic_model.value(obs, act) + + def get_actor_params(self): + return self.actor_model.parameters() diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c33fa331875f4f0b4ce63acefe006b975d82bd51 --- /dev/null +++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py @@ -0,0 +1,137 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import os +import numpy as np + +import parl +from parl import layers +from paddle import fluid +from parl.utils import logger +from parl.utils import action_mapping # 将神经网络输出映射到对应的 实际动作取值范围 内 +from parl.utils import ReplayMemory # 经验回放 + +from rlschool import make_env # 使用 RLSchool 创建飞行器环境 +from quadrotor_model import QuadrotorModel +from quadrotor_agent import QuadrotorAgent +from parl.algorithms import DDPG + +GAMMA = 0.99 # reward 的衰减因子,一般取 0.9 到 0.999 不等 +TAU = 0.001 # target_model 跟 model 同步参数 的 软更新参数 +ACTOR_LR = 0.0002 # Actor网络更新的 learning rate +CRITIC_LR = 0.001 # Critic网络更新的 learning rate +MEMORY_SIZE = 1e6 # replay memory的大小,越大越占用内存 +MEMORY_WARMUP_SIZE = 1e4 # replay_memory 里需要预存一些经验数据,再从里面sample一个batch的经验让agent去learn +REWARD_SCALE = 0.01 # reward 的缩放因子 +BATCH_SIZE = 256 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 +TRAIN_TOTAL_STEPS = 1e6 # 总训练步数 +TEST_EVERY_STEPS = 1e4 # 每个N步评估一下算法效果,每次评估5个episode求平均reward + + +def run_episode(env, agent, rpm): + obs = env.reset() + total_reward, steps = 0, 0 + while True: + steps += 1 + batch_obs = np.expand_dims(obs, axis=0) + action = agent.predict(batch_obs.astype('float32')) + action = np.squeeze(action) + + # Add exploration noise, and clip to [-1.0, 1.0] + action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) + + next_obs, reward, done, info = env.step(action) + rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) + + if rpm.size() > MEMORY_WARMUP_SIZE: + batch_obs, batch_action, batch_reward, batch_next_obs, \ + batch_terminal = rpm.sample_batch(BATCH_SIZE) + critic_cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_terminal) + + obs = next_obs + total_reward += reward + + if done: + break + return total_reward, steps + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + total_reward, steps = 0, 0 + while True: + batch_obs = np.expand_dims(obs, axis=0) + action = agent.predict(batch_obs.astype('float32')) + action = np.squeeze(action) + action = np.clip(action, -1.0, 1.0) ## special + action = action_mapping(action, env.action_space.low[0], + env.action_space.high[0]) + # action = np.clip(action, -1.0, 1.0) ## special + + next_obs, reward, done, info = env.step(action) + + obs = next_obs + total_reward += reward + steps += 1 + + if render: + env.render() + + if done: + break + eval_reward.append(total_reward) + return np.mean(eval_reward) + + +# 创建飞行器环境 +env = make_env("Quadrotor", task="hovering_control") +env.reset() +obs_dim = env.observation_space.shape[0] +act_dim = env.action_space.shape[0] + +# 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套 +model = QuadrotorModel(act_dim) +algorithm = DDPG( + model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) +agent = QuadrotorAgent(algorithm, obs_dim, act_dim) + +# parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用 +rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim) + +test_flag = 0 +total_steps = 0 +while total_steps < TRAIN_TOTAL_STEPS: + train_reward, steps = run_episode(env, agent, rpm) + total_steps += steps + #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) + + if total_steps // TEST_EVERY_STEPS >= test_flag: + while total_steps // TEST_EVERY_STEPS >= test_flag: + test_flag += 1 + + evaluate_reward = evaluate(env, agent) + logger.info('Steps {}, Test reward: {}'.format(total_steps, + evaluate_reward)) + + # 保存模型 + ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps) + agent.save(ckpt) diff --git a/examples/tutorials/lesson4/policy_gradient/train.py b/examples/tutorials/lesson4/policy_gradient/train.py index 306c22526f76a2ecfc1793dcca083856dc51c45b..9af79095a87cae543a33d0b144b1528960ff34c4 100644 --- a/examples/tutorials/lesson4/policy_gradient/train.py +++ b/examples/tutorials/lesson4/policy_gradient/train.py @@ -65,7 +65,7 @@ def evaluate(env, agent, render=False): def calc_reward_to_go(reward_list, gamma=1.0): for i in range(len(reward_list) - 2, -1, -1): - # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1 + # G_i = r_i + γ·G_i+1 reward_list[i] += gamma * reward_list[i + 1] # Gt return np.array(reward_list)