diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d72f9cae03c935431f58043fdb505cec526cb6b
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/agent.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+class QLearningAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))
+
+    # 根据输入观察值，采样输出的动作值，带探索
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, done):
+        """ off-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * np.max(
+                self.Q[next_obs, :])  # Q-learning
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    # 把 Q表格 的数据保存到文件中
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    # 从文件中读取数据到 Q表格
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
diff --git a/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py b/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b34abe905fec3b8b884266850554d0d96932ed8
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/q_learning_frozenlake/train.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import QLearningAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+
+    while True:
+        action = agent.sample(obs)  # 根据算法选择一个动作
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        # 训练 Q-learning算法
+        agent.learn(obs, action, reward, next_obs, done)
+
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    env = gym.make(
+        "FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    env = FrozenLakeWapper(env)
+
+    agent = QLearningAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..964230c88bef164dc8f22d5a3eb5e99f242097d3
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/agent.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+class SarsaAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))
+
+    # 根据输入观察值，采样输出的动作值，带探索
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, next_action, done):
+        """ on-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * self.Q[next_obs,
+                                                    next_action]  # Sarsa
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
diff --git a/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py b/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad1e3d199b6fec2ce6fe041f17f7b14489a266a2
--- /dev/null
+++ b/examples/tutorials/homework/lesson2/sarsa_frozenlake/train.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import SarsaAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+    action = agent.sample(obs)  # 根据算法选择一个动作
+
+    while True:
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        next_action = agent.sample(next_obs)  # 根据算法选择一个动作
+        # 训练 Sarsa 算法
+        agent.learn(obs, action, reward, next_obs, next_action, done)
+
+        action = next_action
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    env = gym.make(
+        "FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    env = FrozenLakeWapper(env)
+
+    agent = SarsaAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14a737f16b62256ee0eb0efcfe3290222209f51
--- /dev/null
+++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/agent.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class Agent(parl.Agent):
+    def __init__(self,
+                 algorithm,
+                 obs_dim,
+                 act_dim,
+                 e_greed=0.1,
+                 e_greed_decrement=0):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(Agent, self).__init__(algorithm)
+
+        self.global_step = 0
+        self.update_target_steps = 200  # 每隔200个training steps再把model的参数复制到target_model中
+
+        self.e_greed = e_greed  # 有一定概率随机选取动作，探索
+        self.e_greed_decrement = e_greed_decrement  # 随着训练逐步收敛，探索的程度慢慢降低
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.value = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):  # 搭建计算图用于 更新Q网络，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            action = layers.data(name='act', shape=[1], dtype='int32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
+
+    def sample(self, obs):
+        sample = np.random.rand()  # 产生0~1之间的小数
+        if sample < self.e_greed:
+            act = np.random.randint(self.act_dim)  # 探索：每个动作都有概率被选择
+        else:
+            act = self.predict(obs)  # 选择最优动作
+        self.e_greed = max(
+            0.01, self.e_greed - self.e_greed_decrement)  # 随着训练逐步收敛，探索的程度慢慢降低
+        return act
+
+    def predict(self, obs):  # 选择最优动作
+        obs = np.expand_dims(obs, axis=0)
+        pred_Q = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.value])[0]
+        pred_Q = np.squeeze(pred_Q, axis=0)
+        act = np.argmax(pred_Q)  # 选择Q最大的下标，即对应的动作
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        # 每隔200个training steps同步一次model和target_model的参数
+        if self.global_step % self.update_target_steps == 0:
+            self.alg.sync_target()
+        self.global_step += 1
+
+        act = np.expand_dims(act, -1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int32'),
+            'reward': reward,
+            'next_obs': next_obs.astype('float32'),
+            'terminal': terminal
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]  # 训练一次网络
+        return cost
diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c7a8d93a532884187abf0a8cb44d3823018e56
--- /dev/null
+++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/model.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import parl
+from parl import layers  # 封装了 paddle.fluid.layers 的API
+
+
+class Model(parl.Model):
+    def __init__(self, act_dim):
+        hid1_size = 128
+        hid2_size = 128
+        # 3层全连接网络
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=act_dim, act=None)
+
+    def value(self, obs):
+        h1 = self.fc1(obs)
+        h2 = self.fc2(h1)
+        Q = self.fc3(h2)
+        return Q
diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5
--- /dev/null
+++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/replay_memory.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
+
+import random
+import collections
+import numpy as np
+
+
+class ReplayMemory(object):
+    def __init__(self, max_size):
+        self.buffer = collections.deque(maxlen=max_size)
+
+    def append(self, exp):
+        self.buffer.append(exp)
+
+    def sample(self, batch_size):
+        mini_batch = random.sample(self.buffer, batch_size)
+        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
+
+        for experience in mini_batch:
+            s, a, r, s_p, done = experience
+            obs_batch.append(s)
+            action_batch.append(a)
+            reward_batch.append(r)
+            next_obs_batch.append(s_p)
+            done_batch.append(done)
+
+        return np.array(obs_batch).astype('float32'), \
+            np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
+            np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
+
+    def __len__(self):
+        return len(self.buffer)
diff --git a/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py b/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b198833e3cf9b8565d2316870775555866b03d5a
--- /dev/null
+++ b/examples/tutorials/homework/lesson3/dqn_mountaincar/train.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import os
+import gym
+import numpy as np
+import parl
+from parl.utils import logger  # 日志打印工具
+
+from model import Model
+from agent import Agent
+from parl.algorithms import DQN
+
+from replay_memory import ReplayMemory
+
+LEARN_FREQ = 5  # 训练频率，不需要每一个step都learn，攒一些新增经验后再learn，提高效率
+MEMORY_SIZE = 20000  # replay memory的大小，越大越占用内存
+MEMORY_WARMUP_SIZE = 200  # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
+BATCH_SIZE = 32  # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
+LEARNING_RATE = 0.001  # 学习率
+GAMMA = 0.99  # reward 的衰减因子，一般取 0.9 到 0.999 不等
+
+
+# 训练一个episode
+def run_episode(env, agent, rpm):
+    total_reward = 0
+    obs = env.reset()
+    step = 0
+    while True:
+        step += 1
+        action = agent.sample(obs)  # 采样动作，所有动作都有概率被尝试到
+        next_obs, reward, done, _ = env.step(action)
+        rpm.append((obs, action, reward, next_obs, done))
+
+        # train model
+        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
+            (batch_obs, batch_action, batch_reward, batch_next_obs,
+             batch_done) = rpm.sample(BATCH_SIZE)
+            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
+                                     batch_next_obs,
+                                     batch_done)  # s,a,r,s',done
+
+        total_reward += reward
+        obs = next_obs
+        if done:
+            break
+    return total_reward
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        episode_reward = 0
+        while True:
+            action = agent.predict(obs)  # 预测动作，只选最优动作
+            obs, reward, done, _ = env.step(action)
+            episode_reward += reward
+            if render:
+                env.render()
+            if done:
+                break
+        eval_reward.append(episode_reward)
+    return np.mean(eval_reward)
+
+
+def main():
+    env = gym.make('MountainCar-v0')  # MountainCar-v0:expected reward > -120
+    action_dim = env.action_space.n  # CartPole-v0: 2
+    obs_shape = env.observation_space.shape  # CartPole-v0: (4,)
+
+    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池
+
+    # 根据parl框架构建agent
+    model = Model(act_dim=action_dim)
+    algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
+    agent = Agent(
+        algorithm,
+        obs_dim=obs_shape[0],
+        act_dim=action_dim,
+        e_greed=0.1,  # 有一定概率随机选取动作，探索
+        e_greed_decrement=1e-6)  # 随着训练逐步收敛，探索的程度慢慢降低
+
+    # 加载模型
+    # save_path = './dqn_model.ckpt'
+    # agent.restore(save_path)
+
+    # 先往经验池里存一些数据，避免最开始训练的时候样本丰富度不够
+    while len(rpm) < MEMORY_WARMUP_SIZE:
+        run_episode(env, agent, rpm)
+
+    max_episode = 2000
+
+    # start train
+    episode = 0
+    while episode < max_episode:  # 训练max_episode个回合，test部分不计算入episode数量
+        # train part
+        for i in range(0, 50):
+            total_reward = run_episode(env, agent, rpm)
+            episode += 1
+
+        # test part
+        eval_reward = evaluate(env, agent, render=False)  # render=True 查看显示效果
+        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
+            episode, agent.e_greed, eval_reward))
+
+    # 训练结束，保存模型
+    save_path = './dqn_model.ckpt'
+    agent.save(save_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad9528a1d1f4035aece21fb0aec753cf6519ae9
--- /dev/null
+++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/agent.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class Agent(parl.Agent):
+    def __init__(self, algorithm, obs_dim, act_dim):
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(Agent, self).__init__(algorithm)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.act_prob = self.alg.predict(obs)
+
+        with fluid.program_guard(
+                self.learn_program):  # 搭建计算图用于 更新policy网络，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='int64')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            self.cost = self.alg.learn(obs, act, reward)
+
+    def sample(self, obs):
+        obs = np.expand_dims(obs, axis=0)  # 增加一维维度
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
+        act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
+        return act
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)
+        act = np.argmax(act_prob)  # 根据动作概率选择概率最高的动作
+        return act
+
+    def learn(self, obs, act, reward):
+        act = np.expand_dims(act, axis=-1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int64'),
+            'reward': reward.astype('float32')
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
+        return cost
diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8b14abcf209f3fac1baf066bda39995c6752ed5
--- /dev/null
+++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/model.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import parl
+from parl import layers
+
+
+class Model(parl.Model):
+    def __init__(self, act_dim):
+        act_dim = act_dim
+        hid1_size = 256
+        hid2_size = 64
+
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=act_dim, act='softmax')
+
+    def forward(self, obs):
+        h1 = self.fc1(obs)
+        h2 = self.fc2(h1)
+        out = self.fc3(h2)
+        return out
diff --git a/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py b/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78d0c2228ebf65c74204eff42c3af9b52a4985f
--- /dev/null
+++ b/examples/tutorials/homework/lesson4/policy_gradient_pong/train.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import os
+import gym
+import numpy as np
+import parl
+
+from agent import Agent
+from model import Model
+from parl.algorithms import PolicyGradient
+
+from parl.utils import logger
+
+LEARNING_RATE = 1e-3
+
+
+def run_episode(env, agent):
+    obs_list, action_list, reward_list = [], [], []
+    obs = env.reset()
+    while True:
+        obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
+        obs_list.append(obs)
+        action = agent.sample(obs)
+        action_list.append(action)
+
+        obs, reward, done, info = env.step(action)
+        reward_list.append(reward)
+
+        if done:
+            break
+    return obs_list, action_list, reward_list
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        episode_reward = 0
+        while True:
+            obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
+            action = agent.predict(obs)
+            obs, reward, isOver, _ = env.step(action)
+            episode_reward += reward
+            if render:
+                env.render()
+            if isOver:
+                break
+        eval_reward.append(episode_reward)
+    return np.mean(eval_reward)
+
+
+def preprocess(image):
+    """ 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
+    image = image[35:195]  # 裁剪
+    image = image[::2, ::2, 0]  # 下采样，缩放2倍
+    image[image == 144] = 0  # 擦除背景 (background type 1)
+    image[image == 109] = 0  # 擦除背景 (background type 2)
+    image[image != 0] = 1  # 转为灰度图，除了黑色外其他都是白色
+    return image.astype(np.float).ravel()
+
+
+def calc_reward_to_go(reward_list, gamma=0.99):
+    """calculate discounted reward"""
+    reward_arr = np.array(reward_list)
+    for i in range(len(reward_arr) - 2, -1, -1):
+        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
+        reward_arr[i] += gamma * reward_arr[i + 1]
+    # normalize episode rewards
+    reward_arr -= np.mean(reward_arr)
+    reward_arr /= np.std(reward_arr)
+    return reward_arr
+
+
+def main():
+    env = gym.make('Pong-v0')
+    obs_dim = 80 * 80
+    act_dim = env.action_space.n
+    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
+
+    # 根据parl框架构建agent
+    model = Model(act_dim=act_dim)
+    alg = PolicyGradient(model, lr=LEARNING_RATE)
+    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)
+
+    # 加载模型
+    # if os.path.exists('./model.ckpt'):
+    #     agent.restore('./model.ckpt')
+
+    for i in range(1000):
+        obs_list, action_list, reward_list = run_episode(env, agent)
+        if i % 10 == 0:
+            logger.info("Train Episode {}, Reward Sum {}.".format(
+                i, sum(reward_list)))
+
+        batch_obs = np.array(obs_list)
+        batch_action = np.array(action_list)
+        batch_reward = calc_reward_to_go(reward_list)
+
+        agent.learn(batch_obs, batch_action, batch_reward)
+        if (i + 1) % 100 == 0:
+            total_reward = evaluate(env, agent, render=False)
+            logger.info('Episode {}, Test reward: {}'.format(
+                i + 1, total_reward))
+
+    # save the parameters to ./model.ckpt
+    agent.save('./model.ckpt')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..726f2f18eb3ebb4afbbe335d5bcb5f69e4cc798d
--- /dev/null
+++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_agent.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import parl
+from parl import layers
+from paddle import fluid
+
+
+class QuadrotorAgent(parl.Agent):
+    def __init__(self, algorithm, obs_dim, act_dim=4):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(QuadrotorAgent, self).__init__(algorithm)
+
+        # Attention: In the beginning, sync target model totally.
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            act = layers.data(
+                name='act', shape=[self.act_dim], dtype='float32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
+                                                 terminal)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.pred_program, feed={'obs': obs},
+            fetch_list=[self.pred_act])[0]
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        critic_cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
+        self.alg.sync_target()
+        return critic_cost
diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f0ce3f47faabc68dd92b7f540bb295b7a03c06
--- /dev/null
+++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/quadrotor_model.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class ActorModel(parl.Model):
+    def __init__(self, act_dim):
+        hidden_dim_1, hidden_dim_2 = 64, 64
+        self.fc1 = layers.fc(size=hidden_dim_1, act='tanh')
+        self.fc2 = layers.fc(size=hidden_dim_2, act='tanh')
+        self.fc3 = layers.fc(size=act_dim, act='tanh')
+
+    def policy(self, obs):
+        x = self.fc1(obs)
+        x = self.fc2(x)
+        return self.fc3(x)
+
+
+class CriticModel(parl.Model):
+    def __init__(self):
+        hidden_dim_1, hidden_dim_2 = 64, 64
+        self.fc1 = layers.fc(size=hidden_dim_1, act='tanh')
+        self.fc2 = layers.fc(size=hidden_dim_2, act='tanh')
+        self.fc3 = layers.fc(size=1, act=None)
+
+    def value(self, obs, act):
+        x = self.fc1(obs)
+        concat = layers.concat([x, act], axis=1)
+        x = self.fc2(concat)
+        Q = self.fc3(x)
+        Q = layers.squeeze(Q, axes=[1])
+        return Q
+
+
+class QuadrotorModel(parl.Model):
+    def __init__(self, act_dim):
+        self.actor_model = ActorModel(act_dim)
+        self.critic_model = CriticModel()
+
+    def policy(self, obs):
+        return self.actor_model.policy(obs)
+
+    def value(self, obs, act):
+        return self.critic_model.value(obs, act)
+
+    def get_actor_params(self):
+        return self.actor_model.parameters()
diff --git a/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py b/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..c33fa331875f4f0b4ce63acefe006b975d82bd51
--- /dev/null
+++ b/examples/tutorials/homework/lesson5/ddpg_quadrotor/train.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import os
+import numpy as np
+
+import parl
+from parl import layers
+from paddle import fluid
+from parl.utils import logger
+from parl.utils import action_mapping  # 将神经网络输出映射到对应的 实际动作取值范围 内
+from parl.utils import ReplayMemory  # 经验回放
+
+from rlschool import make_env  # 使用 RLSchool 创建飞行器环境
+from quadrotor_model import QuadrotorModel
+from quadrotor_agent import QuadrotorAgent
+from parl.algorithms import DDPG
+
+GAMMA = 0.99  # reward 的衰减因子，一般取 0.9 到 0.999 不等
+TAU = 0.001  # target_model 跟 model 同步参数 的 软更新参数
+ACTOR_LR = 0.0002  # Actor网络更新的 learning rate
+CRITIC_LR = 0.001  # Critic网络更新的 learning rate
+MEMORY_SIZE = 1e6  # replay memory的大小，越大越占用内存
+MEMORY_WARMUP_SIZE = 1e4  # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
+REWARD_SCALE = 0.01  # reward 的缩放因子
+BATCH_SIZE = 256  # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
+TRAIN_TOTAL_STEPS = 1e6  # 总训练步数
+TEST_EVERY_STEPS = 1e4  # 每个N步评估一下算法效果，每次评估5个episode求平均reward
+
+
+def run_episode(env, agent, rpm):
+    obs = env.reset()
+    total_reward, steps = 0, 0
+    while True:
+        steps += 1
+        batch_obs = np.expand_dims(obs, axis=0)
+        action = agent.predict(batch_obs.astype('float32'))
+        action = np.squeeze(action)
+
+        # Add exploration noise, and clip to [-1.0, 1.0]
+        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
+        action = action_mapping(action, env.action_space.low[0],
+                                env.action_space.high[0])
+
+        next_obs, reward, done, info = env.step(action)
+        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)
+
+        if rpm.size() > MEMORY_WARMUP_SIZE:
+            batch_obs, batch_action, batch_reward, batch_next_obs, \
+                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
+            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                      batch_next_obs, batch_terminal)
+
+        obs = next_obs
+        total_reward += reward
+
+        if done:
+            break
+    return total_reward, steps
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        total_reward, steps = 0, 0
+        while True:
+            batch_obs = np.expand_dims(obs, axis=0)
+            action = agent.predict(batch_obs.astype('float32'))
+            action = np.squeeze(action)
+            action = np.clip(action, -1.0, 1.0)  ## special
+            action = action_mapping(action, env.action_space.low[0],
+                                    env.action_space.high[0])
+            # action = np.clip(action, -1.0, 1.0) ## special
+
+            next_obs, reward, done, info = env.step(action)
+
+            obs = next_obs
+            total_reward += reward
+            steps += 1
+
+            if render:
+                env.render()
+
+            if done:
+                break
+        eval_reward.append(total_reward)
+    return np.mean(eval_reward)
+
+
+# 创建飞行器环境
+env = make_env("Quadrotor", task="hovering_control")
+env.reset()
+obs_dim = env.observation_space.shape[0]
+act_dim = env.action_space.shape[0]
+
+# 使用parl框架搭建Agent：QuadrotorModel, DDPG, QuadrotorAgent三者嵌套
+model = QuadrotorModel(act_dim)
+algorithm = DDPG(
+    model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
+agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
+
+# parl库也为DDPG算法内置了ReplayMemory，可直接从 parl.utils 引入使用
+rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)
+
+test_flag = 0
+total_steps = 0
+while total_steps < TRAIN_TOTAL_STEPS:
+    train_reward, steps = run_episode(env, agent, rpm)
+    total_steps += steps
+    #logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
+
+    if total_steps // TEST_EVERY_STEPS >= test_flag:
+        while total_steps // TEST_EVERY_STEPS >= test_flag:
+            test_flag += 1
+
+        evaluate_reward = evaluate(env, agent)
+        logger.info('Steps {}, Test reward: {}'.format(total_steps,
+                                                       evaluate_reward))
+
+        # 保存模型
+        ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps)
+        agent.save(ckpt)
diff --git a/examples/tutorials/lesson4/policy_gradient/train.py b/examples/tutorials/lesson4/policy_gradient/train.py
index 306c22526f76a2ecfc1793dcca083856dc51c45b..9af79095a87cae543a33d0b144b1528960ff34c4 100644
--- a/examples/tutorials/lesson4/policy_gradient/train.py
+++ b/examples/tutorials/lesson4/policy_gradient/train.py
@@ -65,7 +65,7 @@ def evaluate(env, agent, render=False):
 
 def calc_reward_to_go(reward_list, gamma=1.0):
     for i in range(len(reward_list) - 2, -1, -1):
-        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
+        # G_i = r_i + γ·G_i+1
         reward_list[i] += gamma * reward_list[i + 1]  # Gt
     return np.array(reward_list)