Add QuickStart example (#35)

* add QuickStart example, refine DQN example * add examples link * refine the naming, and add quick start training result

Add QuickStart example (#35)
* add QuickStart example, refine DQN example * add examples link * refine the naming, and add quick start training result
cdd4622a · Hongsheng Zeng · Bo Zhou · 4a4366a5 · cdd4622a · cdd4622a
12 changed file
--- a/README.md
+++ b/README.md
@@ -73,8 +73,8 @@ pip install --upgrade git+https://github.com/PaddlePaddle/PARL.git
 ```

 # Examples
-
- DQN 
+- [QuickStart](examples/QuickStart/)
+- [DQN](examples/DQN/)
 - DDPG
 - PPO
- Winning Solution for NIPS2018: AI for Prosthetics Challenge
+- [Winning Solution for NIPS2018: AI for Prosthetics Challenge](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
--- a/examples/DQN/README.md
+++ b/examples/DQN/README.md
@@ -21,7 +21,7 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g

 ### Start Training:
 ```
-# To train a model for Pong game with CUDA
-python train.py --rom ./rom_files/pong.bin --use_cuda
+# To train a model for Pong game
+python train.py --rom ./rom_files/pong.bin
 ```
 > To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms).
--- a/examples/DQN/atari_agent.py
+++ b/examples/DQN/atari_agent.py
@@ -16,7 +16,6 @@ import numpy as np
 import paddle.fluid as fluid
 import parl.layers as layers
 from parl.framework.agent_base import Agent
-from parl.utils import logger

 IMAGE_SIZE = (84, 84)
 CONTEXT_LEN = 4
@@ -34,6 +33,7 @@ class AtariAgent(Agent):
    def build_program(self):
        self.pred_program = fluid.Program()
        self.train_program = fluid.Program()
+
        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs',

--- a/examples/DQN/atari_model.py
+++ b/examples/DQN/atari_model.py
@@ -15,7 +15,6 @@
 import paddle.fluid as fluid
 import parl.layers as layers
 from parl.framework.model_base import Model
-from parl.utils import logger


 class AtariModel(Model):

--- a/examples/DQN/train.py
+++ b/examples/DQN/train.py
@@ -25,7 +25,7 @@ from atari_wrapper import FrameStack, MapState, FireResetEnv, LimitLength
 from collections import deque
 from datetime import datetime
 from expreplay import ReplayMemory, Experience
-from parl.algorithms import DQNAlgorithm
+from parl.algorithms import DQN
 from parl.utils import logger
 from tqdm import tqdm

@@ -36,7 +36,7 @@ CONTEXT_LEN = 4
 ACTION_REPEAT = 4  # aka FRAME_SKIP
 UPDATE_FREQ = 4
 GAMMA = 0.99
-LEARNING_RATE = 1e-3
+LEARNING_RATE = 1e-3 * 0.5


 def run_train_episode(agent, env, exp):
@@ -116,7 +116,7 @@ def train_agent():
        'gamma': GAMMA
    }
    model = AtariModel(IMAGE_SIZE[0], IMAGE_SIZE[1], action_dim)
-    algorithm = DQNAlgorithm(model, hyperparas)
+    algorithm = DQN(model, hyperparas)
    agent = AtariAgent(algorithm, action_dim)

    with tqdm(total=MEMORY_WARMUP_SIZE) as pbar:
@@ -151,8 +151,6 @@ def train_agent():
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--rom', help='atari rom', required=True)
-    parser.add_argument(
-        '--use_cuda', action='store_true', help='if set, use cuda')
    parser.add_argument(
        '--batch_size', type=int, default=64, help='batch size for training')
    parser.add_argument(

--- a/examples/QuickStart/README.md
+++ b/examples/QuickStart/README.md
+## Quick Start Example
+Based on PARL, train a agent to play CartPole game with policy gradient algorithm in a few minutes.
+
+## How to use
+### Dependencies:
+
+ python2.7 or python3.5+
+ [PARL](https://github.com/PaddlePaddle/PARL)
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
+ gym
+
+### Start Training:
+```
+# Install dependencies
+pip install paddlepaddle  
+# Or use Cuda: pip install paddlepaddle-gpu
+
+pip install gym
+git clone https://github.com/PaddlePaddle/PARL.git
+cd PARL
+pip install .
+
+# Train model
+cd examples/QuickStart/
+python train.py  
+# Or visualize when evaluating: python train.py --eval_vis
+
+### Result
+After training, you will see the agent get the best score (200 points).
--- a/examples/QuickStart/cartpole_agent.py
+++ b/examples/QuickStart/cartpole_agent.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.framework.agent_base import Agent
+
+
+class CartpoleAgent(Agent):
+    def __init__(self, algorithm, obs_dim, act_dim):
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(CartpoleAgent, self).__init__(algorithm)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.train_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.act_prob = self.alg.define_predict(obs)
+
+        with fluid.program_guard(self.train_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='int64')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            self.cost = self.alg.define_learn(obs, act, reward)
+
+    def sample(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)
+        act = np.random.choice(range(self.act_dim), p=act_prob)
+        return act
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)
+        act = np.argmax(act_prob)
+        return act
+
+    def learn(self, obs, act, reward):
+        act = np.expand_dims(act, axis=-1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int64'),
+            'reward': reward.astype('float32')
+        }
+        cost = self.fluid_executor.run(
+            self.train_program, feed=feed, fetch_list=[self.cost])[0]
+        return cost
--- a/examples/QuickStart/cartpole_model.py
+++ b/examples/QuickStart/cartpole_model.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.framework.model_base import Model
+
+
+class CartpoleModel(Model):
+    def __init__(self, act_dim):
+        act_dim = act_dim
+        hid1_size = act_dim * 10
+
+        self.fc1 = layers.fc(size=hid1_size, act='tanh')
+        self.fc2 = layers.fc(size=act_dim, act='softmax')
+
+    def policy(self, obs):
+        out = self.fc1(obs)
+        out = self.fc2(out)
+        return out
--- a/examples/QuickStart/train.py
+++ b/examples/QuickStart/train.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gym
+import numpy as np
+from cartpole_agent import CartpoleAgent
+from cartpole_model import CartpoleModel
+from parl.algorithms import PolicyGradient
+from parl.utils import logger
+
+OBS_DIM = 4
+ACT_DIM = 2
+GAMMA = 0.99
+LEARNING_RATE = 1e-3
+
+
+def run_train_episode(env, agent):
+    obs_list, action_list, reward_list = [], [], []
+    obs = env.reset()
+    while True:
+        obs_list.append(obs)
+        action = agent.sample(obs)
+        action_list.append(action)
+
+        obs, reward, done, info = env.step(action)
+        reward_list.append(reward)
+
+        if done:
+            break
+    return obs_list, action_list, reward_list
+
+
+def run_evaluate_episode(env, agent):
+    obs = env.reset()
+    all_reward = 0
+    while True:
+        if args.eval_vis:
+            env.render()
+        action = agent.predict(obs)
+        obs, reward, done, info = env.step(action)
+        all_reward += reward
+        if done:
+            break
+    return all_reward
+
+
+def calc_discount_norm_reward(reward_list):
+    discount_norm_reward = np.zeros_like(reward_list)
+
+    discount_cumulative_reward = 0
+    for i in reversed(range(0, len(reward_list))):
+        discount_cumulative_reward = (
+            GAMMA * discount_cumulative_reward + reward_list[i])
+        discount_norm_reward[i] = discount_cumulative_reward
+    discount_norm_reward = discount_norm_reward - np.mean(discount_norm_reward)
+    discount_norm_reward = discount_norm_reward / np.std(discount_norm_reward)
+    return discount_norm_reward
+
+
+def main():
+    env = gym.make("CartPole-v0")
+    model = CartpoleModel(act_dim=ACT_DIM)
+    alg = PolicyGradient(model, hyperparas={'lr': LEARNING_RATE})
+    agent = CartpoleAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM)
+
+    for i in range(500):
+        obs_list, action_list, reward_list = run_train_episode(env, agent)
+        logger.info("Episode {}, Reward Sum {}.".format(i, sum(reward_list)))
+
+        batch_obs = np.array(obs_list)
+        batch_action = np.array(action_list)
+        batch_reward = calc_discount_norm_reward(reward_list)
+
+        agent.learn(batch_obs, batch_action, batch_reward)
+        if i % 100 == 0:
+            all_reward = run_evaluate_episode(env, agent)
+            logger.info('Test reward: {}'.format(all_reward))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--eval_vis',
+        action='store_true',
+        help='if set, will visualize the game when evaluating')
+    args = parser.parse_args()
+
+    main()
--- a/parl/algorithms/__init__.py
+++ b/parl/algorithms/__init__.py
@@ -11,4 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from parl.algorithms.dqn_algorithm import *
+
+from parl.algorithms.dqn import *
+from parl.algorithms.policy_gradient import *
--- a/parl/algorithms/dqn_algorithm.py
+++ b/parl/algorithms/dqn_algorithm.py
@@ -17,8 +17,10 @@ from parl.framework.algorithm_base import Algorithm
 import parl.layers as layers
 import copy

+__all__ = ['DQN']

-class DQNAlgorithm(Algorithm):
+
+class DQN(Algorithm):
    def __init__(self, model, hyperparas):
        Algorithm.__init__(self, model, hyperparas)
        self.model = model
@@ -29,30 +31,32 @@ class DQNAlgorithm(Algorithm):
        self.lr = hyperparas['lr']

    def define_predict(self, obs):
+        """ use value model self.model to predict the action value
+        """
        return self.model.value(obs)

    def define_learn(self, obs, action, reward, next_obs, terminal):
+        """ update value model self.model with DQN algorithm
+        """
+
        pred_value = self.model.value(obs)
-        #fluid.layers.Print(pred_value, summarize=10, message='pred_value')
        next_pred_value = self.target_model.value(next_obs)
-        #fluid.layers.Print(next_pred_value, summarize=10, message='next_pred_value')
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
-        #fluid.layers.Print(best_v, summarize=10, message='best_v')
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

-        #fluid.layers.Print(target, summarize=10, message='target')
        action_onehot = layers.one_hot(action, self.action_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
-        #fluid.layers.Print(pred_action_value, summarize=10, message='pred_action_value')
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
-        optimizer = fluid.optimizer.Adam(self.lr * 0.5, epsilon=1e-3)
+        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost

    def sync_target(self, gpu_id):
+        """ sync parameters of self.target_model with self.model
+        """
        self.model.sync_params_to(self.target_model, gpu_id=gpu_id)
--- a/parl/algorithms/policy_gradient.py
+++ b/parl/algorithms/policy_gradient.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from parl.framework.algorithm_base import Algorithm
+import parl.layers as layers
+
+__all__ = ['PolicyGradient']
+
+
+class PolicyGradient(Algorithm):
+    def __init__(self, model, hyperparas):
+        Algorithm.__init__(self, model, hyperparas)
+        self.model = model
+        self.lr = hyperparas['lr']
+
+    def define_predict(self, obs):
+        """ use policy model self.model to predict the action probability
+        """
+        return self.model.policy(obs)
+
+    def define_learn(self, obs, action, reward):
+        """ update policy model self.model with policy gradient algorithm
+        """
+        act_prob = self.model.policy(obs)
+        log_prob = layers.cross_entropy(act_prob, action)
+        cost = log_prob * reward
+        cost = layers.reduce_mean(cost)
+        optimizer = fluid.optimizer.Adam(self.lr)
+        optimizer.minimize(cost)
+        return cost