remove previous algorithms; use the forward function in parl.Model (#96)

* remove previous algorithms; use the forward function in parl.Model * remove abundant lines * yapf

remove previous algorithms; use the forward function in parl.Model (#96)
* remove previous algorithms; use the forward function in parl.Model * remove abundant lines * yapf
0e174277 · Bo Zhou · GitHub · 6efa7871 · 0e174277 · 0e174277
18 changed file
--- a/examples/A2C/README.md
+++ b/examples/A2C/README.md
@@ -16,7 +16,6 @@ Mean episode reward in training process after 10 million sample steps.

 ## How to use
 ### Dependencies
-+ python2.7 or python3.5+
 + [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
 + [parl](https://github.com/PaddlePaddle/PARL)
 + gym

--- a/examples/GA3C/README.md
+++ b/examples/GA3C/README.md
@@ -16,7 +16,6 @@ Results with one learner (in a P40 GPU) and 24 simulators (in 12 CPU) in 10 mill

 ## How to use
 ### Dependencies
-+ python2.7 or python3.5+
 + [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
 + [parl](https://github.com/PaddlePaddle/PARL)
 + gym

--- a/examples/IMPALA/README.md
+++ b/examples/IMPALA/README.md
@@ -20,7 +20,6 @@ Result with one learner (in a P40 GPU) and 32 actors (in 32 CPUs).

 ## How to use
 ### Dependencies
-+ python2.7 or python3.5+
 + [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
 + [parl](https://github.com/PaddlePaddle/PARL)
 + gym

--- a/examples/QuickStart/README.md
+++ b/examples/QuickStart/README.md
@@ -4,7 +4,6 @@ Train an agent with PARL to solve the CartPole problem, a classical benchmark in
 ## How to use
 ### Dependencies:

-+ python2.7 or python3.5+
 + [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
 + [parl](https://github.com/PaddlePaddle/PARL)
 + gym

--- a/examples/QuickStart/cartpole_agent.py
+++ b/examples/QuickStart/cartpole_agent.py
@@ -18,7 +18,7 @@ import parl
 from parl import layers


-class CartpoleAgent(Agent):
+class CartpoleAgent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim):
        self.obs_dim = obs_dim
        self.act_dim = act_dim

--- a/examples/QuickStart/cartpole_model.py
+++ b/examples/QuickStart/cartpole_model.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import paddle.fluid as fluid
 import parl
 from parl import layers

@@ -25,7 +24,7 @@ class CartpoleModel(parl.Model):
        self.fc1 = layers.fc(size=hid1_size, act='tanh')
        self.fc2 = layers.fc(size=act_dim, act='softmax')

-    def policy(self, obs):
+    def forward(self, obs):
        out = self.fc1(obs)
        out = self.fc2(out)
        return out
--- a/examples/QuickStart/train.py
+++ b/examples/QuickStart/train.py
@@ -53,7 +53,9 @@ def main():

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
-        logger.info("Episode {}, Reward Sum {}.".format(i, sum(reward_list)))
+        if i % 10 == 0:
+            logger.info("Episode {}, Reward Sum {}.".format(
+                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)

--- a/parl/algorithms/a3c.py
+++ b/parl/algorithms/a3c.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import parl.layers as layers
-from parl.framework.algorithm_base import Algorithm
-from parl.framework.policy_distribution import CategoricalDistribution
-
-__all__ = ['A3C']
-
-
-class A3C(Algorithm):
-    def __init__(self, model, hyperparas):
-        super(A3C, self).__init__(model, hyperparas)
-
-    def learn(self, obs, actions, advantages, target_values, learning_rate,
-              entropy_coeff):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-            actions: An int64 tensor of shape [B].
-            advantages: A float32 tensor of shape [B].
-            target_values: A float32 tensor of shape [B].
-            learning_rate: float scalar of learning rate.
-            entropy_coeff: float scalar of entropy coefficient.
-        """
-        logits = self.model.policy(obs)
-        policy_distribution = CategoricalDistribution(logits)
-        actions_log_probs = policy_distribution.logp(actions)
-
-        # The policy gradient loss
-        pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages)
-
-        # The value function loss
-        values = self.model.value(obs)
-        delta = values - target_values
-        vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))
-
-        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
-        policy_entropy = policy_distribution.entropy()
-        entropy = layers.reduce_sum(policy_entropy)
-
-        total_loss = (pi_loss + vf_loss * self.hp['vf_loss_coeff'] +
-                      entropy * entropy_coeff)
-
-        fluid.clip.set_gradient_clip(
-            clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0))
-
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
-        optimizer.minimize(total_loss)
-
-        return total_loss, pi_loss, vf_loss, entropy
-
-    def sample(self, obs):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-        """
-        logits, values = self.model.policy_and_value(obs)
-
-        policy_dist = CategoricalDistribution(logits)
-        sample_actions = policy_dist.sample()
-
-        return sample_actions, values
-
-    def predict(self, obs):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-        """
-        logits = self.model.policy(obs)
-        probs = layers.softmax(logits)
-
-        predict_actions = layers.argmax(probs, 1)
-
-        return predict_actions
-
-    def value(self, obs):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-        """
-        values = self.model.value(obs)
-        return values
--- a/parl/algorithms/ddpg.py
+++ b/parl/algorithms/ddpg.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import parl.layers as layers
-from copy import deepcopy
-from paddle import fluid
-from parl.framework.algorithm_base import Algorithm
-
-__all__ = ['DDPG']
-
-
-class DDPG(Algorithm):
-    def __init__(self, model, hyperparas):
-        """ model: should implement the function get_actor_params()
-        """
-        Algorithm.__init__(self, model, hyperparas)
-        self.model = model
-        self.target_model = deepcopy(model)
-
-        # fetch hyper parameters
-        self.gamma = hyperparas['gamma']
-        self.tau = hyperparas['tau']
-        self.actor_lr = hyperparas['actor_lr']
-        self.critic_lr = hyperparas['critic_lr']
-
-    def define_predict(self, obs):
-        """ use actor model of self.model to predict the action
-        """
-        return self.model.policy(obs)
-
-    def define_learn(self, obs, action, reward, next_obs, terminal):
-        """ update actor and critic model with DDPG algorithm
-        """
-        actor_cost = self._actor_learn(obs)
-        critic_cost = self._critic_learn(obs, action, reward, next_obs,
-                                         terminal)
-        return actor_cost, critic_cost
-
-    def _actor_learn(self, obs):
-        action = self.model.policy(obs)
-        Q = self.model.value(obs, action)
-        cost = layers.reduce_mean(-1.0 * Q)
-        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
-        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
-        return cost
-
-    def _critic_learn(self, obs, action, reward, next_obs, terminal):
-        next_action = self.target_model.policy(next_obs)
-        next_Q = self.target_model.value(next_obs, next_action)
-
-        terminal = layers.cast(terminal, dtype='float32')
-        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
-        target_Q.stop_gradient = True
-
-        Q = self.model.value(obs, action)
-        cost = layers.square_error_cost(Q, target_Q)
-        cost = layers.reduce_mean(cost)
-        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
-        optimizer.minimize(cost)
-        return cost
-
-    def sync_target(self, gpu_id, decay=None):
-        if decay is None:
-            decay = 1.0 - self.tau
-        self.model.sync_params_to(
-            self.target_model, gpu_id=gpu_id, decay=decay)
--- a/parl/algorithms/dqn.py
+++ b/parl/algorithms/dqn.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from parl.framework.algorithm_base import Algorithm
-import parl.layers as layers
-import copy
-
-__all__ = ['DQN']
-
-
-class DQN(Algorithm):
-    def __init__(self, model, hyperparas):
-        Algorithm.__init__(self, model, hyperparas)
-        self.model = model
-        self.target_model = copy.deepcopy(model)
-        # fetch hyper parameters
-        self.action_dim = hyperparas['action_dim']
-        self.gamma = hyperparas['gamma']
-        self.lr = hyperparas['lr']
-
-    def define_predict(self, obs):
-        """ use value model self.model to predict the action value
-        """
-        return self.model.value(obs)
-
-    def define_learn(self, obs, action, reward, next_obs, terminal):
-        """ update value model self.model with DQN algorithm
-        """
-
-        pred_value = self.model.value(obs)
-        next_pred_value = self.target_model.value(next_obs)
-        best_v = layers.reduce_max(next_pred_value, dim=1)
-        best_v.stop_gradient = True
-        target = reward + (
-            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v
-
-        action_onehot = layers.one_hot(action, self.action_dim)
-        action_onehot = layers.cast(action_onehot, dtype='float32')
-        pred_action_value = layers.reduce_sum(
-            layers.elementwise_mul(action_onehot, pred_value), dim=1)
-        cost = layers.square_error_cost(pred_action_value, target)
-        cost = layers.reduce_mean(cost)
-        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
-        optimizer.minimize(cost)
-        return cost
-
-    def sync_target(self, gpu_id):
-        """ sync parameters of self.target_model with self.model
-        """
-        self.model.sync_params_to(self.target_model, gpu_id=gpu_id)
--- a/parl/algorithms/fluid/policy_gradient.py
+++ b/parl/algorithms/fluid/policy_gradient.py
@@ -54,7 +54,7 @@ class PolicyGradient(Algorithm):
    def predict(self, obs):
        """ use policy model self.model to predict the action probability
        """
-        return self.model.policy(obs)
+        return self.model(obs)

    @deprecated(
        deprecated_in='1.2', removed_in='1.3', replace_function='learn')
@@ -66,7 +66,7 @@ class PolicyGradient(Algorithm):
    def learn(self, obs, action, reward):
        """ update policy model self.model with policy gradient algorithm
        """
-        act_prob = self.model.policy(obs)
+        act_prob = self.model(obs)
        log_prob = layers.cross_entropy(act_prob, action)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

--- a/parl/algorithms/impala/__init__.py
+++ b/parl/algorithms/impala/__init__.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from parl.algorithms.impala.impala import *
--- a/parl/algorithms/impala/impala.py
+++ b/parl/algorithms/impala/impala.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import parl.layers as layers
-from parl.algorithms.impala import vtrace
-from parl.framework.algorithm_base import Algorithm
-from parl.framework.policy_distribution import CategoricalDistribution
-from parl.plutils import inverse
-
-__all__ = ['IMPALA']
-
-
-class VTraceLoss(object):
-    def __init__(self,
-                 behaviour_actions_log_probs,
-                 target_actions_log_probs,
-                 policy_entropy,
-                 dones,
-                 discount,
-                 rewards,
-                 values,
-                 bootstrap_value,
-                 entropy_coeff=-0.01,
-                 vf_loss_coeff=0.5,
-                 clip_rho_threshold=1.0,
-                 clip_pg_rho_threshold=1.0):
-        """Policy gradient loss with vtrace importance weighting.
-
-        VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
-        batch_size. The reason we need to know `B` is for V-trace to properly
-        handle episode cut boundaries.
-
-        Args:
-            behaviour_actions_log_probs: A float32 tensor of shape [T, B].
-            target_actions_log_probs: A float32 tensor of shape [T, B].
-            policy_entropy: A float32 tensor of shape [T, B].
-            dones: A float32 tensor of shape [T, B].
-            discount: A float32 scalar.
-            rewards: A float32 tensor of shape [T, B].
-            values: A float32 tensor of shape [T, B].
-            bootstrap_value: A float32 tensor of shape [B].
-        """
-
-        self.vtrace_returns = vtrace.from_importance_weights(
-            behaviour_actions_log_probs=behaviour_actions_log_probs,
-            target_actions_log_probs=target_actions_log_probs,
-            discounts=inverse(dones) * discount,
-            rewards=rewards,
-            values=values,
-            bootstrap_value=bootstrap_value,
-            clip_rho_threshold=clip_rho_threshold,
-            clip_pg_rho_threshold=clip_pg_rho_threshold)
-
-        # The policy gradients loss
-        self.pi_loss = -1.0 * layers.reduce_sum(
-            target_actions_log_probs * self.vtrace_returns.pg_advantages)
-
-        # The baseline loss
-        delta = values - self.vtrace_returns.vs
-        self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))
-
-        # The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
-        self.entropy = layers.reduce_sum(policy_entropy)
-
-        # The summed weighted loss
-        self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
-                           self.entropy * entropy_coeff)
-
-
-class IMPALA(Algorithm):
-    def __init__(self, model, hyperparas):
-        super(IMPALA, self).__init__(model, hyperparas)
-
-    def learn(self, obs, actions, behaviour_logits, rewards, dones,
-              learning_rate, entropy_coeff):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-            actions: An int64 tensor of shape [B].
-            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
-            rewards: A float32 tensor of shape [B].
-            dones: A float32 tensor of shape [B].
-            learning_rate: float scalar of learning rate.
-            entropy_coeff: float scalar of entropy coefficient.
-        """
-
-        values = self.model.value(obs)
-        target_logits = self.model.policy(obs)
-
-        target_policy_distribution = CategoricalDistribution(target_logits)
-        behaviour_policy_distribution = CategoricalDistribution(
-            behaviour_logits)
-
-        policy_entropy = target_policy_distribution.entropy()
-        target_actions_log_probs = target_policy_distribution.logp(actions)
-        behaviour_actions_log_probs = behaviour_policy_distribution.logp(
-            actions)
-
-        # Calculating kl for debug
-        kl = target_policy_distribution.kl(behaviour_policy_distribution)
-        kl = layers.reduce_mean(kl)
-        """
-        Split the tensor into batches at known episode cut boundaries. 
-        [B * T] -> [T, B]
-        """
-        T = self.hp["sample_batch_steps"]
-
-        def split_batches(tensor):
-            B = tensor.shape[0] // T
-            splited_tensor = layers.reshape(tensor,
-                                            [B, T] + list(tensor.shape[1:]))
-            # transpose B and T
-            return layers.transpose(
-                splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape))))
-
-        behaviour_actions_log_probs = split_batches(
-            behaviour_actions_log_probs)
-        target_actions_log_probs = split_batches(target_actions_log_probs)
-        policy_entropy = split_batches(policy_entropy)
-        dones = split_batches(dones)
-        rewards = split_batches(rewards)
-        values = split_batches(values)
-
-        # [T, B] -> [T - 1, B] for V-trace calc.
-        behaviour_actions_log_probs = layers.slice(
-            behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1])
-        target_actions_log_probs = layers.slice(
-            target_actions_log_probs, axes=[0], starts=[0], ends=[-1])
-        policy_entropy = layers.slice(
-            policy_entropy, axes=[0], starts=[0], ends=[-1])
-        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
-        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
-        bootstrap_value = layers.slice(
-            values, axes=[0], starts=[T - 1], ends=[T])
-        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])
-
-        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])
-
-        vtrace_loss = VTraceLoss(
-            behaviour_actions_log_probs=behaviour_actions_log_probs,
-            target_actions_log_probs=target_actions_log_probs,
-            policy_entropy=policy_entropy,
-            dones=dones,
-            discount=self.hp['gamma'],
-            rewards=rewards,
-            values=values,
-            bootstrap_value=bootstrap_value,
-            entropy_coeff=entropy_coeff,
-            vf_loss_coeff=self.hp['vf_loss_coeff'],
-            clip_rho_threshold=self.hp['clip_rho_threshold'],
-            clip_pg_rho_threshold=self.hp['clip_pg_rho_threshold'])
-
-        fluid.clip.set_gradient_clip(
-            clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0))
-
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
-        optimizer.minimize(vtrace_loss.total_loss)
-        return vtrace_loss, kl
-
-    def sample(self, obs):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-        """
-        logits = self.model.policy(obs)
-        policy_dist = CategoricalDistribution(logits)
-        sample_actions = policy_dist.sample()
-        return sample_actions, logits
-
-    def predict(self, obs):
-        """
-        Args:
-            obs: An float32 tensor of shape ([B] + observation_space).
-                 E.g. [B, C, H, W] in atari.
-        """
-        logits = self.model.policy(obs)
-        probs = layers.softmax(logits)
-
-        predict_actions = layers.argmax(probs, 1)
-
-        return predict_actions
--- a/parl/algorithms/impala/tests/vtrace_test.py
+++ b/parl/algorithms/impala/tests/vtrace_test.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for V-trace.
-
-The following code is mainly referenced and copied from:
-https://github.com/deepmind/scalable_agent/blob/master/vtrace_test.py
-"""
-
-import copy
-import numpy as np
-import unittest
-import parl.layers as layers
-from paddle import fluid
-from parameterized import parameterized
-from parl.algorithms.impala import vtrace
-from parl.utils import get_gpu_count
-
-
-def _shaped_arange(*shape):
-    """Runs np.arange, converts to float and reshapes."""
-    return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
-
-
-def _ground_truth_calculation(behaviour_actions_log_probs,
-                              target_actions_log_probs, discounts, rewards,
-                              values, bootstrap_value, clip_rho_threshold,
-                              clip_pg_rho_threshold):
-    """Calculates the ground truth for V-trace in Python/Numpy."""
-    log_rhos = target_actions_log_probs - behaviour_actions_log_probs
-
-    vs = []
-    seq_len = len(discounts)
-    rhos = np.exp(log_rhos)
-    cs = np.minimum(rhos, 1.0)
-    clipped_rhos = rhos
-    if clip_rho_threshold:
-        clipped_rhos = np.minimum(rhos, clip_rho_threshold)
-    clipped_pg_rhos = rhos
-    if clip_pg_rho_threshold:
-        clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)
-
-    # This is a very inefficient way to calculate the V-trace ground truth.
-    # We calculate it this way because it is close to the mathematical notation of
-    # V-trace.
-    # v_s = V(x_s)
-    #       + \sum^{T-1}_{t=s} \gamma^{t-s}
-    #         * \prod_{i=s}^{t-1} c_i
-    #         * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
-    # Note that when we take the product over c_i, we write `s:t` as the notation
-    # of the paper is inclusive of the `t-1`, but Python is exclusive.
-    # Also note that np.prod([]) == 1.
-    values_t_plus_1 = np.concatenate([values, bootstrap_value[None, :]],
-                                     axis=0)
-    for s in range(seq_len):
-        v_s = np.copy(values[s])  # Very important copy.
-        for t in range(s, seq_len):
-            v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0)
-                    * clipped_rhos[t] * (rewards[t] + discounts[t] *
-                                         values_t_plus_1[t + 1] - values[t]))
-        vs.append(v_s)
-    vs = np.stack(vs, axis=0)
-    pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate(
-        [vs[1:], bootstrap_value[None, :]], axis=0) - values))
-
-    return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)
-
-
-class VtraceTest(unittest.TestCase):
-    def setUp(self):
-        gpu_count = get_gpu_count()
-        if gpu_count > 0:
-            place = fluid.CUDAPlace(0)
-            self.gpu_id = 0
-        else:
-            place = fluid.CPUPlace()
-            self.gpu_id = -1
-        self.executor = fluid.Executor(place)
-
-    @parameterized.expand([('Batch1', 1), ('Batch4', 4)])
-    def test_from_importance_weights(self, name, batch_size):
-        """Tests V-trace against ground truth data calculated in python."""
-        seq_len = 5
-
-        # Create log_rhos such that rho will span from near-zero to above the
-        # clipping thresholds. In particular, calculate log_rhos in [-2.5, 2.5),
-        # so that rho is in approx [0.08, 12.2).
-        log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len)
-        log_rhos = 5 * (log_rhos - 0.5)  # [0.0, 1.0) -> [-2.5, 2.5).
-        # Fake behaviour_actions_log_probs, target_actions_log_probs
-        target_actions_log_probs = log_rhos + 1.0
-        behaviour_actions_log_probs = np.ones(
-            shape=log_rhos.shape, dtype='float32')
-
-        values = {
-            'behaviour_actions_log_probs':
-            behaviour_actions_log_probs,
-            'target_actions_log_probs':
-            target_actions_log_probs,
-            # T, B where B_i: [0.9 / (i+1)] * T
-            'discounts':
-            np.array([[0.9 / (b + 1) for b in range(batch_size)]
-                      for _ in range(seq_len)],
-                     dtype=np.float32),
-            'rewards':
-            _shaped_arange(seq_len, batch_size),
-            'values':
-            _shaped_arange(seq_len, batch_size) / batch_size,
-            'bootstrap_value':
-            _shaped_arange(batch_size) + 1.0,
-            'clip_rho_threshold':
-            3.7,
-            'clip_pg_rho_threshold':
-            2.2,
-        }
-
-        # Calculated by numpy/python
-        ground_truth_v = _ground_truth_calculation(**values)
-
-        # Calculated by Fluid
-        test_program = fluid.Program()
-        with fluid.program_guard(test_program):
-            behaviour_actions_log_probs_input = layers.data(
-                name='behaviour_actions_log_probs',
-                shape=[seq_len, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            target_actions_log_probs_input = layers.data(
-                name='target_actions_log_probs',
-                shape=[seq_len, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            discounts_input = layers.data(
-                name='discounts',
-                shape=[seq_len, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            rewards_input = layers.data(
-                name='rewards',
-                shape=[seq_len, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            values_input = layers.data(
-                name='values',
-                shape=[seq_len, batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            bootstrap_value_input = layers.data(
-                name='bootstrap_value',
-                shape=[batch_size],
-                dtype='float32',
-                append_batch_size=False)
-            fluid_inputs = {
-                'behaviour_actions_log_probs':
-                behaviour_actions_log_probs_input,
-                'target_actions_log_probs': target_actions_log_probs_input,
-                'discounts': discounts_input,
-                'rewards': rewards_input,
-                'values': values_input,
-                'bootstrap_value': bootstrap_value_input,
-                'clip_rho_threshold': 3.7,
-                'clip_pg_rho_threshold': 2.2,
-            }
-            output = vtrace.from_importance_weights(**fluid_inputs)
-
-        self.executor.run(fluid.default_startup_program())
-        feed = copy.copy(values)
-        del feed['clip_rho_threshold']
-        del feed['clip_pg_rho_threshold']
-        [output_vs, output_pg_advantage] = self.executor.run(
-            test_program,
-            feed=feed,
-            fetch_list=[output.vs, output.pg_advantages])
-
-        np.testing.assert_almost_equal(ground_truth_v.vs, output_vs, 5)
-        np.testing.assert_almost_equal(ground_truth_v.pg_advantages,
-                                       output_pg_advantage, 5)
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/parl/algorithms/impala/vtrace.py
+++ b/parl/algorithms/impala/vtrace.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Functions to compute V-trace off-policy actor critic targets,
-which used in IMAPLA algorithm.
-
-The following code is mainly referenced and copied from:
-https://github.com/deepmind/scalable_agent/blob/master/vtrace.py
-
-For details and theory see:
-
-"Espeholt L, Soyer H, Munos R, et al. Impala: Scalable distributed 
-deep-rl with importance weighted actor-learner 
-architectures[J]. arXiv preprint arXiv:1802.01561, 2018."
-
-"""
-
-import collections
-import paddle.fluid as fluid
-import parl.layers as layers
-from parl.utils import MAX_INT32
-
-VTraceReturns = collections.namedtuple('VTraceReturns',
-                                       ['vs', 'pg_advantages'])
-
-
-def from_importance_weights(behaviour_actions_log_probs,
-                            target_actions_log_probs,
-                            discounts,
-                            rewards,
-                            values,
-                            bootstrap_value,
-                            clip_rho_threshold=1.0,
-                            clip_pg_rho_threshold=1.0,
-                            name='vtrace_from_logits'):
-    r"""V-trace for softmax policies.
-
-    Calculates V-trace actor critic targets for softmax polices as described in
-
-    "IMPALA: Scalable Distributed Deep-RL with
-    Importance Weighted Actor-Learner Architectures"
-    by Espeholt, Soyer, Munos et al.
-
-    Target policy refers to the policy we are interested in improving and
-    behaviour policy refers to the policy that generated the given
-    rewards and actions.
-
-    In the notation used throughout documentation and comments, T refers to the
-    time dimension ranging from 0 to T-1. B refers to the batch size and
-    NUM_ACTIONS refers to the number of actions.
-
-    Args:
-      behaviour_actions_log_probs: A float32 tensor of shape [T, B] of
-        log-probabilities of actions in behaviour policy.
-      target_policy_logits: A float32 tensor of shape [T, B] of
-        log-probabilities of actions in target policy.
-      discounts: A float32 tensor of shape [T, B] with the discount encountered
-        when following the behaviour policy.
-      rewards: A float32 tensor of shape [T, B] with the rewards generated by
-        following the behaviour policy.
-      values: A float32 tensor of shape [T, B] with the value function estimates
-        wrt. the target policy.
-      bootstrap_value: A float32 of shape [B] with the value function estimate at
-        time T.
-      clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
-        importance weights (rho) when calculating the baseline targets (vs).
-        rho^bar in the paper.
-      clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
-        on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
-      name: The name scope that all V-trace operations will be created in.
-
-    Returns:
-      A VTraceReturns namedtuple (vs, pg_advantages) where:
-        vs: A float32 tensor of shape [T, B]. Can be used as target to
-          train a baseline (V(x_t) - vs_t)^2.
-        pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
-          advantage in the calculation of policy gradients.
-    """
-
-    rank = len(behaviour_actions_log_probs.shape)  # Usually 2.
-    assert len(target_actions_log_probs.shape) == rank
-    assert len(values.shape) == rank
-    assert len(bootstrap_value.shape) == (rank - 1)
-    assert len(discounts.shape) == rank
-    assert len(rewards.shape) == rank
-
-    # log importance sampling weights.
-    # V-trace performs operations on rhos in log-space for numerical stability.
-    log_rhos = target_actions_log_probs - behaviour_actions_log_probs
-
-    if clip_rho_threshold is not None:
-        clip_rho_threshold = layers.fill_constant([1], 'float32',
-                                                  clip_rho_threshold)
-    if clip_pg_rho_threshold is not None:
-        clip_pg_rho_threshold = layers.fill_constant([1], 'float32',
-                                                     clip_pg_rho_threshold)
-
-    rhos = layers.exp(log_rhos)
-    if clip_rho_threshold is not None:
-        clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold)
-    else:
-        clipped_rhos = rhos
-
-    constant_one = layers.fill_constant([1], 'float32', 1.0)
-    cs = layers.elementwise_min(rhos, constant_one)
-
-    # Append bootstrapped value to get [v1, ..., v_t+1]
-    values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32])
-    values_t_plus_1 = layers.concat(
-        [values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)
-
-    # \delta_s * V
-    deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
-
-    vs_minus_v_xs = recursively_scan(discounts, cs, deltas)
-
-    # Add V(x_s) to get v_s.
-    vs = layers.elementwise_add(vs_minus_v_xs, values)
-
-    # Advantage for policy gradient.
-    vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32])
-    vs_t_plus_1 = layers.concat(
-        [vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)
-
-    if clip_pg_rho_threshold is not None:
-        clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold)
-    else:
-        clipped_pg_rhos = rhos
-    pg_advantages = (
-        clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
-
-    # Make sure no gradients backpropagated through the returned values.
-    vs.stop_gradient = True
-    pg_advantages.stop_gradient = True
-    return VTraceReturns(vs=vs, pg_advantages=pg_advantages)
-
-
-def recursively_scan(discounts, cs, deltas):
-    """ Recursively calculate vs_minus_v_xs according to following equation:
-    vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1)
-
-    Args:
-        discounts: A float32 tensor of shape [T, B] with discounts encountered when
-                   following the behaviour policy.
-        cs: A float32 tensor of shape [T, B], which corresponding to $c_s$ in the
-            origin paper.
-        deltas: A float32 tensor of shape [T, B], which corresponding to 
-                $\delta_s * V$ in the origin paper.
-
-    Returns:
-        vs_minus_v_xs: A float32 tensor of shape [T, B], which corresponding to 
-                       $v_s - V(x_s)$ in the origin paper.
-    """
-
-    # All sequences are reversed, computation starts from the back.
-    reverse_discounts = layers.reverse(x=discounts, axis=[0])
-    reverse_cs = layers.reverse(x=cs, axis=[0])
-    reverse_deltas = layers.reverse(x=deltas, axis=[0])
-
-    static_while = layers.StaticRNN()
-    # init: shape [B]
-    init = layers.fill_constant_batch_size_like(
-        discounts, shape=[1], dtype='float32', value=0.0, input_dim_idx=1)
-
-    with static_while.step():
-        discount_t = static_while.step_input(reverse_discounts)
-        c_t = static_while.step_input(reverse_cs)
-        delta_t = static_while.step_input(reverse_deltas)
-
-        vs_minus_v_xs_t_plus_1 = static_while.memory(init=init)
-        vs_minus_v_xs_t = delta_t + discount_t * c_t * vs_minus_v_xs_t_plus_1
-
-        static_while.update_memory(vs_minus_v_xs_t_plus_1, vs_minus_v_xs_t)
-
-        static_while.step_output(vs_minus_v_xs_t)
-
-    vs_minus_v_xs = static_while()
-
-    # Reverse the results back to original order.
-    vs_minus_v_xs = layers.reverse(vs_minus_v_xs, [0])
-
-    return vs_minus_v_xs
--- a/parl/algorithms/policy_gradient.py
+++ b/parl/algorithms/policy_gradient.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from parl.framework.algorithm_base import Algorithm
-import parl.layers as layers
-
-__all__ = ['PolicyGradient']
-
-
-class PolicyGradient(Algorithm):
-    def __init__(self, model, hyperparas):
-        Algorithm.__init__(self, model, hyperparas)
-        self.model = model
-        self.lr = hyperparas['lr']
-
-    def define_predict(self, obs):
-        """ use policy model self.model to predict the action probability
-        """
-        return self.model.policy(obs)
-
-    def define_learn(self, obs, action, reward):
-        """ update policy model self.model with policy gradient algorithm
-        """
-        act_prob = self.model.policy(obs)
-        log_prob = layers.cross_entropy(act_prob, action)
-        cost = log_prob * reward
-        cost = layers.reduce_mean(cost)
-        optimizer = fluid.optimizer.Adam(self.lr)
-        optimizer.minimize(cost)
-        return cost
--- a/parl/algorithms/ppo.py
+++ b/parl/algorithms/ppo.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import parl.layers as layers
-from copy import deepcopy
-from paddle import fluid
-from parl.framework.algorithm_base import Algorithm
-
-__all__ = ['PPO']
-
-
-class PPO(Algorithm):
-    def __init__(self, model, hyperparas):
-        Algorithm.__init__(self, model, hyperparas)
-        # Used to calculate probability of action in old policy
-        self.old_policy_model = deepcopy(model.policy_model)
-
-        # fetch hyper parameters
-        self.act_dim = hyperparas['act_dim']
-        self.policy_lr = hyperparas['policy_lr']
-        self.value_lr = hyperparas['value_lr']
-        if 'epsilon' in hyperparas:
-            self.epsilon = hyperparas['epsilon']
-        else:
-            self.epsilon = 0.2  # default
-
-    def _calc_logprob(self, actions, means, logvars):
-        """ Calculate log probabilities of actions, when given means and logvars
-            of normal distribution.
-            The constant sqrt(2 * pi) is omitted, which will be eliminated in later.
-
-        Args:
-            actions: shape (batch_size, act_dim)
-            means:   shape (batch_size, act_dim)
-            logvars: shape (act_dim)
-
-        Returns:
-            logprob: shape (batch_size)
-        """
-        exp_item = layers.elementwise_div(
-            layers.square(actions - means), layers.exp(logvars), axis=1)
-        exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1)
-
-        vars_item = -0.5 * layers.reduce_sum(logvars)
-        logprob = exp_item + vars_item
-        return logprob
-
-    def _calc_kl(self, means, logvars, old_means, old_logvars):
-        """ Calculate KL divergence between old and new distributions
-            See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence
-
-        Args:
-            means: shape (batch_size, act_dim)
-            logvars: shape (act_dim)
-            old_means: shape (batch_size, act_dim)
-            old_logvars: shape (act_dim)
-
-        Returns:
-            kl: shape (batch_size)
-        """
-        log_det_cov_old = layers.reduce_sum(old_logvars)
-        log_det_cov_new = layers.reduce_sum(logvars)
-        tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars))
-        kl = 0.5 * (layers.reduce_sum(
-            layers.square(means - old_means) / layers.exp(logvars), dim=1) + (
-                log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim)
-        return kl
-
-    def define_predict(self, obs):
-        """ Use policy model of self.model to predict means and logvars of actions
-        """
-        means, logvars = self.model.policy(obs)
-        return means
-
-    def define_sample(self, obs):
-        """ Use policy model of self.model to sample actions
-        """
-        sampled_act = self.model.policy_sample(obs)
-        return sampled_act
-
-    def define_policy_learn(self, obs, actions, advantages, beta=None):
-        """ Learn policy model with: 
-                1. CLIP loss: Clipped Surrogate Objective 
-                2. KLPEN loss: Adaptive KL Penalty Objective
-            See: https://arxiv.org/pdf/1707.02286.pdf
-
-        Args:
-            obs: Tensor, (batch_size, obs_dim)
-            actions: Tensor, (batch_size, act_dim)
-            advantages: Tensor (batch_size, )
-            beta: Tensor (1) or None
-                  if None, use CLIP Loss; else, use KLPEN loss. 
-        """
-        old_means, old_logvars = self.old_policy_model.policy(obs)
-        old_means.stop_gradient = True
-        old_logvars.stop_gradient = True
-        old_logprob = self._calc_logprob(actions, old_means, old_logvars)
-
-        means, logvars = self.model.policy(obs)
-        logprob = self._calc_logprob(actions, means, logvars)
-
-        kl = self._calc_kl(means, logvars, old_means, old_logvars)
-        kl = layers.reduce_mean(kl)
-
-        if beta is None:  # Clipped Surrogate Objective
-            pg_ratio = layers.exp(logprob - old_logprob)
-            clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon,
-                                           1 + self.epsilon)
-            surrogate_loss = layers.elementwise_min(
-                advantages * pg_ratio, advantages * clipped_pg_ratio)
-            loss = 0 - layers.reduce_mean(surrogate_loss)
-        else:  # Adaptive KL Penalty Objective
-            # policy gradient loss
-            loss1 = 0 - layers.reduce_mean(
-                advantages * layers.exp(logprob - old_logprob))
-            # adaptive kl loss
-            loss2 = kl * beta
-            loss = loss1 + loss2
-        optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr)
-        optimizer.minimize(loss)
-        return loss, kl
-
-    def define_value_predict(self, obs):
-        """ Use value model of self.model to predict value of obs
-        """
-        return self.model.value(obs)
-
-    def define_value_learn(self, obs, val):
-        """ Learn value model with square error cost
-        """
-        predict_val = self.model.value(obs)
-        loss = layers.square_error_cost(predict_val, val)
-        loss = layers.reduce_mean(loss)
-        optimizer = fluid.optimizer.AdamOptimizer(self.value_lr)
-        optimizer.minimize(loss)
-        return loss
-
-    def sync_old_policy(self, gpu_id):
-        """ Synchronize parameters of self.model.policy_model to self.old_policy_model
-        """
-        self.model.policy_model.sync_params_to(
-            self.old_policy_model, gpu_id=gpu_id)
--- a/parl/core/model_base.py
+++ b/parl/core/model_base.py
@@ -92,4 +92,4 @@ class ModelBase(object):
    def __call__(self, *args, **kwargs):
        """Call forward function.
        """
-        self.forward(*args, **kwargs)
+        return self.forward(*args, **kwargs)