preliminary implementations of the ComputationTask, Algorithm, and Model classes (#9)

* prelimary implementations of ComputationTask, Algorithm and Model classes * remove "model_func" from the args of an algorithm * a clean clone() function for Algorithm and Model * add use_next_value as a input to learn() * further re-structure * added Feedforward and RLAlgorithm classes * maxid -> argmax * discrete_distribution -> category_distribution * category -> categorical * revisions

preliminary implementations of the ComputationTask, Algorithm, and Model classes (#9)
* prelimary implementations of ComputationTask, Algorithm and Model classes * remove "model_func" from the args of an algorithm * a clean clone() function for Algorithm and Model * add use_next_value as a input to learn() * further re-structure * added Feedforward and RLAlgorithm classes * maxid -> argmax * discrete_distribution -> category_distribution * category -> categorical * revisions
4b4b5824 · Haonan · emailweixu · 2ce57115 · 4b4b5824 · 4b4b5824
14 changed file
--- a/parl/algorithm_zoo/__init__.py
+++ b/parl/algorithm_zoo/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parl/algorithm_zoo/simple_algorithms.py
+++ b/parl/algorithm_zoo/simple_algorithms.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parl.framework.algorithm import RLAlgorithm
+import parl.layers as layers
+import parl.framework.policy_distribution as pd
+from parl.layers import common_functions as comf
+import paddle.fluid as fluid
+from copy import deepcopy
+
+
+class SimpleAC(RLAlgorithm):
+    """
+    A simple Actor-Critic that has a feedforward policy network and
+    a single discrete action.
+
+    learn() requires keywords: "action", "reward", "v_value"
+    """
+
+    def __init__(self,
+                 model,
+                 hyperparas=dict(lr=1e-4),
+                 gpu_id=-1,
+                 discount_factor=0.99):
+
+        super(SimpleAC, self).__init__(model, hyperparas, gpu_id)
+        self.discount_factor = discount_factor
+
+    def learn(self, inputs, next_inputs, states, next_states, episode_end,
+              actions, rewards):
+
+        action = actions["action"]
+        reward = rewards["reward"]
+
+        values = self.model.value(inputs, states)
+        next_values = self.model.value(next_inputs, next_states)
+        value = values["v_value"]
+        next_value = next_values["v_value"] * episode_end["episode_end"]
+        next_value.stop_gradient = True
+        assert value.shape[1] == next_value.shape[1]
+
+        critic_value = reward + self.discount_factor * next_value
+        td_error = critic_value - value
+        value_cost = layers.square(td_error)
+
+        dist, _ = self.model.policy(inputs, states)
+        dist = dist["action"]
+        assert isinstance(dist, pd.CategoricalDistribution)
+
+        pg_cost = 0 - dist.loglikelihood(action)
+        avg_cost = layers.mean(x=value_cost + pg_cost * td_error)
+        optimizer = fluid.optimizer.SGD(learning_rate=self.hp["lr"])
+        optimizer.minimize(avg_cost)
+        return dict(cost=avg_cost)
+
+
+class SimpleQ(RLAlgorithm):
+    """
+    A simple Q-learning that has a feedforward policy network and a single discrete action.
+
+    learn() requires keywords: "action", "reward", "q_value"
+    """
+
+    def __init__(self,
+                 model,
+                 hyperparas=dict(lr=1e-4),
+                 gpu_id=-1,
+                 discount_factor=0.99,
+                 update_ref_interval=100):
+
+        super(SimpleQ, self).__init__(model, hyperparas, gpu_id)
+        self.discount_factor = discount_factor
+        self.gpu_id = gpu_id
+        assert update_ref_interval > 0
+        self.update_ref_interval = update_ref_interval
+        self.total_batches = 0
+        ## create a reference model
+        self.ref_model = deepcopy(model)
+
+    def before_every_batch(self):
+        if self.total_batches % self.update_ref_interval == 0:
+            self.model.sync_paras_to(self.ref_model, self.gpu_id)
+        self.total_batches += 1
+
+    def learn(self, inputs, next_inputs, states, next_states, episode_end,
+              actions, rewards):
+
+        action = actions["action"]
+        reward = rewards["reward"]
+
+        values = self.model.value(inputs, states)
+        next_values = self.ref_model.value(next_inputs, next_states)
+        q_value = values["q_value"]
+        next_q_value = next_values["q_value"] * episode_end["episode_end"]
+        next_q_value.stop_gradient = True
+        next_value = layers.reduce_max(next_q_value, dim=-1)
+        assert q_value.shape[1] == next_q_value.shape[1]
+        num_actions = q_value.shape[1]
+
+        value = comf.idx_select(input=q_value, idx=action)
+        critic_value = reward + self.discount_factor * next_value
+        td_error = critic_value - value
+
+        avg_cost = layers.mean(x=layers.square(td_error))
+        optimizer = fluid.optimizer.SGD(learning_rate=self.hp["lr"])
+        optimizer.minimize(avg_cost)
+        return dict(cost=avg_cost)
--- a/parl/framework/__init__.py
+++ b/parl/framework/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parl/framework/algorithm.py
+++ b/parl/framework/algorithm.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.layers import Network
+import parl.framework.policy_distribution as pd
+from abc import ABCMeta, abstractmethod
+
+
+def check_duplicate_spec_names(model):
+    """
+    Check if there are two specs that have the same name.
+    """
+    specs = model.get_input_specs() \
+            + model.get_action_specs() \
+            + model.get_state_specs() \
+            + model.get_reward_specs()
+    names = [name for name, _ in specs]
+    duplicates = set([n for n in names if names.count(n) > 1])
+    assert not duplicates, \
+        "duplicate names with different specs: " + " ".join(duplicates)
+
+
+class Model(Network):
+    """
+    A Model is owned by an Algorithm. It implements all the network model of
+    a specific problem.
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    @abstractmethod
+    def get_input_specs(self):
+        """
+        Output: list of tuples
+        """
+        pass
+
+    def get_state_specs(self):
+        """
+        States are optional to a Model.
+        Output: list of tuples
+        """
+        return []
+
+    @abstractmethod
+    def get_action_specs(self):
+        """
+        Output: list of tuples
+        """
+        pass
+
+    def get_reward_specs(self):
+        """
+        By default, a scalar reward.
+        User can specify a vector of rewards for some problems
+        """
+        return [("reward", dict(shape=[1]))]
+
+    def policy(self, inputs, states):
+        """
+        Return: action_dists: a dict of action distribution objects
+                states
+                An action distribution object can be created with
+                PolicyDistribution().
+        Optional: a model might not always have to implement policy()
+        """
+        raise NotImplementedError()
+
+    def value(self, inputs, states):
+        """
+        Return: values: a dict of estimated values for the current observations and states
+                        For example, "q_value" and "v_value"
+        Optional: a model might not always have to implement value()
+        """
+        raise NotImplementedError()
+
+
+class Algorithm(object):
+    """
+    An Algorithm implements two functions:
+    1. predict() computes forward
+    2. learn() computes a cost for optimization
+
+    An algorithm should be only part of a network. The user only needs to
+    implement the rest of the network in the Model class.
+    """
+
+    def __init__(self, model, hyperparas, gpu_id):
+        assert isinstance(model, Model)
+        check_duplicate_spec_names(model)
+        self.model = model
+        self.hp = hyperparas
+        self.gpu_id = gpu_id
+
+    def get_input_specs(self):
+        return self.model.get_input_specs()
+
+    def get_state_specs(self):
+        return self.model.get_state_specs()
+
+    def get_action_specs(self):
+        """
+        For non-RL algortihms, this can return []
+        """
+        return self.model.get_action_specs()
+
+    def get_reward_specs(self):
+        """
+        For non-RL algortihms, this can return []
+        """
+        return self.model.get_reward_specs()
+
+    def before_every_batch(self):
+        """
+        A callback function inserted before every batch of training.
+        See ComputationTask.learn()
+        """
+        pass
+
+    def after_every_batch(self):
+        """
+        A callback function inserted after every batch of training.
+        See ComputationTask.learn()
+        """
+        pass
+
+    def predict(self, inputs, states):
+        """
+        Given the inputs and states, this function does forward prediction and updates states.
+        Optional: an algorithm might not implement predict()
+        """
+        pass
+
+    def learn(self, inputs, next_inputs, states, next_states, episode_end,
+              actions, rewards):
+        """
+        This function computes a learning cost to be optimized.
+        The return should be the cost.
+        Output: cost(dict)
+
+        Optional: an algorithm might not implement learn()
+        """
+        pass
+
+
+class RLAlgorithm(Algorithm):
+    """
+    A derived Algorithm class specially for RL problems.
+    """
+
+    def __init__(self, model, hyperparas, gpu_id):
+        super(RLAlgorithm, self).__init__(model, hyperparas, gpu_id)
+
+    def get_behavior_model(self):
+        """
+        Return the behavior model to compute actions. The behavior model could be different
+        from the training model, which is common in off-policy RL algorithms.
+
+        The default behavior model is set to the training model. The user can override this
+        function to specify another different model.
+        """
+        return self.model
+
+    def predict(self, inputs, states):
+        """
+        Implementation of Algorithm.predict()
+
+        Given the inputs and states, this function predicts actions and updates states.
+        Input: inputs(dict), states(dict)
+        Output: actions(dict), states(dict)
+        """
+        behavior_model = self.get_behavior_model()
+        distributions, states = behavior_model.policy(inputs, states)
+        actions = {}
+        for key, dist in distributions.iteritems():
+            assert isinstance(
+                dist, pd.PolicyDistribution
+            ), "behavior_model.policy must return PolicyDist!"
+            actions[key] = dist()
+        return actions, states
--- a/parl/framework/computation_task.py
+++ b/parl/framework/computation_task.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.framework.algorithm import Model, Algorithm
+
+
+def split_list(l, sizes):
+    """
+    Split a list into several chunks, each chunk with a size in sizes
+    """
+    chunks = []
+    offset = 0
+    for size in sizes:
+        chunks.append(l[offset:offset + size])
+        offset += size
+    return chunks
+
+
+class ComputationTask(object):
+    """
+    A ComputationTask is responsible for the general data flow
+    outside the algorithm
+
+    A ComputationTask is created in a bottom-up way:
+    a. create a Model
+    b. create an Algorithm with the model as an input
+    c. define a ComputationTask with the algorithm
+    """
+
+    def __init__(self, algorithm):
+        assert isinstance(algorithm, Algorithm)
+        self.alg = algorithm
+        ## create an Fluid executor
+        self._define_program()
+        place = fluid.CPUPlace() if self.alg.gpu_id < 0 \
+                else fluid.CUDAPlace(self.alg.gpu_id)
+        self.fluid_executor = fluid.Executor(place)
+        self.fluid_executor.run(fluid.default_startup_program())
+
+    def _create_data_layers(self, specs):
+        data_layers = {}
+        for name, args in specs:
+            data_layers[name] = layers.data(name, **args)
+        return data_layers
+
+    def _define_program(self):
+        self.learn_program = fluid.Program()
+        self.predict_program = fluid.Program()
+
+        def _get_next_specs(specs):
+            return [("next_" + spec[0], spec[1]) for spec in specs]
+
+        def _select_data(data_layer_dict, specs):
+            return {name: data_layer_dict[name] for name, _ in specs}
+
+        input_specs = self.alg.get_input_specs()
+        state_specs = self.alg.get_state_specs()
+        next_input_specs = _get_next_specs(input_specs)
+        next_state_specs = _get_next_specs(state_specs)
+        action_specs = self.alg.get_action_specs()
+        reward_specs = self.alg.get_reward_specs()
+        episode_end_specs = [("episode_end", dict(shape=[1]))]
+
+        self.action_names = sorted([name for name, _ in action_specs])
+        self.state_names = sorted([name for name, _ in state_specs])
+
+        with fluid.program_guard(self.predict_program):
+            data_layer_dict = self._create_data_layers(input_specs)
+            data_layer_dict.update(self._create_data_layers(state_specs))
+            self.predict_feed_names = sorted(data_layer_dict.keys())
+
+            inputs = _select_data(data_layer_dict, input_specs)
+            states = _select_data(data_layer_dict, state_specs)
+
+            ### call alg predict()
+            pred_actions, pred_states = self.alg.predict(inputs, states)
+            self.predict_fetch = [pred_actions, pred_states]
+
+        with fluid.program_guard(self.learn_program):
+            data_layer_dict = self._create_data_layers(input_specs)
+            data_layer_dict.update(self._create_data_layers(state_specs))
+            data_layer_dict.update(self._create_data_layers(next_input_specs))
+            data_layer_dict.update(self._create_data_layers(next_state_specs))
+            data_layer_dict.update(self._create_data_layers(action_specs))
+            data_layer_dict.update(self._create_data_layers(reward_specs))
+            data_layer_dict.update(self._create_data_layers(episode_end_specs))
+            self.learn_feed_names = sorted(data_layer_dict.keys())
+
+            inputs = _select_data(data_layer_dict, input_specs)
+            states = _select_data(data_layer_dict, state_specs)
+            next_inputs = _select_data(data_layer_dict, next_input_specs)
+            next_states = _select_data(data_layer_dict, next_state_specs)
+            actions = _select_data(data_layer_dict, action_specs)
+            rewards = _select_data(data_layer_dict, reward_specs)
+            episode_end = _select_data(data_layer_dict, episode_end_specs)
+
+            ## call alg learn()
+            ### TODO: implement a recurrent layer to strip the sequence information
+            self.cost = self.alg.learn(inputs, next_inputs, states,
+                                       next_states, episode_end, actions,
+                                       rewards)
+
+    def predict(self, inputs, states=dict()):
+        """
+        ComputationTask predict API
+        This function is responsible to convert Python data to Fluid tensors, and
+        then convert the computational results in the reverse way.
+        """
+        data = {}
+        data.update(inputs)
+        data.update(states)
+        assert sorted(data.keys()) == self.predict_feed_names, \
+            "field names mismatch: %s %s" % (data.keys(), self.predict_feed_names)
+        feed = {n: data[n] for n in self.predict_feed_names}
+
+        ### run the predict_program and fetch the computational results
+        action_tensors, state_tensors = self.predict_fetch
+        action_tensors = list(action_tensors.iteritems())
+        state_tensors = list(state_tensors.iteritems())
+        result = self.fluid_executor.run(
+            self.predict_program,
+            feed=feed,
+            fetch_list=[t for _, t in action_tensors + state_tensors])
+
+        ## actions and states are numpy arrays
+        actions, states = split_list(
+            result, [len(action_tensors), len(state_tensors)])
+
+        ## wrap the results into dictionaries for better access
+        actions = dict(zip([name for name, _ in action_tensors], actions))
+        states = dict(zip([name for name, _ in state_tensors], states))
+        assert sorted(actions.keys()) == self.action_names
+        assert sorted(states.keys()) == self.state_names
+        return actions, states
+
+    def learn(self,
+              inputs,
+              next_inputs,
+              episode_end,
+              actions,
+              rewards,
+              states=dict(),
+              next_states=dict()):
+        """
+        ComputationTask learn API
+        This function is responsible to convert Python data to Fluid tensors, and
+        then convert the computational results in the reverse way.
+        """
+        data = {}
+        data.update(inputs)
+        data.update(next_inputs)
+        data.update(states)
+        data.update(next_states)
+        data.update(episode_end)
+        data.update(actions)
+        data.update(rewards)
+        assert sorted(data.keys()) == self.learn_feed_names, \
+            "field names mismatch: %s %s" % ()
+        feed = {n: data[n] for n in self.learn_feed_names}
+
+        self.alg.before_every_batch()
+        ## run the learn program and fetch the sole cost output
+        result = self.fluid_executor.run(self.learn_program,
+                                         feed=feed,
+                                         fetch_list=[self.cost["cost"]])
+        self.alg.after_every_batch()
+        return dict(cost=result[0])
--- a/parl/framework/policy_distribution.py
+++ b/parl/framework/policy_distribution.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl.layers as layers
+from paddle.fluid.framework import Variable
+from parl.layers import common_functions as comf
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from abc import ABCMeta, abstractmethod
+
+
+class PolicyDistribution(object):
+    __metaclass__ = ABCMeta
+
+    def __init__(self, dist):
+        assert len(dist.shape) == 2
+        self.dim = dist.shape[1]
+        self.dist = dist
+
+    @abstractmethod
+    def __call__(self):
+        """
+        Implement __call__ to sample an instance.
+        """
+        pass
+
+    def dim(self):
+        """
+        For discrete policies, this function returns the number of actions.
+        For continuous policies, this function returns the action vector length.
+        For sequential policies (e.g., sentences), this function returns the number
+        of choices at each step.
+        """
+        return self.dim
+
+    def dist(self):
+        return self.dist
+
+    def loglikelihood(self, action):
+        """
+        Given an action, this function returns the log likelihood of this action under
+        the current distribution.
+        """
+        raise NotImplementedError()
+
+
+class CategoricalDistribution(PolicyDistribution):
+    def __init__(self, dist):
+        super(CategoricalDistribution, self).__init__(dist)
+
+    def __call__(self):
+        return comf.categorical_random(self.dist)
+
+    def loglikelihood(self, action):
+        return 0 - layers.cross_entropy(input=self.dist, label=action)
+
+
+class Deterministic(PolicyDistribution):
+    def __init__(self, dist):
+        super(Deterministic, self).__init__(dist)
+        ## For deterministic action, we only support continuous ones
+        assert dist.dtype == convert_np_dtype_to_dtype_("float32") \
+            or dist.dtype == convert_np_dtype_to_dtype_("float64")
+
+    def __call__(self):
+        return self.dist
+
+    def loglikelihood(self, action):
+        assert False, "You cannot compute likelihood for a deterministic action!"
+
+
+def q_categorical_distribution(q_value, exploration_rate=0.0):
+    """
+    Generate a PolicyDistribution object given a Q value.
+    We first construct a one-hot distribution according to the Q value,
+    and then add an exploration rate to get a probability.
+    """
+    assert len(q_value.shape) == 2, "[batch_size, num_actions]"
+    max_id = comf.argmax_layer(q_value)
+    prob = layers.cast(
+        x=layers.one_hot(
+            input=max_id, depth=q_value.shape[-1]),
+        dtype="float32")
+    ### exploration_rate could be a Variable
+    if not (isinstance(exploration_rate, float) and exploration_rate == 0):
+        prob = exploration_rate / float(q_value.shape[-1]) \
+               + (1 - exploration_rate) * prob
+    return CategoricalDistribution(prob)
--- a/parl/framework/tests/test_algorithm.py
+++ b/parl/framework/tests/test_algorithm.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.framework.algorithm import Model, RLAlgorithm
+from parl.layers import common_functions as comf
+from parl.model_zoo.simple_models import SimpleModelDeterministic
+import numpy as np
+from copy import deepcopy
+import unittest
+
+
+class TestAlgorithm(RLAlgorithm):
+    def __init__(self, model):
+        super(TestAlgorithm, self).__init__(
+            model, hyperparas=dict(), gpu_id=-1)
+
+
+class TestAlgorithmParas(unittest.TestCase):
+    def test_sync_paras_in_one_program(self):
+        """
+        Test case for copying parameters
+        """
+
+        alg1 = TestAlgorithm(model=SimpleModelDeterministic(
+            dims=10, mlp_layer_confs=[dict(size=10)]))
+        alg2 = deepcopy(alg1)
+
+        batch_size = 10
+        sensor = np.random.uniform(
+            0, 1, [batch_size, alg1.model.dims]).astype("float32")
+
+        program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(program, startup_program):
+            x = layers.data(name='x', shape=[alg1.model.dims], dtype="float32")
+            try:
+                # too eary to sync before the layers are created
+                alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
+                self.assertTrue(False)  # you shouldn't be here
+            except:
+                pass
+            ## first let the program generates the actual variables by using the
+            ## layer functions (before this step the layers haven't been instantiated yet!)
+            ## the call of predict() function already covers all the layers
+            y0, _ = alg1.predict(inputs=dict(sensor=x), states=dict())
+            y1, _ = alg2.predict(inputs=dict(sensor=x), states=dict())
+
+        ######################
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+
+        outputs = exe.run(
+            program,
+            feed={'x': sensor},
+            ## y and y1 are two dictionaries
+            fetch_list=y0.values() + y1.values())
+
+        self.assertNotEqual(
+            np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
+
+        ## do the copying
+        alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
+
+        outputs = exe.run(
+            program,
+            feed={'x': sensor},
+            ## y and y1 are two dictionaries
+            fetch_list=y0.values() + y1.values())
+
+        self.assertEqual(
+            np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
+
+    def test_sync_paras_between_programs(self):
+        """
+        Test case for copying parameters between two different programs
+        """
+        alg1 = TestAlgorithm(model=SimpleModelDeterministic(
+            dims=10, mlp_layer_confs=[dict(size=10)]))
+        alg2 = deepcopy(alg1)
+
+        batch_size = 10
+        sensor = np.random.uniform(
+            0, 1, [batch_size, alg1.model.dims]).astype("float32")
+
+        startup_program = fluid.Program()
+        program1 = fluid.Program()
+        program2 = fluid.Program()
+
+        with fluid.program_guard(program1, startup_program):
+            x1 = layers.data(
+                name='x', shape=[alg1.model.dims], dtype="float32")
+            y1, _ = alg1.predict(inputs=dict(sensor=x1), states=dict())
+
+        with fluid.program_guard(program2, startup_program):
+            x2 = layers.data(
+                name='x', shape=[alg1.model.dims], dtype="float32")
+            y2, _ = alg2.predict(inputs=dict(sensor=x2), states=dict())
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+
+        alg1.model.sync_paras_to(alg2.model, alg2.gpu_id)
+
+        outputs1 = exe.run(program1,
+                           feed={'x': sensor},
+                           fetch_list=y1.values())
+        outputs2 = exe.run(program2,
+                           feed={'x': sensor},
+                           fetch_list=y2.values())
+        self.assertEqual(
+            np.sum(outputs1[0].flatten()), np.sum(outputs2[0].flatten()))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/parl/framework/tests/test_computation_task.py
+++ b/parl/framework/tests/test_computation_task.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import parl.layers as layers
+from parl.framework.algorithm import Model
+from parl.framework.computation_task import ComputationTask
+import parl.framework.policy_distribution as pd
+from parl.layers import common_functions as comf
+from parl.algorithm_zoo.simple_algorithms import SimpleAC, SimpleQ
+from parl.model_zoo.simple_models import SimpleModelDeterministic, SimpleModelAC, SimpleModelQ
+from test_algorithm import TestAlgorithm
+import numpy as np
+from copy import deepcopy
+import unittest
+import math
+
+
+class TestModelCNN(Model):
+    def __init__(self, width, height, num_actions):
+        super(TestModelCNN, self).__init__()
+        self.conv = layers.conv2d(
+            num_filters=1, filter_size=3, bias_attr=False)
+        self.mlp = comf.MLP([
+            dict(
+                size=32, act="relu", bias_attr=False), dict(
+                    size=16, act="relu", bias_attr=False), dict(
+                        size=num_actions, act="softmax", bias_attr=False)
+        ])
+        self.height = height
+        self.width = width
+
+    def get_input_specs(self):
+        ## image format CHW
+        return [("image", dict(shape=[1, self.height, self.width]))]
+
+    def get_action_specs(self):
+        return [("action", dict(shape=[1], dtype="int64"))]
+
+    def policy(self, inputs, states):
+        conv = self.conv(input=inputs.values()[0])
+        dist = pd.CategoricalDistribution(self.mlp(conv))
+        return dict(action=dist), states
+
+    def value(self, inputs, states):
+        v_value = layers.fill_constant(
+            shape=[inputs.values()[0].shape[0], 1], dtype="float32", value=0)
+        return dict(v_value=v_value)
+
+
+class TestComputationTask(unittest.TestCase):
+    def test_predict(self):
+        """
+        Test case for AC-learning and Q-learning predictions
+        """
+        num_actions = 4
+
+        def test(input, ct, max):
+            action_counter = [0] * num_actions
+            total = 2000
+            for i in range(total):
+                actions, states = ct.predict(inputs=input)
+                assert not states, "states should be empty"
+                ## actions["action"] is a batch of actions
+                for a in actions["action"]:
+                    action_counter[a[0]] += 1
+
+            if max:
+                ### if max, the first action will always be chosen
+                for i in range(num_actions):
+                    prob = action_counter[i] / float(sum(action_counter))
+                    self.assertAlmostEqual(
+                        prob, 1.0 if i == 0 else 0.0, places=1)
+            else:
+                ### the actions should be uniform
+                for i in range(num_actions):
+                    prob = action_counter[i] / float(sum(action_counter))
+                    self.assertAlmostEqual(prob, 1.0 / num_actions, places=1)
+
+        dims = 100
+
+        ac = SimpleAC(model=SimpleModelAC(
+            dims=dims,
+            num_actions=num_actions,
+            mlp_layer_confs=[
+                dict(
+                    size=32, act="relu", bias_attr=False), dict(
+                        size=16, act="relu", bias_attr=False), dict(
+                            size=num_actions, act="softmax", bias_attr=False)
+            ]))
+
+        ac_cnn = SimpleAC(model=TestModelCNN(
+            width=84, height=84, num_actions=num_actions))
+
+        q = SimpleQ(model=SimpleModelQ(
+            dims=dims,
+            num_actions=num_actions,
+            mlp_layer_confs=[
+                dict(
+                    size=32, act="relu", bias_attr=False), dict(
+                        size=16, act="relu", bias_attr=False), dict(
+                            size=num_actions, bias_attr=False)
+            ]))
+
+        batch_size = 10
+        height, width = 84, 84
+        sensor = np.zeros([batch_size, dims]).astype("float32")
+        image = np.zeros([batch_size, 1, height, width]).astype("float32")
+
+        ct0 = ComputationTask(algorithm=ac)
+        ct1 = ComputationTask(algorithm=q)
+        ct2 = ComputationTask(algorithm=ac_cnn)
+
+        test(dict(sensor=sensor), ct0, max=False)
+        test(dict(sensor=sensor), ct1, max=True)
+        test(dict(image=image), ct2, max=False)
+
+    def test_ct_para_sharing(self):
+        """
+        Test case for two CTs sharing parameters
+        """
+        alg = TestAlgorithm(model=SimpleModelDeterministic(
+            dims=10, mlp_layer_confs=[dict(size=10)]))
+        ct0 = ComputationTask(algorithm=alg)
+        ct1 = ComputationTask(algorithm=alg)
+
+        batch_size = 10
+        sensor = np.random.uniform(
+            0, 1, [batch_size, alg.model.dims]).astype("float32")
+
+        outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
+        outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
+        self.assertEqual(
+            np.sum(outputs0["continuous_action"].flatten()),
+            np.sum(outputs1["continuous_action"].flatten()))
+
+    def test_ct_para_sync(self):
+        """
+        Test case for two CTs copying parameters
+        """
+
+        alg = TestAlgorithm(model=SimpleModelDeterministic(
+            dims=10, mlp_layer_confs=[dict(size=10)]))
+
+        ct0 = ComputationTask(algorithm=alg)
+        ct1 = ComputationTask(algorithm=deepcopy(alg))
+
+        batch_size = 10
+        sensor = np.random.uniform(
+            0, 1, [batch_size, ct0.alg.model.dims]).astype("float32")
+
+        outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
+        outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
+        self.assertNotEqual(
+            np.sum(outputs0["continuous_action"].flatten()),
+            np.sum(outputs1["continuous_action"].flatten()))
+
+        ct0.alg.model.sync_paras_to(ct1.alg.model, ct1.alg.gpu_id)
+
+        outputs0, _ = ct0.predict(inputs=dict(sensor=sensor))
+        outputs1, _ = ct1.predict(inputs=dict(sensor=sensor))
+        self.assertEqual(
+            np.sum(outputs0["continuous_action"].flatten()),
+            np.sum(outputs1["continuous_action"].flatten()))
+
+    def test_ct_learning(self):
+        """
+        Test training
+        """
+        num_actions = 2
+        dims = 100
+        batch_size = 8
+        sensor = np.ones(
+            [batch_size, dims]).astype("float32") / dims  # normalize
+        next_sensor = np.zeros([batch_size, dims]).astype("float32")
+
+        for on_policy in [True, False]:
+            if on_policy:
+                alg = SimpleAC(
+                    model=SimpleModelAC(
+                        dims=dims,
+                        num_actions=num_actions,
+                        mlp_layer_confs=[
+                            dict(
+                                size=64, act="relu", bias_attr=False), dict(
+                                    size=32, act="relu", bias_attr=False),
+                            dict(
+                                size=num_actions, act="softmax")
+                        ]),
+                    hyperparas=dict(lr=1e-1))
+                ct = ComputationTask(algorithm=alg)
+            else:
+                alg = SimpleQ(
+                    model=SimpleModelQ(
+                        dims=dims,
+                        num_actions=num_actions,
+                        mlp_layer_confs=[
+                            dict(
+                                size=64, act="relu", bias_attr=False), dict(
+                                    size=32, act="relu", bias_attr=False),
+                            dict(size=num_actions)
+                        ]),
+                    update_ref_interval=100,
+                    hyperparas=dict(lr=1e-1))
+                ct = ComputationTask(algorithm=alg)
+
+            for i in range(1000):
+                if on_policy:
+                    outputs, _ = ct.predict(inputs=dict(sensor=sensor))
+                    actions = outputs["action"]
+                else:
+                    ## randomly assemble a batch
+                    actions = np.random.choice(
+                        [0, 1], size=(batch_size, 1),
+                        p=[0.5, 0.5]).astype("int")
+                rewards = (1 - actions).astype("float32")
+                cost = ct.learn(
+                    inputs=dict(sensor=sensor),
+                    next_inputs=dict(next_sensor=next_sensor),
+                    episode_end=dict(episode_end=np.ones(
+                        (batch_size, 1)).astype("float32")),
+                    actions=dict(action=actions),
+                    rewards=dict(reward=rewards))
+
+            print("final cost: %f" % cost["cost"])
+
+            ### the policy should bias towards the first action
+            outputs, _ = ct.predict(inputs=dict(sensor=sensor))
+            for a in outputs["action"]:
+                self.assertEqual(a[0], 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/parl/layers/common_functions.py
+++ b/parl/layers/common_functions.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl.layers as layers
+from paddle.fluid.framework import Variable
+
+
+class Feedforward(layers.Network):
+    """
+     A feedforward network can contain a sequence of components,
+     where each component can be either a LayerFunc or a Feedforward.
+     The purpose of this class is to create a collection of LayerFuncs that can
+     be easily copied from one Network to another.
+     Examples of feedforward networks can be MLP and CNN.
+     """
+
+    def __init__(self, components):
+        for i in range(len(components)):
+            setattr(self, "ff%06d" % i, components[i])
+
+    def __call__(self, input):
+        attrs = {
+            attr: getattr(self, attr)
+            for attr in dir(self) if "ff" in attr
+        }
+        for k in sorted(attrs.keys()):
+            input = attrs[k](input)
+        return input
+
+
+class MLP(Feedforward):
+    def __init__(self, multi_fc_layers):
+        super(MLP, self).__init__([layers.fc(**c) for c in multi_fc_layers])
+
+
+class CNN(Feedforward):
+    """
+    Image CNN
+    """
+
+    def __init__(self, multi_conv_layers):
+        super(CNN, self).__init__(
+            [layers.conv2d(**c) for c in multi_conv_layers])
+
+
+def categorical_random(prob):
+    """
+    Sample an id based on categorical distribution prob
+    """
+    cumsum = layers.cumsum(x=prob)
+    r = layers.uniform_random_batch_size_like(
+        input=prob, min=0., max=1., shape=[-1])
+    index = layers.reduce_sum(layers.cast(cumsum < r, 'int'), dim=-1)
+    index = layers.reshape(index, index.shape + (1, ))
+    return index
+
+
+def argmax_layer(input):
+    """
+    Get the id of the max val of an input vector
+    """
+    _, index = layers.topk(input, 1)
+    return index
+
+
+def inner_prod(x, y):
+    """
+    Get the inner product of two vectors
+    """
+    return layers.reduce_sum(layers.elementwise_mul(x, y), dim=-1)
+
+
+def sum_to_one_norm_layer(input):
+    eps = 1e-9  # avoid dividing 0
+    sum = layers.reduce_sum(input + eps, dim=-1)
+    return layers.elementwise_div(x=input, y=sum, axis=0)
+
+
+def idx_select(input, idx):
+    """
+    Given an input vector (Variable) and an idx (int or Variable),
+    select the entry of the vector according to the idx.
+    """
+    assert isinstance(input, Variable)
+    assert len(input.shape) == 2
+    batch_size, num_entries = input.shape
+
+    if isinstance(idx, int):
+        ## if idx is a constant int, then we create a variable
+        idx = layers.fill_constant(
+            shape=[batch_size, 1], dtype="int64", value=idx)
+    else:
+        assert isinstance(idx, Variable)
+
+    assert input.shape
+    select = layers.cast(
+        x=layers.one_hot(
+            input=idx, depth=num_entries), dtype="float32")
+    return inner_prod(select, input)
--- a/parl/layers/layer_wrappers.py
+++ b/parl/layers/layer_wrappers.py
@@ -15,34 +15,16 @@
 Wrappers for fluid.layers so that the layers can share parameters conveniently.
 """

+from paddle.fluid.executor import fetch_var
+import paddle.fluid as fluid
 from paddle.fluid.layers import *
 from paddle.fluid.param_attr import ParamAttr
 import paddle.fluid.layers as layers
 import paddle.fluid.unique_name as unique_name
-import warnings
+from copy import deepcopy
 import inspect


-class LayerFunc(object):
-    def __init__(self, param_attr=False, bias_attr=False):
-        self.param_attr = param_attr
-        self.bias_attr = bias_attr
-
-    @property
-    def param_name(self):
-        if self.param_attr:
-            return self.param_attr.name
-        else:
-            return None
-
-    @property
-    def bias_name(self):
-        if self.bias_attr:
-            return self.bias_attr.name
-        else:
-            return None
-
-
 def update_attr_name(name, default_name, attr, is_bias):
    """
    Update the name in an attribute
@@ -73,13 +55,131 @@ def update_attr_name(name, default_name, attr, is_bias):
    return check_or_replace_name(new_name, attr)


+class LayerFunc(object):
+    def __init__(self, param_attr=False, bias_attr=False):
+        self.param_attr = param_attr
+        self.bias_attr = bias_attr
+
+    def sync_paras_to(self, target_layer, gpu_id):
+        """
+        Copy the paras from self to a target layer
+        """
+        ## isinstance can handle subclass
+        assert isinstance(target_layer, LayerFunc)
+        src_attrs = [self.param_attr, self.bias_attr]
+        target_attrs = [target_layer.param_attr, target_layer.bias_attr]
+
+        place = fluid.CPUPlace() if gpu_id < 0 \
+                else fluid.CUDAPlace(gpu_id)
+
+        for i, attrs in enumerate(zip(src_attrs, target_attrs)):
+            src_attr, target_attr = attrs
+            assert (src_attr and target_attr) \
+                or (not src_attr and not target_attr)
+            if not src_attr:
+                continue
+            src_var = fetch_var(src_attr.name)
+            target_var = fetch_var(target_attr.name, return_numpy=False)
+            target_var.set(src_var, place)
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        ## __new__ won't init the class, we need to do that ourselves
+        copied = cls.__new__(cls)
+        ## record in the memo that self has been copied to avoid recursive copying
+        memo[id(self)] = copied
+
+        ## first copy all content
+        for k, v in self.__dict__.iteritems():
+            setattr(copied, k, deepcopy(v, memo))
+
+        ## then we need to create new para names for self.param_attr and self.bias_attr
+        def create_new_para_name(attr):
+            if attr:
+                assert attr.name, "attr should have a name already!"
+                ## remove the last number id but keep the name key
+                name_key = "_".join(attr.name.split("_")[:-1])
+                attr.name = unique_name.generate(name_key)
+
+        create_new_para_name(copied.param_attr)
+        create_new_para_name(copied.bias_attr)
+        ## We require the user to sync the parameter values later, because
+        ## this deepcopy is supposed to be called only before the startup
+        ## program. This function will cause the computation graph change, so
+        ## it cannot be called during the execution.
+        return copied
+
+    @property
+    def param_name(self):
+        if self.param_attr:
+            return self.param_attr.name
+        else:
+            return None
+
+    @property
+    def bias_name(self):
+        if self.bias_attr:
+            return self.bias_attr.name
+        else:
+            return None
+
+
+class Network(object):
+    """
+    A Network is an unordered set of LayerFuncs or Networks.
+    """
+
+    def sync_paras_to(self, target_net, gpu_id):
+        assert not target_net is self, "cannot copy between identical networks"
+        assert isinstance(target_net, Network)
+        assert self.__class__.__name__ == target_net.__class__.__name__, \
+            "must be the same class for para syncing!"
+
+        for attr in self.__dict__:
+            if not attr in target_net.__dict__:
+                continue
+            val = getattr(self, attr)
+            target_val = getattr(target_net, attr)
+
+            assert type(val) == type(target_val)
+            ### TODO: sync paras recursively
+            if isinstance(val, Network) or isinstance(val, LayerFunc):
+                val.sync_paras_to(target_val, gpu_id)
+            elif isinstance(val, tuple) or isinstance(val, list) or isinstance(
+                    val, set):
+                for v, tv in zip(val, target_val):
+                    v.sync_paras_to(tv, gpu_id)
+            elif isinstance(val, dict):
+                for k in val.keys():
+                    assert k in target_val
+                    val[k].sync_paras_to(target_val[k], gpu_id)
+            else:
+                # for any other type, we do not copy
+                pass
+
+
+def check_caller_name():
+    stack = inspect.stack()
+    ## we trace back to the call stack and make sure Network.__init__ is on the path
+    called_by_init = False
+    for s in stack:
+        try:
+            the_class = s[0].f_locals["self"].__class__
+            the_method = s[0].f_code.co_name
+            if issubclass(the_class, Network) and the_method == "__init__":
+                called_by_init = True
+        except:
+            pass
+
+    assert called_by_init, "parl.layers can only be called in Network.__init__()!"
+
+
 def fc(size,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
       use_mkldnn=False,
       act=None,
-       is_test=False,
       name=None):
    """
    Return a function that creates a paddle.fluid.layers.fc.
@@ -87,17 +187,18 @@ def fc(size,
    default_name = "fc"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class FC_(LayerFunc):
        def __init__(self):
            super(FC_, self).__init__(param_attr, bias_attr)

-        def __call__(self, input):
+        def __call__(self, input, is_test=False):
            return layers.fc(input=input,
                             size=size,
                             num_flatten_dims=num_flatten_dims,
-                             param_attr=param_attr,
-                             bias_attr=bias_attr,
+                             param_attr=self.param_attr,
+                             bias_attr=self.bias_attr,
                             use_mkldnn=use_mkldnn,
                             act=act,
                             is_test=is_test)
@@ -116,6 +217,7 @@ def embedding(size,
    Return a function that creates a paddle.fluid.layers.embedding.
    """
    param_attr = update_attr_name(name, "embedding", param_attr, False)
+    check_caller_name()

    class Embedding_(LayerFunc):
        def __init__(self):
@@ -128,7 +230,7 @@ def embedding(size,
                is_sparse=is_sparse,
                is_distributed=is_distributed,
                padding_idx=padding_idx,
-                param_attr=param_attr,
+                param_attr=self.param_attr,
                dtype=dtype)

    return Embedding_()
@@ -150,6 +252,7 @@ def dynamic_lstm(size,
    default_name = "dynamic_lstm"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class DynamicLstm_(LayerFunc):
        def __init__(self):
@@ -159,8 +262,8 @@ def dynamic_lstm(size,
            return layers.dynamic_lstm(
                input=input,
                size=size,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr,
                use_peepholes=use_peepholes,
                is_reverse=is_reverse,
                gate_activation=gate_activation,
@@ -189,6 +292,7 @@ def dynamic_lstmp(size,
    default_name = "dynamic_lstmp"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class DynamicLstmp_(LayerFunc):
        def __init__(self):
@@ -199,8 +303,8 @@ def dynamic_lstmp(size,
                input=input,
                size=size,
                proj_size=proj_size,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr,
                use_peepholes=use_peepholes,
                is_reverse=is_reverse,
                gate_activation=gate_activation,
@@ -226,6 +330,7 @@ def dynamic_gru(size,
    default_name = "dynamic_gru"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class DynamicGru_(LayerFunc):
        def __init__(self):
@@ -235,8 +340,8 @@ def dynamic_gru(size,
            return layers.dynamic_gru(
                input=input,
                size=size,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr,
                is_reverse=is_reverse,
                gate_activation=gate_activation,
                candidate_activation=candidate_activation,
@@ -274,6 +379,7 @@ def sequence_conv(num_filters,
    default_name = "sequence_conv"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class SequenceConv_(LayerFunc):
        def __init__(self):
@@ -286,8 +392,8 @@ def sequence_conv(num_filters,
                filter_size=filter_size,
                filter_stride=filter_stride,
                padding=padding,
-                bias_attr=bias_attr,
-                param_attr=param_attr,
+                bias_attr=self.bias_attr,
+                param_attr=self.param_attr,
                act=act)

    return SequenceConv_()
@@ -311,6 +417,7 @@ def conv2d(num_filters,
    default_name = "conv2d"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class Conv2D_(LayerFunc):
        def __init__(self):
@@ -325,8 +432,8 @@ def conv2d(num_filters,
                padding=padding,
                dilation=dilation,
                groups=groups,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr,
                use_cudnn=use_cudnn,
                use_mkldnn=use_mkldnn,
                act=act)
@@ -351,6 +458,7 @@ def conv2d_transpose(num_filters,
    default_name = "conv2d_transpose"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class Conv2DTranspose_(LayerFunc):
        def __init__(self):
@@ -365,8 +473,8 @@ def conv2d_transpose(num_filters,
                padding=padding,
                stride=stride,
                dilation=dilation,
-                param_attr=param_attr,
-                bias_attr=bias_attr,
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr,
                use_cudnn=use_cudnn,
                act=act)

@@ -380,6 +488,7 @@ def lstm_unit(forget_bias=0.0, param_attr=None, bias_attr=None, name=None):
    default_name = "lstm_unit"
    param_attr = update_attr_name(name, default_name, param_attr, False)
    bias_attr = update_attr_name(name, default_name, bias_attr, True)
+    check_caller_name()

    class LstmUnit_(LayerFunc):
        def __init__(self):
@@ -391,8 +500,8 @@ def lstm_unit(forget_bias=0.0, param_attr=None, bias_attr=None, name=None):
                hidden_t_prev=hidden_t_prev,
                cell_t_prev=cell_t_prev,
                forget_bias=forget_bias,
-                param_attr=param_attr,
-                bias_attr=bias_attr)
+                param_attr=self.param_attr,
+                bias_attr=self.bias_attr)

    return LstmUnit_()

@@ -406,6 +515,7 @@ def row_conv(future_context_size, param_attr=None, act=None, name=None):
    Return a function that creates a paddle.fluid.layers.row_conv.
    """
    param_attr = update_attr_name(name, "row_conv", param_attr, False)
+    check_caller_name()

    class RowConv_(LayerFunc):
        def __init__(self):
@@ -415,7 +525,7 @@ def row_conv(future_context_size, param_attr=None, act=None, name=None):
            return layers.row_conv(
                input=input,
                future_context_size=future_context_size,
-                param_attr=param_attr,
+                param_attr=self.param_attr,
                act=act)

    return RowConv_()

--- a/parl/layers/tests/test_param_name.py
+++ b/parl/layers/tests/test_param_name.py
@@ -14,10 +14,11 @@

 import unittest
 import parl.layers as layers
+from parl.layers import Network


-class TestParamName(unittest.TestCase):
-    def test_name_number(self):
+class MyNetWork(Network):
+    def __init__(self):
        self.fc1 = layers.fc(100)
        self.fc2 = layers.fc(100)
        self.fc3 = layers.fc(100, bias_attr=False)
@@ -33,37 +34,36 @@ class TestParamName(unittest.TestCase):
            filter_size=3,
            param_attr=self.embedding.param_attr,
            name="my_conv2d")
-        self.dynamic_grus = []
-        for i in range(5):
-            self.dynamic_grus.append(layers.dynamic_gru(50))
+
+
+class TestParamName(unittest.TestCase):
+    def test_name_number(self):
+        net = MyNetWork()

        ## fc1 and fc2 have different parameters
-        self.assertEqual(self.fc1.param_name, "fc.w_0")
-        self.assertEqual(self.fc2.param_name, "fc.w_1")
+        self.assertEqual(net.fc1.param_name, "fc.w_0")
+        self.assertEqual(net.fc2.param_name, "fc.w_1")

        ## fc3 has no bias and fc4 has no param; so the names are None
-        self.assertEqual(self.fc3.bias_name, None)
-        self.assertEqual(self.fc4.param_name, None)
-        self.assertEqual(self.fc4.bias_name, "fc.b_3")
+        self.assertEqual(net.fc3.bias_name, None)
+        self.assertEqual(net.fc4.param_name, None)
+        self.assertEqual(net.fc4.bias_name, "fc.b_3")

        ## fc5 has a custom name without a bias
-        self.assertEqual(self.fc5.param_name, "fc.w_4")
-        self.assertEqual(self.fc5.bias_name, None)
+        self.assertEqual(net.fc5.param_name, "fc.w_4")
+        self.assertEqual(net.fc5.bias_name, None)

        ## embedding layer has no bias
-        self.assertEqual(self.embedding.param_name, "embedding.w_0")
-        self.assertEqual(self.embedding.bias_name, None)
+        self.assertEqual(net.embedding.param_name, "embedding.w_0")
+        self.assertEqual(net.embedding.bias_name, None)

        ## embedding layer with a custom name
-        self.assertEqual(self.embedding_custom.param_name,
+        self.assertEqual(net.embedding_custom.param_name,
                         "embedding_custom.w_0")

        ## conv2d shares param with embedding; has a custom bias name
-        self.assertEqual(self.conv2d.param_name, "embedding.w_0")
-        self.assertEqual(self.conv2d.bias_name, "my_conv2d.b_0")
-
-        for i, gru in enumerate(self.dynamic_grus):
-            self.assertEqual(gru.param_name, "dynamic_gru.w_%d" % i)
+        self.assertEqual(net.conv2d.param_name, "embedding.w_0")
+        self.assertEqual(net.conv2d.bias_name, "my_conv2d.b_0")


 if __name__ == '__main__':

--- a/parl/layers/tests/test_param_sharing.py
+++ b/parl/layers/tests/test_param_sharing.py
@@ -14,45 +14,67 @@

 import unittest
 import parl.layers as layers
+from parl.layers import Network
 import paddle.fluid as fluid
 import numpy as np


-class TestParamSharing(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(TestParamSharing, self).__init__(*args, **kwargs)
+class MyNetWork(Network):
+    def __init__(self):
        self.fc1 = layers.fc(64, bias_attr=False)
        self.fc2 = layers.fc(64, bias_attr=False)
        self.fc3 = layers.fc(64, name="fc")
        self.fc4 = layers.fc(64, name="fc")
-        ## we bind the paras of self.embedding to those of self.fc1
        self.embedding = layers.embedding(
            (100, 64), param_attr=self.fc1.param_attr)

+
+class TestParamSharing(unittest.TestCase):
    def test_param_sharing(self):
        """
        Test case for parameter sharing between layers of the same type
        """
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
+        net = MyNetWork()
+        ## we bind the paras of embedding to those of fc1
+        batch_size = 10
+        dict_size = 100
+        input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
+        input_x = np.random.randint(
+            dict_size, size=(batch_size, 1)).astype("int")
+        #################################

-        with fluid.program_guard(main_program, startup_program):
+        main_program1 = fluid.Program()
+        with fluid.program_guard(main_program1):
            x = layers.data(name='x', shape=[100], dtype="float32")
-            y1 = self.fc1(input=x)
-            y11 = self.fc1(input=x)
-            y2 = self.fc2(input=x)
-            y3 = self.fc3(input=x)
-            y4 = self.fc4(input=x)
+            y1 = net.fc1(input=x)
+            y11 = net.fc1(input=x)
+            y2 = net.fc2(input=x)
+            y3 = net.fc3(input=x)
+            y4 = net.fc4(input=x)

+        main_program2 = fluid.Program()
+        with fluid.program_guard(main_program2):
+            x_ = layers.data(name='x', shape=[1], dtype="int")
+            cx_ = layers.cast(
+                x=layers.one_hot(
+                    input=x_, depth=dict_size), dtype="float32")
+            y1_ = net.fc1(input=cx_)
+            y2_ = net.embedding(input=x_)
+
+            x1_ = layers.data(name='x1', shape=[100], dtype="float32")
+            y3_ = net.fc1(input=x1_)
+
+        #### we run the startup program only once to make sure
+        #### only one para init across the two programs
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        batch_size = 10
-        input_x = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
-        outputs = exe.run(main_program,
-                          feed={"x": input_x},
-                          fetch_list=[y1, y11, y2, y3, y4])
+        exe.run(fluid.default_startup_program())
+        ######################################################

+        outputs = exe.run(main_program1,
+                          feed={"x": input_cx},
+                          fetch_list=[y1, y11, y2, y3, y4])
+        old_y1 = outputs[0]
        self.assertEqual(
            np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
        self.assertNotEqual(
@@ -60,35 +82,17 @@ class TestParamSharing(unittest.TestCase):
        self.assertNotEqual(
            np.sum(outputs[3].flatten()), np.sum(outputs[4].flatten()))

-    def test_manual_param_sharing(self):
-        """
-        Test case for parameter sharing between layers of different types
-        """
-        batch_size = 10
-        dict_size = 100
-
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            x = layers.data(name='x', shape=[1], dtype="int")
-            cx = layers.cast(
-                x=layers.one_hot(
-                    input=x, depth=dict_size), dtype="float32")
-            ## remove bias because embedding layer does not have one
-            y1 = self.fc1(input=cx)
-            y2 = self.embedding(input=x)
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        input_x = np.random.randint(
-            dict_size, size=(batch_size, 1)).astype("int")
-        outputs = exe.run(main_program,
-                          feed={'x': input_x},
-                          fetch_list=[y1, y2])
+        outputs = exe.run(main_program2,
+                          feed={'x': input_x,
+                                'x1': input_cx},
+                          fetch_list=[y1_, y2_, y3_])

+        ### test two different layers sharing the same para matrix
        self.assertEqual(
            np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
+        ### test if the same layer can have the same parameters across two different programs
+        self.assertEqual(
+            np.sum(outputs[2].flatten()), np.sum(old_y1.flatten()))


 if __name__ == "__main__":

--- a/parl/model_zoo/__init__.py
+++ b/parl/model_zoo/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parl/model_zoo/simple_models.py
+++ b/parl/model_zoo/simple_models.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl.layers as layers
+from parl.framework.algorithm import Model
+import parl.framework.policy_distribution as pd
+from parl.layers import common_functions as comf
+
+
+class SimpleModelDeterministic(Model):
+    def __init__(self, dims, mlp_layer_confs):
+        super(SimpleModelDeterministic, self).__init__()
+        self.dims = dims
+        self.mlp = comf.MLP(mlp_layer_confs)
+
+    def get_input_specs(self):
+        return [("sensor", dict(shape=[self.dims]))]
+
+    def get_action_specs(self):
+        return [("continuous_action", dict(shape=[self.dims]))]
+
+    def policy(self, inputs, states):
+        hidden = self.mlp(inputs.values()[0])
+        return dict(continuous_action=pd.Deterministic(hidden)), states
+
+
+class SimpleModelAC(Model):
+    def __init__(self, dims, num_actions, mlp_layer_confs):
+        super(SimpleModelAC, self).__init__()
+        self.dims = dims
+        assert mlp_layer_confs[-1]["act"] == "softmax"
+        self.mlp = comf.MLP(mlp_layer_confs[:-1])
+        self.policy_mlp = comf.MLP(mlp_layer_confs[-1:])
+        self.value_layer = layers.fc(size=1)
+
+    def get_input_specs(self):
+        return [("sensor", dict(shape=[self.dims]))]
+
+    def get_action_specs(self):
+        return [("action", dict(shape=[1], dtype="int64"))]
+
+    def _perceive(self, inputs, states):
+        return self.mlp(inputs.values()[0])
+
+    def policy(self, inputs, states):
+        dist = pd.CategoricalDistribution(
+            self.policy_mlp(self._perceive(inputs, states)))
+        return dict(action=dist), states
+
+    def value(self, inputs, states):
+        return dict(v_value=self.value_layer(self._perceive(inputs, states)))
+
+
+class SimpleModelQ(Model):
+    def __init__(self,
+                 dims,
+                 num_actions,
+                 mlp_layer_confs,
+                 estimated_total_num_batches=0):
+        super(SimpleModelQ, self).__init__()
+        self.dims = dims
+        self.num_actions = num_actions
+        assert "act" not in mlp_layer_confs[-1], "should be linear act"
+        self.mlp = comf.MLP(mlp_layer_confs)
+        self.estimated_total_num_batches = estimated_total_num_batches
+
+    def get_input_specs(self):
+        return [("sensor", dict(shape=[self.dims]))]
+
+    def get_action_specs(self):
+        return [("action", dict(shape=[1], dtype="int64"))]
+
+    def policy(self, inputs, states):
+        values = self.value(inputs, states)
+        q_value = values["q_value"]
+        return dict(action=pd.q_categorical_distribution(q_value)), states
+
+    def value(self, inputs, states):
+        return dict(q_value=self.mlp(inputs.values()[0]))