diff --git a/benchmark/torch/coma/.benchmark/3m_result.png b/benchmark/torch/coma/.benchmark/3m_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d5343bc1920a0e96ded07ff45f05b79ecd979a3
Binary files /dev/null and b/benchmark/torch/coma/.benchmark/3m_result.png differ
diff --git a/benchmark/torch/coma/README.md b/benchmark/torch/coma/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b34f4ad773a128b8192cdc617a7bf26f9893562
--- /dev/null
+++ b/benchmark/torch/coma/README.md
@@ -0,0 +1,59 @@
+## Reproduce COMA with PARL
+
+This is an PARL + PyTorch implementation of the multi-agent reinforcement learning algorithms: COMA.
+
+### Paper
+- [Counterfactual Multi-Agent Policy Gradients](https://arxiv.org/abs/1705.08926)
+
+### Benchmark Result
+Mean win_rate (evaluate 5 episode) for 1000 epchos training (1 epcho = 5 episodes).
+
+<img src=".benchmark/3m_result.png" width = "400" height = "300" alt="coma-3m"/>
+
+
+
+
+## StarCraft II Installation
+The environment is based on the full game of StarCraft II (version >= 3.16.1). To install the game, follow the commands bellow, or check more detail in [SMAC](https://github.com/oxwhirl/smac#installing-starcraft-ii). MacOS/Windows users are required to run this folder in Docker, as the starcraft environment does not support these two systems.
+
+### Linux
+```shell
+$ cd starcraft2
+$ SC2PATH=~ bash install_sc2.sh
+```
+### MacOS (use Docker)
+```shell
+$ cd starcraft2
+$ bash build_docker.sh  # build Dockerfile
+$ bash install_sc2.sh  # download startcraft II and maps
+```
+### Windows (use Docker)
+- Step 1: Build docker images, `cd starcraft2 && bash build_docker.sh`
+- Step 2: Download a [Starcraft II package](https://github.com/Blizzard/s2client-proto#linux-packages), unzip to folder `starcraft2/StarCraftII` (password: `iagreetotheeula`)
+- Step 3: Download [Map](https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip), unzip to folder `starcraft2/StarCraftII/Maps/SMAC_Maps`
+
+
+## How to use
+### Dependencies
+- python3.5+
+- parl
+- torch
+- [SMAC](https://github.com/oxwhirl/smac)
+
+### Start Training
+#### Linux
+```shell
+$ python3 train.py
+```
+#### MacOS/Windows (use Docker)
+```shell
+$ cd coma
+$ NV_GPU=$your_gpu_id docker run --name $your_container_name --user $(id -u):$(id -g) -v `pwd`:/parl -t parl-starcraft2:1.0 python3 train.py
+```
+*or you can operate docker interactively by `docker run --name $your_container_name -it -v $your_host_path:/parl -t parl-starcraft2:1.0  /bin/bash`*
+
+
+
+### Reference
+- [StarCraft](https://github.com/starry-sky6688/StarCraft)
+- [pymarl](https://github.com/oxwhirl/pymarl)
diff --git a/benchmark/torch/coma/coma_config.py b/benchmark/torch/coma/coma_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d6971efe4695f7816cf47148835a1e9d421bc6
--- /dev/null
+++ b/benchmark/torch/coma/coma_config.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# arguments of coma
+config = {
+    # ========== Environment ==========
+    'difficulty': '3',  # The difficulty of the game
+    'map': '3m',  # The map of the game
+    'env_seed': None,  # Environment random seed
+    'replay_dir': '',  # Save the replay, not available in Ubuntu
+
+    # ========== Learn ==========
+    'gamma': 0.99,
+    'grad_norm_clip': 10,  # Prevent gradient explosion
+    'td_lambda': 0.8,  # Lambda of td-lambda return
+    'actor_lr': 1e-4,
+    'critic_lr': 1e-3,
+    'target_update_cycle': 200,  # How often to update the target_net
+
+    # ========== Epsilon-greedy ==========
+    'epsilon': 0.5,
+    'anneal_epsilon': 0.00064,
+    'min_epsilon': 0.02,
+    # 'epsilon_anneal_scale' : 'epoch',
+
+    # ========== Other ==========
+    'n_epoch': 5000,  # The number of the epoch to train the agent
+    'n_episodes': 5,  # The number of the episodes in one epoch
+    'test_episode_n': 20,  # The Number of the epochs to evaluate the agent
+    'threshold': 19,  # The threshold to judge whether win
+    'test_cycle': 5,  # How often to evaluate (every 'test_cycle' epcho)
+    'save_cycle': 1000,  # How often to save the model
+    'model_dir': './model',  # The model directory of the policy
+    'test': False,  # Evaluate model and quit (no training)
+    'restore': False  # restore model or not
+}
diff --git a/benchmark/torch/coma/sc2_agent.py b/benchmark/torch/coma/sc2_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eb9bcdf802175ff0d579c095b351e101d42351d
--- /dev/null
+++ b/benchmark/torch/coma/sc2_agent.py
@@ -0,0 +1,237 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import parl
+import torch
+from torch.distributions import Categorical
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+class Agents(parl.Agent):
+    def __init__(self, algorithm, config):
+        self.n_actions = config['n_actions']
+        self.n_agents = config['n_agents']
+        self.state_shape = config['state_shape']
+        self.obs_shape = config['obs_shape']
+
+        self.config = config
+        self.train_steps = 0
+        self.rnn_h = None
+        super(Agents, self).__init__(algorithm)
+        print('Init all agents')
+
+    def init_hidden(self):
+        """ function: init a hidden tensor for every agent at the begging of every episode
+            self.rnn_h: rnn hidden state, shape (n_agents, hidden_size)
+        """
+        self.rnn_h = self.alg.init_hidden(1)[0]
+
+    def predict(self, obs, rnn_h_in):
+        """input:
+                obs: obs + last_action + agent_id, shape: (1, obs_shape + n_actions + n_agents)
+                rnn_h_in: rnn's hidden input
+            output:
+                prob: output of actor, shape: (1, n_actions)
+                rnn_h_out: rnn's hidden output
+        """
+        obs = np.expand_dims(obs, 0)
+        obs = torch.tensor(obs, dtype=torch.float32).to(device)
+        prob, rnn_h_out = self.alg.predict(obs, rnn_h_in)
+        return prob, rnn_h_out
+
+    def sample(self,
+               obs,
+               last_action,
+               agent_id,
+               avail_actions,
+               epsilon,
+               test=False):
+        """input:
+                obs (array): agent i's obs
+                last_action (int): agent i's last action
+                agent_id (int): agent index
+                avail_actions (one_hot): available actions
+                epsilon (float): e_greed discount
+                test (bool): train or test
+            output:
+                action: int
+                prob: probability of every action, float, 0 ~ 1
+        """
+        obs = obs.copy()
+        # make obs: obs + agent's last action(one_hot) + agent's id(one_hot)
+        last_act_one_hot = np.zeros(self.n_actions)
+        last_act_one_hot[last_action] = 1.
+        id_one_hot = np.zeros(self.n_agents)
+        id_one_hot[agent_id] = 1.
+        obs = np.hstack((obs, last_act_one_hot))
+        obs = np.hstack((obs, id_one_hot))
+
+        # predict action prob
+        prob, self.rnn_h[agent_id] = self.predict(obs, self.rnn_h[agent_id])
+
+        # add noise
+        avail_actions = torch.tensor(
+            avail_actions, dtype=torch.float32).unsqueeze(0).to(
+                device)  # shape: (1, n_actions)
+        action_num = avail_actions.sum()  # how many actions are available
+        prob = ((1 - epsilon) * prob +
+                torch.ones_like(prob) * epsilon / action_num)
+        prob[avail_actions == 0] = 0.0  # set avail action
+
+        # choose action
+        if epsilon == 0 or test:
+            action = torch.argmax(prob)
+        else:
+            action = Categorical(prob).sample().long()
+        return action.cpu()
+
+    def _get_actor_inputs(self, batch):
+        """ o(t), u(t-1)_a, agent_id
+        """
+        obs = batch['o']
+        u_onehot = batch['u_onehot']
+        u_onehot_last = np.zeros_like(u_onehot)
+        u_onehot_last[:, 1:] = u_onehot[:, :-1]
+        ep_num = batch['o'].shape[0]
+        tr_num = batch['o'].shape[1]
+
+        actor_inputs = []
+        for agent_id in range(self.n_agents):
+            obs_a = obs[:, :, agent_id]
+            u_a_onehot_last = u_onehot_last[:, :, agent_id]
+            id_onehot = np.zeros((ep_num, tr_num, self.n_agents))
+            id_onehot[:, :, agent_id] = 1.
+            # actor inputs: obs + agent's last action(one_hot) + agent's id(one_hot)
+            a_inputs = np.concatenate((obs_a, u_a_onehot_last, id_onehot),
+                                      axis=2)
+            # a_inpits shape (ep_num, tr_num, actor_input_dim)
+            actor_inputs.append(a_inputs)
+
+        actor_inputs = np.stack(
+            actor_inputs,
+            axis=2)  # shape (ep_num, tr_num, n_agents, actor_input_dim)
+        return actor_inputs
+
+    def _get_critic_inputs(self, batch):
+        """ o(t)_a, s(t), u(t)_-a, u(t-1), agent_id
+        """
+        ep_num = batch['o'].shape[0]
+        tr_num = batch['o'].shape[1]
+
+        # o, o_next, state, state_next
+        o = batch['o']  # shape (ep_num, tr_num, n_agents, obs_shape)
+        o_next = np.zeros_like(o)
+        o_next[:, :-1] = o[:, 1:]
+        s = batch['s']  # shape (ep_num, tr_num, state_shape)
+        s_next = np.zeros_like(s)
+        s_next[:, :-1] = s[:, 1:]
+        # u_onehot, u_onehot_last shape (ep_num, tr_num, n_agents, n_actions)
+        u_onehot = batch['u_onehot']
+        u_onehot_next = np.zeros_like(u_onehot)
+        u_onehot_next[:, :-1] = u_onehot[:, 1:]
+        u_onehot_last = np.zeros_like(u_onehot)
+        u_onehot_last[:, 1:] = u_onehot[:, :-1]
+
+        critic_inputs = []
+        critic_inputs_next = []
+        for agent_id in range(self.n_agents):
+            # get o(t)_a, s(t)
+            o_a = o[:, :, agent_id]  # shape (ep_num, tr_num, obs_shape)
+            o_a_next = o_next[:, :, agent_id]
+            s_a = s  # shape (ep_num, tr_num, state_shape)
+            s_a_next = s_next
+            # get u(t-1)  shape (ep_num, tr_num, n_agents * n_actions)
+            u_all_onehot = u_onehot.reshape((ep_num, tr_num,
+                                             self.n_agents * self.n_actions))
+            u_all_onehot_next = u_onehot_next.reshape(
+                (ep_num, tr_num, self.n_agents * self.n_actions))
+            u_all_onehot_last = u_onehot_last.reshape(
+                (ep_num, tr_num, self.n_agents * self.n_actions))
+            # get u(t)_-a,   set 0 to mask action, shape (ep_num, tr_num, n_agents * n_actions)
+            u_not_a_onehot = u_all_onehot.copy()
+            u_not_a_onehot_next = u_all_onehot_next.copy()
+            m_s = agent_id * self.n_actions  # mask start flag
+            m_e = (agent_id + 1) * self.n_actions  # mask end flag
+            u_not_a_onehot[:, :, m_s:m_e] = 0
+            u_not_a_onehot_next[:, :, m_s:m_e] = 0
+            # get id onehot, shape (ep_num, tr_num, n_agents)
+            id_onehot = np.zeros((ep_num, tr_num, self.n_agents))
+            id_onehot[:, :, agent_id] = 1.
+
+            # input:      o,      s,      u_-a,      u_last, agent_id
+            # input_next: o_next, s_next, u_-a_next, u,      agent_id
+            # shape (ep_num, tr_num, critic_input_dim)
+            c_inputs = np.concatenate(
+                (o_a, s_a, u_not_a_onehot, u_all_onehot_last, id_onehot),
+                axis=2)
+            c_inputs_next = np.concatenate(
+                (o_a_next, s_a_next, u_not_a_onehot_next, u_all_onehot,
+                 id_onehot),
+                axis=2)
+            critic_inputs.append(c_inputs)
+            critic_inputs_next.append(c_inputs_next)
+        critic_inputs = np.stack(critic_inputs, axis=2)
+        critic_inputs_next = np.stack(critic_inputs_next, axis=2)
+        # shape (ep_num, tr_num, n_agents, critic_input_dim)
+        return critic_inputs, critic_inputs_next
+
+    def _get_avail_transitions_num(self, isover_batch):
+        """ input:
+                isover_batch: shape (ep_num, tr_num, 1)
+            output: 
+                max_tr_num: max avail transitions number in all episodes
+        """
+        ep_num = isover_batch.shape[0]
+        max_tr_num = 0
+        for ep_id in range(ep_num):
+            for tr_id in range(self.config['episode_limit']):
+                if isover_batch[ep_id, tr_id, 0] == 1:
+                    if tr_id + 1 >= max_tr_num:
+                        max_tr_num = tr_id + 1
+                    break
+        return max_tr_num
+
+    def learn(self, batch, epsilon=None):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover)
+                epsilon: e-greedy discount
+        """
+        # different episode has different avail transition length
+        tr_num = self._get_avail_transitions_num(batch['isover'])
+        for key in batch.keys():
+            # cut batch data's episode length
+            batch[key] = batch[key][:, :tr_num]
+
+        # get actor input and critic input
+        batch['actor_inputs'] = self._get_actor_inputs(batch)
+        batch['critic_inputs'], batch[
+            'critic_inputs_next'] = self._get_critic_inputs(batch)
+
+        # change batch data to torch tensor
+        for key in batch.keys():
+            if key == 'u':
+                batch[key] = torch.tensor(
+                    batch[key], dtype=torch.long).to(device)
+            else:
+                batch[key] = torch.tensor(
+                    batch[key], dtype=torch.float32).to(device)
+
+        self.alg.learn(batch, epsilon)
+
+        if self.train_steps > 0 and self.train_steps % self.config[
+                'target_update_cycle'] == 0:
+            self.alg.sync_target()
+        self.train_steps += 1
diff --git a/benchmark/torch/coma/sc2_model.py b/benchmark/torch/coma/sc2_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..789436342050bb1fa32e1c2dcf8fa5a008b0894f
--- /dev/null
+++ b/benchmark/torch/coma/sc2_model.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import parl
+
+
+class ComaModel(parl.Model):
+    def __init__(self, config):
+        super(ComaModel, self).__init__()
+        self.n_actions = config['n_actions']
+        self.n_agents = config['n_agents']
+        self.state_shape = config['state_shape']
+        self.obs_shape = config['obs_shape']
+
+        actor_input_dim = self._get_actor_input_dim()
+        critic_input_dim = self._get_critic_input_dim()
+
+        self.actor_model = ActorModel(actor_input_dim, self.n_actions)
+        self.critic_model = CriticModel(critic_input_dim, self.n_actions)
+
+    def policy(self, obs, hidden_state):
+        return self.actor_model.policy(obs, hidden_state)
+
+    def value(self, inputs):
+        return self.critic_model.value(inputs)
+
+    def get_actor_params(self):
+        return self.actor_model.parameters()
+
+    def get_critic_params(self):
+        return self.critic_model.parameters()
+
+    def _get_actor_input_dim(self):
+        input_shape = self.obs_shape  # obs: 30 in 3m map
+        input_shape += self.n_actions  # agent's last action (one_hot): 9 in 3m map
+        input_shape += self.n_agents  # agent's one_hot id: 3 in 3m map
+        return input_shape  # 30 + 9 + 3 = 42
+
+    def _get_critic_input_dim(self):
+        input_shape = self.state_shape  # state: 48 in 3m map
+        input_shape += self.obs_shape  # obs: 30 in 3m map
+        input_shape += self.n_agents  # agent_id: 3 in 3m map
+        input_shape += self.n_actions * self.n_agents * 2  # all agents' action and last_action (one-hot): 54 in 3m map
+        return input_shape  # 48 + 30+ 3 = 135
+
+
+# all agents share one actor network
+class ActorModel(parl.Model):
+    def __init__(self, input_shape, act_dim):
+        """ input : obs, include the agent's id and last action, shape: (batch, obs_shape + n_action + n_agents)
+            output: one agent's q(obs, act)
+        """
+        super(ActorModel, self).__init__()
+        self.hid_size = 64
+
+        self.fc1 = nn.Linear(input_shape, self.hid_size)
+        self.rnn = nn.GRUCell(self.hid_size, self.hid_size)
+        self.fc2 = nn.Linear(self.hid_size, act_dim)
+
+    def init_hidden(self):
+        # new hidden states
+        return self.fc1.weight.new(1, self.hid_size).zero_()
+
+    def policy(self, obs, h0):
+        x = F.relu(self.fc1(obs))
+        h1 = h0.reshape(-1, self.hid_size)
+        h2 = self.rnn(x, h1)
+        policy = self.fc2(h2)
+        return policy, h2
+
+
+class CriticModel(parl.Model):
+    def __init__(self, input_shape, act_dim):
+        """ inputs: [ s(t), o(t)_a, u(t)_a, agent_a, u(t-1) ], shape: (Batch, input_shape)
+            output: Q,   shape: (Batch, n_actions)
+            Batch = ep_num * n_agents
+        """
+        super(CriticModel, self).__init__()
+        hid_size = 128
+        self.fc1 = nn.Linear(input_shape, hid_size)
+        self.fc2 = nn.Linear(hid_size, hid_size)
+        self.fc3 = nn.Linear(hid_size, act_dim)
+
+    def value(self, inputs):
+        hid1 = F.relu(self.fc1(inputs))
+        hid2 = F.relu(self.fc2(hid1))
+        Q = self.fc3(hid2)
+        return Q
diff --git a/benchmark/torch/coma/starcraft2/Dockerfile b/benchmark/torch/coma/starcraft2/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..176e89448d6689e08c9556ee2fd3bd9259658769
--- /dev/null
+++ b/benchmark/torch/coma/starcraft2/Dockerfile
@@ -0,0 +1,38 @@
+FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
+MAINTAINER Tabish Rashid
+
+# CUDA includes
+ENV CUDA_PATH /usr/local/cuda
+ENV CUDA_INCLUDE_PATH /usr/local/cuda/include
+ENV CUDA_LIBRARY_PATH /usr/local/cuda/lib64
+
+# Ubuntu Packages
+RUN apt-get update -y && apt-get install software-properties-common -y && \
+    add-apt-repository -y multiverse && apt-get update -y && apt-get upgrade -y && \
+    apt-get install -y apt-utils nano vim git man build-essential wget sudo && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install python3 pip3
+RUN apt-get update
+RUN apt-get -y install python3
+RUN apt-get -y install python3-pip
+RUN pip3 install --upgrade pip
+
+#### -------------------------------------------------------------------
+#### install parl
+#### -------------------------------------------------------------------
+RUN pip3 install parl
+
+#### -------------------------------------------------------------------
+#### install SMAC
+#### -------------------------------------------------------------------
+RUN pip3 install git+https://github.com/oxwhirl/smac.git
+
+#### -------------------------------------------------------------------
+#### install pytorch
+#### -------------------------------------------------------------------
+RUN pip3 install torch
+
+
+ENV SC2PATH /parl/starcraft2/StarCraftII
+WORKDIR /parl
diff --git a/benchmark/torch/coma/starcraft2/build_docker.sh b/benchmark/torch/coma/starcraft2/build_docker.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ad7a742e1aa0aa347e7dd9755cf8c20597798292
--- /dev/null
+++ b/benchmark/torch/coma/starcraft2/build_docker.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+#### -------------------------------------------------------------------
+#### build docker image
+#### -------------------------------------------------------------------
+echo 'Building Dockerfile with image name parl-starcraft2:1.0'
+docker build -t parl-starcraft2:1.0 .
diff --git a/benchmark/torch/coma/starcraft2/install_sc2.sh b/benchmark/torch/coma/starcraft2/install_sc2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75e1599224e36c686b65ddc208cd157616e6d4df
--- /dev/null
+++ b/benchmark/torch/coma/starcraft2/install_sc2.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+#### -------------------------------------------------------------------
+#### Install StarCraft II 
+#### -------------------------------------------------------------------
+
+if [ -z "$SC2PATH" ]; then
+    SC2PATH=`pwd`'/StarCraftII'
+else
+    SC2PATH=$SC2PATH'/StarCraftII'
+fi
+
+export SC2PATH=$SC2PATH
+echo 'SC2PATH is set to '$SC2PATH
+
+if [ ! -d $SC2PATH ]; then
+        echo 'StarCraftII is not installed. Installing now ...'
+        wget http://blzdistsc2-a.akamaihd.net/Linux/SC2.4.6.2.69232.zip
+        unzip -P iagreetotheeula SC2.4.6.2.69232.zip
+        rm -f SC2.4.6.2.69232.zip
+        echo 'Finished installing StarCraftII'
+else
+        echo 'StarCraftII is already installed.'
+fi
+
+if [ -f $SC2PATH/Libs/libstdc++.so* ]; then
+	echo 'Successfully installing StarCraft II'
+else
+	echo 'Fail to install StarCraft II !'
+	exit 1
+fi
+
+
+#### -------------------------------------------------------------------
+#### Add the custom maps
+#### -------------------------------------------------------------------
+
+echo 'Adding SMAC maps.'
+MAP_DIR="$SC2PATH/Maps"
+echo 'MAP_DIR is set to '$MAP_DIR
+mkdir -p $MAP_DIR
+
+wget https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip
+unzip SMAC_Maps.zip
+mv SMAC_Maps $MAP_DIR
+rm -f SMAC_Maps.zip
+cp  $MAP_DIR/SMAC_Maps/3m.SC2Map ./
+
+if [ -f $MAP_DIR/SMAC_Maps/3m.SC2Map ]; then
+	echo 'Successfully adding custom maps'
+else
+	echo 'Fail to add maps !'
+	exit 1
+fi
diff --git a/benchmark/torch/coma/train.py b/benchmark/torch/coma/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f4f71205870431dcf4372843b5182023c3a927
--- /dev/null
+++ b/benchmark/torch/coma/train.py
@@ -0,0 +1,225 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from smac.env import StarCraft2Env
+import numpy as np
+import os
+from sc2_model import ComaModel
+from sc2_agent import Agents
+from parl.algorithms import COMA
+from parl.utils import tensorboard
+
+
+def run_episode(env, agents, config, test=False):
+    o, u, r, s, avail_u, u_onehot, isover, padded = [], [], [], [], [], [], [], []
+    env.reset()
+    done = False
+    step = 0
+    ep_reward = 0
+    last_act = [0 for _ in range(config['n_agents'])]
+    agents.init_hidden()  # init rnn h0 for all agents
+
+    while not done:
+        obs = env.get_obs()
+        state = env.get_state()
+        acts, avail_acts, acts_onehot = [], [], []
+
+        for agent_id in range(config['n_agents']):
+            avail_act = env.get_avail_agent_actions(agent_id)
+
+            # action
+            epsilon = 0 if test else config['epsilon']
+            act = agents.sample(obs[agent_id], last_act[agent_id], agent_id,
+                                avail_act, epsilon, test)
+            last_act[agent_id] = act
+
+            # action one-hot
+            act_onehot = np.zeros(config['n_actions'])
+            act_onehot[act] = 1
+            acts.append(act)
+            acts_onehot.append(act_onehot)
+            avail_acts.append(avail_act)
+
+        # step
+        reward, done, _ = env.step(acts)
+
+        if step == config['episode_limit'] - 1:
+            done = 1
+
+        o.append(obs)
+        s.append(state)
+        u.append(np.reshape(acts, [config['n_agents'], 1]))
+        u_onehot.append(acts_onehot)
+        avail_u.append(avail_acts)
+        r.append([reward])
+        isover.append([done])
+        padded.append([0.])  # 0: no padded, 1: padded
+
+        ep_reward += reward
+        step += 1
+
+    # fill trainsition len to episode_limit
+    for _ in range(step, config['episode_limit']):
+        # shape: (config['episode_limit'], n_agents, shape)
+        o.append(np.zeros((config['n_agents'], config['obs_shape'])))
+        s.append(np.zeros(config['state_shape']))
+        u.append(np.zeros([config['n_agents'], 1]))
+        u_onehot.append(np.zeros((config['n_agents'], config['n_actions'])))
+        avail_u.append(np.zeros((config['n_agents'], config['n_actions'])))
+        # shape: (config['episode_limit'], 1)
+        r.append([0.])
+        padded.append([1.])
+        isover.append([1.])
+
+    ep_data = dict(
+        o=o.copy(),
+        s=s.copy(),
+        u=u.copy(),
+        r=r.copy(),
+        avail_u=avail_u.copy(),
+        u_onehot=u_onehot.copy(),
+        padded=padded.copy(),
+        isover=isover.copy())
+
+    # add an additional dimension at axis 0 for each item
+    for key in ep_data.keys():
+        # each items shape: (1, trainsition_num, n_agents, own_shape)
+        ep_data[key] = np.array([ep_data[key]])
+
+    return ep_data, ep_reward
+
+
+def run(env, agents, config):
+    win_rates = []
+    episode_rewards = []
+    train_steps = 0
+    for epoch in range(config['n_epoch']):
+        print('train epoch {}'.format(epoch))
+        # decay epsilon at the begging of each epoch
+        if config['epsilon'] > config['min_epsilon']:
+            config['epsilon'] -= config['anneal_epsilon']
+
+        # run n episode(s)
+        ep_data_list = []
+        for _ in range(config['n_episodes']):
+            ep_data, _ = run_episode(env, agents, config, test=False)
+            ep_data_list.append(ep_data)
+        # each item in ep_batch shape: (episode_num, trainsition_num, n_agents, item_shape)
+        ep_batch = ep_data_list[0]
+        ep_data_list.pop(0)
+        for ep_data in ep_data_list:
+            for key in ep_batch.keys():
+                ep_batch[key] = np.concatenate((ep_batch[key], ep_data[key]),
+                                               axis=0)
+
+        # learn
+        agents.learn(ep_batch, config['epsilon'])
+        train_steps += 1
+
+        # save model
+        if train_steps > 0 and train_steps % config['save_cycle'] == 0:
+            model_path = config['model_dir'] + '/coma_' + str(
+                train_steps) + '.ckpt'
+            agents.save(save_path=model_path)
+            print('save model: ', model_path)
+
+        # test
+        if epoch % config['test_cycle'] == 0:
+            win_rate, ep_mean_reward = test(env, agents, config)
+            # print('win_rate is ', win_rate)
+            win_rates.append(win_rate)
+            episode_rewards.append(ep_mean_reward)
+            tensorboard.add_scalar('win_rate', win_rates[-1], len(win_rates))
+            tensorboard.add_scalar('episode_rewards', episode_rewards[-1],
+                                   len(episode_rewards))
+            print('win_rate', win_rates, len(win_rates))
+            print('episode_rewards', episode_rewards, len(episode_rewards))
+
+
+def test(env, agents, config):
+    win_number = 0
+    episode_rewards = 0
+    for ep_id in range(config['test_episode_n']):
+        _, ep_reward = run_episode(env, agents, config, test=True)
+        episode_rewards += ep_reward
+        if ep_reward > config['threshold']:
+            win_number += 1
+    return win_number / config['test_episode_n'], episode_rewards / config[
+        'test_episode_n']
+
+
+def test_by_sparse_reward(agents, config):
+    env = StarCraft2Env(
+        map_name=config['map'],
+        difficulty=config['difficulty'],
+        seed=config['env_seed'],
+        replay_dir=config['replay_dir'],
+        reward_sparse=True,  # Receive 1/-1 reward for winning/loosing an episode
+        reward_scale=False)
+    win_number = 0
+    for ep_id in range(config['test_episode_n']):
+        _, ep_reward = run_episode(env, agents, config, test=True)
+        result = 'win' if ep_reward > 0 else 'defeat'
+        print('Episode {}: {}'.format(ep_id, result))
+        if ep_reward > 0:
+            win_number += 1
+    env.close()
+    win_rate = win_number / config['test_episode_n']
+    print('The win rate of coma is  {}'.format(win_rate))
+    return win_rate
+
+
+def main(config):
+    env = StarCraft2Env(
+        map_name=config['map'],
+        seed=config['env_seed'],
+        difficulty=config['difficulty'],
+        replay_dir=config['replay_dir'])
+    env_info = env.get_env_info()
+
+    config['n_actions'] = env_info['n_actions']
+    config['n_agents'] = env_info['n_agents']
+    config['state_shape'] = env_info['state_shape']
+    config['obs_shape'] = env_info['obs_shape']
+    config['episode_limit'] = env_info['episode_limit']
+
+    model = ComaModel(config=config)
+    algorithm = COMA(
+        model,
+        n_actions=config['n_actions'],
+        n_agents=config['n_agents'],
+        grad_norm_clip=config['grad_norm_clip'],
+        actor_lr=config['actor_lr'],
+        critic_lr=config['critic_lr'],
+        gamma=config['gamma'],
+        td_lambda=config['td_lambda'])
+    agents = Agents(algorithm, config)
+
+    # restore model here
+    model_file = config['model_dir'] + '/coma.ckpt'
+    if config['restore'] and os.path.isfile(model_file):
+        agents.restore(model_file)
+        print('model loaded: ', model_file)
+
+    if config['test']:
+        test_by_sparse_reward(agents, config)
+    else:
+        run(env, agents, config)
+
+    env.close()
+
+
+if __name__ == '__main__':
+    from coma_config import config
+    main(config)
diff --git a/parl/algorithms/__init__.py b/parl/algorithms/__init__.py
index 8565455c374db1e16e878b3977f3c8f2f7c5557d..20c3d3d467cf50b4fed42a7d1b37671b107f1e4b 100644
--- a/parl/algorithms/__init__.py
+++ b/parl/algorithms/__init__.py
@@ -13,13 +13,8 @@
 # limitations under the License.
 
 from parl.utils.utils import _HAS_FLUID, _HAS_TORCH
-from parl.utils import logger
 
 if _HAS_FLUID:
     from parl.algorithms.fluid import *
 elif _HAS_TORCH:
     from parl.algorithms.torch import *
-else:
-    logger.warning(
-        "No deep learning framework was found, but it's ok for parallel computation."
-    )
diff --git a/parl/algorithms/torch/__init__.py b/parl/algorithms/torch/__init__.py
index 9de7afbdd57305b1280b024556e0b1730bcbc494..86d0386b0de6c9c0f0eb766ea9c15b34ac8084cd 100644
--- a/parl/algorithms/torch/__init__.py
+++ b/parl/algorithms/torch/__init__.py
@@ -16,5 +16,6 @@ from parl.algorithms.torch.ddqn import *
 from parl.algorithms.torch.dqn import *
 from parl.algorithms.torch.a2c import *
 from parl.algorithms.torch.td3 import *
+from parl.algorithms.torch.coma import *
 from parl.algorithms.torch.ppo import *
 from parl.algorithms.torch.policy_gradient import *
diff --git a/parl/algorithms/torch/coma.py b/parl/algorithms/torch/coma.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8cc6bffa076ec3c29a5eea6cfa6857c93c8a1a3
--- /dev/null
+++ b/parl/algorithms/torch/coma.py
@@ -0,0 +1,290 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import os
+from copy import deepcopy
+import parl
+import numpy as np
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+__all__ = ['COMA']
+
+
+class COMA(parl.Algorithm):
+    def __init__(self,
+                 model,
+                 n_actions,
+                 n_agents,
+                 grad_norm_clip=None,
+                 actor_lr=None,
+                 critic_lr=None,
+                 gamma=None,
+                 td_lambda=None):
+        """  COMA algorithm
+        
+        Args:
+            model (parl.Model): forward network of actor and critic.
+            n_actions (int): action dim for each agent
+            n_agents (int): agents number
+            grad_norm_clip (int or float): gradient clip, prevent gradient explosion
+            actor_lr (float): actor network learning rate
+            critic_lr (float): critic network learning rate
+            gamma (float):  discounted factor for reward computation
+            td_lambda (float): lambda of td-lambda return
+        """
+        assert isinstance(n_actions, int)
+        assert isinstance(n_agents, int)
+        assert isinstance(grad_norm_clip, int) or isinstance(
+            grad_norm_clip, float)
+        assert isinstance(actor_lr, float)
+        assert isinstance(critic_lr, float)
+        assert isinstance(gamma, float)
+        assert isinstance(td_lambda, float)
+
+        self.n_actions = n_actions
+        self.n_agents = n_agents
+        self.grad_norm_clip = grad_norm_clip
+        self.actor_lr = actor_lr
+        self.critic_lr = critic_lr
+        self.gamma = gamma
+        self.td_lambda = td_lambda
+
+        self.model = model.to(device)
+        self.target_model = deepcopy(model).to(device)
+
+        self.sync_target()
+
+        self.actor_parameters = list(self.model.get_actor_params())
+        self.critic_parameters = list(self.model.get_critic_params())
+
+        self.critic_optimizer = torch.optim.RMSprop(
+            self.critic_parameters, lr=self.critic_lr)
+        self.actor_optimizer = torch.optim.RMSprop(
+            self.actor_parameters, lr=self.actor_lr)
+
+        self.train_rnn_h = None
+
+    def init_hidden(self, ep_num):
+        """ function: init a hidden tensor for every agent
+            input: 
+                ep_num: How many episodes are included in a batch of data
+            output:
+                rnn_h: rnn hidden state, shape (ep_num, n_agents, hidden_size)
+        """
+        assert hasattr(self.model.actor_model, 'init_hidden'), \
+            "actor must have rnn structure and has method 'init_hidden' to make hidden states"
+        rnn_h = self.model.actor_model.init_hidden().unsqueeze(0).expand(
+            ep_num, self.n_agents, -1)
+        return rnn_h
+
+    def predict(self, obs, rnn_h_in):
+        """input:
+                obs: obs + last_action + agent_id, shape: (1, obs_shape + n_actions + n_agents)
+                rnn_h_in: rnn's hidden input
+            output:
+                prob: output of actor, shape: (1, n_actions)
+                rnn_h_out: rnn's hidden output
+        """
+        with torch.no_grad():
+            policy_logits, rnn_h_out = self.model.policy(
+                obs, rnn_h_in)  # input obs shape [1, 42]
+            prob = torch.nn.functional.softmax(
+                policy_logits, dim=-1)  # shape [1, 9]
+        return prob, rnn_h_out
+
+    def _get_critic_output(self, batch):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+            output:
+                q_evals and q_targets: shape (ep_num, tr_num, n_agents, n_actions)
+        """
+        ep_num = batch['r'].shape[0]
+        tr_num = batch['r'].shape[1]
+        critic_inputs = batch['critic_inputs']
+        critic_inputs_next = batch['critic_inputs_next']
+
+        critic_inputs = critic_inputs.reshape((ep_num * tr_num * self.n_agents,
+                                               -1))
+        critic_inputs_next = critic_inputs.reshape(
+            (ep_num * tr_num * self.n_agents, -1))
+
+        q_evals = self.model.value(critic_inputs)
+        q_targets = self.model.value(critic_inputs_next)
+
+        q_evals = q_evals.reshape((ep_num, tr_num, self.n_agents, -1))
+        q_targets = q_targets.reshape((ep_num, tr_num, self.n_agents, -1))
+        return q_evals, q_targets
+
+    def _get_actor_output(self, batch, epsilon):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+                epsilon: noise discount factor
+            output:
+                action_prob: probability of actions, shape (ep_num, tr_num, n_agents, n_actions)
+        """
+        ep_num = batch['r'].shape[0]
+        tr_num = batch['r'].shape[1]
+        avail_actions = batch['avail_u']
+        actor_inputs = batch['actor_inputs']
+        action_prob = []
+        for tr_id in range(tr_num):
+            inputs = actor_inputs[:,
+                                  tr_id]  # shape (ep_num, n_agents, actor_input_dim)
+            inputs = inputs.reshape(
+                (-1, inputs.shape[-1]))  # shape (-1, actor_input_dim)
+            policy_logits, self.train_rnn_h = self.model.policy(
+                inputs, self.train_rnn_h)
+            # policy_logits shape from (-1, n_actions) to (ep_num, n_agents, n_actions)
+            policy_logits = policy_logits.view(ep_num, self.n_agents, -1)
+            prob = torch.nn.functional.softmax(policy_logits, dim=-1)
+            action_prob.append(prob)
+
+        action_prob = torch.stack(
+            action_prob,
+            dim=1).to(device)  # shape: (ep_num, tr_num, n_agents, n_actions)
+        action_num = avail_actions.sum()  # how many actions are available
+        action_prob = ((1 - epsilon) * action_prob +
+                       torch.ones_like(action_prob) * epsilon / action_num)
+        action_prob[avail_actions == 0] = 0.0  # set avail action
+
+        action_prob = action_prob / action_prob.sum(
+            dim=-1, keepdim=True)  # in case action_prob.sum != 1
+        action_prob[avail_actions == 0] = 0.0
+        action_prob = action_prob.to(device)
+        return action_prob
+
+    def _cal_td_target(self, batch, q_targets):  # compute TD(lambda)
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+                q_targets: Q value of target critic network, shape (ep_num, tr_num, n_agents)
+            output:
+                lambda_return: TD lambda return, shape (ep_num, tr_num, n_agents)
+        """
+        ep_num = batch['r'].shape[0]
+        tr_num = batch['r'].shape[1]
+        mask = (1 - batch['padded'].float()).repeat(1, 1,
+                                                    self.n_agents).to(device)
+        isover = (1 - batch['isover'].float()).repeat(1, 1, self.n_agents).to(
+            device)  # used for setting last transition's q_target to 0
+        # reshape reward: from (ep_num, tr_num, 1) to (ep_num, tr_num, n_agents)
+        r = batch['r'].repeat((1, 1, self.n_agents)).to(device)
+        # compute n_step_return
+        n_step_return = torch.zeros((ep_num, tr_num, self.n_agents,
+                                     tr_num)).to(device)
+        for tr_id in range(tr_num - 1, -1, -1):
+            n_step_return[:, tr_id, :, 0] = (
+                r[:, tr_id] + self.gamma * q_targets[:, tr_id] *
+                isover[:, tr_id]) * mask[:, tr_id]
+            for n in range(1, tr_num - tr_id):
+                n_step_return[:, tr_id, :, n] = (
+                    r[:, tr_id] + self.gamma *
+                    n_step_return[:, tr_id + 1, :, n - 1]) * mask[:, tr_id]
+
+        lambda_return = torch.zeros((ep_num, tr_num, self.n_agents)).to(device)
+        for tr_id in range(tr_num):
+            returns = torch.zeros((ep_num, self.n_agents)).to(device)
+            for n in range(1, tr_num - tr_id):
+                returns += pow(self.td_lambda,
+                               n - 1) * n_step_return[:, tr_id, :, n - 1]
+            lambda_return[:, tr_id] = (1 - self.td_lambda) * returns + \
+                                            pow(self.td_lambda, tr_num - tr_id - 1) * \
+                                            n_step_return[:, tr_id, :, tr_num - tr_id - 1]
+        return lambda_return
+
+    def _critic_learn(self, batch):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+            output:
+                q_values: Q value of eval critic network, shape (ep_num, tr_num, n_agents, n_actions)
+        """
+        u = batch['u']  # shape (ep_num, tr_num, agent, n_actions)
+        u_next = torch.zeros_like(u, dtype=torch.long)
+        u_next[:, :-1] = u[:, 1:]
+        mask = (1 - batch['padded'].float()).repeat(1, 1,
+                                                    self.n_agents).to(device)
+
+        # get q value for every agent and every action, shape (ep_num, tr_num, n_agents, n_actions)
+        q_evals, q_next_target = self._get_critic_output(batch)
+        q_values = q_evals.clone()  # used for function return
+
+        # get q valur for every agent
+        q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3)
+        q_next_target = torch.gather(
+            q_next_target, dim=3, index=u_next).squeeze(3)
+
+        targets = self._cal_td_target(batch, q_next_target)
+
+        td_error = targets.detach() - q_evals
+        masked_td_error = mask * td_error  # mask padded data
+
+        loss = (masked_td_error**
+                2).sum() / mask.sum()  # mask.sum: avail transition num
+
+        self.critic_optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.critic_parameters,
+                                       self.grad_norm_clip)
+        self.critic_optimizer.step()
+        return q_values
+
+    def _actor_learn(self, batch, epsilon, q_values):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+                epsilon (float): e-greedy discount
+                q_values: Q value of eval critic network, shape (ep_num, tr_num, n_agents, n_actions)
+        """
+        action_prob = self._get_actor_output(batch, epsilon)  # prob of u
+
+        # mask: used to compute TD-error, filling data should not affect learning
+        u = batch['u']
+        mask = (1 - batch['padded'].float()).repeat(1, 1, self.n_agents).to(
+            device)  # shape (ep_num, tr_num, 3)
+
+        q_taken = torch.gather(q_values, dim=3, index=u).squeeze(3)  # Q(u_a)
+        pi_taken = torch.gather(
+            action_prob, dim=3,
+            index=u).squeeze(3)  # prob of act that agent a choosen
+        pi_taken[mask == 0] = 1.0  # prevent log overflow
+        log_pi_taken = torch.log(pi_taken)
+
+        # advantage
+        baseline = (q_values * action_prob).sum(
+            dim=3, keepdim=True).squeeze(3).detach()
+        advantage = (q_taken - baseline).detach()
+        loss = -((advantage * log_pi_taken) * mask).sum() / mask.sum()
+        self.actor_optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.actor_parameters,
+                                       self.grad_norm_clip)
+        self.actor_optimizer.step()
+
+    def learn(self, batch, epsilon):
+        """ input:
+                batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs)
+                epsilon (float): e-greedy discount
+        """
+        ep_num = batch['r'].shape[0]
+        self.train_rnn_h = self.init_hidden(ep_num)
+        self.train_rnn_h = self.train_rnn_h.to(device)
+
+        q_values = self._critic_learn(batch)
+        self._actor_learn(batch, epsilon, q_values)
+
+    def sync_target(self, decay=0):
+        for param, target_param in zip(self.model.parameters(),
+                                       self.target_model.parameters()):
+            target_param.data.copy_((1 - decay) * param.data +
+                                    decay * target_param.data)