diff --git a/benchmark/torch/coma/.benchmark/3m_result.png b/benchmark/torch/coma/.benchmark/3m_result.png new file mode 100644 index 0000000000000000000000000000000000000000..3d5343bc1920a0e96ded07ff45f05b79ecd979a3 Binary files /dev/null and b/benchmark/torch/coma/.benchmark/3m_result.png differ diff --git a/benchmark/torch/coma/README.md b/benchmark/torch/coma/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b34f4ad773a128b8192cdc617a7bf26f9893562 --- /dev/null +++ b/benchmark/torch/coma/README.md @@ -0,0 +1,59 @@ +## Reproduce COMA with PARL + +This is an PARL + PyTorch implementation of the multi-agent reinforcement learning algorithms: COMA. + +### Paper +- [Counterfactual Multi-Agent Policy Gradients](https://arxiv.org/abs/1705.08926) + +### Benchmark Result +Mean win_rate (evaluate 5 episode) for 1000 epchos training (1 epcho = 5 episodes). + +coma-3m + + + + +## StarCraft II Installation +The environment is based on the full game of StarCraft II (version >= 3.16.1). To install the game, follow the commands bellow, or check more detail in [SMAC](https://github.com/oxwhirl/smac#installing-starcraft-ii). MacOS/Windows users are required to run this folder in Docker, as the starcraft environment does not support these two systems. + +### Linux +```shell +$ cd starcraft2 +$ SC2PATH=~ bash install_sc2.sh +``` +### MacOS (use Docker) +```shell +$ cd starcraft2 +$ bash build_docker.sh # build Dockerfile +$ bash install_sc2.sh # download startcraft II and maps +``` +### Windows (use Docker) +- Step 1: Build docker images, `cd starcraft2 && bash build_docker.sh` +- Step 2: Download a [Starcraft II package](https://github.com/Blizzard/s2client-proto#linux-packages), unzip to folder `starcraft2/StarCraftII` (password: `iagreetotheeula`) +- Step 3: Download [Map](https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip), unzip to folder `starcraft2/StarCraftII/Maps/SMAC_Maps` + + +## How to use +### Dependencies +- python3.5+ +- parl +- torch +- [SMAC](https://github.com/oxwhirl/smac) + +### Start Training +#### Linux +```shell +$ python3 train.py +``` +#### MacOS/Windows (use Docker) +```shell +$ cd coma +$ NV_GPU=$your_gpu_id docker run --name $your_container_name --user $(id -u):$(id -g) -v `pwd`:/parl -t parl-starcraft2:1.0 python3 train.py +``` +*or you can operate docker interactively by `docker run --name $your_container_name -it -v $your_host_path:/parl -t parl-starcraft2:1.0 /bin/bash`* + + + +### Reference +- [StarCraft](https://github.com/starry-sky6688/StarCraft) +- [pymarl](https://github.com/oxwhirl/pymarl) diff --git a/benchmark/torch/coma/coma_config.py b/benchmark/torch/coma/coma_config.py new file mode 100644 index 0000000000000000000000000000000000000000..25d6971efe4695f7816cf47148835a1e9d421bc6 --- /dev/null +++ b/benchmark/torch/coma/coma_config.py @@ -0,0 +1,47 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# arguments of coma +config = { + # ========== Environment ========== + 'difficulty': '3', # The difficulty of the game + 'map': '3m', # The map of the game + 'env_seed': None, # Environment random seed + 'replay_dir': '', # Save the replay, not available in Ubuntu + + # ========== Learn ========== + 'gamma': 0.99, + 'grad_norm_clip': 10, # Prevent gradient explosion + 'td_lambda': 0.8, # Lambda of td-lambda return + 'actor_lr': 1e-4, + 'critic_lr': 1e-3, + 'target_update_cycle': 200, # How often to update the target_net + + # ========== Epsilon-greedy ========== + 'epsilon': 0.5, + 'anneal_epsilon': 0.00064, + 'min_epsilon': 0.02, + # 'epsilon_anneal_scale' : 'epoch', + + # ========== Other ========== + 'n_epoch': 5000, # The number of the epoch to train the agent + 'n_episodes': 5, # The number of the episodes in one epoch + 'test_episode_n': 20, # The Number of the epochs to evaluate the agent + 'threshold': 19, # The threshold to judge whether win + 'test_cycle': 5, # How often to evaluate (every 'test_cycle' epcho) + 'save_cycle': 1000, # How often to save the model + 'model_dir': './model', # The model directory of the policy + 'test': False, # Evaluate model and quit (no training) + 'restore': False # restore model or not +} diff --git a/benchmark/torch/coma/sc2_agent.py b/benchmark/torch/coma/sc2_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..1eb9bcdf802175ff0d579c095b351e101d42351d --- /dev/null +++ b/benchmark/torch/coma/sc2_agent.py @@ -0,0 +1,237 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl +import torch +from torch.distributions import Categorical + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +class Agents(parl.Agent): + def __init__(self, algorithm, config): + self.n_actions = config['n_actions'] + self.n_agents = config['n_agents'] + self.state_shape = config['state_shape'] + self.obs_shape = config['obs_shape'] + + self.config = config + self.train_steps = 0 + self.rnn_h = None + super(Agents, self).__init__(algorithm) + print('Init all agents') + + def init_hidden(self): + """ function: init a hidden tensor for every agent at the begging of every episode + self.rnn_h: rnn hidden state, shape (n_agents, hidden_size) + """ + self.rnn_h = self.alg.init_hidden(1)[0] + + def predict(self, obs, rnn_h_in): + """input: + obs: obs + last_action + agent_id, shape: (1, obs_shape + n_actions + n_agents) + rnn_h_in: rnn's hidden input + output: + prob: output of actor, shape: (1, n_actions) + rnn_h_out: rnn's hidden output + """ + obs = np.expand_dims(obs, 0) + obs = torch.tensor(obs, dtype=torch.float32).to(device) + prob, rnn_h_out = self.alg.predict(obs, rnn_h_in) + return prob, rnn_h_out + + def sample(self, + obs, + last_action, + agent_id, + avail_actions, + epsilon, + test=False): + """input: + obs (array): agent i's obs + last_action (int): agent i's last action + agent_id (int): agent index + avail_actions (one_hot): available actions + epsilon (float): e_greed discount + test (bool): train or test + output: + action: int + prob: probability of every action, float, 0 ~ 1 + """ + obs = obs.copy() + # make obs: obs + agent's last action(one_hot) + agent's id(one_hot) + last_act_one_hot = np.zeros(self.n_actions) + last_act_one_hot[last_action] = 1. + id_one_hot = np.zeros(self.n_agents) + id_one_hot[agent_id] = 1. + obs = np.hstack((obs, last_act_one_hot)) + obs = np.hstack((obs, id_one_hot)) + + # predict action prob + prob, self.rnn_h[agent_id] = self.predict(obs, self.rnn_h[agent_id]) + + # add noise + avail_actions = torch.tensor( + avail_actions, dtype=torch.float32).unsqueeze(0).to( + device) # shape: (1, n_actions) + action_num = avail_actions.sum() # how many actions are available + prob = ((1 - epsilon) * prob + + torch.ones_like(prob) * epsilon / action_num) + prob[avail_actions == 0] = 0.0 # set avail action + + # choose action + if epsilon == 0 or test: + action = torch.argmax(prob) + else: + action = Categorical(prob).sample().long() + return action.cpu() + + def _get_actor_inputs(self, batch): + """ o(t), u(t-1)_a, agent_id + """ + obs = batch['o'] + u_onehot = batch['u_onehot'] + u_onehot_last = np.zeros_like(u_onehot) + u_onehot_last[:, 1:] = u_onehot[:, :-1] + ep_num = batch['o'].shape[0] + tr_num = batch['o'].shape[1] + + actor_inputs = [] + for agent_id in range(self.n_agents): + obs_a = obs[:, :, agent_id] + u_a_onehot_last = u_onehot_last[:, :, agent_id] + id_onehot = np.zeros((ep_num, tr_num, self.n_agents)) + id_onehot[:, :, agent_id] = 1. + # actor inputs: obs + agent's last action(one_hot) + agent's id(one_hot) + a_inputs = np.concatenate((obs_a, u_a_onehot_last, id_onehot), + axis=2) + # a_inpits shape (ep_num, tr_num, actor_input_dim) + actor_inputs.append(a_inputs) + + actor_inputs = np.stack( + actor_inputs, + axis=2) # shape (ep_num, tr_num, n_agents, actor_input_dim) + return actor_inputs + + def _get_critic_inputs(self, batch): + """ o(t)_a, s(t), u(t)_-a, u(t-1), agent_id + """ + ep_num = batch['o'].shape[0] + tr_num = batch['o'].shape[1] + + # o, o_next, state, state_next + o = batch['o'] # shape (ep_num, tr_num, n_agents, obs_shape) + o_next = np.zeros_like(o) + o_next[:, :-1] = o[:, 1:] + s = batch['s'] # shape (ep_num, tr_num, state_shape) + s_next = np.zeros_like(s) + s_next[:, :-1] = s[:, 1:] + # u_onehot, u_onehot_last shape (ep_num, tr_num, n_agents, n_actions) + u_onehot = batch['u_onehot'] + u_onehot_next = np.zeros_like(u_onehot) + u_onehot_next[:, :-1] = u_onehot[:, 1:] + u_onehot_last = np.zeros_like(u_onehot) + u_onehot_last[:, 1:] = u_onehot[:, :-1] + + critic_inputs = [] + critic_inputs_next = [] + for agent_id in range(self.n_agents): + # get o(t)_a, s(t) + o_a = o[:, :, agent_id] # shape (ep_num, tr_num, obs_shape) + o_a_next = o_next[:, :, agent_id] + s_a = s # shape (ep_num, tr_num, state_shape) + s_a_next = s_next + # get u(t-1) shape (ep_num, tr_num, n_agents * n_actions) + u_all_onehot = u_onehot.reshape((ep_num, tr_num, + self.n_agents * self.n_actions)) + u_all_onehot_next = u_onehot_next.reshape( + (ep_num, tr_num, self.n_agents * self.n_actions)) + u_all_onehot_last = u_onehot_last.reshape( + (ep_num, tr_num, self.n_agents * self.n_actions)) + # get u(t)_-a, set 0 to mask action, shape (ep_num, tr_num, n_agents * n_actions) + u_not_a_onehot = u_all_onehot.copy() + u_not_a_onehot_next = u_all_onehot_next.copy() + m_s = agent_id * self.n_actions # mask start flag + m_e = (agent_id + 1) * self.n_actions # mask end flag + u_not_a_onehot[:, :, m_s:m_e] = 0 + u_not_a_onehot_next[:, :, m_s:m_e] = 0 + # get id onehot, shape (ep_num, tr_num, n_agents) + id_onehot = np.zeros((ep_num, tr_num, self.n_agents)) + id_onehot[:, :, agent_id] = 1. + + # input: o, s, u_-a, u_last, agent_id + # input_next: o_next, s_next, u_-a_next, u, agent_id + # shape (ep_num, tr_num, critic_input_dim) + c_inputs = np.concatenate( + (o_a, s_a, u_not_a_onehot, u_all_onehot_last, id_onehot), + axis=2) + c_inputs_next = np.concatenate( + (o_a_next, s_a_next, u_not_a_onehot_next, u_all_onehot, + id_onehot), + axis=2) + critic_inputs.append(c_inputs) + critic_inputs_next.append(c_inputs_next) + critic_inputs = np.stack(critic_inputs, axis=2) + critic_inputs_next = np.stack(critic_inputs_next, axis=2) + # shape (ep_num, tr_num, n_agents, critic_input_dim) + return critic_inputs, critic_inputs_next + + def _get_avail_transitions_num(self, isover_batch): + """ input: + isover_batch: shape (ep_num, tr_num, 1) + output: + max_tr_num: max avail transitions number in all episodes + """ + ep_num = isover_batch.shape[0] + max_tr_num = 0 + for ep_id in range(ep_num): + for tr_id in range(self.config['episode_limit']): + if isover_batch[ep_id, tr_id, 0] == 1: + if tr_id + 1 >= max_tr_num: + max_tr_num = tr_id + 1 + break + return max_tr_num + + def learn(self, batch, epsilon=None): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover) + epsilon: e-greedy discount + """ + # different episode has different avail transition length + tr_num = self._get_avail_transitions_num(batch['isover']) + for key in batch.keys(): + # cut batch data's episode length + batch[key] = batch[key][:, :tr_num] + + # get actor input and critic input + batch['actor_inputs'] = self._get_actor_inputs(batch) + batch['critic_inputs'], batch[ + 'critic_inputs_next'] = self._get_critic_inputs(batch) + + # change batch data to torch tensor + for key in batch.keys(): + if key == 'u': + batch[key] = torch.tensor( + batch[key], dtype=torch.long).to(device) + else: + batch[key] = torch.tensor( + batch[key], dtype=torch.float32).to(device) + + self.alg.learn(batch, epsilon) + + if self.train_steps > 0 and self.train_steps % self.config[ + 'target_update_cycle'] == 0: + self.alg.sync_target() + self.train_steps += 1 diff --git a/benchmark/torch/coma/sc2_model.py b/benchmark/torch/coma/sc2_model.py new file mode 100644 index 0000000000000000000000000000000000000000..789436342050bb1fa32e1c2dcf8fa5a008b0894f --- /dev/null +++ b/benchmark/torch/coma/sc2_model.py @@ -0,0 +1,102 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import parl + + +class ComaModel(parl.Model): + def __init__(self, config): + super(ComaModel, self).__init__() + self.n_actions = config['n_actions'] + self.n_agents = config['n_agents'] + self.state_shape = config['state_shape'] + self.obs_shape = config['obs_shape'] + + actor_input_dim = self._get_actor_input_dim() + critic_input_dim = self._get_critic_input_dim() + + self.actor_model = ActorModel(actor_input_dim, self.n_actions) + self.critic_model = CriticModel(critic_input_dim, self.n_actions) + + def policy(self, obs, hidden_state): + return self.actor_model.policy(obs, hidden_state) + + def value(self, inputs): + return self.critic_model.value(inputs) + + def get_actor_params(self): + return self.actor_model.parameters() + + def get_critic_params(self): + return self.critic_model.parameters() + + def _get_actor_input_dim(self): + input_shape = self.obs_shape # obs: 30 in 3m map + input_shape += self.n_actions # agent's last action (one_hot): 9 in 3m map + input_shape += self.n_agents # agent's one_hot id: 3 in 3m map + return input_shape # 30 + 9 + 3 = 42 + + def _get_critic_input_dim(self): + input_shape = self.state_shape # state: 48 in 3m map + input_shape += self.obs_shape # obs: 30 in 3m map + input_shape += self.n_agents # agent_id: 3 in 3m map + input_shape += self.n_actions * self.n_agents * 2 # all agents' action and last_action (one-hot): 54 in 3m map + return input_shape # 48 + 30+ 3 = 135 + + +# all agents share one actor network +class ActorModel(parl.Model): + def __init__(self, input_shape, act_dim): + """ input : obs, include the agent's id and last action, shape: (batch, obs_shape + n_action + n_agents) + output: one agent's q(obs, act) + """ + super(ActorModel, self).__init__() + self.hid_size = 64 + + self.fc1 = nn.Linear(input_shape, self.hid_size) + self.rnn = nn.GRUCell(self.hid_size, self.hid_size) + self.fc2 = nn.Linear(self.hid_size, act_dim) + + def init_hidden(self): + # new hidden states + return self.fc1.weight.new(1, self.hid_size).zero_() + + def policy(self, obs, h0): + x = F.relu(self.fc1(obs)) + h1 = h0.reshape(-1, self.hid_size) + h2 = self.rnn(x, h1) + policy = self.fc2(h2) + return policy, h2 + + +class CriticModel(parl.Model): + def __init__(self, input_shape, act_dim): + """ inputs: [ s(t), o(t)_a, u(t)_a, agent_a, u(t-1) ], shape: (Batch, input_shape) + output: Q, shape: (Batch, n_actions) + Batch = ep_num * n_agents + """ + super(CriticModel, self).__init__() + hid_size = 128 + self.fc1 = nn.Linear(input_shape, hid_size) + self.fc2 = nn.Linear(hid_size, hid_size) + self.fc3 = nn.Linear(hid_size, act_dim) + + def value(self, inputs): + hid1 = F.relu(self.fc1(inputs)) + hid2 = F.relu(self.fc2(hid1)) + Q = self.fc3(hid2) + return Q diff --git a/benchmark/torch/coma/starcraft2/Dockerfile b/benchmark/torch/coma/starcraft2/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..176e89448d6689e08c9556ee2fd3bd9259658769 --- /dev/null +++ b/benchmark/torch/coma/starcraft2/Dockerfile @@ -0,0 +1,38 @@ +FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04 +MAINTAINER Tabish Rashid + +# CUDA includes +ENV CUDA_PATH /usr/local/cuda +ENV CUDA_INCLUDE_PATH /usr/local/cuda/include +ENV CUDA_LIBRARY_PATH /usr/local/cuda/lib64 + +# Ubuntu Packages +RUN apt-get update -y && apt-get install software-properties-common -y && \ + add-apt-repository -y multiverse && apt-get update -y && apt-get upgrade -y && \ + apt-get install -y apt-utils nano vim git man build-essential wget sudo && \ + rm -rf /var/lib/apt/lists/* + +# Install python3 pip3 +RUN apt-get update +RUN apt-get -y install python3 +RUN apt-get -y install python3-pip +RUN pip3 install --upgrade pip + +#### ------------------------------------------------------------------- +#### install parl +#### ------------------------------------------------------------------- +RUN pip3 install parl + +#### ------------------------------------------------------------------- +#### install SMAC +#### ------------------------------------------------------------------- +RUN pip3 install git+https://github.com/oxwhirl/smac.git + +#### ------------------------------------------------------------------- +#### install pytorch +#### ------------------------------------------------------------------- +RUN pip3 install torch + + +ENV SC2PATH /parl/starcraft2/StarCraftII +WORKDIR /parl diff --git a/benchmark/torch/coma/starcraft2/build_docker.sh b/benchmark/torch/coma/starcraft2/build_docker.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad7a742e1aa0aa347e7dd9755cf8c20597798292 --- /dev/null +++ b/benchmark/torch/coma/starcraft2/build_docker.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +#### ------------------------------------------------------------------- +#### build docker image +#### ------------------------------------------------------------------- +echo 'Building Dockerfile with image name parl-starcraft2:1.0' +docker build -t parl-starcraft2:1.0 . diff --git a/benchmark/torch/coma/starcraft2/install_sc2.sh b/benchmark/torch/coma/starcraft2/install_sc2.sh new file mode 100644 index 0000000000000000000000000000000000000000..75e1599224e36c686b65ddc208cd157616e6d4df --- /dev/null +++ b/benchmark/torch/coma/starcraft2/install_sc2.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +#### ------------------------------------------------------------------- +#### Install StarCraft II +#### ------------------------------------------------------------------- + +if [ -z "$SC2PATH" ]; then + SC2PATH=`pwd`'/StarCraftII' +else + SC2PATH=$SC2PATH'/StarCraftII' +fi + +export SC2PATH=$SC2PATH +echo 'SC2PATH is set to '$SC2PATH + +if [ ! -d $SC2PATH ]; then + echo 'StarCraftII is not installed. Installing now ...' + wget http://blzdistsc2-a.akamaihd.net/Linux/SC2.4.6.2.69232.zip + unzip -P iagreetotheeula SC2.4.6.2.69232.zip + rm -f SC2.4.6.2.69232.zip + echo 'Finished installing StarCraftII' +else + echo 'StarCraftII is already installed.' +fi + +if [ -f $SC2PATH/Libs/libstdc++.so* ]; then + echo 'Successfully installing StarCraft II' +else + echo 'Fail to install StarCraft II !' + exit 1 +fi + + +#### ------------------------------------------------------------------- +#### Add the custom maps +#### ------------------------------------------------------------------- + +echo 'Adding SMAC maps.' +MAP_DIR="$SC2PATH/Maps" +echo 'MAP_DIR is set to '$MAP_DIR +mkdir -p $MAP_DIR + +wget https://github.com/oxwhirl/smac/releases/download/v0.1-beta1/SMAC_Maps.zip +unzip SMAC_Maps.zip +mv SMAC_Maps $MAP_DIR +rm -f SMAC_Maps.zip +cp $MAP_DIR/SMAC_Maps/3m.SC2Map ./ + +if [ -f $MAP_DIR/SMAC_Maps/3m.SC2Map ]; then + echo 'Successfully adding custom maps' +else + echo 'Fail to add maps !' + exit 1 +fi diff --git a/benchmark/torch/coma/train.py b/benchmark/torch/coma/train.py new file mode 100644 index 0000000000000000000000000000000000000000..59f4f71205870431dcf4372843b5182023c3a927 --- /dev/null +++ b/benchmark/torch/coma/train.py @@ -0,0 +1,225 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from smac.env import StarCraft2Env +import numpy as np +import os +from sc2_model import ComaModel +from sc2_agent import Agents +from parl.algorithms import COMA +from parl.utils import tensorboard + + +def run_episode(env, agents, config, test=False): + o, u, r, s, avail_u, u_onehot, isover, padded = [], [], [], [], [], [], [], [] + env.reset() + done = False + step = 0 + ep_reward = 0 + last_act = [0 for _ in range(config['n_agents'])] + agents.init_hidden() # init rnn h0 for all agents + + while not done: + obs = env.get_obs() + state = env.get_state() + acts, avail_acts, acts_onehot = [], [], [] + + for agent_id in range(config['n_agents']): + avail_act = env.get_avail_agent_actions(agent_id) + + # action + epsilon = 0 if test else config['epsilon'] + act = agents.sample(obs[agent_id], last_act[agent_id], agent_id, + avail_act, epsilon, test) + last_act[agent_id] = act + + # action one-hot + act_onehot = np.zeros(config['n_actions']) + act_onehot[act] = 1 + acts.append(act) + acts_onehot.append(act_onehot) + avail_acts.append(avail_act) + + # step + reward, done, _ = env.step(acts) + + if step == config['episode_limit'] - 1: + done = 1 + + o.append(obs) + s.append(state) + u.append(np.reshape(acts, [config['n_agents'], 1])) + u_onehot.append(acts_onehot) + avail_u.append(avail_acts) + r.append([reward]) + isover.append([done]) + padded.append([0.]) # 0: no padded, 1: padded + + ep_reward += reward + step += 1 + + # fill trainsition len to episode_limit + for _ in range(step, config['episode_limit']): + # shape: (config['episode_limit'], n_agents, shape) + o.append(np.zeros((config['n_agents'], config['obs_shape']))) + s.append(np.zeros(config['state_shape'])) + u.append(np.zeros([config['n_agents'], 1])) + u_onehot.append(np.zeros((config['n_agents'], config['n_actions']))) + avail_u.append(np.zeros((config['n_agents'], config['n_actions']))) + # shape: (config['episode_limit'], 1) + r.append([0.]) + padded.append([1.]) + isover.append([1.]) + + ep_data = dict( + o=o.copy(), + s=s.copy(), + u=u.copy(), + r=r.copy(), + avail_u=avail_u.copy(), + u_onehot=u_onehot.copy(), + padded=padded.copy(), + isover=isover.copy()) + + # add an additional dimension at axis 0 for each item + for key in ep_data.keys(): + # each items shape: (1, trainsition_num, n_agents, own_shape) + ep_data[key] = np.array([ep_data[key]]) + + return ep_data, ep_reward + + +def run(env, agents, config): + win_rates = [] + episode_rewards = [] + train_steps = 0 + for epoch in range(config['n_epoch']): + print('train epoch {}'.format(epoch)) + # decay epsilon at the begging of each epoch + if config['epsilon'] > config['min_epsilon']: + config['epsilon'] -= config['anneal_epsilon'] + + # run n episode(s) + ep_data_list = [] + for _ in range(config['n_episodes']): + ep_data, _ = run_episode(env, agents, config, test=False) + ep_data_list.append(ep_data) + # each item in ep_batch shape: (episode_num, trainsition_num, n_agents, item_shape) + ep_batch = ep_data_list[0] + ep_data_list.pop(0) + for ep_data in ep_data_list: + for key in ep_batch.keys(): + ep_batch[key] = np.concatenate((ep_batch[key], ep_data[key]), + axis=0) + + # learn + agents.learn(ep_batch, config['epsilon']) + train_steps += 1 + + # save model + if train_steps > 0 and train_steps % config['save_cycle'] == 0: + model_path = config['model_dir'] + '/coma_' + str( + train_steps) + '.ckpt' + agents.save(save_path=model_path) + print('save model: ', model_path) + + # test + if epoch % config['test_cycle'] == 0: + win_rate, ep_mean_reward = test(env, agents, config) + # print('win_rate is ', win_rate) + win_rates.append(win_rate) + episode_rewards.append(ep_mean_reward) + tensorboard.add_scalar('win_rate', win_rates[-1], len(win_rates)) + tensorboard.add_scalar('episode_rewards', episode_rewards[-1], + len(episode_rewards)) + print('win_rate', win_rates, len(win_rates)) + print('episode_rewards', episode_rewards, len(episode_rewards)) + + +def test(env, agents, config): + win_number = 0 + episode_rewards = 0 + for ep_id in range(config['test_episode_n']): + _, ep_reward = run_episode(env, agents, config, test=True) + episode_rewards += ep_reward + if ep_reward > config['threshold']: + win_number += 1 + return win_number / config['test_episode_n'], episode_rewards / config[ + 'test_episode_n'] + + +def test_by_sparse_reward(agents, config): + env = StarCraft2Env( + map_name=config['map'], + difficulty=config['difficulty'], + seed=config['env_seed'], + replay_dir=config['replay_dir'], + reward_sparse=True, # Receive 1/-1 reward for winning/loosing an episode + reward_scale=False) + win_number = 0 + for ep_id in range(config['test_episode_n']): + _, ep_reward = run_episode(env, agents, config, test=True) + result = 'win' if ep_reward > 0 else 'defeat' + print('Episode {}: {}'.format(ep_id, result)) + if ep_reward > 0: + win_number += 1 + env.close() + win_rate = win_number / config['test_episode_n'] + print('The win rate of coma is {}'.format(win_rate)) + return win_rate + + +def main(config): + env = StarCraft2Env( + map_name=config['map'], + seed=config['env_seed'], + difficulty=config['difficulty'], + replay_dir=config['replay_dir']) + env_info = env.get_env_info() + + config['n_actions'] = env_info['n_actions'] + config['n_agents'] = env_info['n_agents'] + config['state_shape'] = env_info['state_shape'] + config['obs_shape'] = env_info['obs_shape'] + config['episode_limit'] = env_info['episode_limit'] + + model = ComaModel(config=config) + algorithm = COMA( + model, + n_actions=config['n_actions'], + n_agents=config['n_agents'], + grad_norm_clip=config['grad_norm_clip'], + actor_lr=config['actor_lr'], + critic_lr=config['critic_lr'], + gamma=config['gamma'], + td_lambda=config['td_lambda']) + agents = Agents(algorithm, config) + + # restore model here + model_file = config['model_dir'] + '/coma.ckpt' + if config['restore'] and os.path.isfile(model_file): + agents.restore(model_file) + print('model loaded: ', model_file) + + if config['test']: + test_by_sparse_reward(agents, config) + else: + run(env, agents, config) + + env.close() + + +if __name__ == '__main__': + from coma_config import config + main(config) diff --git a/parl/algorithms/__init__.py b/parl/algorithms/__init__.py index 8565455c374db1e16e878b3977f3c8f2f7c5557d..20c3d3d467cf50b4fed42a7d1b37671b107f1e4b 100644 --- a/parl/algorithms/__init__.py +++ b/parl/algorithms/__init__.py @@ -13,13 +13,8 @@ # limitations under the License. from parl.utils.utils import _HAS_FLUID, _HAS_TORCH -from parl.utils import logger if _HAS_FLUID: from parl.algorithms.fluid import * elif _HAS_TORCH: from parl.algorithms.torch import * -else: - logger.warning( - "No deep learning framework was found, but it's ok for parallel computation." - ) diff --git a/parl/algorithms/torch/__init__.py b/parl/algorithms/torch/__init__.py index 9de7afbdd57305b1280b024556e0b1730bcbc494..86d0386b0de6c9c0f0eb766ea9c15b34ac8084cd 100644 --- a/parl/algorithms/torch/__init__.py +++ b/parl/algorithms/torch/__init__.py @@ -16,5 +16,6 @@ from parl.algorithms.torch.ddqn import * from parl.algorithms.torch.dqn import * from parl.algorithms.torch.a2c import * from parl.algorithms.torch.td3 import * +from parl.algorithms.torch.coma import * from parl.algorithms.torch.ppo import * from parl.algorithms.torch.policy_gradient import * diff --git a/parl/algorithms/torch/coma.py b/parl/algorithms/torch/coma.py new file mode 100644 index 0000000000000000000000000000000000000000..f8cc6bffa076ec3c29a5eea6cfa6857c93c8a1a3 --- /dev/null +++ b/parl/algorithms/torch/coma.py @@ -0,0 +1,290 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import os +from copy import deepcopy +import parl +import numpy as np + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +__all__ = ['COMA'] + + +class COMA(parl.Algorithm): + def __init__(self, + model, + n_actions, + n_agents, + grad_norm_clip=None, + actor_lr=None, + critic_lr=None, + gamma=None, + td_lambda=None): + """ COMA algorithm + + Args: + model (parl.Model): forward network of actor and critic. + n_actions (int): action dim for each agent + n_agents (int): agents number + grad_norm_clip (int or float): gradient clip, prevent gradient explosion + actor_lr (float): actor network learning rate + critic_lr (float): critic network learning rate + gamma (float): discounted factor for reward computation + td_lambda (float): lambda of td-lambda return + """ + assert isinstance(n_actions, int) + assert isinstance(n_agents, int) + assert isinstance(grad_norm_clip, int) or isinstance( + grad_norm_clip, float) + assert isinstance(actor_lr, float) + assert isinstance(critic_lr, float) + assert isinstance(gamma, float) + assert isinstance(td_lambda, float) + + self.n_actions = n_actions + self.n_agents = n_agents + self.grad_norm_clip = grad_norm_clip + self.actor_lr = actor_lr + self.critic_lr = critic_lr + self.gamma = gamma + self.td_lambda = td_lambda + + self.model = model.to(device) + self.target_model = deepcopy(model).to(device) + + self.sync_target() + + self.actor_parameters = list(self.model.get_actor_params()) + self.critic_parameters = list(self.model.get_critic_params()) + + self.critic_optimizer = torch.optim.RMSprop( + self.critic_parameters, lr=self.critic_lr) + self.actor_optimizer = torch.optim.RMSprop( + self.actor_parameters, lr=self.actor_lr) + + self.train_rnn_h = None + + def init_hidden(self, ep_num): + """ function: init a hidden tensor for every agent + input: + ep_num: How many episodes are included in a batch of data + output: + rnn_h: rnn hidden state, shape (ep_num, n_agents, hidden_size) + """ + assert hasattr(self.model.actor_model, 'init_hidden'), \ + "actor must have rnn structure and has method 'init_hidden' to make hidden states" + rnn_h = self.model.actor_model.init_hidden().unsqueeze(0).expand( + ep_num, self.n_agents, -1) + return rnn_h + + def predict(self, obs, rnn_h_in): + """input: + obs: obs + last_action + agent_id, shape: (1, obs_shape + n_actions + n_agents) + rnn_h_in: rnn's hidden input + output: + prob: output of actor, shape: (1, n_actions) + rnn_h_out: rnn's hidden output + """ + with torch.no_grad(): + policy_logits, rnn_h_out = self.model.policy( + obs, rnn_h_in) # input obs shape [1, 42] + prob = torch.nn.functional.softmax( + policy_logits, dim=-1) # shape [1, 9] + return prob, rnn_h_out + + def _get_critic_output(self, batch): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + output: + q_evals and q_targets: shape (ep_num, tr_num, n_agents, n_actions) + """ + ep_num = batch['r'].shape[0] + tr_num = batch['r'].shape[1] + critic_inputs = batch['critic_inputs'] + critic_inputs_next = batch['critic_inputs_next'] + + critic_inputs = critic_inputs.reshape((ep_num * tr_num * self.n_agents, + -1)) + critic_inputs_next = critic_inputs.reshape( + (ep_num * tr_num * self.n_agents, -1)) + + q_evals = self.model.value(critic_inputs) + q_targets = self.model.value(critic_inputs_next) + + q_evals = q_evals.reshape((ep_num, tr_num, self.n_agents, -1)) + q_targets = q_targets.reshape((ep_num, tr_num, self.n_agents, -1)) + return q_evals, q_targets + + def _get_actor_output(self, batch, epsilon): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + epsilon: noise discount factor + output: + action_prob: probability of actions, shape (ep_num, tr_num, n_agents, n_actions) + """ + ep_num = batch['r'].shape[0] + tr_num = batch['r'].shape[1] + avail_actions = batch['avail_u'] + actor_inputs = batch['actor_inputs'] + action_prob = [] + for tr_id in range(tr_num): + inputs = actor_inputs[:, + tr_id] # shape (ep_num, n_agents, actor_input_dim) + inputs = inputs.reshape( + (-1, inputs.shape[-1])) # shape (-1, actor_input_dim) + policy_logits, self.train_rnn_h = self.model.policy( + inputs, self.train_rnn_h) + # policy_logits shape from (-1, n_actions) to (ep_num, n_agents, n_actions) + policy_logits = policy_logits.view(ep_num, self.n_agents, -1) + prob = torch.nn.functional.softmax(policy_logits, dim=-1) + action_prob.append(prob) + + action_prob = torch.stack( + action_prob, + dim=1).to(device) # shape: (ep_num, tr_num, n_agents, n_actions) + action_num = avail_actions.sum() # how many actions are available + action_prob = ((1 - epsilon) * action_prob + + torch.ones_like(action_prob) * epsilon / action_num) + action_prob[avail_actions == 0] = 0.0 # set avail action + + action_prob = action_prob / action_prob.sum( + dim=-1, keepdim=True) # in case action_prob.sum != 1 + action_prob[avail_actions == 0] = 0.0 + action_prob = action_prob.to(device) + return action_prob + + def _cal_td_target(self, batch, q_targets): # compute TD(lambda) + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + q_targets: Q value of target critic network, shape (ep_num, tr_num, n_agents) + output: + lambda_return: TD lambda return, shape (ep_num, tr_num, n_agents) + """ + ep_num = batch['r'].shape[0] + tr_num = batch['r'].shape[1] + mask = (1 - batch['padded'].float()).repeat(1, 1, + self.n_agents).to(device) + isover = (1 - batch['isover'].float()).repeat(1, 1, self.n_agents).to( + device) # used for setting last transition's q_target to 0 + # reshape reward: from (ep_num, tr_num, 1) to (ep_num, tr_num, n_agents) + r = batch['r'].repeat((1, 1, self.n_agents)).to(device) + # compute n_step_return + n_step_return = torch.zeros((ep_num, tr_num, self.n_agents, + tr_num)).to(device) + for tr_id in range(tr_num - 1, -1, -1): + n_step_return[:, tr_id, :, 0] = ( + r[:, tr_id] + self.gamma * q_targets[:, tr_id] * + isover[:, tr_id]) * mask[:, tr_id] + for n in range(1, tr_num - tr_id): + n_step_return[:, tr_id, :, n] = ( + r[:, tr_id] + self.gamma * + n_step_return[:, tr_id + 1, :, n - 1]) * mask[:, tr_id] + + lambda_return = torch.zeros((ep_num, tr_num, self.n_agents)).to(device) + for tr_id in range(tr_num): + returns = torch.zeros((ep_num, self.n_agents)).to(device) + for n in range(1, tr_num - tr_id): + returns += pow(self.td_lambda, + n - 1) * n_step_return[:, tr_id, :, n - 1] + lambda_return[:, tr_id] = (1 - self.td_lambda) * returns + \ + pow(self.td_lambda, tr_num - tr_id - 1) * \ + n_step_return[:, tr_id, :, tr_num - tr_id - 1] + return lambda_return + + def _critic_learn(self, batch): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + output: + q_values: Q value of eval critic network, shape (ep_num, tr_num, n_agents, n_actions) + """ + u = batch['u'] # shape (ep_num, tr_num, agent, n_actions) + u_next = torch.zeros_like(u, dtype=torch.long) + u_next[:, :-1] = u[:, 1:] + mask = (1 - batch['padded'].float()).repeat(1, 1, + self.n_agents).to(device) + + # get q value for every agent and every action, shape (ep_num, tr_num, n_agents, n_actions) + q_evals, q_next_target = self._get_critic_output(batch) + q_values = q_evals.clone() # used for function return + + # get q valur for every agent + q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3) + q_next_target = torch.gather( + q_next_target, dim=3, index=u_next).squeeze(3) + + targets = self._cal_td_target(batch, q_next_target) + + td_error = targets.detach() - q_evals + masked_td_error = mask * td_error # mask padded data + + loss = (masked_td_error** + 2).sum() / mask.sum() # mask.sum: avail transition num + + self.critic_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(self.critic_parameters, + self.grad_norm_clip) + self.critic_optimizer.step() + return q_values + + def _actor_learn(self, batch, epsilon, q_values): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + epsilon (float): e-greedy discount + q_values: Q value of eval critic network, shape (ep_num, tr_num, n_agents, n_actions) + """ + action_prob = self._get_actor_output(batch, epsilon) # prob of u + + # mask: used to compute TD-error, filling data should not affect learning + u = batch['u'] + mask = (1 - batch['padded'].float()).repeat(1, 1, self.n_agents).to( + device) # shape (ep_num, tr_num, 3) + + q_taken = torch.gather(q_values, dim=3, index=u).squeeze(3) # Q(u_a) + pi_taken = torch.gather( + action_prob, dim=3, + index=u).squeeze(3) # prob of act that agent a choosen + pi_taken[mask == 0] = 1.0 # prevent log overflow + log_pi_taken = torch.log(pi_taken) + + # advantage + baseline = (q_values * action_prob).sum( + dim=3, keepdim=True).squeeze(3).detach() + advantage = (q_taken - baseline).detach() + loss = -((advantage * log_pi_taken) * mask).sum() / mask.sum() + self.actor_optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(self.actor_parameters, + self.grad_norm_clip) + self.actor_optimizer.step() + + def learn(self, batch, epsilon): + """ input: + batch: dict(o, s, u, r, u_onehot, avail_u, padded, isover, actor_inputs, critic_inputs) + epsilon (float): e-greedy discount + """ + ep_num = batch['r'].shape[0] + self.train_rnn_h = self.init_hidden(ep_num) + self.train_rnn_h = self.train_rnn_h.to(device) + + q_values = self._critic_learn(batch) + self._actor_learn(batch, epsilon, q_values) + + def sync_target(self, decay=0): + for param, target_param in zip(self.model.parameters(), + self.target_model.parameters()): + target_param.data.copy_((1 - decay) * param.data + + decay * target_param.data)