未验证 提交 0e174277 编写于 作者: B Bo Zhou 提交者: GitHub

remove previous algorithms; use the forward function in parl.Model (#96)

* remove previous algorithms; use the forward function in parl.Model

* remove abundant lines

* yapf
上级 6efa7871
......@@ -16,7 +16,6 @@ Mean episode reward in training process after 10 million sample steps.
## How to use
### Dependencies
+ python2.7 or python3.5+
+ [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
......
......@@ -16,7 +16,6 @@ Results with one learner (in a P40 GPU) and 24 simulators (in 12 CPU) in 10 mill
## How to use
### Dependencies
+ python2.7 or python3.5+
+ [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
......
......@@ -20,7 +20,6 @@ Result with one learner (in a P40 GPU) and 32 actors (in 32 CPUs).
## How to use
### Dependencies
+ python2.7 or python3.5+
+ [paddlepaddle>=1.3.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
......
......@@ -4,7 +4,6 @@ Train an agent with PARL to solve the CartPole problem, a classical benchmark in
## How to use
### Dependencies:
+ python2.7 or python3.5+
+ [paddlepaddle>=1.0.0](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
......
......@@ -18,7 +18,7 @@ import parl
from parl import layers
class CartpoleAgent(Agent):
class CartpoleAgent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim
self.act_dim = act_dim
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import parl
from parl import layers
......@@ -25,7 +24,7 @@ class CartpoleModel(parl.Model):
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=act_dim, act='softmax')
def policy(self, obs):
def forward(self, obs):
out = self.fc1(obs)
out = self.fc2(out)
return out
......@@ -53,7 +53,9 @@ def main():
for i in range(1000):
obs_list, action_list, reward_list = run_episode(env, agent)
logger.info("Episode {}, Reward Sum {}.".format(i, sum(reward_list)))
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import parl.layers as layers
from parl.framework.algorithm_base import Algorithm
from parl.framework.policy_distribution import CategoricalDistribution
__all__ = ['A3C']
class A3C(Algorithm):
def __init__(self, model, hyperparas):
super(A3C, self).__init__(model, hyperparas)
def learn(self, obs, actions, advantages, target_values, learning_rate,
entropy_coeff):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
actions: An int64 tensor of shape [B].
advantages: A float32 tensor of shape [B].
target_values: A float32 tensor of shape [B].
learning_rate: float scalar of learning rate.
entropy_coeff: float scalar of entropy coefficient.
"""
logits = self.model.policy(obs)
policy_distribution = CategoricalDistribution(logits)
actions_log_probs = policy_distribution.logp(actions)
# The policy gradient loss
pi_loss = -1.0 * layers.reduce_sum(actions_log_probs * advantages)
# The value function loss
values = self.model.value(obs)
delta = values - target_values
vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))
# The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
policy_entropy = policy_distribution.entropy()
entropy = layers.reduce_sum(policy_entropy)
total_loss = (pi_loss + vf_loss * self.hp['vf_loss_coeff'] +
entropy * entropy_coeff)
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0))
optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
optimizer.minimize(total_loss)
return total_loss, pi_loss, vf_loss, entropy
def sample(self, obs):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
"""
logits, values = self.model.policy_and_value(obs)
policy_dist = CategoricalDistribution(logits)
sample_actions = policy_dist.sample()
return sample_actions, values
def predict(self, obs):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
"""
logits = self.model.policy(obs)
probs = layers.softmax(logits)
predict_actions = layers.argmax(probs, 1)
return predict_actions
def value(self, obs):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
"""
values = self.model.value(obs)
return values
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl.layers as layers
from copy import deepcopy
from paddle import fluid
from parl.framework.algorithm_base import Algorithm
__all__ = ['DDPG']
class DDPG(Algorithm):
def __init__(self, model, hyperparas):
""" model: should implement the function get_actor_params()
"""
Algorithm.__init__(self, model, hyperparas)
self.model = model
self.target_model = deepcopy(model)
# fetch hyper parameters
self.gamma = hyperparas['gamma']
self.tau = hyperparas['tau']
self.actor_lr = hyperparas['actor_lr']
self.critic_lr = hyperparas['critic_lr']
def define_predict(self, obs):
""" use actor model of self.model to predict the action
"""
return self.model.policy(obs)
def define_learn(self, obs, action, reward, next_obs, terminal):
""" update actor and critic model with DDPG algorithm
"""
actor_cost = self._actor_learn(obs)
critic_cost = self._critic_learn(obs, action, reward, next_obs,
terminal)
return actor_cost, critic_cost
def _actor_learn(self, obs):
action = self.model.policy(obs)
Q = self.model.value(obs, action)
cost = layers.reduce_mean(-1.0 * Q)
optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
return cost
def _critic_learn(self, obs, action, reward, next_obs, terminal):
next_action = self.target_model.policy(next_obs)
next_Q = self.target_model.value(next_obs, next_action)
terminal = layers.cast(terminal, dtype='float32')
target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
target_Q.stop_gradient = True
Q = self.model.value(obs, action)
cost = layers.square_error_cost(Q, target_Q)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
optimizer.minimize(cost)
return cost
def sync_target(self, gpu_id, decay=None):
if decay is None:
decay = 1.0 - self.tau
self.model.sync_params_to(
self.target_model, gpu_id=gpu_id, decay=decay)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from parl.framework.algorithm_base import Algorithm
import parl.layers as layers
import copy
__all__ = ['DQN']
class DQN(Algorithm):
def __init__(self, model, hyperparas):
Algorithm.__init__(self, model, hyperparas)
self.model = model
self.target_model = copy.deepcopy(model)
# fetch hyper parameters
self.action_dim = hyperparas['action_dim']
self.gamma = hyperparas['gamma']
self.lr = hyperparas['lr']
def define_predict(self, obs):
""" use value model self.model to predict the action value
"""
return self.model.value(obs)
def define_learn(self, obs, action, reward, next_obs, terminal):
""" update value model self.model with DQN algorithm
"""
pred_value = self.model.value(obs)
next_pred_value = self.target_model.value(next_obs)
best_v = layers.reduce_max(next_pred_value, dim=1)
best_v.stop_gradient = True
target = reward + (
1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v
action_onehot = layers.one_hot(action, self.action_dim)
action_onehot = layers.cast(action_onehot, dtype='float32')
pred_action_value = layers.reduce_sum(
layers.elementwise_mul(action_onehot, pred_value), dim=1)
cost = layers.square_error_cost(pred_action_value, target)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
optimizer.minimize(cost)
return cost
def sync_target(self, gpu_id):
""" sync parameters of self.target_model with self.model
"""
self.model.sync_params_to(self.target_model, gpu_id=gpu_id)
......@@ -54,7 +54,7 @@ class PolicyGradient(Algorithm):
def predict(self, obs):
""" use policy model self.model to predict the action probability
"""
return self.model.policy(obs)
return self.model(obs)
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='learn')
......@@ -66,7 +66,7 @@ class PolicyGradient(Algorithm):
def learn(self, obs, action, reward):
""" update policy model self.model with policy gradient algorithm
"""
act_prob = self.model.policy(obs)
act_prob = self.model(obs)
log_prob = layers.cross_entropy(act_prob, action)
cost = log_prob * reward
cost = layers.reduce_mean(cost)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parl.algorithms.impala.impala import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import parl.layers as layers
from parl.algorithms.impala import vtrace
from parl.framework.algorithm_base import Algorithm
from parl.framework.policy_distribution import CategoricalDistribution
from parl.plutils import inverse
__all__ = ['IMPALA']
class VTraceLoss(object):
def __init__(self,
behaviour_actions_log_probs,
target_actions_log_probs,
policy_entropy,
dones,
discount,
rewards,
values,
bootstrap_value,
entropy_coeff=-0.01,
vf_loss_coeff=0.5,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0):
"""Policy gradient loss with vtrace importance weighting.
VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
batch_size. The reason we need to know `B` is for V-trace to properly
handle episode cut boundaries.
Args:
behaviour_actions_log_probs: A float32 tensor of shape [T, B].
target_actions_log_probs: A float32 tensor of shape [T, B].
policy_entropy: A float32 tensor of shape [T, B].
dones: A float32 tensor of shape [T, B].
discount: A float32 scalar.
rewards: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
bootstrap_value: A float32 tensor of shape [B].
"""
self.vtrace_returns = vtrace.from_importance_weights(
behaviour_actions_log_probs=behaviour_actions_log_probs,
target_actions_log_probs=target_actions_log_probs,
discounts=inverse(dones) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold)
# The policy gradients loss
self.pi_loss = -1.0 * layers.reduce_sum(
target_actions_log_probs * self.vtrace_returns.pg_advantages)
# The baseline loss
delta = values - self.vtrace_returns.vs
self.vf_loss = 0.5 * layers.reduce_sum(layers.square(delta))
# The entropy loss (We want to maximize entropy, so entropy_ceoff < 0)
self.entropy = layers.reduce_sum(policy_entropy)
# The summed weighted loss
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
self.entropy * entropy_coeff)
class IMPALA(Algorithm):
def __init__(self, model, hyperparas):
super(IMPALA, self).__init__(model, hyperparas)
def learn(self, obs, actions, behaviour_logits, rewards, dones,
learning_rate, entropy_coeff):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
actions: An int64 tensor of shape [B].
behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
rewards: A float32 tensor of shape [B].
dones: A float32 tensor of shape [B].
learning_rate: float scalar of learning rate.
entropy_coeff: float scalar of entropy coefficient.
"""
values = self.model.value(obs)
target_logits = self.model.policy(obs)
target_policy_distribution = CategoricalDistribution(target_logits)
behaviour_policy_distribution = CategoricalDistribution(
behaviour_logits)
policy_entropy = target_policy_distribution.entropy()
target_actions_log_probs = target_policy_distribution.logp(actions)
behaviour_actions_log_probs = behaviour_policy_distribution.logp(
actions)
# Calculating kl for debug
kl = target_policy_distribution.kl(behaviour_policy_distribution)
kl = layers.reduce_mean(kl)
"""
Split the tensor into batches at known episode cut boundaries.
[B * T] -> [T, B]
"""
T = self.hp["sample_batch_steps"]
def split_batches(tensor):
B = tensor.shape[0] // T
splited_tensor = layers.reshape(tensor,
[B, T] + list(tensor.shape[1:]))
# transpose B and T
return layers.transpose(
splited_tensor, [1, 0] + list(range(2, 1 + len(tensor.shape))))
behaviour_actions_log_probs = split_batches(
behaviour_actions_log_probs)
target_actions_log_probs = split_batches(target_actions_log_probs)
policy_entropy = split_batches(policy_entropy)
dones = split_batches(dones)
rewards = split_batches(rewards)
values = split_batches(values)
# [T, B] -> [T - 1, B] for V-trace calc.
behaviour_actions_log_probs = layers.slice(
behaviour_actions_log_probs, axes=[0], starts=[0], ends=[-1])
target_actions_log_probs = layers.slice(
target_actions_log_probs, axes=[0], starts=[0], ends=[-1])
policy_entropy = layers.slice(
policy_entropy, axes=[0], starts=[0], ends=[-1])
dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
bootstrap_value = layers.slice(
values, axes=[0], starts=[T - 1], ends=[T])
values = layers.slice(values, axes=[0], starts=[0], ends=[-1])
bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])
vtrace_loss = VTraceLoss(
behaviour_actions_log_probs=behaviour_actions_log_probs,
target_actions_log_probs=target_actions_log_probs,
policy_entropy=policy_entropy,
dones=dones,
discount=self.hp['gamma'],
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
entropy_coeff=entropy_coeff,
vf_loss_coeff=self.hp['vf_loss_coeff'],
clip_rho_threshold=self.hp['clip_rho_threshold'],
clip_pg_rho_threshold=self.hp['clip_pg_rho_threshold'])
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=40.0))
optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
optimizer.minimize(vtrace_loss.total_loss)
return vtrace_loss, kl
def sample(self, obs):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
"""
logits = self.model.policy(obs)
policy_dist = CategoricalDistribution(logits)
sample_actions = policy_dist.sample()
return sample_actions, logits
def predict(self, obs):
"""
Args:
obs: An float32 tensor of shape ([B] + observation_space).
E.g. [B, C, H, W] in atari.
"""
logits = self.model.policy(obs)
probs = layers.softmax(logits)
predict_actions = layers.argmax(probs, 1)
return predict_actions
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for V-trace.
The following code is mainly referenced and copied from:
https://github.com/deepmind/scalable_agent/blob/master/vtrace_test.py
"""
import copy
import numpy as np
import unittest
import parl.layers as layers
from paddle import fluid
from parameterized import parameterized
from parl.algorithms.impala import vtrace
from parl.utils import get_gpu_count
def _shaped_arange(*shape):
"""Runs np.arange, converts to float and reshapes."""
return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
def _ground_truth_calculation(behaviour_actions_log_probs,
target_actions_log_probs, discounts, rewards,
values, bootstrap_value, clip_rho_threshold,
clip_pg_rho_threshold):
"""Calculates the ground truth for V-trace in Python/Numpy."""
log_rhos = target_actions_log_probs - behaviour_actions_log_probs
vs = []
seq_len = len(discounts)
rhos = np.exp(log_rhos)
cs = np.minimum(rhos, 1.0)
clipped_rhos = rhos
if clip_rho_threshold:
clipped_rhos = np.minimum(rhos, clip_rho_threshold)
clipped_pg_rhos = rhos
if clip_pg_rho_threshold:
clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)
# This is a very inefficient way to calculate the V-trace ground truth.
# We calculate it this way because it is close to the mathematical notation of
# V-trace.
# v_s = V(x_s)
# + \sum^{T-1}_{t=s} \gamma^{t-s}
# * \prod_{i=s}^{t-1} c_i
# * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
# Note that when we take the product over c_i, we write `s:t` as the notation
# of the paper is inclusive of the `t-1`, but Python is exclusive.
# Also note that np.prod([]) == 1.
values_t_plus_1 = np.concatenate([values, bootstrap_value[None, :]],
axis=0)
for s in range(seq_len):
v_s = np.copy(values[s]) # Very important copy.
for t in range(s, seq_len):
v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0)
* clipped_rhos[t] * (rewards[t] + discounts[t] *
values_t_plus_1[t + 1] - values[t]))
vs.append(v_s)
vs = np.stack(vs, axis=0)
pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate(
[vs[1:], bootstrap_value[None, :]], axis=0) - values))
return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)
class VtraceTest(unittest.TestCase):
def setUp(self):
gpu_count = get_gpu_count()
if gpu_count > 0:
place = fluid.CUDAPlace(0)
self.gpu_id = 0
else:
place = fluid.CPUPlace()
self.gpu_id = -1
self.executor = fluid.Executor(place)
@parameterized.expand([('Batch1', 1), ('Batch4', 4)])
def test_from_importance_weights(self, name, batch_size):
"""Tests V-trace against ground truth data calculated in python."""
seq_len = 5
# Create log_rhos such that rho will span from near-zero to above the
# clipping thresholds. In particular, calculate log_rhos in [-2.5, 2.5),
# so that rho is in approx [0.08, 12.2).
log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len)
log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5).
# Fake behaviour_actions_log_probs, target_actions_log_probs
target_actions_log_probs = log_rhos + 1.0
behaviour_actions_log_probs = np.ones(
shape=log_rhos.shape, dtype='float32')
values = {
'behaviour_actions_log_probs':
behaviour_actions_log_probs,
'target_actions_log_probs':
target_actions_log_probs,
# T, B where B_i: [0.9 / (i+1)] * T
'discounts':
np.array([[0.9 / (b + 1) for b in range(batch_size)]
for _ in range(seq_len)],
dtype=np.float32),
'rewards':
_shaped_arange(seq_len, batch_size),
'values':
_shaped_arange(seq_len, batch_size) / batch_size,
'bootstrap_value':
_shaped_arange(batch_size) + 1.0,
'clip_rho_threshold':
3.7,
'clip_pg_rho_threshold':
2.2,
}
# Calculated by numpy/python
ground_truth_v = _ground_truth_calculation(**values)
# Calculated by Fluid
test_program = fluid.Program()
with fluid.program_guard(test_program):
behaviour_actions_log_probs_input = layers.data(
name='behaviour_actions_log_probs',
shape=[seq_len, batch_size],
dtype='float32',
append_batch_size=False)
target_actions_log_probs_input = layers.data(
name='target_actions_log_probs',
shape=[seq_len, batch_size],
dtype='float32',
append_batch_size=False)
discounts_input = layers.data(
name='discounts',
shape=[seq_len, batch_size],
dtype='float32',
append_batch_size=False)
rewards_input = layers.data(
name='rewards',
shape=[seq_len, batch_size],
dtype='float32',
append_batch_size=False)
values_input = layers.data(
name='values',
shape=[seq_len, batch_size],
dtype='float32',
append_batch_size=False)
bootstrap_value_input = layers.data(
name='bootstrap_value',
shape=[batch_size],
dtype='float32',
append_batch_size=False)
fluid_inputs = {
'behaviour_actions_log_probs':
behaviour_actions_log_probs_input,
'target_actions_log_probs': target_actions_log_probs_input,
'discounts': discounts_input,
'rewards': rewards_input,
'values': values_input,
'bootstrap_value': bootstrap_value_input,
'clip_rho_threshold': 3.7,
'clip_pg_rho_threshold': 2.2,
}
output = vtrace.from_importance_weights(**fluid_inputs)
self.executor.run(fluid.default_startup_program())
feed = copy.copy(values)
del feed['clip_rho_threshold']
del feed['clip_pg_rho_threshold']
[output_vs, output_pg_advantage] = self.executor.run(
test_program,
feed=feed,
fetch_list=[output.vs, output.pg_advantages])
np.testing.assert_almost_equal(ground_truth_v.vs, output_vs, 5)
np.testing.assert_almost_equal(ground_truth_v.pg_advantages,
output_pg_advantage, 5)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to compute V-trace off-policy actor critic targets,
which used in IMAPLA algorithm.
The following code is mainly referenced and copied from:
https://github.com/deepmind/scalable_agent/blob/master/vtrace.py
For details and theory see:
"Espeholt L, Soyer H, Munos R, et al. Impala: Scalable distributed
deep-rl with importance weighted actor-learner
architectures[J]. arXiv preprint arXiv:1802.01561, 2018."
"""
import collections
import paddle.fluid as fluid
import parl.layers as layers
from parl.utils import MAX_INT32
VTraceReturns = collections.namedtuple('VTraceReturns',
['vs', 'pg_advantages'])
def from_importance_weights(behaviour_actions_log_probs,
target_actions_log_probs,
discounts,
rewards,
values,
bootstrap_value,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name='vtrace_from_logits'):
r"""V-trace for softmax policies.
Calculates V-trace actor critic targets for softmax polices as described in
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
Target policy refers to the policy we are interested in improving and
behaviour policy refers to the policy that generated the given
rewards and actions.
In the notation used throughout documentation and comments, T refers to the
time dimension ranging from 0 to T-1. B refers to the batch size and
NUM_ACTIONS refers to the number of actions.
Args:
behaviour_actions_log_probs: A float32 tensor of shape [T, B] of
log-probabilities of actions in behaviour policy.
target_policy_logits: A float32 tensor of shape [T, B] of
log-probabilities of actions in target policy.
discounts: A float32 tensor of shape [T, B] with the discount encountered
when following the behaviour policy.
rewards: A float32 tensor of shape [T, B] with the rewards generated by
following the behaviour policy.
values: A float32 tensor of shape [T, B] with the value function estimates
wrt. the target policy.
bootstrap_value: A float32 of shape [B] with the value function estimate at
time T.
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
importance weights (rho) when calculating the baseline targets (vs).
rho^bar in the paper.
clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
name: The name scope that all V-trace operations will be created in.
Returns:
A VTraceReturns namedtuple (vs, pg_advantages) where:
vs: A float32 tensor of shape [T, B]. Can be used as target to
train a baseline (V(x_t) - vs_t)^2.
pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
advantage in the calculation of policy gradients.
"""
rank = len(behaviour_actions_log_probs.shape) # Usually 2.
assert len(target_actions_log_probs.shape) == rank
assert len(values.shape) == rank
assert len(bootstrap_value.shape) == (rank - 1)
assert len(discounts.shape) == rank
assert len(rewards.shape) == rank
# log importance sampling weights.
# V-trace performs operations on rhos in log-space for numerical stability.
log_rhos = target_actions_log_probs - behaviour_actions_log_probs
if clip_rho_threshold is not None:
clip_rho_threshold = layers.fill_constant([1], 'float32',
clip_rho_threshold)
if clip_pg_rho_threshold is not None:
clip_pg_rho_threshold = layers.fill_constant([1], 'float32',
clip_pg_rho_threshold)
rhos = layers.exp(log_rhos)
if clip_rho_threshold is not None:
clipped_rhos = layers.elementwise_min(rhos, clip_rho_threshold)
else:
clipped_rhos = rhos
constant_one = layers.fill_constant([1], 'float32', 1.0)
cs = layers.elementwise_min(rhos, constant_one)
# Append bootstrapped value to get [v1, ..., v_t+1]
values_1_t = layers.slice(values, axes=[0], starts=[1], ends=[MAX_INT32])
values_t_plus_1 = layers.concat(
[values_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)
# \delta_s * V
deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
vs_minus_v_xs = recursively_scan(discounts, cs, deltas)
# Add V(x_s) to get v_s.
vs = layers.elementwise_add(vs_minus_v_xs, values)
# Advantage for policy gradient.
vs_1_t = layers.slice(vs, axes=[0], starts=[1], ends=[MAX_INT32])
vs_t_plus_1 = layers.concat(
[vs_1_t, layers.unsqueeze(bootstrap_value, [0])], axis=0)
if clip_pg_rho_threshold is not None:
clipped_pg_rhos = layers.elementwise_min(rhos, clip_pg_rho_threshold)
else:
clipped_pg_rhos = rhos
pg_advantages = (
clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
# Make sure no gradients backpropagated through the returned values.
vs.stop_gradient = True
pg_advantages.stop_gradient = True
return VTraceReturns(vs=vs, pg_advantages=pg_advantages)
def recursively_scan(discounts, cs, deltas):
""" Recursively calculate vs_minus_v_xs according to following equation:
vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1)
Args:
discounts: A float32 tensor of shape [T, B] with discounts encountered when
following the behaviour policy.
cs: A float32 tensor of shape [T, B], which corresponding to $c_s$ in the
origin paper.
deltas: A float32 tensor of shape [T, B], which corresponding to
$\delta_s * V$ in the origin paper.
Returns:
vs_minus_v_xs: A float32 tensor of shape [T, B], which corresponding to
$v_s - V(x_s)$ in the origin paper.
"""
# All sequences are reversed, computation starts from the back.
reverse_discounts = layers.reverse(x=discounts, axis=[0])
reverse_cs = layers.reverse(x=cs, axis=[0])
reverse_deltas = layers.reverse(x=deltas, axis=[0])
static_while = layers.StaticRNN()
# init: shape [B]
init = layers.fill_constant_batch_size_like(
discounts, shape=[1], dtype='float32', value=0.0, input_dim_idx=1)
with static_while.step():
discount_t = static_while.step_input(reverse_discounts)
c_t = static_while.step_input(reverse_cs)
delta_t = static_while.step_input(reverse_deltas)
vs_minus_v_xs_t_plus_1 = static_while.memory(init=init)
vs_minus_v_xs_t = delta_t + discount_t * c_t * vs_minus_v_xs_t_plus_1
static_while.update_memory(vs_minus_v_xs_t_plus_1, vs_minus_v_xs_t)
static_while.step_output(vs_minus_v_xs_t)
vs_minus_v_xs = static_while()
# Reverse the results back to original order.
vs_minus_v_xs = layers.reverse(vs_minus_v_xs, [0])
return vs_minus_v_xs
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from parl.framework.algorithm_base import Algorithm
import parl.layers as layers
__all__ = ['PolicyGradient']
class PolicyGradient(Algorithm):
def __init__(self, model, hyperparas):
Algorithm.__init__(self, model, hyperparas)
self.model = model
self.lr = hyperparas['lr']
def define_predict(self, obs):
""" use policy model self.model to predict the action probability
"""
return self.model.policy(obs)
def define_learn(self, obs, action, reward):
""" update policy model self.model with policy gradient algorithm
"""
act_prob = self.model.policy(obs)
log_prob = layers.cross_entropy(act_prob, action)
cost = log_prob * reward
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl.layers as layers
from copy import deepcopy
from paddle import fluid
from parl.framework.algorithm_base import Algorithm
__all__ = ['PPO']
class PPO(Algorithm):
def __init__(self, model, hyperparas):
Algorithm.__init__(self, model, hyperparas)
# Used to calculate probability of action in old policy
self.old_policy_model = deepcopy(model.policy_model)
# fetch hyper parameters
self.act_dim = hyperparas['act_dim']
self.policy_lr = hyperparas['policy_lr']
self.value_lr = hyperparas['value_lr']
if 'epsilon' in hyperparas:
self.epsilon = hyperparas['epsilon']
else:
self.epsilon = 0.2 # default
def _calc_logprob(self, actions, means, logvars):
""" Calculate log probabilities of actions, when given means and logvars
of normal distribution.
The constant sqrt(2 * pi) is omitted, which will be eliminated in later.
Args:
actions: shape (batch_size, act_dim)
means: shape (batch_size, act_dim)
logvars: shape (act_dim)
Returns:
logprob: shape (batch_size)
"""
exp_item = layers.elementwise_div(
layers.square(actions - means), layers.exp(logvars), axis=1)
exp_item = -0.5 * layers.reduce_sum(exp_item, dim=1)
vars_item = -0.5 * layers.reduce_sum(logvars)
logprob = exp_item + vars_item
return logprob
def _calc_kl(self, means, logvars, old_means, old_logvars):
""" Calculate KL divergence between old and new distributions
See: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback.E2.80.93Leibler_divergence
Args:
means: shape (batch_size, act_dim)
logvars: shape (act_dim)
old_means: shape (batch_size, act_dim)
old_logvars: shape (act_dim)
Returns:
kl: shape (batch_size)
"""
log_det_cov_old = layers.reduce_sum(old_logvars)
log_det_cov_new = layers.reduce_sum(logvars)
tr_old_new = layers.reduce_sum(layers.exp(old_logvars - logvars))
kl = 0.5 * (layers.reduce_sum(
layers.square(means - old_means) / layers.exp(logvars), dim=1) + (
log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim)
return kl
def define_predict(self, obs):
""" Use policy model of self.model to predict means and logvars of actions
"""
means, logvars = self.model.policy(obs)
return means
def define_sample(self, obs):
""" Use policy model of self.model to sample actions
"""
sampled_act = self.model.policy_sample(obs)
return sampled_act
def define_policy_learn(self, obs, actions, advantages, beta=None):
""" Learn policy model with:
1. CLIP loss: Clipped Surrogate Objective
2. KLPEN loss: Adaptive KL Penalty Objective
See: https://arxiv.org/pdf/1707.02286.pdf
Args:
obs: Tensor, (batch_size, obs_dim)
actions: Tensor, (batch_size, act_dim)
advantages: Tensor (batch_size, )
beta: Tensor (1) or None
if None, use CLIP Loss; else, use KLPEN loss.
"""
old_means, old_logvars = self.old_policy_model.policy(obs)
old_means.stop_gradient = True
old_logvars.stop_gradient = True
old_logprob = self._calc_logprob(actions, old_means, old_logvars)
means, logvars = self.model.policy(obs)
logprob = self._calc_logprob(actions, means, logvars)
kl = self._calc_kl(means, logvars, old_means, old_logvars)
kl = layers.reduce_mean(kl)
if beta is None: # Clipped Surrogate Objective
pg_ratio = layers.exp(logprob - old_logprob)
clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon,
1 + self.epsilon)
surrogate_loss = layers.elementwise_min(
advantages * pg_ratio, advantages * clipped_pg_ratio)
loss = 0 - layers.reduce_mean(surrogate_loss)
else: # Adaptive KL Penalty Objective
# policy gradient loss
loss1 = 0 - layers.reduce_mean(
advantages * layers.exp(logprob - old_logprob))
# adaptive kl loss
loss2 = kl * beta
loss = loss1 + loss2
optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr)
optimizer.minimize(loss)
return loss, kl
def define_value_predict(self, obs):
""" Use value model of self.model to predict value of obs
"""
return self.model.value(obs)
def define_value_learn(self, obs, val):
""" Learn value model with square error cost
"""
predict_val = self.model.value(obs)
loss = layers.square_error_cost(predict_val, val)
loss = layers.reduce_mean(loss)
optimizer = fluid.optimizer.AdamOptimizer(self.value_lr)
optimizer.minimize(loss)
return loss
def sync_old_policy(self, gpu_id):
""" Synchronize parameters of self.model.policy_model to self.old_policy_model
"""
self.model.policy_model.sync_params_to(
self.old_policy_model, gpu_id=gpu_id)
......@@ -92,4 +92,4 @@ class ModelBase(object):
def __call__(self, *args, **kwargs):
"""Call forward function.
"""
self.forward(*args, **kwargs)
return self.forward(*args, **kwargs)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册