未验证 提交 0b71fc4e 编写于 作者: S Swain 提交者: GitHub

feature(nyz): add H-PPO hybrid action space algorithm (#140)

* feature(nyz): add hybrid ppo, unify action_space field and use dict type mu sigma

* polish(nyz): polish ppo config continous field, move to action_space field

* fix(nyz): fix ppo action_space field compatibility bug

* fix(nyz): fix ppg/sac/cql action_space field compatibility bug

* demo(nyz): update gym hybrid hppo config

* polish(pu): polish hppo hyper-para, use tanh and fixed sigma 0.3 in actor_action_args, use clamp [0,1] and [-1,1] for acceleration_value and rotation_value correspondingly after sample from the pi distri. in collect phase

* polish(pu):polish as review

* polish(pu): polish hppo config

* polish(pu): entropy weight=0.03 performs best empirically

* fix(nyz): fix unittest compatibility bugs

* polish(nyz): remove atari env unused print(ci skip)
Co-authored-by: Npuyuan1996 <2402552459@qq.com>
上级 eb6c60cc
......@@ -28,6 +28,7 @@ class MAVAC(nn.Module):
actor_head_layer_num: int = 2,
critic_head_hidden_size: int = 64,
critic_head_layer_num: int = 1,
action_space: str = 'discrete',
activation: Optional[nn.Module] = nn.ReLU(),
norm_type: Optional[str] = None,
) -> None:
......
......@@ -14,8 +14,8 @@ class PPG(nn.Module):
self,
obs_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType],
action_space: str = 'discrete',
share_encoder: bool = True,
continuous: bool = False,
encoder_hidden_size_list: SequenceType = [128, 128, 64],
actor_head_hidden_size: int = 64,
actor_head_layer_num: int = 2,
......@@ -26,7 +26,7 @@ class PPG(nn.Module):
) -> None:
super(PPG, self).__init__()
self.actor_critic = VAC(
obs_shape, action_shape, share_encoder, continuous, encoder_hidden_size_list, actor_head_hidden_size,
obs_shape, action_shape, action_space, share_encoder, encoder_hidden_size_list, actor_head_hidden_size,
actor_head_layer_num, critic_head_hidden_size, critic_head_layer_num, activation, norm_type
)
self.aux_critic = copy.deepcopy(self.actor_critic.critic)
......
......@@ -23,7 +23,7 @@ class QAC(nn.Module):
self,
obs_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType, EasyDict],
actor_head_type: str,
action_space: str,
twin_critic: bool = False,
actor_head_hidden_size: int = 64,
actor_head_layer_num: int = 1,
......@@ -39,7 +39,7 @@ class QAC(nn.Module):
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
- action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, ), \
EasyDict({'action_type_shape': 3, 'action_args_shape': 4}).
- actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
- action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
- twin_critic (:obj:`bool`): Whether include twin critic.
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
- actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
......@@ -56,9 +56,9 @@ class QAC(nn.Module):
obs_shape: int = squeeze(obs_shape)
action_shape = squeeze(action_shape)
self.action_shape = action_shape
self.actor_head_type = actor_head_type
assert self.actor_head_type in ['regression', 'reparameterization', 'hybrid']
if self.actor_head_type == 'regression': # DDPG, TD3
self.action_space = action_space
assert self.action_space in ['regression', 'reparameterization', 'hybrid']
if self.action_space == 'regression': # DDPG, TD3
self.actor = nn.Sequential(
nn.Linear(obs_shape, actor_head_hidden_size), activation,
RegressionHead(
......@@ -70,7 +70,7 @@ class QAC(nn.Module):
norm_type=norm_type
)
)
elif self.actor_head_type == 'reparameterization': # SAC
elif self.action_space == 'reparameterization': # SAC
self.actor = nn.Sequential(
nn.Linear(obs_shape, actor_head_hidden_size), activation,
ReparameterizationHead(
......@@ -82,7 +82,7 @@ class QAC(nn.Module):
norm_type=norm_type
)
)
elif self.actor_head_type == 'hybrid': # PADDPG
elif self.action_space == 'hybrid': # PADDPG
# hybrid action space: action_type(discrete) + action_args(continuous),
# such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
......@@ -110,7 +110,7 @@ class QAC(nn.Module):
)
self.actor = nn.ModuleList([actor_action_type, actor_action_args])
self.twin_critic = twin_critic
if self.actor_head_type == 'hybrid':
if self.action_space == 'hybrid':
critic_input_size = obs_shape + action_shape.action_type_shape + action_shape.action_args_shape
else:
critic_input_size = obs_shape + action_shape
......@@ -194,7 +194,7 @@ class QAC(nn.Module):
Critic Examples:
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
>>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
>>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
>>> model(inputs, mode='compute_critic')['q_value'] # q value
tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
......@@ -245,13 +245,13 @@ class QAC(nn.Module):
>>> actor_outputs['logit'][1].shape # sigma
>>> torch.Size([4, 64])
"""
if self.actor_head_type == 'regression':
if self.action_space == 'regression':
x = self.actor(inputs)
return {'action': x['pred']}
elif self.actor_head_type == 'reparameterization':
elif self.action_space == 'reparameterization':
x = self.actor(inputs)
return {'logit': [x['mu'], x['sigma']]}
elif self.actor_head_type == 'hybrid':
elif self.action_space == 'hybrid':
logit = self.actor[0](inputs)
action_args = self.actor[1](inputs)
return {'logit': logit['logit'], 'action_args': action_args['pred']}
......@@ -284,14 +284,14 @@ class QAC(nn.Module):
Examples:
>>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)}
>>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
>>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
>>> model(inputs, mode='compute_critic')['q_value'] # q value
>>> tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
"""
obs, action = inputs['obs'], inputs['action']
assert len(obs.shape) == 2
if self.actor_head_type == 'hybrid':
if self.action_space == 'hybrid':
action_type_logit = inputs['logit']
action_type_logit = torch.softmax(action_type_logit, dim=-1)
action_args = action['action_args']
......
......@@ -20,7 +20,7 @@ class QACDIST(nn.Module):
self,
obs_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType],
actor_head_type: str = "regression",
action_space: str = "regression",
critic_head_type: str = "categorical",
actor_head_hidden_size: int = 64,
actor_head_layer_num: int = 1,
......@@ -38,7 +38,7 @@ class QACDIST(nn.Module):
Arguments:
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
- action_shape (:obj:`Union[int, SequenceType]`): Action's space.
- actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
- action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
- critic_head_type (:obj:`str`): Only ``categorical``.
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
- actor_head_layer_num (:obj:`int`):
......@@ -58,9 +58,9 @@ class QACDIST(nn.Module):
super(QACDIST, self).__init__()
obs_shape: int = squeeze(obs_shape)
action_shape: int = squeeze(action_shape)
self.actor_head_type = actor_head_type
assert self.actor_head_type in ['regression', 'reparameterization']
if self.actor_head_type == 'regression':
self.action_space = action_space
assert self.action_space in ['regression', 'reparameterization']
if self.action_space == 'regression':
self.actor = nn.Sequential(
nn.Linear(obs_shape, actor_head_hidden_size), activation,
RegressionHead(
......@@ -72,7 +72,7 @@ class QACDIST(nn.Module):
norm_type=norm_type
)
)
elif self.actor_head_type == 'reparameterization':
elif self.action_space == 'reparameterization':
self.actor = nn.Sequential(
nn.Linear(obs_shape, actor_head_hidden_size), activation,
ReparameterizationHead(
......@@ -156,7 +156,7 @@ class QACDIST(nn.Module):
Critic Examples:
>>> # Categorical mode
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
... critic_head_type='categorical', n_atoms=51)
>>> q_value = model(inputs, mode='compute_critic') # q value
>>> assert q_value['q_value'].shape == torch.Size([4, 1])
......@@ -204,9 +204,9 @@ class QACDIST(nn.Module):
>>> torch.Size([4, 64])
"""
x = self.actor(inputs)
if self.actor_head_type == 'regression':
if self.action_space == 'regression':
return {'action': x['pred']}
elif self.actor_head_type == 'reparameterization':
elif self.action_space == 'reparameterization':
return {'logit': [x['mu'], x['sigma']]}
def compute_critic(self, inputs: Dict) -> Dict:
......@@ -232,7 +232,7 @@ class QACDIST(nn.Module):
Examples:
>>> # Categorical mode
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
... critic_head_type='categorical', n_atoms=51)
>>> q_value = model(inputs, mode='compute_critic') # q value
>>> assert q_value['q_value'].shape == torch.Size([4, 1])
......
......@@ -16,7 +16,7 @@ hybrid_args = {
'action_args_shape': (6, )
}),
'twin': True,
'actor_head_type': 'hybrid'
'action_space': 'hybrid'
}
......@@ -27,10 +27,10 @@ class TestHybridQAC:
self,
action_shape=hybrid_args['action_shape'],
twin=hybrid_args['twin'],
actor_head_type=hybrid_args['actor_head_type']
action_space=hybrid_args['action_space']
):
N = 32
assert actor_head_type == 'hybrid'
assert action_space == 'hybrid'
inputs = {
'obs': torch.randn(B, N),
'action': {
......@@ -42,7 +42,7 @@ class TestHybridQAC:
model = QAC(
obs_shape=(N, ),
action_shape=action_shape,
actor_head_type=actor_head_type,
action_space=action_space,
critic_head_hidden_size=embedding_size,
actor_head_hidden_size=embedding_size,
twin_critic=twin,
......
......@@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, [True, False], ['regression', 'reparame
@pytest.mark.unittest
@pytest.mark.parametrize('action_shape, twin, actor_head_type', args)
@pytest.mark.parametrize('action_shape, twin, action_space', args)
class TestQAC:
def test_fcqac(self, action_shape, twin, actor_head_type):
def test_fcqac(self, action_shape, twin, action_space):
N = 32
inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
model = QAC(
obs_shape=(N, ),
action_shape=action_shape,
actor_head_type=actor_head_type,
action_space=action_space,
critic_head_hidden_size=embedding_size,
actor_head_hidden_size=embedding_size,
twin_critic=twin,
......@@ -41,7 +41,7 @@ class TestQAC:
# compute_action
print(model)
if actor_head_type == 'regression':
if action_space == 'regression':
action = model(inputs['obs'], mode='compute_actor')['action']
if squeeze(action_shape) == 1:
assert action.shape == (B, )
......@@ -49,7 +49,7 @@ class TestQAC:
assert action.shape == (B, squeeze(action_shape))
assert action.eq(action.clamp(-1, 1)).all()
is_differentiable(action.sum(), model.actor)
elif actor_head_type == 'reparameterization':
elif action_space == 'reparameterization':
(mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
assert mu.shape == (B, *action_shape)
assert sigma.shape == (B, *action_shape)
......
......@@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, ['regression', 'reparameterization']]))
@pytest.mark.unittest
@pytest.mark.parametrize('action_shape, actor_head_type', args)
@pytest.mark.parametrize('action_shape, action_space', args)
class TestQACDIST:
def test_fcqac_dist(self, action_shape, actor_head_type):
def test_fcqac_dist(self, action_shape, action_space):
N = 32
inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
model = QACDIST(
obs_shape=(N, ),
action_shape=action_shape,
actor_head_type=actor_head_type,
action_space=action_space,
critic_head_hidden_size=embedding_size,
actor_head_hidden_size=embedding_size,
)
......@@ -43,7 +43,7 @@ class TestQACDIST:
# compute_action
print(model)
if actor_head_type == 'regression':
if action_space == 'regression':
action = model(inputs['obs'], mode='compute_actor')['action']
if squeeze(action_shape) == 1:
assert action.shape == (B, )
......@@ -51,7 +51,7 @@ class TestQACDIST:
assert action.shape == (B, squeeze(action_shape))
assert action.eq(action.clamp(-1, 1)).all()
is_differentiable(action.sum(), model.actor)
elif actor_head_type == 'reparameterization':
elif action_space == 'reparameterization':
(mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
assert mu.shape == (B, *action_shape)
assert sigma.shape == (B, *action_shape)
......
......@@ -8,8 +8,8 @@ from ding.torch_utils import is_differentiable
B, C, H, W = 4, 3, 128, 128
obs_shape = [4, (8, ), (4, 64, 64)]
act_args = [[6, False], [(3, ), True], [[2, 3, 6], False]]
#act_args = [[(3, ), True]]
act_args = [[6, 'discrete'], [(3, ), 'continuous'], [[2, 3, 6], 'discrete']]
# act_args = [[(3, ), True]]
args = list(product(*[obs_shape, act_args, [False, True]]))
......@@ -29,12 +29,12 @@ class TestVAC:
inputs = torch.randn(B, obs_shape)
else:
inputs = torch.randn(B, *obs_shape)
model = VAC(obs_shape, action_shape=act_args[0], continuous=act_args[1], share_encoder=share_encoder)
model = VAC(obs_shape, action_shape=act_args[0], action_space=act_args[1], share_encoder=share_encoder)
outputs = model(inputs, mode='compute_actor_critic')
value, logit = outputs['value'], outputs['logit']
if model.continuous:
outputs = value.sum() + logit[0].sum() + logit[1].sum()
if model.action_space == 'continuous':
outputs = value.sum() + logit['mu'].sum() + logit['sigma'].sum()
else:
if model.multi_head:
outputs = value.sum() + sum([t.sum() for t in logit])
......@@ -45,8 +45,8 @@ class TestVAC:
for p in model.parameters():
p.grad = None
logit = model(inputs, mode='compute_actor')['logit']
if model.continuous:
logit = logit[0].sum() + logit[1].sum()
if model.action_space == 'continuous':
logit = logit['mu'].sum() + logit['sigma'].sum()
self.output_check(model.actor, logit, model.action_shape)
for p in model.parameters():
......
from typing import Union, Dict, Optional
from easydict import EasyDict
import torch
import torch.nn as nn
......@@ -20,9 +21,9 @@ class VAC(nn.Module):
def __init__(
self,
obs_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType, EasyDict],
action_space: str = 'discrete',
share_encoder: bool = True,
continuous: bool = False,
encoder_hidden_size_list: SequenceType = [128, 128, 64],
actor_head_hidden_size: int = 64,
actor_head_layer_num: int = 1,
......@@ -31,6 +32,7 @@ class VAC(nn.Module):
activation: Optional[nn.Module] = nn.ReLU(),
norm_type: Optional[str] = None,
sigma_type: Optional[str] = 'independent',
fixed_sigma_value: Optional[int] = 0.3,
bound_type: Optional[str] = None,
) -> None:
r"""
......@@ -39,8 +41,8 @@ class VAC(nn.Module):
Arguments:
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
- action_shape (:obj:`Union[int, SequenceType]`): Action's space.
- action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid']
- share_encoder (:obj:`bool`): Whether share encoder.
- continuous (:obj:`bool`): Whether collect continuously.
- encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
- actor_head_layer_num (:obj:`int`):
......@@ -56,7 +58,7 @@ class VAC(nn.Module):
"""
super(VAC, self).__init__()
obs_shape: int = squeeze(obs_shape)
action_shape: int = squeeze(action_shape)
action_shape = squeeze(action_shape)
self.obs_shape, self.action_shape = obs_shape, action_shape
# Encoder Type
if isinstance(obs_shape, int) or len(obs_shape) == 1:
......@@ -81,8 +83,9 @@ class VAC(nn.Module):
self.critic_head = RegressionHead(
critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
)
self.continuous = continuous
if self.continuous:
self.action_space = action_space
assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space
if self.action_space == 'continuous':
self.multi_head = False
self.actor_head = ReparameterizationHead(
actor_head_hidden_size,
......@@ -93,7 +96,7 @@ class VAC(nn.Module):
norm_type=norm_type,
bound_type=bound_type
)
else:
elif self.action_space == 'discrete':
actor_head_cls = DiscreteHead
multi_head = not isinstance(action_shape, int)
self.multi_head = multi_head
......@@ -114,6 +117,30 @@ class VAC(nn.Module):
activation=activation,
norm_type=norm_type
)
elif self.action_space == 'hybrid': # HPPO
# hybrid action space: action_type(discrete) + action_args(continuous),
# such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
action_shape.action_type_shape = squeeze(action_shape.action_type_shape)
actor_action_args = ReparameterizationHead(
actor_head_hidden_size,
action_shape.action_args_shape,
actor_head_layer_num,
sigma_type=sigma_type,
fixed_sigma_value=fixed_sigma_value,
activation=activation,
norm_type=norm_type,
bound_type=bound_type,
)
actor_action_type = DiscreteHead(
actor_head_hidden_size,
action_shape.action_type_shape,
actor_head_layer_num,
activation=activation,
norm_type=norm_type,
)
self.actor_head = nn.ModuleList([actor_action_type, actor_action_args])
# must use list, not nn.ModuleList
if self.share_encoder:
self.actor = [self.encoder, self.actor_head]
......@@ -203,10 +230,16 @@ class VAC(nn.Module):
x = self.encoder(x)
else:
x = self.actor_encoder(x)
x = self.actor_head(x)
if self.continuous:
x = {'logit': [x['mu'], x['sigma']]}
return x
if self.action_space == 'discrete':
return self.actor_head(x)
elif self.action_space == 'continuous':
x = self.actor_head(x) # mu, sigma
return {'logit': x}
elif self.action_space == 'hybrid':
action_type = self.actor_head[0](x)
action_args = self.actor_head[1](x)
return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}}
def compute_critic(self, x: torch.Tensor) -> Dict:
r"""
......@@ -278,10 +311,16 @@ class VAC(nn.Module):
else:
actor_embedding = self.actor_encoder(x)
critic_embedding = self.critic_encoder(x)
value = self.critic_head(critic_embedding)
actor_output = self.actor_head(actor_embedding)
if self.continuous:
logit = [actor_output['mu'], actor_output['sigma']]
else:
logit = actor_output['logit']
return {'logit': logit, 'value': value['pred']}
value = self.critic_head(critic_embedding)['pred']
if self.action_space == 'discrete':
logit = self.actor_head(actor_embedding)['logit']
return {'logit': logit, 'value': value}
elif self.action_space == 'continuous':
x = self.actor_head(actor_embedding)
return {'logit': x, 'value': value}
elif self.action_space == 'hybrid':
action_type = self.actor_head[0](actor_embedding)
action_args = self.actor_head[1](actor_embedding)
return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}, 'value': value}
......@@ -5,6 +5,7 @@ import numpy as np
import torch
from ding.torch_utils import get_tensor_data
from ding.rl_utils import create_noise_generator
from torch.distributions import Categorical, Independent, Normal
class IModelWrapper(ABC):
......@@ -408,6 +409,91 @@ class HybridEpsGreedyMultinomialSampleWrapper(IModelWrapper):
return output
class HybridReparamMultinomialSampleWrapper(IModelWrapper):
"""
Overview:
Reparameterization sampler coupled with multinomial sample used in collector_model
to help balance exploration and exploitation.
In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
Interfaces:
forward
"""
def forward(self, *args, **kwargs):
output = self._model.forward(*args, **kwargs)
assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
logit = output['logit'] # logit: {'action_type': action_type_logit, 'action_args': action_args_logit}
# discrete part
action_type_logit = logit['action_type']
prob = torch.softmax(action_type_logit, dim=-1)
pi_action = Categorical(prob)
action_type = pi_action.sample()
# continuous part
mu, sigma = logit['action_args']['mu'], logit['action_args']['sigma']
dist = Independent(Normal(mu, sigma), 1)
action_args = dist.sample()
action = {'action_type': action_type, 'action_args': action_args}
output['action'] = action
return output
class HybridDeterministicArgmaxSampleWrapper(IModelWrapper):
"""
Overview:
Deterministic sampler coupled with argmax sample used in eval_model.
In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
Interfaces:
forward
"""
def forward(self, *args, **kwargs):
output = self._model.forward(*args, **kwargs)
assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
logit = output['logit'] # logit: {'action_type': action_type_logit, 'action_args': action_args_logit}
# discrete part
action_type_logit = logit['action_type']
action_type = action_type_logit.argmax(dim=-1)
# continuous part
mu = logit['action_args']['mu']
action_args = mu
action = {'action_type': action_type, 'action_args': action_args}
output['action'] = action
return output
class DeterministicSample(IModelWrapper):
"""
Overview:
Deterministic sampler (just use mu directly) used in eval_model.
Interfaces:
forward
"""
def forward(self, *args, **kwargs):
output = self._model.forward(*args, **kwargs)
assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
output['action'] = output['logit']['mu']
return output
class ReparamSample(IModelWrapper):
"""
Overview:
Reparameterization gaussian sampler used in collector_model.
Interfaces:
forward
"""
def forward(self, *args, **kwargs):
output = self._model.forward(*args, **kwargs)
assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
mu, sigma = output['logit']['mu'], output['logit']['sigma']
dist = Independent(Normal(mu, sigma), 1)
output['action'] = dist.sample()
return output
class EpsGreedySampleNGUWrapper(IModelWrapper):
r"""
Overview:
......@@ -592,8 +678,12 @@ wrapper_name_map = {
'eps_greedy_sample': EpsGreedySampleWrapper,
'eps_greedy_sample_ngu': EpsGreedySampleNGUWrapper,
'eps_greedy_multinomial_sample': EpsGreedyMultinomialSampleWrapper,
'deterministic_sample': DeterministicSample,
'reparam_sample': ReparamSample,
'hybrid_eps_greedy_sample': HybridEpsGreedySampleWrapper,
'hybrid_eps_greedy_multinomial_sample': HybridEpsGreedyMultinomialSampleWrapper,
'hybrid_reparam_multinomial_sample': HybridReparamMultinomialSampleWrapper,
'hybrid_deterministic_argmax_sample': HybridDeterministicArgmaxSampleWrapper,
'multinomial_sample': MultinomialSampleWrapper,
'action_noise': ActionNoiseWrapper,
# model wrapper
......@@ -607,6 +697,8 @@ def model_wrap(model, wrapper_name: str = None, **kwargs):
if not isinstance(model, IModelWrapper):
model = wrapper_name_map['base'](model)
model = wrapper_name_map[wrapper_name](model, **kwargs)
else:
raise TypeError("not support model_wrapper type: {}".format(wrapper_name))
return model
......
......@@ -99,7 +99,9 @@ class CQLPolicy(SACPolicy):
# and learning_rate_policy in `cfg.policy.learn`.
# Default to False.
# value_network=False,
actor_head_type='reparameterization',
# (str type) action_space: Use reparameterization trick for continous action
action_space='reparameterization',
),
learn=dict(
# (bool) Whether to use multi gpu
......
......@@ -8,10 +8,11 @@ from ding.model import model_wrap
from ding.utils import POLICY_REGISTRY
from ding.utils.data import default_collate, default_decollate
from .base_policy import Policy
try:
from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
except ImportError:
FootballKaggle5thPlaceModel = None
# try:
# from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
# except ImportError:
# FootballKaggle5thPlaceModel = None
FootballKaggle5thPlaceModel = None
@POLICY_REGISTRY.register('IL')
......
......@@ -86,7 +86,7 @@ class PPGPolicy(Policy):
# (bool) Whether to use cuda for network.
cuda=False,
# (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used)
on_policy=True,
on_policy=False,
priority=False,
# (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
priority_IS_weight=False,
......
......@@ -32,11 +32,16 @@ class PPOPolicy(Policy):
on_policy=True,
# (bool) Whether to use priority(priority sample, IS weight, update priority)
priority=False,
# (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
# (bool) Whether to use Importance Sampling Weight to correct biased update due to priority.
# If True, priority must be True.
priority_IS_weight=False,
# (bool) Whether to recompurete advantages in each iteration of on-policy PPO
recompute_adv=True,
continuous=True,
# (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous', 'hybrid']
action_space='discrete',
# (bool) Whether to use nstep return to calculate value target, otherwise, use return = adv + value
nstep_return=False,
# (bool) Whether to enable multi-agent training, i.e.: MAPPO
multi_agent=False,
# (bool) Whether to need policy data in process transition
transition_with_policy_data=True,
......@@ -89,16 +94,22 @@ class PPOPolicy(Policy):
self._priority_IS_weight = self._cfg.priority_IS_weight
assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO"
self._continuous = self._cfg.continuous
self._action_space = self._cfg.action_space
if self._cfg.learn.ppo_param_init:
for n, m in self._model.named_modules():
if isinstance(m, torch.nn.Linear):
torch.nn.init.orthogonal_(m.weight)
torch.nn.init.zeros_(m.bias)
if self._continuous:
if self._action_space in ['continuous', 'hybrid']:
# init log sigma
if hasattr(self._model.actor_head, 'log_sigma_param'):
torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5)
if self._action_space == 'continuous':
if hasattr(self._model.actor_head, 'log_sigma_param'):
torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5)
elif self._action_space == 'hybrid': # actor_head[1]: ReparameterizationHead, for action_args
if hasattr(self._model.actor_head[1], 'log_sigma_param'):
torch.nn.init.constant_(self._model.actor_head[1].log_sigma_param, -0.5)
print('init ok')
for m in list(self._model.critic.modules()) + list(self._model.actor.modules()):
if isinstance(m, torch.nn.Linear):
# orthogonal initialization
......@@ -194,18 +205,42 @@ class PPOPolicy(Policy):
adv = (adv - adv.mean()) / (adv.std() + 1e-8)
# Calculate ppo error
if self._continuous:
if self._action_space == 'continuous':
ppo_batch = ppo_data(
output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv,
batch['return'], batch['weight']
)
ppo_loss, ppo_info = ppo_error_continuous(ppo_batch, self._clip_ratio)
else:
elif self._action_space == 'discrete':
ppo_batch = ppo_data(
output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv,
batch['return'], batch['weight']
)
ppo_loss, ppo_info = ppo_error(ppo_batch, self._clip_ratio)
elif self._action_space == 'hybrid':
# discrete part (discrete policy loss and entropy loss)
ppo_discrete_batch = ppo_policy_data(
output['logit']['action_type'], batch['logit']['action_type'], batch['action']['action_type'],
adv, batch['weight']
)
ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_batch, self._clip_ratio)
# continuous part (continuous policy loss and entropy loss, value loss)
ppo_continuous_batch = ppo_data(
output['logit']['action_args'], batch['logit']['action_args'], batch['action']['action_args'],
output['value'], batch['value'], adv, batch['return'], batch['weight']
)
ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous(
ppo_continuous_batch, self._clip_ratio
)
# sum discrete and continuous loss
ppo_loss = type(ppo_continuous_loss)(
ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, ppo_continuous_loss.value_loss,
ppo_continuous_loss.entropy_loss + ppo_discrete_loss.entropy_loss
)
ppo_info = type(ppo_continuous_info)(
max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl),
max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac)
)
wv, we = self._value_weight, self._entropy_weight
total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss
......@@ -225,13 +260,13 @@ class PPOPolicy(Policy):
'value_max': output['value'].max().item(),
'approx_kl': ppo_info.approx_kl,
'clipfrac': ppo_info.clipfrac,
'act': batch['action'].float().mean().item(),
}
if self._continuous:
if self._action_space == 'continuous':
return_info.update(
{
'mu_mean': output['logit'][0].mean().item(),
'sigma_mean': output['logit'][1].mean().item(),
'act': batch['action'].float().mean().item(),
'mu_mean': output['logit']['mu'].mean().item(),
'sigma_mean': output['logit']['sigma'].mean().item(),
}
)
return_infos.append(return_info)
......@@ -254,11 +289,13 @@ class PPOPolicy(Policy):
Init traj and unroll length, collect model.
"""
self._unroll_len = self._cfg.collect.unroll_len
self._continuous = self._cfg.continuous
if self._continuous:
self._collect_model = model_wrap(self._model, wrapper_name='base')
else:
self._action_space = self._cfg.action_space
if self._action_space == 'continuous':
self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample')
elif self._action_space == 'discrete':
self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample')
elif self._action_space == 'hybrid':
self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample')
self._collect_model.reset()
self._gamma = self._cfg.collect.discount_factor
self._gae_lambda = self._cfg.collect.gae_lambda
......@@ -283,10 +320,6 @@ class PPOPolicy(Policy):
self._collect_model.eval()
with torch.no_grad():
output = self._collect_model.forward(data, mode='compute_actor_critic')
if self._continuous:
(mu, sigma), value = output['logit'], output['value']
dist = Independent(Normal(mu, sigma), 1)
output['action'] = dist.sample()
if self._cuda:
output = to_device(output, 'cpu')
output = default_decollate(output)
......@@ -378,11 +411,13 @@ class PPOPolicy(Policy):
Evaluate mode init method. Called by ``self.__init__``.
Init eval model with argmax strategy.
"""
self._continuous = self._cfg.continuous
if self._continuous:
self._eval_model = model_wrap(self._model, wrapper_name='base')
else:
self._action_space = self._cfg.action_space
if self._action_space == 'continuous':
self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
elif self._action_space == 'discrete':
self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
elif self._action_space == 'hybrid':
self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample')
self._eval_model.reset()
def _forward_eval(self, data: dict) -> dict:
......@@ -404,9 +439,6 @@ class PPOPolicy(Policy):
self._eval_model.eval()
with torch.no_grad():
output = self._eval_model.forward(data, mode='compute_actor')
if self._continuous:
(mu, sigma) = output['logit']
output.update({'action': mu})
if self._cuda:
output = to_device(output, 'cpu')
output = default_decollate(output)
......@@ -430,7 +462,7 @@ class PPOPolicy(Policy):
'value_max',
'value_mean',
]
if self._continuous:
if self._action_space == 'continuous':
variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act']
return variables
......
......@@ -599,7 +599,9 @@ class SACPolicy(Policy):
# and learning_rate_policy in `cfg.policy.learn`.
# Default to False.
# value_network=False,
actor_head_type='reparameterization',
# (str type) action_space: Use reparameterization trick for continous action
action_space='reparameterization',
),
learn=dict(
# (bool) Whether to use multi gpu
......
......@@ -108,7 +108,6 @@ def ppo_policy_error(data: namedtuple,
# only use dual_clip when adv < 0
policy_loss = -(torch.where(adv < 0, clip2, clip1) * weight).mean()
else:
#policy_loss = (-torch.min(surr1, surr2) * weight).mean()
policy_loss = (-torch.min(surr1, surr2) * weight).mean()
with torch.no_grad():
approx_kl = (logp_old - logp_new).mean().item()
......@@ -179,11 +178,11 @@ def ppo_error_continuous(
if weight is None:
weight = torch.ones_like(adv)
dist_new = Independent(Normal(mu_sigma_new[0], mu_sigma_new[1]), 1)
if len(mu_sigma_old[0].shape) == 1:
dist_old = Independent(Normal(mu_sigma_old[0].unsqueeze(-1), mu_sigma_old[1].unsqueeze(-1)), 1)
dist_new = Independent(Normal(mu_sigma_new['mu'], mu_sigma_new['sigma']), 1)
if len(mu_sigma_old['mu'].shape) == 1:
dist_old = Independent(Normal(mu_sigma_old['mu'].unsqueeze(-1), mu_sigma_old['sigma'].unsqueeze(-1)), 1)
else:
dist_old = Independent(Normal(mu_sigma_old[0], mu_sigma_old[1]), 1)
dist_old = Independent(Normal(mu_sigma_old['mu'], mu_sigma_old['sigma']), 1)
logp_new = dist_new.log_prob(action)
logp_old = dist_old.log_prob(action)
entropy_loss = (dist_new.entropy() * weight).mean()
......
......@@ -70,11 +70,11 @@ def test_mappo():
@pytest.mark.parametrize('use_value_clip, dual_clip, weight', args)
def test_ppo_error_continous(use_value_clip, dual_clip, weight):
B, N = 4, 6
mu_sigma_new = [torch.rand(B, N).requires_grad_(True), torch.rand(B, N).requires_grad_(True)]
mu_sigma_old = [
mu_sigma_new[0] + torch.rand_like(mu_sigma_new[0]) * 0.1,
mu_sigma_new[1] + torch.rand_like(mu_sigma_new[1]) * 0.1
]
mu_sigma_new = {'mu': torch.rand(B, N).requires_grad_(True), 'sigma': torch.rand(B, N).requires_grad_(True)}
mu_sigma_old = {
'mu': mu_sigma_new['mu'] + torch.rand_like(mu_sigma_new['mu']) * 0.1,
'sigma': mu_sigma_new['sigma'] + torch.rand_like(mu_sigma_new['sigma']) * 0.1
}
action = torch.rand(B, N)
value_new = torch.randn(B).requires_grad_(True)
value_old = value_new + torch.rand_like(value_new) * 0.1
......@@ -84,9 +84,9 @@ def test_ppo_error_continous(use_value_clip, dual_clip, weight):
loss, info = ppo_error_continuous(data, use_value_clip=use_value_clip, dual_clip=dual_clip)
assert all([l.shape == tuple() for l in loss])
assert all([np.isscalar(i) for i in info])
assert mu_sigma_new[0].grad is None
assert mu_sigma_new['mu'].grad is None
assert value_new.grad is None
total_loss = sum(loss)
total_loss.backward()
assert isinstance(mu_sigma_new[0].grad, torch.Tensor)
assert isinstance(mu_sigma_new['mu'].grad, torch.Tensor)
assert isinstance(value_new.grad, torch.Tensor)
......@@ -410,6 +410,17 @@ def one_time_warning(warning_msg: str) -> None:
logging.warning(warning_msg)
def split_fn(data, indices, start, end):
if data is None:
return None
elif isinstance(data, list):
return [split_fn(d, indices, start, end) for d in data]
elif isinstance(data, dict):
return {k1: split_fn(v1, indices, start, end) for k1, v1 in data.items()}
else:
return data[indices[start:end]]
def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> dict:
assert isinstance(data, dict), type(data)
length = []
......@@ -436,31 +447,7 @@ def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> d
for i in range(0, length, split_size):
if i + split_size > length:
i = length - split_size
batch = {}
for k in data.keys():
if data[k] is None:
batch[k] = None
elif k.startswith('prev_state'):
batch[k] = [data[k][t] for t in indices[i:i + split_size]]
elif isinstance(data[k], list) or isinstance(data[k], tuple):
if isinstance(data[k][0], list) and k == 'logit':
# for continuous action
# transform to mu_sigma (:obj:`list`): :math:`[(B, N), (B, N)]`,
# where B is batch size and N is action dim
batch[k] = [
torch.stack(
[
data[k][transition_index][mu_sigma_index]
for transition_index in indices[i:i + split_size]
]
) for mu_sigma_index in range(2)
]
else: # for discrete action
batch[k] = [t[indices[i:i + split_size]] for t in data[k]]
elif isinstance(data[k], dict):
batch[k] = {k1: v1[indices[i:i + split_size]] for k1, v1 in data[k].items()}
else:
batch[k] = data[k][indices[i:i + split_size]]
batch = split_fn(data, indices, i, i + split_size)
yield batch
......
......@@ -15,22 +15,19 @@ bipedalwalker_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=True,
action_space='continuous',
model=dict(
continuous=True,
action_space='continuous',
obs_shape=24,
action_shape=4,
),
learn=dict(
epoch_per_collect=10,
# update_per_collect=4, # offpolicy
batch_size=64,
learning_rate=0.001,
value_weight=0.5,
entropy_weight=0.01,
clip_ratio=0.2,
nstep=1,
nstep_return=False,
adv_norm=True,
),
collect=dict(
......@@ -50,7 +47,6 @@ bipedalwalker_ppo_create_config = dict(
),
env_manager=dict(type='base'),
# policy=dict(type='ppo_offpolicy'), # TODO
policy=dict(type='ppo'),
)
bipedalwalker_ppo_create_config = EasyDict(bipedalwalker_ppo_create_config)
......
......@@ -18,11 +18,12 @@ lunarlander_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
action_space='discrete',
recompute_adv=True,
model=dict(
obs_shape=8,
action_shape=4,
action_space='discrete',
),
learn=dict(
update_per_collect=8,
......
......@@ -44,8 +44,6 @@ lunarlander_ngu_config = dict(
type='episodic',
),
policy=dict(
continuous=False,
on_policy=False,
cuda=True,
priority=True,
priority_IS_weight=True,
......
from easydict import EasyDict
from ding.entry import serial_pipeline_reward_model
collector_env_num=8
collector_env_num = 8
lunarlander_ppo_rnd_config = dict(
env=dict(
collector_env_num=collector_env_num,
......@@ -15,20 +15,19 @@ lunarlander_ppo_rnd_config = dict(
# batch_size=32,
# update_per_collect=10,
batch_size=320,
update_per_collect=4, # TODO(pu):2
update_per_collect=4,
),
policy=dict(
recompute_adv=True,
cuda=True,
continuous=False,
on_policy=True,
action_space='discrete',
model=dict(
obs_shape=8,
action_shape=4,
action_space='discrete',
),
learn=dict(
# update_per_collect=4,
epoch_per_collect=10, # TODO(pu)
epoch_per_collect=10,
update_per_collect=1, # 4
batch_size=64,
learning_rate=3e-4,
......
......@@ -17,11 +17,12 @@ cartpole_gcl_ppo_onpolicy_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
recompute_adv=True,
action_space='discrete',
model=dict(
obs_shape=4,
action_shape=2,
action_space='discrete',
encoder_hidden_size_list=[64, 64, 128],
critic_head_hidden_size=128,
actor_head_hidden_size=128,
......
......@@ -10,10 +10,11 @@ cartpole_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
action_space='discrete',
model=dict(
obs_shape=4,
action_shape=2,
action_space='discrete',
encoder_hidden_size_list=[64, 64, 128],
critic_head_hidden_size=128,
actor_head_hidden_size=128,
......
......@@ -20,10 +20,11 @@ cartpole_ppo_icm_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
action_space='discrete',
model=dict(
obs_shape=4,
action_shape=2,
action_space='discrete',
encoder_hidden_size_list=[64, 64, 128],
critic_head_hidden_size=128,
actor_head_hidden_size=128,
......@@ -60,4 +61,4 @@ cartpole_ppo_icm_create_config = EasyDict(cartpole_ppo_icm_create_config)
create_config = cartpole_ppo_icm_create_config
if __name__ == '__main__':
serial_pipeline_reward_model([main_config, create_config], seed=0)
\ No newline at end of file
serial_pipeline_reward_model([main_config, create_config], seed=0)
......@@ -16,6 +16,7 @@ cartpole_ppo_offpolicy_config = dict(
encoder_hidden_size_list=[64, 64, 128],
critic_head_hidden_size=128,
actor_head_hidden_size=128,
action_space='discrete',
),
learn=dict(
update_per_collect=6,
......
......@@ -15,7 +15,7 @@ pendulum_cql_default_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='reparameterization',
action_space='reparameterization',
actor_head_hidden_size=128,
critic_head_hidden_size=128,
),
......
......@@ -17,7 +17,7 @@ pendulum_d4pg_config = dict(
model=dict(
obs_shape=3,
action_shape=1,
actor_head_type='regression',
action_space='regression',
v_min=-100,
v_max=100,
n_atom=51,
......
......@@ -17,7 +17,7 @@ pendulum_ddpg_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=False,
actor_head_type='regression',
action_space='regression',
),
learn=dict(
update_per_collect=2,
......
......@@ -10,13 +10,13 @@ pendulum_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=True,
action_space='continuous',
recompute_adv=True,
model=dict(
obs_shape=3,
action_shape=1,
encoder_hidden_size_list=[64, 64],
continuous=True,
action_space='continuous',
actor_head_layer_num=0,
critic_head_layer_num=0,
sigma_type='conditioned',
......
......@@ -18,7 +18,7 @@ pendulum_sac_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='reparameterization',
action_space='reparameterization',
actor_head_hidden_size=128,
critic_head_hidden_size=128,
),
......
......@@ -17,7 +17,7 @@ pendulum_sac_data_genearation_default_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='reparameterization',
action_space='reparameterization',
actor_head_hidden_size=128,
critic_head_hidden_size=128,
),
......
......@@ -18,7 +18,7 @@ pendulum_td3_bc_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='regression',
action_space='regression',
actor_head_hidden_size=128,
critic_head_hidden_size=128,
),
......@@ -44,7 +44,7 @@ pendulum_td3_bc_config = dict(
noise_sigma=0.1,
collector=dict(collect_print_freq=1000, ),
data_type='hdf5',
data_path = './td3/expert_demos.hdf5',
data_path='./td3/expert_demos.hdf5',
normalize_states=True,
),
eval=dict(evaluator=dict(eval_freq=100, ), ),
......
......@@ -18,7 +18,7 @@ pendulum_td3_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='regression',
action_space='regression',
),
learn=dict(
update_per_collect=2,
......
......@@ -18,7 +18,7 @@ pendulum_td3_generation_config = dict(
obs_shape=3,
action_shape=1,
twin_critic=True,
actor_head_type='regression',
action_space='regression',
),
learn=dict(
update_per_collect=2,
......@@ -33,7 +33,7 @@ pendulum_td3_generation_config = dict(
min=-0.5,
max=0.5,
),
learner = dict(
learner=dict(
load_path='./td3/ckpt/ckpt_best.pth.tar',
hook=dict(
load_ckpt_before_run='./td3/ckpt/ckpt_best.pth.tar',
......@@ -46,7 +46,7 @@ pendulum_td3_generation_config = dict(
noise_sigma=0.1,
collector=dict(collect_print_freq=1000, ),
save_path='./td3/expert.pkl',
data_type = 'hdf5',
data_type='hdf5',
),
eval=dict(evaluator=dict(eval_freq=100, ), ),
other=dict(replay_buffer=dict(
......
......@@ -24,10 +24,9 @@ gym_hybrid_ddpg_config = dict(
action_args_shape=2,
),
twin_critic=False,
actor_head_type='hybrid',
action_space='hybrid',
),
learn=dict(
action_space='hybrid',
update_per_collect=10, # [5, 10]
batch_size=32,
discount_factor=0.99,
......
from easydict import EasyDict
from ding.entry import serial_pipeline_onpolicy
gym_hybrid_hppo_config = dict(
exp_name='gym_hybrid_hppo_actsacle_fsv0.3_ew0.03_seed0',
env=dict(
collector_env_num=8,
evaluator_env_num=5,
# (bool) Scale output action into legal range, usually [-1, 1].
act_scale=True,
env_id='Moving-v0', # ['Sliding-v0', 'Moving-v0']
n_evaluator_episode=5,
stop_value=1.8,
),
policy=dict(
cuda=True,
priority=False,
action_space='hybrid',
recompute_adv=True,
model=dict(
obs_shape=10,
action_shape=dict(
action_type_shape=3,
action_args_shape=2,
),
action_space='hybrid',
encoder_hidden_size_list=[256, 128, 64, 64],
sigma_type='fixed',
fixed_sigma_value=0.3, # TODO(pu)
bound_type='tanh',
),
learn=dict(
epoch_per_collect=10,
batch_size=320,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.03, # TODO(pu)
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
),
collect=dict(
n_sample=int(3200),
discount_factor=0.99,
gae_lambda=0.95,
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=200, ), ),
),
)
gym_hybrid_hppo_config = EasyDict(gym_hybrid_hppo_config)
main_config = gym_hybrid_hppo_config
gym_hybrid_hppo_create_config = dict(
env=dict(
type='gym_hybrid',
import_names=['dizoo.gym_hybrid.envs.gym_hybrid_env'],
),
env_manager=dict(type='base'),
policy=dict(type='ppo'),
)
gym_hybrid_hppo_create_config = EasyDict(gym_hybrid_hppo_create_config)
create_config = gym_hybrid_hppo_create_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=0)
......@@ -3,7 +3,6 @@ from ding.entry import serial_pipeline
gym_hybrid_pdqn_config = dict(
exp_name='gym_hybrid_pdqn_seed1',
# exp_name='gym_hybrid_pdqn_dataaction_1encoder_lrd1e-5_lrc1e-3_upc100_seed0',
env=dict(
collector_env_num=8,
......
......@@ -54,8 +54,11 @@ class GymHybridEnv(BaseEnv):
def step(self, action: Dict) -> BaseEnvTimestep:
if self._act_scale:
# acceleration_value
# acceleration_value.
action['action_args'][0] = affine_transform(action['action_args'][0], min_val=0, max_val=1)
# rotation_value. Following line can be omitted, because in the affine_transform function,
# we have already done the clip(-1,1) operation
action['action_args'][1] = affine_transform(action['action_args'][1], min_val=-1, max_val=1)
action = [action['action_type'], action['action_args']]
obs, rew, done, info = self._env.step(action)
self._final_eval_reward += rew
......
......@@ -13,10 +13,11 @@ league_demo_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
action_space='discrete',
model=dict(
obs_shape=2,
action_shape=2,
action_space='discrete',
encoder_hidden_size_list=[32, 32],
critic_head_hidden_size=32,
actor_head_hidden_size=32,
......
......@@ -4,12 +4,10 @@ from easydict import EasyDict
from ding.entry import serial_pipeline_reward_model_ngu
print(torch.cuda.is_available(), torch.__version__)
collector_env_num = 32 #TODO
collector_env_num = 32
evaluator_env_num = 5
nstep = 5
minigrid_ppo_rnd_config = dict(
# exp_name='debug_minigrid_empty8_ngu_n5_bs2_ul98_erbm1',
# exp_name='debug_minigrid_fourrooms_ngu_er01_rbs5e4_n32',
minigrid_ppo_ngu_config = dict(
exp_name='debug_minigrid_doorkey_ngu_ul298_er01_rbs3e4_n32',
env=dict(
collector_env_num=collector_env_num,
......@@ -47,8 +45,6 @@ minigrid_ppo_rnd_config = dict(
type='episodic',
),
policy=dict(
continuous=False,
on_policy=False,
cuda=True,
priority=True,
priority_IS_weight=True,
......@@ -83,7 +79,7 @@ minigrid_ppo_rnd_config = dict(
),
replay_buffer=dict(
replay_buffer_size=30000,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full
alpha=0.6,
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
......@@ -91,9 +87,9 @@ minigrid_ppo_rnd_config = dict(
),
),
)
minigrid_ppo_rnd_config = EasyDict(minigrid_ppo_rnd_config)
main_config = minigrid_ppo_rnd_config
minigrid_ppo_rnd_create_config = dict(
minigrid_ppo_ngu_config = EasyDict(minigrid_ppo_ngu_config)
main_config = minigrid_ppo_ngu_config
minigrid_ppo_ngu_create_config = dict(
env=dict(
type='minigrid',
import_names=['dizoo.minigrid.envs.minigrid_env'],
......@@ -105,8 +101,8 @@ minigrid_ppo_rnd_create_config = dict(
episodic_reward_model=dict(type='episodic'),
collector=dict(type='sample_ngu', )
)
minigrid_ppo_rnd_create_config = EasyDict(minigrid_ppo_rnd_create_config)
create_config = minigrid_ppo_rnd_create_config
minigrid_ppo_ngu_create_config = EasyDict(minigrid_ppo_ngu_create_config)
create_config = minigrid_ppo_ngu_create_config
if __name__ == "__main__":
serial_pipeline_reward_model_ngu([main_config, create_config], seed=0)
......@@ -2,10 +2,7 @@ from easydict import EasyDict
from ding.entry import serial_pipeline_onpolicy
collector_env_num = 8
minigrid_ppo_config = dict(
# exp_name="minigrid_empty8_onppo",
exp_name="minigrid_fourrooms_onppo",
# exp_name="minigrid_doorkey88_onppo",
# exp_name="minigrid_doorkey_onppo",
env=dict(
collector_env_num=8,
evaluator_env_num=5,
......@@ -19,11 +16,11 @@ minigrid_ppo_config = dict(
policy=dict(
cuda=True,
recompute_adv=True,
continuous=False,
on_policy=True,
action_space='discrete',
model=dict(
obs_shape=2739,
action_shape=7,
action_space='discrete',
encoder_hidden_size_list=[256, 128, 64, 64],
),
learn=dict(
......
......@@ -4,12 +4,7 @@ import torch
print(torch.__version__, torch.cuda.is_available())
collector_env_num = 8
minigrid_ppo_rnd_config = dict(
# exp_name='minigrid_empty8_rnd_onppo_b01_weight1000_maxlen100',
# exp_name='minigrid_fourrooms_rnd_onppo_b01_weight1000_maxlen100',
exp_name='minigrid_doorkey88_rnd_onppo_b01_weight1000_maxlen300',
# exp_name='minigrid_doorkey_rnd_onppo_b01_weight1000_maxlen300',
# exp_name='minigrid_kcs3r3_rnd_onppo_b01',
# exp_name='minigrid_om2dlh_rnd_onppo_b01',
env=dict(
collector_env_num=collector_env_num,
evaluator_env_num=5,
......@@ -38,17 +33,17 @@ minigrid_ppo_rnd_config = dict(
policy=dict(
recompute_adv=True,
cuda=True,
continuous=False,
on_policy=True,
action_space='discrete',
model=dict(
obs_shape=2739,
action_shape=7,
action_space='discrete',
encoder_hidden_size_list=[256, 128, 64, 64],
critic_head_hidden_size=64, # default=64
actor_head_hidden_size=64,
),
learn=dict(
epoch_per_collect=10, # TODO(pu)
epoch_per_collect=10,
update_per_collect=1, # 4
batch_size=320, # 64,
learning_rate=3e-4,
......@@ -60,7 +55,7 @@ minigrid_ppo_rnd_config = dict(
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=int(3200),
n_sample=3200,
# here self.traj_length = 3200//8 = 400, because in minigrid env the max_length is 300.
# in ding/worker/collector/sample_serial_collector.py
# self._traj_len = max(
......
......@@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
ant_ppo_default_config = dict(
exp_name="result_mujoco/ant_onppo_noig",
exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0",
# exp_name="result_mujoco_para2/ant_onppo_ig_para2",
env=dict(
env_id='Ant-v3',
norm_obs=dict(use_norm=False, ),
......@@ -18,33 +19,37 @@ ant_ppo_default_config = dict(
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
action_space='continuous',
model=dict(
continuous=True,
action_space='continuous',
obs_shape=111,
action_shape=8,
),
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
batch_size=64,
batch_size=320,
learning_rate=3e-4,
value_weight=0.25,
entropy_weight=0,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must
# use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
# ignore_done=True,
ignore_done=False,
grad_clip_type='clip_norm',
grad_clip_value=0.5,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=2048,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.97,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
......
from easydict import EasyDict
from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
ant_ppo_default_config = dict(
exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0",
# exp_name="result_mujoco_para2/ant_onppo_ig_para2",
env=dict(
env_id='Ant-v3',
norm_obs=dict(use_norm=False, ),
norm_reward=dict(use_norm=False, ),
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
use_act_scale=True,
n_evaluator_episode=10,
stop_value=6000,
),
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
model=dict(
continuous=True,
obs_shape=111,
action_shape=8,
),
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
batch_size=320,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
# ignore_done=True,
ignore_done=False,
grad_clip_type='clip_norm',
grad_clip_value=0.5,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
)
ant_ppo_default_config = EasyDict(ant_ppo_default_config)
main_config = ant_ppo_default_config
ant_ppo_create_default_config = dict(
env=dict(
type='mujoco',
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
# env_manager=dict(type='subprocess'),
env_manager=dict(type='base'),
policy=dict(type='ppo', ),
)
ant_ppo_create_default_config = EasyDict(ant_ppo_create_default_config)
create_config = ant_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=0)
......@@ -4,9 +4,7 @@ from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
halfcheetah_ppo_default_config = dict(
exp_name="Halfcheetah_onppo",
# exp_name="debug/debug_halfcheetah_onppo_ig",
exp_name="halfcheetah_onppo",
env=dict(
env_id='HalfCheetah-v3',
norm_obs=dict(use_norm=False, ),
......@@ -15,44 +13,41 @@ halfcheetah_ppo_default_config = dict(
evaluator_env_num=evaluator_env_num,
use_act_scale=True,
n_evaluator_episode=10,
# n_evaluator_episode=1,
stop_value=12000,
),
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
action_space='continuous',
model=dict(
continuous=True,
action_space='continuous',
obs_shape=17,
action_shape=6,
),
learn=dict(
epoch_per_collect=10,#10,
epoch_per_collect=10,
update_per_collect=1,
batch_size=64,#320,
batch_size=320,
learning_rate=3e-4,
value_weight=0.25,#0.5,
entropy_weight=0,#0.001,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must
# use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
# ignore_done=True,
ignore_done=False,
ignore_done=True,
grad_clip_type='clip_norm',
grad_clip_value=0.5,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=2048,#3200,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.97,#0.95,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
......@@ -65,12 +60,11 @@ halfcheetah_ppo_create_default_config = dict(
type='mujoco',
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
# env_manager=dict(type='subprocess'),
env_manager=dict(type='base'),
env_manager=dict(type='subprocess'),
policy=dict(type='ppo', ),
)
halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config)
create_config = halfcheetah_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=0)
serial_pipeline_onpolicy([main_config, create_config], seed=1)
from easydict import EasyDict
from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
halfcheetah_ppo_default_config = dict(
# exp_name="result_mujoco_para2/halfcheetah_onppo_noig_para2",
exp_name="result_mujoco_para2/halfcheetah_onppo_ig_para2_seed1",
env=dict(
env_id='HalfCheetah-v3',
norm_obs=dict(use_norm=False, ),
norm_reward=dict(use_norm=False, ),
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
use_act_scale=True,
n_evaluator_episode=10,
# n_evaluator_episode=1,
stop_value=12000,
),
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
model=dict(
continuous=True,
obs_shape=17,
action_shape=6,
),
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
batch_size=320,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
ignore_done=True,
# ignore_done=False,
grad_clip_type='clip_norm',
grad_clip_value=0.5,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
)
halfcheetah_ppo_default_config = EasyDict(halfcheetah_ppo_default_config)
main_config = halfcheetah_ppo_default_config
halfcheetah_ppo_create_default_config = dict(
env=dict(
type='mujoco',
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
env_manager=dict(type='subprocess'),
# env_manager=dict(type='base'),
policy=dict(type='ppo', ),
)
halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config)
create_config = halfcheetah_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=1)
......@@ -25,12 +25,12 @@ hopper_gcl_default_config = dict(
policy=dict(
cuda=False,
recompute_adv=True,
action_space='continuous',
model=dict(
obs_shape=11,
action_shape=3,
continuous=True,
action_space='continuous',
),
continuous=True,
learn=dict(
update_per_collect=10,
batch_size=64,
......@@ -59,10 +59,7 @@ hopper_gcl_create_default_config = dict(
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
env_manager=dict(type='base'),
policy=dict(
type='ppo',
import_names=['ding.policy.ppo'],
),
policy=dict(type='ppo', ),
reward_model=dict(type='guided_cost'),
)
hopper_gcl_create_default_config = EasyDict(hopper_gcl_create_default_config)
......
......@@ -16,12 +16,12 @@ hopper_ppo_default_config = dict(
policy=dict(
cuda=True,
recompute_adv=True,
action_space='continuous',
model=dict(
obs_shape=11,
action_shape=3,
continuous=True,
action_space='continuous',
),
continuous=True,
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
......@@ -57,4 +57,4 @@ hopper_ppo_create_default_config = EasyDict(hopper_ppo_create_default_config)
create_config = hopper_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=0)
\ No newline at end of file
serial_pipeline_onpolicy([main_config, create_config], seed=0)
......@@ -21,7 +21,6 @@ walker2d_ddpg_gail_default_config = dict(
update_per_collect=100,
expert_data_path='walker2d_ddpg/expert_data_train.pkl',
load_path='walker2d_ddpg_gail/reward_model/ckpt/ckpt_best.pth.tar', # state_dict of the reward model
collect_count=100000,
),
policy=dict(
......
......@@ -24,12 +24,12 @@ walker_gcl_default_config = dict(
policy=dict(
cuda=False,
recompute_adv=True,
action_space='continuous',
model=dict(
obs_shape=17,
action_shape=6,
continuous=True,
action_space='continuous',
),
continuous=True,
learn=dict(
update_per_collect=10,
batch_size=64,
......@@ -58,10 +58,7 @@ walker_gcl_create_default_config = dict(
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
env_manager=dict(type='base'),
policy=dict(
type='ppo',
import_names=['ding.policy.ppo'],
),
policy=dict(type='ppo', ),
replay_buffer=dict(type='naive', ),
reward_model=dict(type='guided_cost'),
)
......
......@@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
walker2d_ppo_default_config = dict(
exp_name="result_mujoco/wlker2d_onppo_noig",
# exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1",
# exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1",
env=dict(
env_id='Walker2d-v3',
norm_obs=dict(use_norm=False, ),
......@@ -18,24 +19,24 @@ walker2d_ppo_default_config = dict(
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
action_space='continuous',
model=dict(
continuous=True,
action_space='continuous',
obs_shape=17,
action_shape=6,
),
learn=dict(
epoch_per_collect=10,
epoch_per_collect=10,
update_per_collect=1,
batch_size=64,
batch_size=320,
learning_rate=3e-4,
value_weight=0.25,
entropy_weight=0,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must
# use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
# ignore_done=True,
......@@ -45,10 +46,10 @@ walker2d_ppo_default_config = dict(
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=2048,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.97,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
......@@ -69,4 +70,4 @@ walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config
create_config = walker2d_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=0)
serial_pipeline_onpolicy([main_config, create_config], seed=1)
from easydict import EasyDict
from ding.entry import serial_pipeline_onpolicy
collector_env_num = 1
evaluator_env_num = 1
walker2d_ppo_default_config = dict(
# exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1",
# exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1",
env=dict(
env_id='Walker2d-v3',
norm_obs=dict(use_norm=False, ),
norm_reward=dict(use_norm=False, ),
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
use_act_scale=True,
n_evaluator_episode=10,
stop_value=6000,
),
policy=dict(
cuda=True,
recompute_adv=True,
continuous=True,
on_policy=True,
model=dict(
continuous=True,
obs_shape=17,
action_shape=6,
),
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
batch_size=320,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
# for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
# but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
# for halfcheetah, the length=1000
# ignore_done=True,
ignore_done=False,
grad_clip_type='clip_norm',
grad_clip_value=0.5,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=5000, )),
),
)
walker2d_ppo_default_config = EasyDict(walker2d_ppo_default_config)
main_config = walker2d_ppo_default_config
walker2d_ppo_create_default_config = dict(
env=dict(
type='mujoco',
import_names=['dizoo.mujoco.envs.mujoco_env'],
),
# env_manager=dict(type='subprocess'),
env_manager=dict(type='base'),
policy=dict(type='ppo', ),
)
walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config)
create_config = walker2d_ppo_create_default_config
if __name__ == "__main__":
serial_pipeline_onpolicy([main_config, create_config], seed=1)
......@@ -20,8 +20,9 @@ main_config = dict(
policy=dict(
cuda=False,
multi_agent=True,
continuous=False,
action_space='discrete',
model=dict(
action_space='discrete',
agent_num=n_agent,
agent_obs_shape=2 + 2 + (n_agent - 1) * 2 + num_landmarks * 2,
global_obs_shape=n_agent * 2 + num_landmarks * 2 + n_agent * 2,
......
......@@ -11,22 +11,21 @@ overcooked_league_demo_ppo_config = dict(
),
policy=dict(
cuda=False,
continuous=False,
recompute_adv=True,
action_space='discrete',
model=dict(
obs_shape=[5, 4, 26],
action_shape=6,
share_encoder=False,
action_space='discrete',
),
learn=dict(
update_per_collect=4,
epoch_per_collect=4,
batch_size=128,
learning_rate=0.001,
value_weight=0.5,
entropy_weight=0.01,
clip_ratio=0.2,
nstep=1,
nstep_return=False,
adv_norm=True,
value_norm=True,
),
......
......@@ -10,8 +10,10 @@ coinrun_ppo_default_config = dict(
),
policy=dict(
cuda=False,
action_space='discrete',
model=dict(
obs_shape=[3, 64, 64],
action_space='discrete',
action_shape=15,
encoder_hidden_size_list=[32, 32, 64],
),
......@@ -34,7 +36,6 @@ coinrun_ppo_default_config = dict(
),
replay_buffer=dict(replay_buffer_size=100000, ),
),
cuda=True,
),
)
coinrun_ppo_default_config = EasyDict(coinrun_ppo_default_config)
......
......@@ -34,7 +34,6 @@ maze_dqn_default_config = dict(
),
replay_buffer=dict(replay_buffer_size=100000, ),
),
cuda=True,
),
)
maze_dqn_default_config = EasyDict(maze_dqn_default_config)
......
......@@ -11,9 +11,11 @@ maze_ppo_default_config = dict(
),
policy=dict(
cuda=False,
action_space='discrete',
model=dict(
obs_shape=[3, 64, 64],
action_shape=15,
action_space='discrete',
encoder_hidden_size_list=[32, 32, 64],
),
learn=dict(
......
......@@ -14,12 +14,12 @@ hopper_ppo_default_config = dict(
policy=dict(
cuda=True,
recompute_adv=True,
action_space='continuous',
model=dict(
obs_shape=11,
action_shape=3,
continuous=True,
action_space='continuous',
),
continuous=True,
learn=dict(
epoch_per_collect=10,
batch_size=64,
......
from easydict import EasyDict
slime_volley_league_ppo_config = dict(
exp_name="slime_volley_league_ppo",
env=dict(
collector_env_num=8,
evaluator_env_num=10,
n_evaluator_episode=100,
stop_value=0,
# Single-agent env for evaluator; Double-agent env for collector.
# Should be assigned True or False in code.
is_evaluator=None,
manager=dict(shared_memory=False, ),
env_id="SlimeVolley-v0",
),
policy=dict(
cuda=False,
continuous=False,
model=dict(
obs_shape=12,
action_shape=6,
encoder_hidden_size_list=[32, 32],
critic_head_hidden_size=32,
actor_head_hidden_size=32,
share_encoder=False,
),
learn=dict(
update_per_collect=3,
batch_size=32,
learning_rate=0.00001,
value_weight=0.5,
entropy_weight=0.0,
clip_ratio=0.2,
),
collect=dict(
n_episode=128, unroll_len=1, discount_factor=1.0, gae_lambda=1.0, collector=dict(get_train_sample=True, )
),
other=dict(
league=dict(
player_category=['default'],
path_policy="slime_volley_league_ppo/policy",
active_players=dict(
main_player=1,
main_exploiter=1,
league_exploiter=1,
),
main_player=dict(
one_phase_step=200,
branch_probs=dict(
pfsp=0.5,
sp=1.0,
),
strong_win_rate=0.7,
),
main_exploiter=dict(
one_phase_step=200,
branch_probs=dict(main_players=1.0, ),
strong_win_rate=0.7,
min_valid_win_rate=0.3,
),
league_exploiter=dict(
one_phase_step=200,
branch_probs=dict(pfsp=1.0, ),
strong_win_rate=0.7,
mutate_prob=0.0,
),
use_pretrain=False,
use_pretrain_init_historical=False,
payoff=dict(
type='battle',
decay=0.99,
min_win_rate_games=8,
)
),
),
),
)
slime_volley_league_ppo_config = EasyDict(slime_volley_league_ppo_config)
......@@ -13,10 +13,11 @@ slime_volley_ppo_config = dict(
),
policy=dict(
cuda=True,
continuous=False,
action_space='discrete',
model=dict(
obs_shape=12,
action_shape=6,
action_space='discrete',
encoder_hidden_size_list=[64, 64],
critic_head_hidden_size=64,
actor_head_hidden_size=64,
......
......@@ -31,7 +31,7 @@ main_config = dict(
policy=dict(
cuda=True,
multi_agent=True,
continuous=False,
action_space='discrete',
model=dict(
# (int) agent_num: The number of the agent.
# For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
......@@ -49,6 +49,7 @@ main_config = dict(
action_shape=14,
# (List[int]) The size of hidden layer
# hidden_size_list=[64],
action_space='discrete'
),
# used in state_num of hidden_state
learn=dict(
......
......@@ -30,7 +30,7 @@ main_config = dict(
policy=dict(
cuda=True,
multi_agent=True,
continuous=False,
action_space='discrete',
model=dict(
# (int) agent_num: The number of the agent.
# For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
......@@ -48,6 +48,7 @@ main_config = dict(
action_shape=12,
# (List[int]) The size of hidden layer
# hidden_size_list=[64],
action_space='discrete',
),
# used in state_num of hidden_state
learn=dict(
......
......@@ -30,7 +30,7 @@ main_config = dict(
policy=dict(
cuda=True,
multi_agent=True,
continuous=False,
action_space='discrete',
model=dict(
# (int) agent_num: The number of the agent.
# For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
......@@ -47,6 +47,7 @@ main_config = dict(
action_shape=18,
# (List[int]) The size of hidden layer
# hidden_size_list=[64],
action_space='discrete',
),
# used in state_num of hidden_state
learn=dict(
......
......@@ -30,7 +30,7 @@ main_config = dict(
policy=dict(
cuda=True,
multi_agent=True,
continuous=False,
action_space='discrete',
model=dict(
# (int) agent_num: The number of the agent.
# For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
......@@ -48,6 +48,7 @@ main_config = dict(
action_shape=16,
# (List[int]) The size of hidden layer
# hidden_size_list=[64],
action_space='discrete',
),
# used in state_num of hidden_state
learn=dict(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册