diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py index d6b415b6c2c00f70eb7d67a2f60ab1ab67d77c7e..6b76eadf58a1b2b063059dfcb5324225de526147 100644 --- a/ding/model/template/mavac.py +++ b/ding/model/template/mavac.py @@ -28,6 +28,7 @@ class MAVAC(nn.Module): actor_head_layer_num: int = 2, critic_head_hidden_size: int = 64, critic_head_layer_num: int = 1, + action_space: str = 'discrete', activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, ) -> None: diff --git a/ding/model/template/ppg.py b/ding/model/template/ppg.py index 79c5ba0f6988ee3ca2643badf4dfe0e1db48f2f3..dba98d8d4f713c4c776a6d0c0ad4979ada2f76d6 100644 --- a/ding/model/template/ppg.py +++ b/ding/model/template/ppg.py @@ -14,8 +14,8 @@ class PPG(nn.Module): self, obs_shape: Union[int, SequenceType], action_shape: Union[int, SequenceType], + action_space: str = 'discrete', share_encoder: bool = True, - continuous: bool = False, encoder_hidden_size_list: SequenceType = [128, 128, 64], actor_head_hidden_size: int = 64, actor_head_layer_num: int = 2, @@ -26,7 +26,7 @@ class PPG(nn.Module): ) -> None: super(PPG, self).__init__() self.actor_critic = VAC( - obs_shape, action_shape, share_encoder, continuous, encoder_hidden_size_list, actor_head_hidden_size, + obs_shape, action_shape, action_space, share_encoder, encoder_hidden_size_list, actor_head_hidden_size, actor_head_layer_num, critic_head_hidden_size, critic_head_layer_num, activation, norm_type ) self.aux_critic = copy.deepcopy(self.actor_critic.critic) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index cce967c506f967706fc45003835cca094396b47f..20746541538ed56fa244dd21171e1edb8a89a858 100644 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -23,7 +23,7 @@ class QAC(nn.Module): self, obs_shape: Union[int, SequenceType], action_shape: Union[int, SequenceType, EasyDict], - actor_head_type: str, + action_space: str, twin_critic: bool = False, actor_head_hidden_size: int = 64, actor_head_layer_num: int = 1, @@ -39,7 +39,7 @@ class QAC(nn.Module): - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, ), \ EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). - - actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` . + - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` . - twin_critic (:obj:`bool`): Whether include twin critic. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ @@ -56,9 +56,9 @@ class QAC(nn.Module): obs_shape: int = squeeze(obs_shape) action_shape = squeeze(action_shape) self.action_shape = action_shape - self.actor_head_type = actor_head_type - assert self.actor_head_type in ['regression', 'reparameterization', 'hybrid'] - if self.actor_head_type == 'regression': # DDPG, TD3 + self.action_space = action_space + assert self.action_space in ['regression', 'reparameterization', 'hybrid'] + if self.action_space == 'regression': # DDPG, TD3 self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, RegressionHead( @@ -70,7 +70,7 @@ class QAC(nn.Module): norm_type=norm_type ) ) - elif self.actor_head_type == 'reparameterization': # SAC + elif self.action_space == 'reparameterization': # SAC self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, ReparameterizationHead( @@ -82,7 +82,7 @@ class QAC(nn.Module): norm_type=norm_type ) ) - elif self.actor_head_type == 'hybrid': # PADDPG + elif self.action_space == 'hybrid': # PADDPG # hybrid action space: action_type(discrete) + action_args(continuous), # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])} action_shape.action_args_shape = squeeze(action_shape.action_args_shape) @@ -110,7 +110,7 @@ class QAC(nn.Module): ) self.actor = nn.ModuleList([actor_action_type, actor_action_args]) self.twin_critic = twin_critic - if self.actor_head_type == 'hybrid': + if self.action_space == 'hybrid': critic_input_size = obs_shape + action_shape.action_type_shape + action_shape.action_args_shape else: critic_input_size = obs_shape + action_shape @@ -194,7 +194,7 @@ class QAC(nn.Module): Critic Examples: >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)} - >>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression') + >>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression') >>> model(inputs, mode='compute_critic')['q_value'] # q value tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) @@ -245,13 +245,13 @@ class QAC(nn.Module): >>> actor_outputs['logit'][1].shape # sigma >>> torch.Size([4, 64]) """ - if self.actor_head_type == 'regression': + if self.action_space == 'regression': x = self.actor(inputs) return {'action': x['pred']} - elif self.actor_head_type == 'reparameterization': + elif self.action_space == 'reparameterization': x = self.actor(inputs) return {'logit': [x['mu'], x['sigma']]} - elif self.actor_head_type == 'hybrid': + elif self.action_space == 'hybrid': logit = self.actor[0](inputs) action_args = self.actor[1](inputs) return {'logit': logit['logit'], 'action_args': action_args['pred']} @@ -284,14 +284,14 @@ class QAC(nn.Module): Examples: >>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)} - >>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression') + >>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression') >>> model(inputs, mode='compute_critic')['q_value'] # q value >>> tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) """ obs, action = inputs['obs'], inputs['action'] assert len(obs.shape) == 2 - if self.actor_head_type == 'hybrid': + if self.action_space == 'hybrid': action_type_logit = inputs['logit'] action_type_logit = torch.softmax(action_type_logit, dim=-1) action_args = action['action_args'] diff --git a/ding/model/template/qac_dist.py b/ding/model/template/qac_dist.py index a18c035342d38e3e735024b1e7300c202a00259e..e2ce65f34c76a11a156977e9ab731005d8c25650 100644 --- a/ding/model/template/qac_dist.py +++ b/ding/model/template/qac_dist.py @@ -20,7 +20,7 @@ class QACDIST(nn.Module): self, obs_shape: Union[int, SequenceType], action_shape: Union[int, SequenceType], - actor_head_type: str = "regression", + action_space: str = "regression", critic_head_type: str = "categorical", actor_head_hidden_size: int = 64, actor_head_layer_num: int = 1, @@ -38,7 +38,7 @@ class QACDIST(nn.Module): Arguments: - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization``. + - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``. - critic_head_type (:obj:`str`): Only ``categorical``. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - actor_head_layer_num (:obj:`int`): @@ -58,9 +58,9 @@ class QACDIST(nn.Module): super(QACDIST, self).__init__() obs_shape: int = squeeze(obs_shape) action_shape: int = squeeze(action_shape) - self.actor_head_type = actor_head_type - assert self.actor_head_type in ['regression', 'reparameterization'] - if self.actor_head_type == 'regression': + self.action_space = action_space + assert self.action_space in ['regression', 'reparameterization'] + if self.action_space == 'regression': self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, RegressionHead( @@ -72,7 +72,7 @@ class QACDIST(nn.Module): norm_type=norm_type ) ) - elif self.actor_head_type == 'reparameterization': + elif self.action_space == 'reparameterization': self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, ReparameterizationHead( @@ -156,7 +156,7 @@ class QACDIST(nn.Module): Critic Examples: >>> # Categorical mode >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)} - >>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \ + >>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \ ... critic_head_type='categorical', n_atoms=51) >>> q_value = model(inputs, mode='compute_critic') # q value >>> assert q_value['q_value'].shape == torch.Size([4, 1]) @@ -204,9 +204,9 @@ class QACDIST(nn.Module): >>> torch.Size([4, 64]) """ x = self.actor(inputs) - if self.actor_head_type == 'regression': + if self.action_space == 'regression': return {'action': x['pred']} - elif self.actor_head_type == 'reparameterization': + elif self.action_space == 'reparameterization': return {'logit': [x['mu'], x['sigma']]} def compute_critic(self, inputs: Dict) -> Dict: @@ -232,7 +232,7 @@ class QACDIST(nn.Module): Examples: >>> # Categorical mode >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)} - >>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \ + >>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \ ... critic_head_type='categorical', n_atoms=51) >>> q_value = model(inputs, mode='compute_critic') # q value >>> assert q_value['q_value'].shape == torch.Size([4, 1]) diff --git a/ding/model/template/tests/test_hybrid_qac.py b/ding/model/template/tests/test_hybrid_qac.py index d4f9f279a1bb16ceb8fe5bfec3c7c92ee97c0d0e..018c3f2d36054577ea1fbec3a00b288c12d12404 100644 --- a/ding/model/template/tests/test_hybrid_qac.py +++ b/ding/model/template/tests/test_hybrid_qac.py @@ -16,7 +16,7 @@ hybrid_args = { 'action_args_shape': (6, ) }), 'twin': True, - 'actor_head_type': 'hybrid' + 'action_space': 'hybrid' } @@ -27,10 +27,10 @@ class TestHybridQAC: self, action_shape=hybrid_args['action_shape'], twin=hybrid_args['twin'], - actor_head_type=hybrid_args['actor_head_type'] + action_space=hybrid_args['action_space'] ): N = 32 - assert actor_head_type == 'hybrid' + assert action_space == 'hybrid' inputs = { 'obs': torch.randn(B, N), 'action': { @@ -42,7 +42,7 @@ class TestHybridQAC: model = QAC( obs_shape=(N, ), action_shape=action_shape, - actor_head_type=actor_head_type, + action_space=action_space, critic_head_hidden_size=embedding_size, actor_head_hidden_size=embedding_size, twin_critic=twin, diff --git a/ding/model/template/tests/test_qac.py b/ding/model/template/tests/test_qac.py index d3fca991cf356d552d45c9eb943d1cf66dc0e131..ea0e9348d9b3c6c49a7470cd4c3472b26314b83a 100644 --- a/ding/model/template/tests/test_qac.py +++ b/ding/model/template/tests/test_qac.py @@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, [True, False], ['regression', 'reparame @pytest.mark.unittest -@pytest.mark.parametrize('action_shape, twin, actor_head_type', args) +@pytest.mark.parametrize('action_shape, twin, action_space', args) class TestQAC: - def test_fcqac(self, action_shape, twin, actor_head_type): + def test_fcqac(self, action_shape, twin, action_space): N = 32 inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))} model = QAC( obs_shape=(N, ), action_shape=action_shape, - actor_head_type=actor_head_type, + action_space=action_space, critic_head_hidden_size=embedding_size, actor_head_hidden_size=embedding_size, twin_critic=twin, @@ -41,7 +41,7 @@ class TestQAC: # compute_action print(model) - if actor_head_type == 'regression': + if action_space == 'regression': action = model(inputs['obs'], mode='compute_actor')['action'] if squeeze(action_shape) == 1: assert action.shape == (B, ) @@ -49,7 +49,7 @@ class TestQAC: assert action.shape == (B, squeeze(action_shape)) assert action.eq(action.clamp(-1, 1)).all() is_differentiable(action.sum(), model.actor) - elif actor_head_type == 'reparameterization': + elif action_space == 'reparameterization': (mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit'] assert mu.shape == (B, *action_shape) assert sigma.shape == (B, *action_shape) diff --git a/ding/model/template/tests/test_qac_dist.py b/ding/model/template/tests/test_qac_dist.py index 21e8871153c24a7e9616aa0ba7b8fec9684255f1..2e6f8548092e32b21171fc31f7dc31b24e4865d6 100644 --- a/ding/model/template/tests/test_qac_dist.py +++ b/ding/model/template/tests/test_qac_dist.py @@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, ['regression', 'reparameterization']])) @pytest.mark.unittest -@pytest.mark.parametrize('action_shape, actor_head_type', args) +@pytest.mark.parametrize('action_shape, action_space', args) class TestQACDIST: - def test_fcqac_dist(self, action_shape, actor_head_type): + def test_fcqac_dist(self, action_shape, action_space): N = 32 inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))} model = QACDIST( obs_shape=(N, ), action_shape=action_shape, - actor_head_type=actor_head_type, + action_space=action_space, critic_head_hidden_size=embedding_size, actor_head_hidden_size=embedding_size, ) @@ -43,7 +43,7 @@ class TestQACDIST: # compute_action print(model) - if actor_head_type == 'regression': + if action_space == 'regression': action = model(inputs['obs'], mode='compute_actor')['action'] if squeeze(action_shape) == 1: assert action.shape == (B, ) @@ -51,7 +51,7 @@ class TestQACDIST: assert action.shape == (B, squeeze(action_shape)) assert action.eq(action.clamp(-1, 1)).all() is_differentiable(action.sum(), model.actor) - elif actor_head_type == 'reparameterization': + elif action_space == 'reparameterization': (mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit'] assert mu.shape == (B, *action_shape) assert sigma.shape == (B, *action_shape) diff --git a/ding/model/template/tests/test_vac.py b/ding/model/template/tests/test_vac.py index 698d1a8e83a33f37674a23d2330b256f26877e9f..48eb64e16aa798ca15a87d6adbd9bf1fc4f0ac95 100644 --- a/ding/model/template/tests/test_vac.py +++ b/ding/model/template/tests/test_vac.py @@ -8,8 +8,8 @@ from ding.torch_utils import is_differentiable B, C, H, W = 4, 3, 128, 128 obs_shape = [4, (8, ), (4, 64, 64)] -act_args = [[6, False], [(3, ), True], [[2, 3, 6], False]] -#act_args = [[(3, ), True]] +act_args = [[6, 'discrete'], [(3, ), 'continuous'], [[2, 3, 6], 'discrete']] +# act_args = [[(3, ), True]] args = list(product(*[obs_shape, act_args, [False, True]])) @@ -29,12 +29,12 @@ class TestVAC: inputs = torch.randn(B, obs_shape) else: inputs = torch.randn(B, *obs_shape) - model = VAC(obs_shape, action_shape=act_args[0], continuous=act_args[1], share_encoder=share_encoder) + model = VAC(obs_shape, action_shape=act_args[0], action_space=act_args[1], share_encoder=share_encoder) outputs = model(inputs, mode='compute_actor_critic') value, logit = outputs['value'], outputs['logit'] - if model.continuous: - outputs = value.sum() + logit[0].sum() + logit[1].sum() + if model.action_space == 'continuous': + outputs = value.sum() + logit['mu'].sum() + logit['sigma'].sum() else: if model.multi_head: outputs = value.sum() + sum([t.sum() for t in logit]) @@ -45,8 +45,8 @@ class TestVAC: for p in model.parameters(): p.grad = None logit = model(inputs, mode='compute_actor')['logit'] - if model.continuous: - logit = logit[0].sum() + logit[1].sum() + if model.action_space == 'continuous': + logit = logit['mu'].sum() + logit['sigma'].sum() self.output_check(model.actor, logit, model.action_shape) for p in model.parameters(): diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index de142a2f439a109fc5f38be2c845941fb99c5c8a..aa970fb4a260b92be218288132f9267aae1344e3 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -1,4 +1,5 @@ from typing import Union, Dict, Optional +from easydict import EasyDict import torch import torch.nn as nn @@ -20,9 +21,9 @@ class VAC(nn.Module): def __init__( self, obs_shape: Union[int, SequenceType], - action_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + action_space: str = 'discrete', share_encoder: bool = True, - continuous: bool = False, encoder_hidden_size_list: SequenceType = [128, 128, 64], actor_head_hidden_size: int = 64, actor_head_layer_num: int = 1, @@ -31,6 +32,7 @@ class VAC(nn.Module): activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, sigma_type: Optional[str] = 'independent', + fixed_sigma_value: Optional[int] = 0.3, bound_type: Optional[str] = None, ) -> None: r""" @@ -39,8 +41,8 @@ class VAC(nn.Module): Arguments: - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - action_shape (:obj:`Union[int, SequenceType]`): Action's space. + - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid'] - share_encoder (:obj:`bool`): Whether share encoder. - - continuous (:obj:`bool`): Whether collect continuously. - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - actor_head_layer_num (:obj:`int`): @@ -56,7 +58,7 @@ class VAC(nn.Module): """ super(VAC, self).__init__() obs_shape: int = squeeze(obs_shape) - action_shape: int = squeeze(action_shape) + action_shape = squeeze(action_shape) self.obs_shape, self.action_shape = obs_shape, action_shape # Encoder Type if isinstance(obs_shape, int) or len(obs_shape) == 1: @@ -81,8 +83,9 @@ class VAC(nn.Module): self.critic_head = RegressionHead( critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type ) - self.continuous = continuous - if self.continuous: + self.action_space = action_space + assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space + if self.action_space == 'continuous': self.multi_head = False self.actor_head = ReparameterizationHead( actor_head_hidden_size, @@ -93,7 +96,7 @@ class VAC(nn.Module): norm_type=norm_type, bound_type=bound_type ) - else: + elif self.action_space == 'discrete': actor_head_cls = DiscreteHead multi_head = not isinstance(action_shape, int) self.multi_head = multi_head @@ -114,6 +117,30 @@ class VAC(nn.Module): activation=activation, norm_type=norm_type ) + elif self.action_space == 'hybrid': # HPPO + # hybrid action space: action_type(discrete) + action_args(continuous), + # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])} + action_shape.action_args_shape = squeeze(action_shape.action_args_shape) + action_shape.action_type_shape = squeeze(action_shape.action_type_shape) + actor_action_args = ReparameterizationHead( + actor_head_hidden_size, + action_shape.action_args_shape, + actor_head_layer_num, + sigma_type=sigma_type, + fixed_sigma_value=fixed_sigma_value, + activation=activation, + norm_type=norm_type, + bound_type=bound_type, + ) + actor_action_type = DiscreteHead( + actor_head_hidden_size, + action_shape.action_type_shape, + actor_head_layer_num, + activation=activation, + norm_type=norm_type, + ) + self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) + # must use list, not nn.ModuleList if self.share_encoder: self.actor = [self.encoder, self.actor_head] @@ -203,10 +230,16 @@ class VAC(nn.Module): x = self.encoder(x) else: x = self.actor_encoder(x) - x = self.actor_head(x) - if self.continuous: - x = {'logit': [x['mu'], x['sigma']]} - return x + + if self.action_space == 'discrete': + return self.actor_head(x) + elif self.action_space == 'continuous': + x = self.actor_head(x) # mu, sigma + return {'logit': x} + elif self.action_space == 'hybrid': + action_type = self.actor_head[0](x) + action_args = self.actor_head[1](x) + return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}} def compute_critic(self, x: torch.Tensor) -> Dict: r""" @@ -278,10 +311,16 @@ class VAC(nn.Module): else: actor_embedding = self.actor_encoder(x) critic_embedding = self.critic_encoder(x) - value = self.critic_head(critic_embedding) - actor_output = self.actor_head(actor_embedding) - if self.continuous: - logit = [actor_output['mu'], actor_output['sigma']] - else: - logit = actor_output['logit'] - return {'logit': logit, 'value': value['pred']} + + value = self.critic_head(critic_embedding)['pred'] + + if self.action_space == 'discrete': + logit = self.actor_head(actor_embedding)['logit'] + return {'logit': logit, 'value': value} + elif self.action_space == 'continuous': + x = self.actor_head(actor_embedding) + return {'logit': x, 'value': value} + elif self.action_space == 'hybrid': + action_type = self.actor_head[0](actor_embedding) + action_args = self.actor_head[1](actor_embedding) + return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}, 'value': value} diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py index 8c9c04b792af12ee4c1415b61ff8dd02c001a9e0..374892932ff823ef5eb1f00cbd0ec1c8b8da5526 100644 --- a/ding/model/wrapper/model_wrappers.py +++ b/ding/model/wrapper/model_wrappers.py @@ -5,6 +5,7 @@ import numpy as np import torch from ding.torch_utils import get_tensor_data from ding.rl_utils import create_noise_generator +from torch.distributions import Categorical, Independent, Normal class IModelWrapper(ABC): @@ -408,6 +409,91 @@ class HybridEpsGreedyMultinomialSampleWrapper(IModelWrapper): return output +class HybridReparamMultinomialSampleWrapper(IModelWrapper): + """ + Overview: + Reparameterization sampler coupled with multinomial sample used in collector_model + to help balance exploration and exploitation. + In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous} + Interfaces: + forward + """ + + def forward(self, *args, **kwargs): + output = self._model.forward(*args, **kwargs) + assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) + + logit = output['logit'] # logit: {'action_type': action_type_logit, 'action_args': action_args_logit} + # discrete part + action_type_logit = logit['action_type'] + prob = torch.softmax(action_type_logit, dim=-1) + pi_action = Categorical(prob) + action_type = pi_action.sample() + # continuous part + mu, sigma = logit['action_args']['mu'], logit['action_args']['sigma'] + dist = Independent(Normal(mu, sigma), 1) + action_args = dist.sample() + action = {'action_type': action_type, 'action_args': action_args} + output['action'] = action + return output + + +class HybridDeterministicArgmaxSampleWrapper(IModelWrapper): + """ + Overview: + Deterministic sampler coupled with argmax sample used in eval_model. + In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous} + Interfaces: + forward + """ + + def forward(self, *args, **kwargs): + output = self._model.forward(*args, **kwargs) + assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) + logit = output['logit'] # logit: {'action_type': action_type_logit, 'action_args': action_args_logit} + # discrete part + action_type_logit = logit['action_type'] + action_type = action_type_logit.argmax(dim=-1) + # continuous part + mu = logit['action_args']['mu'] + action_args = mu + action = {'action_type': action_type, 'action_args': action_args} + output['action'] = action + return output + + +class DeterministicSample(IModelWrapper): + """ + Overview: + Deterministic sampler (just use mu directly) used in eval_model. + Interfaces: + forward + """ + + def forward(self, *args, **kwargs): + output = self._model.forward(*args, **kwargs) + assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) + output['action'] = output['logit']['mu'] + return output + + +class ReparamSample(IModelWrapper): + """ + Overview: + Reparameterization gaussian sampler used in collector_model. + Interfaces: + forward + """ + + def forward(self, *args, **kwargs): + output = self._model.forward(*args, **kwargs) + assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) + mu, sigma = output['logit']['mu'], output['logit']['sigma'] + dist = Independent(Normal(mu, sigma), 1) + output['action'] = dist.sample() + return output + + class EpsGreedySampleNGUWrapper(IModelWrapper): r""" Overview: @@ -592,8 +678,12 @@ wrapper_name_map = { 'eps_greedy_sample': EpsGreedySampleWrapper, 'eps_greedy_sample_ngu': EpsGreedySampleNGUWrapper, 'eps_greedy_multinomial_sample': EpsGreedyMultinomialSampleWrapper, + 'deterministic_sample': DeterministicSample, + 'reparam_sample': ReparamSample, 'hybrid_eps_greedy_sample': HybridEpsGreedySampleWrapper, 'hybrid_eps_greedy_multinomial_sample': HybridEpsGreedyMultinomialSampleWrapper, + 'hybrid_reparam_multinomial_sample': HybridReparamMultinomialSampleWrapper, + 'hybrid_deterministic_argmax_sample': HybridDeterministicArgmaxSampleWrapper, 'multinomial_sample': MultinomialSampleWrapper, 'action_noise': ActionNoiseWrapper, # model wrapper @@ -607,6 +697,8 @@ def model_wrap(model, wrapper_name: str = None, **kwargs): if not isinstance(model, IModelWrapper): model = wrapper_name_map['base'](model) model = wrapper_name_map[wrapper_name](model, **kwargs) + else: + raise TypeError("not support model_wrapper type: {}".format(wrapper_name)) return model diff --git a/ding/policy/cql.py b/ding/policy/cql.py index a0a448b36473e3b59b2cf8fa39bbd06ce1f78351..ea1a3ef128cb69934c8631e347243ebe449a77eb 100644 --- a/ding/policy/cql.py +++ b/ding/policy/cql.py @@ -99,7 +99,9 @@ class CQLPolicy(SACPolicy): # and learning_rate_policy in `cfg.policy.learn`. # Default to False. # value_network=False, - actor_head_type='reparameterization', + + # (str type) action_space: Use reparameterization trick for continous action + action_space='reparameterization', ), learn=dict( # (bool) Whether to use multi gpu diff --git a/ding/policy/il.py b/ding/policy/il.py index 8169419bfc5d6ce315e4291c75702bf6063543c1..6ddb0eb17ad27a94d7667d6fb51e1013c8b9684f 100644 --- a/ding/policy/il.py +++ b/ding/policy/il.py @@ -8,10 +8,11 @@ from ding.model import model_wrap from ding.utils import POLICY_REGISTRY from ding.utils.data import default_collate, default_decollate from .base_policy import Policy -try: - from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel -except ImportError: - FootballKaggle5thPlaceModel = None +# try: +# from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel +# except ImportError: +# FootballKaggle5thPlaceModel = None +FootballKaggle5thPlaceModel = None @POLICY_REGISTRY.register('IL') diff --git a/ding/policy/ppg.py b/ding/policy/ppg.py index 59af611d7ea3ae930acf3a9ba4766784c538cc8d..00cd225e946e4d34ea43b75641c1358f4c6c20c1 100644 --- a/ding/policy/ppg.py +++ b/ding/policy/ppg.py @@ -86,7 +86,7 @@ class PPGPolicy(Policy): # (bool) Whether to use cuda for network. cuda=False, # (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used) - on_policy=True, + on_policy=False, priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index f0ae45f54b9af4168f7bc8d4f1f2cf087bac1500..fb961723240fc65e22a4602c09f3f187081ff966 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -32,11 +32,16 @@ class PPOPolicy(Policy): on_policy=True, # (bool) Whether to use priority(priority sample, IS weight, update priority) priority=False, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) Whether to use Importance Sampling Weight to correct biased update due to priority. + # If True, priority must be True. priority_IS_weight=False, + # (bool) Whether to recompurete advantages in each iteration of on-policy PPO recompute_adv=True, - continuous=True, + # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous', 'hybrid'] + action_space='discrete', + # (bool) Whether to use nstep return to calculate value target, otherwise, use return = adv + value nstep_return=False, + # (bool) Whether to enable multi-agent training, i.e.: MAPPO multi_agent=False, # (bool) Whether to need policy data in process transition transition_with_policy_data=True, @@ -89,16 +94,22 @@ class PPOPolicy(Policy): self._priority_IS_weight = self._cfg.priority_IS_weight assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" - self._continuous = self._cfg.continuous + self._action_space = self._cfg.action_space if self._cfg.learn.ppo_param_init: for n, m in self._model.named_modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) - if self._continuous: + if self._action_space in ['continuous', 'hybrid']: # init log sigma - if hasattr(self._model.actor_head, 'log_sigma_param'): - torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5) + if self._action_space == 'continuous': + if hasattr(self._model.actor_head, 'log_sigma_param'): + torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5) + elif self._action_space == 'hybrid': # actor_head[1]: ReparameterizationHead, for action_args + if hasattr(self._model.actor_head[1], 'log_sigma_param'): + torch.nn.init.constant_(self._model.actor_head[1].log_sigma_param, -0.5) + print('init ok') + for m in list(self._model.critic.modules()) + list(self._model.actor.modules()): if isinstance(m, torch.nn.Linear): # orthogonal initialization @@ -194,18 +205,42 @@ class PPOPolicy(Policy): adv = (adv - adv.mean()) / (adv.std() + 1e-8) # Calculate ppo error - if self._continuous: + if self._action_space == 'continuous': ppo_batch = ppo_data( output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv, batch['return'], batch['weight'] ) ppo_loss, ppo_info = ppo_error_continuous(ppo_batch, self._clip_ratio) - else: + elif self._action_space == 'discrete': ppo_batch = ppo_data( output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv, batch['return'], batch['weight'] ) ppo_loss, ppo_info = ppo_error(ppo_batch, self._clip_ratio) + elif self._action_space == 'hybrid': + # discrete part (discrete policy loss and entropy loss) + ppo_discrete_batch = ppo_policy_data( + output['logit']['action_type'], batch['logit']['action_type'], batch['action']['action_type'], + adv, batch['weight'] + ) + ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_batch, self._clip_ratio) + # continuous part (continuous policy loss and entropy loss, value loss) + ppo_continuous_batch = ppo_data( + output['logit']['action_args'], batch['logit']['action_args'], batch['action']['action_args'], + output['value'], batch['value'], adv, batch['return'], batch['weight'] + ) + ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous( + ppo_continuous_batch, self._clip_ratio + ) + # sum discrete and continuous loss + ppo_loss = type(ppo_continuous_loss)( + ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, ppo_continuous_loss.value_loss, + ppo_continuous_loss.entropy_loss + ppo_discrete_loss.entropy_loss + ) + ppo_info = type(ppo_continuous_info)( + max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), + max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) + ) wv, we = self._value_weight, self._entropy_weight total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss @@ -225,13 +260,13 @@ class PPOPolicy(Policy): 'value_max': output['value'].max().item(), 'approx_kl': ppo_info.approx_kl, 'clipfrac': ppo_info.clipfrac, - 'act': batch['action'].float().mean().item(), } - if self._continuous: + if self._action_space == 'continuous': return_info.update( { - 'mu_mean': output['logit'][0].mean().item(), - 'sigma_mean': output['logit'][1].mean().item(), + 'act': batch['action'].float().mean().item(), + 'mu_mean': output['logit']['mu'].mean().item(), + 'sigma_mean': output['logit']['sigma'].mean().item(), } ) return_infos.append(return_info) @@ -254,11 +289,13 @@ class PPOPolicy(Policy): Init traj and unroll length, collect model. """ self._unroll_len = self._cfg.collect.unroll_len - self._continuous = self._cfg.continuous - if self._continuous: - self._collect_model = model_wrap(self._model, wrapper_name='base') - else: + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') + elif self._action_space == 'discrete': self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') + elif self._action_space == 'hybrid': + self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample') self._collect_model.reset() self._gamma = self._cfg.collect.discount_factor self._gae_lambda = self._cfg.collect.gae_lambda @@ -283,10 +320,6 @@ class PPOPolicy(Policy): self._collect_model.eval() with torch.no_grad(): output = self._collect_model.forward(data, mode='compute_actor_critic') - if self._continuous: - (mu, sigma), value = output['logit'], output['value'] - dist = Independent(Normal(mu, sigma), 1) - output['action'] = dist.sample() if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) @@ -378,11 +411,13 @@ class PPOPolicy(Policy): Evaluate mode init method. Called by ``self.__init__``. Init eval model with argmax strategy. """ - self._continuous = self._cfg.continuous - if self._continuous: - self._eval_model = model_wrap(self._model, wrapper_name='base') - else: + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') + elif self._action_space == 'discrete': self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + elif self._action_space == 'hybrid': + self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample') self._eval_model.reset() def _forward_eval(self, data: dict) -> dict: @@ -404,9 +439,6 @@ class PPOPolicy(Policy): self._eval_model.eval() with torch.no_grad(): output = self._eval_model.forward(data, mode='compute_actor') - if self._continuous: - (mu, sigma) = output['logit'] - output.update({'action': mu}) if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) @@ -430,7 +462,7 @@ class PPOPolicy(Policy): 'value_max', 'value_mean', ] - if self._continuous: + if self._action_space == 'continuous': variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] return variables diff --git a/ding/policy/sac.py b/ding/policy/sac.py index 78f154d83141def97214df16f955b7776949702c..48d0d4d2bfd0c14cfd3cb926fd795f8492506ed9 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -599,7 +599,9 @@ class SACPolicy(Policy): # and learning_rate_policy in `cfg.policy.learn`. # Default to False. # value_network=False, - actor_head_type='reparameterization', + + # (str type) action_space: Use reparameterization trick for continous action + action_space='reparameterization', ), learn=dict( # (bool) Whether to use multi gpu diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py index e51851ab85c468faff6b2fdf5d2f6d079a6ae1f6..256b297b68a50f3679ec9885d9bdb98da26a6e34 100644 --- a/ding/rl_utils/ppo.py +++ b/ding/rl_utils/ppo.py @@ -108,7 +108,6 @@ def ppo_policy_error(data: namedtuple, # only use dual_clip when adv < 0 policy_loss = -(torch.where(adv < 0, clip2, clip1) * weight).mean() else: - #policy_loss = (-torch.min(surr1, surr2) * weight).mean() policy_loss = (-torch.min(surr1, surr2) * weight).mean() with torch.no_grad(): approx_kl = (logp_old - logp_new).mean().item() @@ -179,11 +178,11 @@ def ppo_error_continuous( if weight is None: weight = torch.ones_like(adv) - dist_new = Independent(Normal(mu_sigma_new[0], mu_sigma_new[1]), 1) - if len(mu_sigma_old[0].shape) == 1: - dist_old = Independent(Normal(mu_sigma_old[0].unsqueeze(-1), mu_sigma_old[1].unsqueeze(-1)), 1) + dist_new = Independent(Normal(mu_sigma_new['mu'], mu_sigma_new['sigma']), 1) + if len(mu_sigma_old['mu'].shape) == 1: + dist_old = Independent(Normal(mu_sigma_old['mu'].unsqueeze(-1), mu_sigma_old['sigma'].unsqueeze(-1)), 1) else: - dist_old = Independent(Normal(mu_sigma_old[0], mu_sigma_old[1]), 1) + dist_old = Independent(Normal(mu_sigma_old['mu'], mu_sigma_old['sigma']), 1) logp_new = dist_new.log_prob(action) logp_old = dist_old.log_prob(action) entropy_loss = (dist_new.entropy() * weight).mean() diff --git a/ding/rl_utils/tests/test_ppo.py b/ding/rl_utils/tests/test_ppo.py index bab78c148548965dd5021cc4e19a1df8764b3751..a72d0e3b1674dc2df290ee4f502187acab9c4db3 100644 --- a/ding/rl_utils/tests/test_ppo.py +++ b/ding/rl_utils/tests/test_ppo.py @@ -70,11 +70,11 @@ def test_mappo(): @pytest.mark.parametrize('use_value_clip, dual_clip, weight', args) def test_ppo_error_continous(use_value_clip, dual_clip, weight): B, N = 4, 6 - mu_sigma_new = [torch.rand(B, N).requires_grad_(True), torch.rand(B, N).requires_grad_(True)] - mu_sigma_old = [ - mu_sigma_new[0] + torch.rand_like(mu_sigma_new[0]) * 0.1, - mu_sigma_new[1] + torch.rand_like(mu_sigma_new[1]) * 0.1 - ] + mu_sigma_new = {'mu': torch.rand(B, N).requires_grad_(True), 'sigma': torch.rand(B, N).requires_grad_(True)} + mu_sigma_old = { + 'mu': mu_sigma_new['mu'] + torch.rand_like(mu_sigma_new['mu']) * 0.1, + 'sigma': mu_sigma_new['sigma'] + torch.rand_like(mu_sigma_new['sigma']) * 0.1 + } action = torch.rand(B, N) value_new = torch.randn(B).requires_grad_(True) value_old = value_new + torch.rand_like(value_new) * 0.1 @@ -84,9 +84,9 @@ def test_ppo_error_continous(use_value_clip, dual_clip, weight): loss, info = ppo_error_continuous(data, use_value_clip=use_value_clip, dual_clip=dual_clip) assert all([l.shape == tuple() for l in loss]) assert all([np.isscalar(i) for i in info]) - assert mu_sigma_new[0].grad is None + assert mu_sigma_new['mu'].grad is None assert value_new.grad is None total_loss = sum(loss) total_loss.backward() - assert isinstance(mu_sigma_new[0].grad, torch.Tensor) + assert isinstance(mu_sigma_new['mu'].grad, torch.Tensor) assert isinstance(value_new.grad, torch.Tensor) diff --git a/ding/utils/default_helper.py b/ding/utils/default_helper.py index 6f0ebb144e309a1211d8563efe1edef4b0476b20..ed6180d96034f60556f0ad7c8fac5c0679303d0c 100644 --- a/ding/utils/default_helper.py +++ b/ding/utils/default_helper.py @@ -410,6 +410,17 @@ def one_time_warning(warning_msg: str) -> None: logging.warning(warning_msg) +def split_fn(data, indices, start, end): + if data is None: + return None + elif isinstance(data, list): + return [split_fn(d, indices, start, end) for d in data] + elif isinstance(data, dict): + return {k1: split_fn(v1, indices, start, end) for k1, v1 in data.items()} + else: + return data[indices[start:end]] + + def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> dict: assert isinstance(data, dict), type(data) length = [] @@ -436,31 +447,7 @@ def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> d for i in range(0, length, split_size): if i + split_size > length: i = length - split_size - batch = {} - for k in data.keys(): - if data[k] is None: - batch[k] = None - elif k.startswith('prev_state'): - batch[k] = [data[k][t] for t in indices[i:i + split_size]] - elif isinstance(data[k], list) or isinstance(data[k], tuple): - if isinstance(data[k][0], list) and k == 'logit': - # for continuous action - # transform to mu_sigma (:obj:`list`): :math:`[(B, N), (B, N)]`, - # where B is batch size and N is action dim - batch[k] = [ - torch.stack( - [ - data[k][transition_index][mu_sigma_index] - for transition_index in indices[i:i + split_size] - ] - ) for mu_sigma_index in range(2) - ] - else: # for discrete action - batch[k] = [t[indices[i:i + split_size]] for t in data[k]] - elif isinstance(data[k], dict): - batch[k] = {k1: v1[indices[i:i + split_size]] for k1, v1 in data[k].items()} - else: - batch[k] = data[k][indices[i:i + split_size]] + batch = split_fn(data, indices, i, i + split_size) yield batch diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py index e58586722c70d32719ba98d418a1726e8c8bb099..91c377f7a8cd585007bf64612676d6efe357767f 100644 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py @@ -15,22 +15,19 @@ bipedalwalker_ppo_config = dict( ), policy=dict( cuda=False, - continuous=True, + action_space='continuous', model=dict( - continuous=True, + action_space='continuous', obs_shape=24, action_shape=4, ), learn=dict( epoch_per_collect=10, - # update_per_collect=4, # offpolicy batch_size=64, learning_rate=0.001, value_weight=0.5, entropy_weight=0.01, clip_ratio=0.2, - nstep=1, - nstep_return=False, adv_norm=True, ), collect=dict( @@ -50,7 +47,6 @@ bipedalwalker_ppo_create_config = dict( ), env_manager=dict(type='base'), - # policy=dict(type='ppo_offpolicy'), # TODO policy=dict(type='ppo'), ) bipedalwalker_ppo_create_config = EasyDict(bipedalwalker_ppo_create_config) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py index 5f7394786c9e9bc9b2263a4b528cf2c29fc79b07..cc79da66362441ddc776f1a6e74a628f7b0adf93 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py @@ -18,11 +18,12 @@ lunarlander_ppo_config = dict( ), policy=dict( cuda=False, - continuous=False, + action_space='discrete', recompute_adv=True, model=dict( obs_shape=8, action_shape=4, + action_space='discrete', ), learn=dict( update_per_collect=8, diff --git a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py index 57c9fe6ec9ad455689f2f77d717e5d7c41161bd0..7a8ed191c4ee77e86d759f6d899ec4bd2d04042c 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py @@ -44,8 +44,6 @@ lunarlander_ngu_config = dict( type='episodic', ), policy=dict( - continuous=False, - on_policy=False, cuda=True, priority=True, priority_IS_weight=True, diff --git a/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py index 839bec95bafa04381961e1d114d71506fe425817..825c097a23f00e843bb1154e9d85b4b8a2b8ae28 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py @@ -1,6 +1,6 @@ from easydict import EasyDict from ding.entry import serial_pipeline_reward_model -collector_env_num=8 +collector_env_num = 8 lunarlander_ppo_rnd_config = dict( env=dict( collector_env_num=collector_env_num, @@ -15,20 +15,19 @@ lunarlander_ppo_rnd_config = dict( # batch_size=32, # update_per_collect=10, batch_size=320, - update_per_collect=4, # TODO(pu):2 + update_per_collect=4, ), policy=dict( recompute_adv=True, cuda=True, - continuous=False, - on_policy=True, + action_space='discrete', model=dict( obs_shape=8, action_shape=4, + action_space='discrete', ), learn=dict( - # update_per_collect=4, - epoch_per_collect=10, # TODO(pu) + epoch_per_collect=10, update_per_collect=1, # 4 batch_size=64, learning_rate=3e-4, diff --git a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py index 71972d94a577a668476fbe2e4d246cb1fc00b4cf..fa4ea6935db4600c6da905c7a9afaa2c09a1efa9 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py @@ -17,11 +17,12 @@ cartpole_gcl_ppo_onpolicy_config = dict( ), policy=dict( cuda=False, - continuous=False, recompute_adv=True, + action_space='discrete', model=dict( obs_shape=4, action_shape=2, + action_space='discrete', encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py index f63ba82b011feea4ac5e04c04c8caf0c747cc718..ffda4cf2367968f0dc4b49040f82ecbab6a89e8e 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py @@ -10,10 +10,11 @@ cartpole_ppo_config = dict( ), policy=dict( cuda=False, - continuous=False, + action_space='discrete', model=dict( obs_shape=4, action_shape=2, + action_space='discrete', encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py index 94818ebc1c70217ff262ec4b925d99db54f57542..e2ca8b126ab9fc1995954c4714ca70cbaa42197d 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py @@ -20,10 +20,11 @@ cartpole_ppo_icm_config = dict( ), policy=dict( cuda=False, - continuous=False, + action_space='discrete', model=dict( obs_shape=4, action_shape=2, + action_space='discrete', encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, @@ -60,4 +61,4 @@ cartpole_ppo_icm_create_config = EasyDict(cartpole_ppo_icm_create_config) create_config = cartpole_ppo_icm_create_config if __name__ == '__main__': - serial_pipeline_reward_model([main_config, create_config], seed=0) \ No newline at end of file + serial_pipeline_reward_model([main_config, create_config], seed=0) diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py index 59ba6824af72c81450ef123f1d59cbf7c0c84c0e..2c7499fc5fe2001223e4973473cc6c267d319a05 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py @@ -16,6 +16,7 @@ cartpole_ppo_offpolicy_config = dict( encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, + action_space='discrete', ), learn=dict( update_per_collect=6, diff --git a/dizoo/classic_control/pendulum/config/pendulum_cql_config.py b/dizoo/classic_control/pendulum/config/pendulum_cql_config.py index 8d7e677a318a75f64050e30e7cfdad03e8f7e2ac..7e91497b52afba1184b3ba4bcc43771ff2b3f05c 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_cql_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_cql_config.py @@ -15,7 +15,7 @@ pendulum_cql_default_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='reparameterization', + action_space='reparameterization', actor_head_hidden_size=128, critic_head_hidden_size=128, ), diff --git a/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py index c8d9fde2498697070f3be7ff21449bd3078fb1b4..27c02fc2c444bc305068b94cd18191ba08ce6f7e 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py @@ -17,7 +17,7 @@ pendulum_d4pg_config = dict( model=dict( obs_shape=3, action_shape=1, - actor_head_type='regression', + action_space='regression', v_min=-100, v_max=100, n_atom=51, diff --git a/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py b/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py index ce6da56e30fef846430eb3939ca846ba8cc9709a..dd41f26d47a0eb41e54296495568e450e21f807b 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py @@ -17,7 +17,7 @@ pendulum_ddpg_config = dict( obs_shape=3, action_shape=1, twin_critic=False, - actor_head_type='regression', + action_space='regression', ), learn=dict( update_per_collect=2, diff --git a/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py b/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py index aeff665523a80fd922ed4e8275dd420c31f205fe..545a9e25f42a45ffb93a5886660931256ef46682 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py @@ -10,13 +10,13 @@ pendulum_ppo_config = dict( ), policy=dict( cuda=False, - continuous=True, + action_space='continuous', recompute_adv=True, model=dict( obs_shape=3, action_shape=1, encoder_hidden_size_list=[64, 64], - continuous=True, + action_space='continuous', actor_head_layer_num=0, critic_head_layer_num=0, sigma_type='conditioned', diff --git a/dizoo/classic_control/pendulum/config/pendulum_sac_config.py b/dizoo/classic_control/pendulum/config/pendulum_sac_config.py index f3370c0646290919ad457d2cec2f2656bbdae93e..bac739e8d547ea07d71501ed9d74e2f9fe9bedc2 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_sac_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_sac_config.py @@ -18,7 +18,7 @@ pendulum_sac_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='reparameterization', + action_space='reparameterization', actor_head_hidden_size=128, critic_head_hidden_size=128, ), diff --git a/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py b/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py index ecba243d68e8a1ea706de3afe9ee556a9b2d1618..a673e8bde36e335a7a76b845790fc26d77204a0e 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py @@ -17,7 +17,7 @@ pendulum_sac_data_genearation_default_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='reparameterization', + action_space='reparameterization', actor_head_hidden_size=128, critic_head_hidden_size=128, ), diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py index d531358b469ee77a89265736b98b1f707060fc3f..200aa632ab994570262b8102e81a77d8b70a9158 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py @@ -18,7 +18,7 @@ pendulum_td3_bc_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='regression', + action_space='regression', actor_head_hidden_size=128, critic_head_hidden_size=128, ), @@ -44,7 +44,7 @@ pendulum_td3_bc_config = dict( noise_sigma=0.1, collector=dict(collect_print_freq=1000, ), data_type='hdf5', - data_path = './td3/expert_demos.hdf5', + data_path='./td3/expert_demos.hdf5', normalize_states=True, ), eval=dict(evaluator=dict(eval_freq=100, ), ), diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_config.py index a23944e6dc0aeb6bd0a468b15056cdea81eeaa72..810f0bd34d2f669fe28ecfdf3b017d9c244e6e92 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_td3_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_td3_config.py @@ -18,7 +18,7 @@ pendulum_td3_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='regression', + action_space='regression', ), learn=dict( update_per_collect=2, diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py index c1f59e8b8b854c62bd7ae7569b2ba8469370b3b2..357443486de70dd63b4e79a95e5e49395bf5bc57 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py @@ -18,7 +18,7 @@ pendulum_td3_generation_config = dict( obs_shape=3, action_shape=1, twin_critic=True, - actor_head_type='regression', + action_space='regression', ), learn=dict( update_per_collect=2, @@ -33,7 +33,7 @@ pendulum_td3_generation_config = dict( min=-0.5, max=0.5, ), - learner = dict( + learner=dict( load_path='./td3/ckpt/ckpt_best.pth.tar', hook=dict( load_ckpt_before_run='./td3/ckpt/ckpt_best.pth.tar', @@ -46,7 +46,7 @@ pendulum_td3_generation_config = dict( noise_sigma=0.1, collector=dict(collect_print_freq=1000, ), save_path='./td3/expert.pkl', - data_type = 'hdf5', + data_type='hdf5', ), eval=dict(evaluator=dict(eval_freq=100, ), ), other=dict(replay_buffer=dict( diff --git a/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py b/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py index a3c1e7d2821428004e526bda53749657ec4510aa..9dda3bd90f7f3189dcfff806c54b0729ebfea4d2 100644 --- a/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py +++ b/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py @@ -24,10 +24,9 @@ gym_hybrid_ddpg_config = dict( action_args_shape=2, ), twin_critic=False, - actor_head_type='hybrid', + action_space='hybrid', ), learn=dict( - action_space='hybrid', update_per_collect=10, # [5, 10] batch_size=32, discount_factor=0.99, diff --git a/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py b/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py new file mode 100644 index 0000000000000000000000000000000000000000..f6822b45eaefbb92bf420a588eb9327be92bb205 --- /dev/null +++ b/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py @@ -0,0 +1,66 @@ +from easydict import EasyDict +from ding.entry import serial_pipeline_onpolicy + +gym_hybrid_hppo_config = dict( + exp_name='gym_hybrid_hppo_actsacle_fsv0.3_ew0.03_seed0', + env=dict( + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range, usually [-1, 1]. + act_scale=True, + env_id='Moving-v0', # ['Sliding-v0', 'Moving-v0'] + n_evaluator_episode=5, + stop_value=1.8, + ), + policy=dict( + cuda=True, + priority=False, + action_space='hybrid', + recompute_adv=True, + model=dict( + obs_shape=10, + action_shape=dict( + action_type_shape=3, + action_args_shape=2, + ), + action_space='hybrid', + encoder_hidden_size_list=[256, 128, 64, 64], + sigma_type='fixed', + fixed_sigma_value=0.3, # TODO(pu) + bound_type='tanh', + ), + learn=dict( + epoch_per_collect=10, + batch_size=320, + learning_rate=3e-4, + value_weight=0.5, + entropy_weight=0.03, # TODO(pu) + clip_ratio=0.2, + adv_norm=True, + value_norm=True, + ), + collect=dict( + n_sample=int(3200), + discount_factor=0.99, + gae_lambda=0.95, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=200, ), ), + ), +) +gym_hybrid_hppo_config = EasyDict(gym_hybrid_hppo_config) +main_config = gym_hybrid_hppo_config + +gym_hybrid_hppo_create_config = dict( + env=dict( + type='gym_hybrid', + import_names=['dizoo.gym_hybrid.envs.gym_hybrid_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='ppo'), +) +gym_hybrid_hppo_create_config = EasyDict(gym_hybrid_hppo_create_config) +create_config = gym_hybrid_hppo_create_config + +if __name__ == "__main__": + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py b/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py index 138980eac411fc7936c4a17cf7a1b8dbb845de0c..2067a9cdbaf14540278f9ea7fcaf83cfd4ecc90e 100644 --- a/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py +++ b/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py @@ -3,7 +3,6 @@ from ding.entry import serial_pipeline gym_hybrid_pdqn_config = dict( exp_name='gym_hybrid_pdqn_seed1', - # exp_name='gym_hybrid_pdqn_dataaction_1encoder_lrd1e-5_lrc1e-3_upc100_seed0', env=dict( collector_env_num=8, diff --git a/dizoo/gym_hybrid/envs/gym_hybrid_env.py b/dizoo/gym_hybrid/envs/gym_hybrid_env.py index e48680b0770973eb011211d845367513789effc3..86ec5e84b972be20186a29b5f37b700982dd3b07 100644 --- a/dizoo/gym_hybrid/envs/gym_hybrid_env.py +++ b/dizoo/gym_hybrid/envs/gym_hybrid_env.py @@ -54,8 +54,11 @@ class GymHybridEnv(BaseEnv): def step(self, action: Dict) -> BaseEnvTimestep: if self._act_scale: - # acceleration_value + # acceleration_value. action['action_args'][0] = affine_transform(action['action_args'][0], min_val=0, max_val=1) + # rotation_value. Following line can be omitted, because in the affine_transform function, + # we have already done the clip(-1,1) operation + action['action_args'][1] = affine_transform(action['action_args'][1], min_val=-1, max_val=1) action = [action['action_type'], action['action_args']] obs, rew, done, info = self._env.step(action) self._final_eval_reward += rew diff --git a/dizoo/league_demo/league_demo_ppo_config.py b/dizoo/league_demo/league_demo_ppo_config.py index dcb95f3b5611d7d55013b3e325d5d22db452b504..7ded7cca10818998a5d3e9c72d304aeeb0e5f7de 100644 --- a/dizoo/league_demo/league_demo_ppo_config.py +++ b/dizoo/league_demo/league_demo_ppo_config.py @@ -13,10 +13,11 @@ league_demo_ppo_config = dict( ), policy=dict( cuda=False, - continuous=False, + action_space='discrete', model=dict( obs_shape=2, action_shape=2, + action_space='discrete', encoder_hidden_size_list=[32, 32], critic_head_hidden_size=32, actor_head_hidden_size=32, diff --git a/dizoo/minigrid/config/minigrid_ngu_config.py b/dizoo/minigrid/config/minigrid_ngu_config.py index 5f1ccf4d2ce5b88efc01c4119a5e5ed3b1f79e7e..a1601a7d82c9310edc22b509fd7641433833d08e 100644 --- a/dizoo/minigrid/config/minigrid_ngu_config.py +++ b/dizoo/minigrid/config/minigrid_ngu_config.py @@ -4,12 +4,10 @@ from easydict import EasyDict from ding.entry import serial_pipeline_reward_model_ngu print(torch.cuda.is_available(), torch.__version__) -collector_env_num = 32 #TODO +collector_env_num = 32 evaluator_env_num = 5 nstep = 5 -minigrid_ppo_rnd_config = dict( - # exp_name='debug_minigrid_empty8_ngu_n5_bs2_ul98_erbm1', - # exp_name='debug_minigrid_fourrooms_ngu_er01_rbs5e4_n32', +minigrid_ppo_ngu_config = dict( exp_name='debug_minigrid_doorkey_ngu_ul298_er01_rbs3e4_n32', env=dict( collector_env_num=collector_env_num, @@ -47,8 +45,6 @@ minigrid_ppo_rnd_config = dict( type='episodic', ), policy=dict( - continuous=False, - on_policy=False, cuda=True, priority=True, priority_IS_weight=True, @@ -83,7 +79,7 @@ minigrid_ppo_rnd_config = dict( ), replay_buffer=dict( replay_buffer_size=30000, - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization + # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full alpha=0.6, # (Float type) How much correction is used: 0 means no correction while 1 means full correction beta=0.4, @@ -91,9 +87,9 @@ minigrid_ppo_rnd_config = dict( ), ), ) -minigrid_ppo_rnd_config = EasyDict(minigrid_ppo_rnd_config) -main_config = minigrid_ppo_rnd_config -minigrid_ppo_rnd_create_config = dict( +minigrid_ppo_ngu_config = EasyDict(minigrid_ppo_ngu_config) +main_config = minigrid_ppo_ngu_config +minigrid_ppo_ngu_create_config = dict( env=dict( type='minigrid', import_names=['dizoo.minigrid.envs.minigrid_env'], @@ -105,8 +101,8 @@ minigrid_ppo_rnd_create_config = dict( episodic_reward_model=dict(type='episodic'), collector=dict(type='sample_ngu', ) ) -minigrid_ppo_rnd_create_config = EasyDict(minigrid_ppo_rnd_create_config) -create_config = minigrid_ppo_rnd_create_config +minigrid_ppo_ngu_create_config = EasyDict(minigrid_ppo_ngu_create_config) +create_config = minigrid_ppo_ngu_create_config if __name__ == "__main__": serial_pipeline_reward_model_ngu([main_config, create_config], seed=0) diff --git a/dizoo/minigrid/config/minigrid_onppo_config.py b/dizoo/minigrid/config/minigrid_onppo_config.py index 19e0073bf2ce9edcfe963e4e1264b0b3a3b5a3e7..73f270e0767429cd54957e2c83c821ce1b0c9ec1 100644 --- a/dizoo/minigrid/config/minigrid_onppo_config.py +++ b/dizoo/minigrid/config/minigrid_onppo_config.py @@ -2,10 +2,7 @@ from easydict import EasyDict from ding.entry import serial_pipeline_onpolicy collector_env_num = 8 minigrid_ppo_config = dict( - # exp_name="minigrid_empty8_onppo", exp_name="minigrid_fourrooms_onppo", - # exp_name="minigrid_doorkey88_onppo", - # exp_name="minigrid_doorkey_onppo", env=dict( collector_env_num=8, evaluator_env_num=5, @@ -19,11 +16,11 @@ minigrid_ppo_config = dict( policy=dict( cuda=True, recompute_adv=True, - continuous=False, - on_policy=True, + action_space='discrete', model=dict( obs_shape=2739, action_shape=7, + action_space='discrete', encoder_hidden_size_list=[256, 128, 64, 64], ), learn=dict( diff --git a/dizoo/minigrid/config/minigrid_rnd_onppo_config.py b/dizoo/minigrid/config/minigrid_rnd_onppo_config.py index d96b8f51004d7527e333ad3b636f62c89a5db4a2..8b24f3484e815b0f79e7fdf5bec56b7e2bc07359 100644 --- a/dizoo/minigrid/config/minigrid_rnd_onppo_config.py +++ b/dizoo/minigrid/config/minigrid_rnd_onppo_config.py @@ -4,12 +4,7 @@ import torch print(torch.__version__, torch.cuda.is_available()) collector_env_num = 8 minigrid_ppo_rnd_config = dict( - # exp_name='minigrid_empty8_rnd_onppo_b01_weight1000_maxlen100', - # exp_name='minigrid_fourrooms_rnd_onppo_b01_weight1000_maxlen100', exp_name='minigrid_doorkey88_rnd_onppo_b01_weight1000_maxlen300', - # exp_name='minigrid_doorkey_rnd_onppo_b01_weight1000_maxlen300', - # exp_name='minigrid_kcs3r3_rnd_onppo_b01', - # exp_name='minigrid_om2dlh_rnd_onppo_b01', env=dict( collector_env_num=collector_env_num, evaluator_env_num=5, @@ -38,17 +33,17 @@ minigrid_ppo_rnd_config = dict( policy=dict( recompute_adv=True, cuda=True, - continuous=False, - on_policy=True, + action_space='discrete', model=dict( obs_shape=2739, action_shape=7, + action_space='discrete', encoder_hidden_size_list=[256, 128, 64, 64], critic_head_hidden_size=64, # default=64 actor_head_hidden_size=64, ), learn=dict( - epoch_per_collect=10, # TODO(pu) + epoch_per_collect=10, update_per_collect=1, # 4 batch_size=320, # 64, learning_rate=3e-4, @@ -60,7 +55,7 @@ minigrid_ppo_rnd_config = dict( ), collect=dict( collector_env_num=collector_env_num, - n_sample=int(3200), + n_sample=3200, # here self.traj_length = 3200//8 = 400, because in minigrid env the max_length is 300. # in ding/worker/collector/sample_serial_collector.py # self._traj_len = max( diff --git a/dizoo/mujoco/config/ant_onppo_default_config.py b/dizoo/mujoco/config/ant_onppo_default_config.py index a1d9e4cbbd1f03af34ec612cc2d9a33dc0894d01..2cc72b6f07fecf1fa982dedf0be40c2281534db7 100644 --- a/dizoo/mujoco/config/ant_onppo_default_config.py +++ b/dizoo/mujoco/config/ant_onppo_default_config.py @@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy collector_env_num = 1 evaluator_env_num = 1 ant_ppo_default_config = dict( - exp_name="result_mujoco/ant_onppo_noig", + exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0", + # exp_name="result_mujoco_para2/ant_onppo_ig_para2", env=dict( env_id='Ant-v3', norm_obs=dict(use_norm=False, ), @@ -18,33 +19,37 @@ ant_ppo_default_config = dict( policy=dict( cuda=True, recompute_adv=True, - continuous=True, - on_policy=True, + action_space='continuous', model=dict( - continuous=True, + action_space='continuous', obs_shape=111, action_shape=8, ), learn=dict( epoch_per_collect=10, update_per_collect=1, - batch_size=64, + batch_size=320, learning_rate=3e-4, - value_weight=0.25, - entropy_weight=0, + value_weight=0.5, + entropy_weight=0.001, clip_ratio=0.2, adv_norm=True, value_norm=True, + # for onppo, when we recompute adv, we need the key done in data to split traj, so we must + # use ignore_done=False here, + # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True + # for halfcheetah, the length=1000 + # ignore_done=True, ignore_done=False, grad_clip_type='clip_norm', grad_clip_value=0.5, ), collect=dict( collector_env_num=collector_env_num, - n_sample=2048, + n_sample=3200, unroll_len=1, discount_factor=0.99, - gae_lambda=0.97, + gae_lambda=0.95, ), eval=dict(evaluator=dict(eval_freq=5000, )), ), diff --git a/dizoo/mujoco/config/ant_onppo_default_config_para2.py b/dizoo/mujoco/config/ant_onppo_default_config_para2.py deleted file mode 100644 index 2798c12ac9ef09ebc1552eb32a3cbbda7bd03e43..0000000000000000000000000000000000000000 --- a/dizoo/mujoco/config/ant_onppo_default_config_para2.py +++ /dev/null @@ -1,74 +0,0 @@ -from easydict import EasyDict -from ding.entry import serial_pipeline_onpolicy - -collector_env_num = 1 -evaluator_env_num = 1 -ant_ppo_default_config = dict( - exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0", - # exp_name="result_mujoco_para2/ant_onppo_ig_para2", - env=dict( - env_id='Ant-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - use_act_scale=True, - n_evaluator_episode=10, - stop_value=6000, - ), - policy=dict( - cuda=True, - recompute_adv=True, - continuous=True, - on_policy=True, - model=dict( - continuous=True, - obs_shape=111, - action_shape=8, - ), - learn=dict( - epoch_per_collect=10, - update_per_collect=1, - batch_size=320, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, - adv_norm=True, - value_norm=True, - # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here, - # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True - # for halfcheetah, the length=1000 - # ignore_done=True, - ignore_done=False, - grad_clip_type='clip_norm', - grad_clip_value=0.5, - - ), - collect=dict( - collector_env_num=collector_env_num, - n_sample=3200, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - eval=dict(evaluator=dict(eval_freq=5000, )), - ), -) -ant_ppo_default_config = EasyDict(ant_ppo_default_config) -main_config = ant_ppo_default_config - -ant_ppo_create_default_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - # env_manager=dict(type='subprocess'), - env_manager=dict(type='base'), - policy=dict(type='ppo', ), -) -ant_ppo_create_default_config = EasyDict(ant_ppo_create_default_config) -create_config = ant_ppo_create_default_config - -if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/halfcheetah_onppo_default_config.py b/dizoo/mujoco/config/halfcheetah_onppo_default_config.py index e95d955143441cd16f5af09c6326b56a72be47e2..43a9a84fe8e9312da2871296532d36db3bdf9ed6 100644 --- a/dizoo/mujoco/config/halfcheetah_onppo_default_config.py +++ b/dizoo/mujoco/config/halfcheetah_onppo_default_config.py @@ -4,9 +4,7 @@ from ding.entry import serial_pipeline_onpolicy collector_env_num = 1 evaluator_env_num = 1 halfcheetah_ppo_default_config = dict( - exp_name="Halfcheetah_onppo", - # exp_name="debug/debug_halfcheetah_onppo_ig", - + exp_name="halfcheetah_onppo", env=dict( env_id='HalfCheetah-v3', norm_obs=dict(use_norm=False, ), @@ -15,44 +13,41 @@ halfcheetah_ppo_default_config = dict( evaluator_env_num=evaluator_env_num, use_act_scale=True, n_evaluator_episode=10, - # n_evaluator_episode=1, stop_value=12000, ), policy=dict( cuda=True, recompute_adv=True, - continuous=True, - on_policy=True, + action_space='continuous', model=dict( - continuous=True, + action_space='continuous', obs_shape=17, action_shape=6, ), learn=dict( - epoch_per_collect=10,#10, + epoch_per_collect=10, update_per_collect=1, - batch_size=64,#320, + batch_size=320, learning_rate=3e-4, - value_weight=0.25,#0.5, - entropy_weight=0,#0.001, + value_weight=0.5, + entropy_weight=0.001, clip_ratio=0.2, adv_norm=True, value_norm=True, - # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here, + # for onppo, when we recompute adv, we need the key done in data to split traj, so we must + # use ignore_done=False here, # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True # for halfcheetah, the length=1000 - # ignore_done=True, - ignore_done=False, + ignore_done=True, grad_clip_type='clip_norm', grad_clip_value=0.5, - ), collect=dict( collector_env_num=collector_env_num, - n_sample=2048,#3200, + n_sample=3200, unroll_len=1, discount_factor=0.99, - gae_lambda=0.97,#0.95, + gae_lambda=0.95, ), eval=dict(evaluator=dict(eval_freq=5000, )), ), @@ -65,12 +60,11 @@ halfcheetah_ppo_create_default_config = dict( type='mujoco', import_names=['dizoo.mujoco.envs.mujoco_env'], ), - # env_manager=dict(type='subprocess'), - env_manager=dict(type='base'), + env_manager=dict(type='subprocess'), policy=dict(type='ppo', ), ) halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config) create_config = halfcheetah_ppo_create_default_config if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=0) + serial_pipeline_onpolicy([main_config, create_config], seed=1) diff --git a/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py b/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py deleted file mode 100644 index 735dc5a0d1761006deb1b0d43d0af57218e249e4..0000000000000000000000000000000000000000 --- a/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py +++ /dev/null @@ -1,75 +0,0 @@ -from easydict import EasyDict -from ding.entry import serial_pipeline_onpolicy - -collector_env_num = 1 -evaluator_env_num = 1 -halfcheetah_ppo_default_config = dict( - # exp_name="result_mujoco_para2/halfcheetah_onppo_noig_para2", - exp_name="result_mujoco_para2/halfcheetah_onppo_ig_para2_seed1", - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - use_act_scale=True, - n_evaluator_episode=10, - # n_evaluator_episode=1, - stop_value=12000, - ), - policy=dict( - cuda=True, - recompute_adv=True, - continuous=True, - on_policy=True, - model=dict( - continuous=True, - obs_shape=17, - action_shape=6, - ), - learn=dict( - epoch_per_collect=10, - update_per_collect=1, - batch_size=320, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, - adv_norm=True, - value_norm=True, - # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here, - # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True - # for halfcheetah, the length=1000 - ignore_done=True, - # ignore_done=False, - grad_clip_type='clip_norm', - grad_clip_value=0.5, - - ), - collect=dict( - collector_env_num=collector_env_num, - n_sample=3200, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - eval=dict(evaluator=dict(eval_freq=5000, )), - ), -) -halfcheetah_ppo_default_config = EasyDict(halfcheetah_ppo_default_config) -main_config = halfcheetah_ppo_default_config - -halfcheetah_ppo_create_default_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - # env_manager=dict(type='base'), - policy=dict(type='ppo', ), -) -halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config) -create_config = halfcheetah_ppo_create_default_config - -if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=1) diff --git a/dizoo/mujoco/config/hopper_gcl_config.py b/dizoo/mujoco/config/hopper_gcl_config.py index a0553731d544884619f58dbd1773a30e76298d94..555443e484712916cfa0097de92f921a1c48b6a8 100644 --- a/dizoo/mujoco/config/hopper_gcl_config.py +++ b/dizoo/mujoco/config/hopper_gcl_config.py @@ -25,12 +25,12 @@ hopper_gcl_default_config = dict( policy=dict( cuda=False, recompute_adv=True, + action_space='continuous', model=dict( obs_shape=11, action_shape=3, - continuous=True, + action_space='continuous', ), - continuous=True, learn=dict( update_per_collect=10, batch_size=64, @@ -59,10 +59,7 @@ hopper_gcl_create_default_config = dict( import_names=['dizoo.mujoco.envs.mujoco_env'], ), env_manager=dict(type='base'), - policy=dict( - type='ppo', - import_names=['ding.policy.ppo'], - ), + policy=dict(type='ppo', ), reward_model=dict(type='guided_cost'), ) hopper_gcl_create_default_config = EasyDict(hopper_gcl_create_default_config) diff --git a/dizoo/mujoco/config/hopper_onppo_default_config.py b/dizoo/mujoco/config/hopper_onppo_default_config.py index 5dc646c52939517af95ee1071802c0b63b8e8ae8..828d4e7b0cacfad2d65e6a618c51751340d26a15 100644 --- a/dizoo/mujoco/config/hopper_onppo_default_config.py +++ b/dizoo/mujoco/config/hopper_onppo_default_config.py @@ -16,12 +16,12 @@ hopper_ppo_default_config = dict( policy=dict( cuda=True, recompute_adv=True, + action_space='continuous', model=dict( obs_shape=11, action_shape=3, - continuous=True, + action_space='continuous', ), - continuous=True, learn=dict( epoch_per_collect=10, update_per_collect=1, @@ -57,4 +57,4 @@ hopper_ppo_create_default_config = EasyDict(hopper_ppo_create_default_config) create_config = hopper_ppo_create_default_config if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=0) \ No newline at end of file + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/walker2d_ddpg_gail_config.py b/dizoo/mujoco/config/walker2d_ddpg_gail_config.py index 2e70219aea6776466f4230b0b98db1a5d4c2e4f6..2c08820b7d3c45372c9a86dba2e6cce7bd220f40 100644 --- a/dizoo/mujoco/config/walker2d_ddpg_gail_config.py +++ b/dizoo/mujoco/config/walker2d_ddpg_gail_config.py @@ -21,7 +21,6 @@ walker2d_ddpg_gail_default_config = dict( update_per_collect=100, expert_data_path='walker2d_ddpg/expert_data_train.pkl', load_path='walker2d_ddpg_gail/reward_model/ckpt/ckpt_best.pth.tar', # state_dict of the reward model - collect_count=100000, ), policy=dict( diff --git a/dizoo/mujoco/config/walker2d_gcl_config.py b/dizoo/mujoco/config/walker2d_gcl_config.py index 537e98055d42d17ce228ac9a2321591739f71cb0..fb68b45dfe66a721bb284a2563334c5f4825a998 100644 --- a/dizoo/mujoco/config/walker2d_gcl_config.py +++ b/dizoo/mujoco/config/walker2d_gcl_config.py @@ -24,12 +24,12 @@ walker_gcl_default_config = dict( policy=dict( cuda=False, recompute_adv=True, + action_space='continuous', model=dict( obs_shape=17, action_shape=6, - continuous=True, + action_space='continuous', ), - continuous=True, learn=dict( update_per_collect=10, batch_size=64, @@ -58,10 +58,7 @@ walker_gcl_create_default_config = dict( import_names=['dizoo.mujoco.envs.mujoco_env'], ), env_manager=dict(type='base'), - policy=dict( - type='ppo', - import_names=['ding.policy.ppo'], - ), + policy=dict(type='ppo', ), replay_buffer=dict(type='naive', ), reward_model=dict(type='guided_cost'), ) diff --git a/dizoo/mujoco/config/walker2d_onppo_default_config.py b/dizoo/mujoco/config/walker2d_onppo_default_config.py index adca526178c77b7bed1001b34e2a0fcf90e19868..102ac1ea6e05e7e7f056d211943442ec80df27a0 100644 --- a/dizoo/mujoco/config/walker2d_onppo_default_config.py +++ b/dizoo/mujoco/config/walker2d_onppo_default_config.py @@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy collector_env_num = 1 evaluator_env_num = 1 walker2d_ppo_default_config = dict( - exp_name="result_mujoco/wlker2d_onppo_noig", + # exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1", + # exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1", env=dict( env_id='Walker2d-v3', norm_obs=dict(use_norm=False, ), @@ -18,24 +19,24 @@ walker2d_ppo_default_config = dict( policy=dict( cuda=True, recompute_adv=True, - continuous=True, - on_policy=True, + action_space='continuous', model=dict( - continuous=True, + action_space='continuous', obs_shape=17, action_shape=6, ), learn=dict( - epoch_per_collect=10, + epoch_per_collect=10, update_per_collect=1, - batch_size=64, + batch_size=320, learning_rate=3e-4, - value_weight=0.25, - entropy_weight=0, + value_weight=0.5, + entropy_weight=0.001, clip_ratio=0.2, adv_norm=True, value_norm=True, - # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here, + # for onppo, when we recompute adv, we need the key done in data to split traj, so we must + # use ignore_done=False here, # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True # for halfcheetah, the length=1000 # ignore_done=True, @@ -45,10 +46,10 @@ walker2d_ppo_default_config = dict( ), collect=dict( collector_env_num=collector_env_num, - n_sample=2048, + n_sample=3200, unroll_len=1, discount_factor=0.99, - gae_lambda=0.97, + gae_lambda=0.95, ), eval=dict(evaluator=dict(eval_freq=5000, )), ), @@ -69,4 +70,4 @@ walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config create_config = walker2d_ppo_create_default_config if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=0) + serial_pipeline_onpolicy([main_config, create_config], seed=1) diff --git a/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py b/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py deleted file mode 100644 index d5cf1ac311d603bf1ab9707f24ad892a11d513d8..0000000000000000000000000000000000000000 --- a/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py +++ /dev/null @@ -1,73 +0,0 @@ -from easydict import EasyDict -from ding.entry import serial_pipeline_onpolicy - -collector_env_num = 1 -evaluator_env_num = 1 -walker2d_ppo_default_config = dict( - # exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1", - # exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1", - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - use_act_scale=True, - n_evaluator_episode=10, - stop_value=6000, - ), - policy=dict( - cuda=True, - recompute_adv=True, - continuous=True, - on_policy=True, - model=dict( - continuous=True, - obs_shape=17, - action_shape=6, - ), - learn=dict( - epoch_per_collect=10, - update_per_collect=1, - batch_size=320, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, - adv_norm=True, - value_norm=True, - # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here, - # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True - # for halfcheetah, the length=1000 - # ignore_done=True, - ignore_done=False, - grad_clip_type='clip_norm', - grad_clip_value=0.5, - ), - collect=dict( - collector_env_num=collector_env_num, - n_sample=3200, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - eval=dict(evaluator=dict(eval_freq=5000, )), - ), -) -walker2d_ppo_default_config = EasyDict(walker2d_ppo_default_config) -main_config = walker2d_ppo_default_config - -walker2d_ppo_create_default_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - # env_manager=dict(type='subprocess'), - env_manager=dict(type='base'), - policy=dict(type='ppo', ), -) -walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config) -create_config = walker2d_ppo_create_default_config - -if __name__ == "__main__": - serial_pipeline_onpolicy([main_config, create_config], seed=1) diff --git a/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py b/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py index 9a69fe8aa30619c60794832ced06ebe88ea01f16..81fc4aadf52e2a88764b81ab2d23b7231619bbca 100644 --- a/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py +++ b/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py @@ -20,8 +20,9 @@ main_config = dict( policy=dict( cuda=False, multi_agent=True, - continuous=False, + action_space='discrete', model=dict( + action_space='discrete', agent_num=n_agent, agent_obs_shape=2 + 2 + (n_agent - 1) * 2 + num_landmarks * 2, global_obs_shape=n_agent * 2 + num_landmarks * 2 + n_agent * 2, diff --git a/dizoo/overcooked/config/overcooked_demo_ppo_config.py b/dizoo/overcooked/config/overcooked_demo_ppo_config.py index 9a175917cd2784b4be019758d9e74b2e5eef53d3..207e870ea66013b1da324a106139c5846ca0e607 100644 --- a/dizoo/overcooked/config/overcooked_demo_ppo_config.py +++ b/dizoo/overcooked/config/overcooked_demo_ppo_config.py @@ -11,22 +11,21 @@ overcooked_league_demo_ppo_config = dict( ), policy=dict( cuda=False, - continuous=False, recompute_adv=True, + action_space='discrete', model=dict( obs_shape=[5, 4, 26], action_shape=6, share_encoder=False, + action_space='discrete', ), learn=dict( - update_per_collect=4, + epoch_per_collect=4, batch_size=128, learning_rate=0.001, value_weight=0.5, entropy_weight=0.01, clip_ratio=0.2, - nstep=1, - nstep_return=False, adv_norm=True, value_norm=True, ), diff --git a/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py b/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py index b2f10c72595c9750205e3ff96bcbb27250edafdc..83052ec67c83a9f0a6ed095d30ad15d0213fe667 100644 --- a/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py +++ b/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py @@ -10,8 +10,10 @@ coinrun_ppo_default_config = dict( ), policy=dict( cuda=False, + action_space='discrete', model=dict( obs_shape=[3, 64, 64], + action_space='discrete', action_shape=15, encoder_hidden_size_list=[32, 32, 64], ), @@ -34,7 +36,6 @@ coinrun_ppo_default_config = dict( ), replay_buffer=dict(replay_buffer_size=100000, ), ), - cuda=True, ), ) coinrun_ppo_default_config = EasyDict(coinrun_ppo_default_config) diff --git a/dizoo/procgen/maze/entry/maze_dqn_config.py b/dizoo/procgen/maze/entry/maze_dqn_config.py index e677e591319b2c4e87c8cd8ba587a5956de1fb2e..432adcdce442c30d61620a0dd27a776f843c707f 100644 --- a/dizoo/procgen/maze/entry/maze_dqn_config.py +++ b/dizoo/procgen/maze/entry/maze_dqn_config.py @@ -34,7 +34,6 @@ maze_dqn_default_config = dict( ), replay_buffer=dict(replay_buffer_size=100000, ), ), - cuda=True, ), ) maze_dqn_default_config = EasyDict(maze_dqn_default_config) diff --git a/dizoo/procgen/maze/entry/maze_ppo_config.py b/dizoo/procgen/maze/entry/maze_ppo_config.py index 6e0ad658be46a1a52e7e1fa61a1500d4887490dc..c6b3cc1d7d580d2ebb2150171b6a2264714cc486 100644 --- a/dizoo/procgen/maze/entry/maze_ppo_config.py +++ b/dizoo/procgen/maze/entry/maze_ppo_config.py @@ -11,9 +11,11 @@ maze_ppo_default_config = dict( ), policy=dict( cuda=False, + action_space='discrete', model=dict( obs_shape=[3, 64, 64], action_shape=15, + action_space='discrete', encoder_hidden_size_list=[32, 32, 64], ), learn=dict( diff --git a/dizoo/pybullet/config/hopper_ppo_default_config.py b/dizoo/pybullet/config/hopper_ppo_default_config.py index e60416b9d3b3451ce5e8963e0e28a1a39a73422f..5d42840784acfa587cf8efc7ffae38520ac15e71 100644 --- a/dizoo/pybullet/config/hopper_ppo_default_config.py +++ b/dizoo/pybullet/config/hopper_ppo_default_config.py @@ -14,12 +14,12 @@ hopper_ppo_default_config = dict( policy=dict( cuda=True, recompute_adv=True, + action_space='continuous', model=dict( obs_shape=11, action_shape=3, - continuous=True, + action_space='continuous', ), - continuous=True, learn=dict( epoch_per_collect=10, batch_size=64, diff --git a/dizoo/slime_volley/config/slime_volley_league_ppo_config.py b/dizoo/slime_volley/config/slime_volley_league_ppo_config.py deleted file mode 100644 index d48ca085d04d870b2d529229f4f1f65aa854a509..0000000000000000000000000000000000000000 --- a/dizoo/slime_volley/config/slime_volley_league_ppo_config.py +++ /dev/null @@ -1,78 +0,0 @@ -from easydict import EasyDict - -slime_volley_league_ppo_config = dict( - exp_name="slime_volley_league_ppo", - env=dict( - collector_env_num=8, - evaluator_env_num=10, - n_evaluator_episode=100, - stop_value=0, - # Single-agent env for evaluator; Double-agent env for collector. - # Should be assigned True or False in code. - is_evaluator=None, - manager=dict(shared_memory=False, ), - env_id="SlimeVolley-v0", - ), - policy=dict( - cuda=False, - continuous=False, - model=dict( - obs_shape=12, - action_shape=6, - encoder_hidden_size_list=[32, 32], - critic_head_hidden_size=32, - actor_head_hidden_size=32, - share_encoder=False, - ), - learn=dict( - update_per_collect=3, - batch_size=32, - learning_rate=0.00001, - value_weight=0.5, - entropy_weight=0.0, - clip_ratio=0.2, - ), - collect=dict( - n_episode=128, unroll_len=1, discount_factor=1.0, gae_lambda=1.0, collector=dict(get_train_sample=True, ) - ), - other=dict( - league=dict( - player_category=['default'], - path_policy="slime_volley_league_ppo/policy", - active_players=dict( - main_player=1, - main_exploiter=1, - league_exploiter=1, - ), - main_player=dict( - one_phase_step=200, - branch_probs=dict( - pfsp=0.5, - sp=1.0, - ), - strong_win_rate=0.7, - ), - main_exploiter=dict( - one_phase_step=200, - branch_probs=dict(main_players=1.0, ), - strong_win_rate=0.7, - min_valid_win_rate=0.3, - ), - league_exploiter=dict( - one_phase_step=200, - branch_probs=dict(pfsp=1.0, ), - strong_win_rate=0.7, - mutate_prob=0.0, - ), - use_pretrain=False, - use_pretrain_init_historical=False, - payoff=dict( - type='battle', - decay=0.99, - min_win_rate_games=8, - ) - ), - ), - ), -) -slime_volley_league_ppo_config = EasyDict(slime_volley_league_ppo_config) diff --git a/dizoo/slime_volley/config/slime_volley_ppo_config.py b/dizoo/slime_volley/config/slime_volley_ppo_config.py index fbe854181f889ff3dfe24b48f6a120f1b15bbbca..38b90318c9fd9d24b0686f5a6b68d2cc75246281 100644 --- a/dizoo/slime_volley/config/slime_volley_ppo_config.py +++ b/dizoo/slime_volley/config/slime_volley_ppo_config.py @@ -13,10 +13,11 @@ slime_volley_ppo_config = dict( ), policy=dict( cuda=True, - continuous=False, + action_space='discrete', model=dict( obs_shape=12, action_shape=6, + action_space='discrete', encoder_hidden_size_list=[64, 64], critic_head_hidden_size=64, actor_head_hidden_size=64, diff --git a/dizoo/smac/config/smac_3s5z_mappo_config.py b/dizoo/smac/config/smac_3s5z_mappo_config.py index f5f57d940ee7f60020259e9097628171f6597da4..609a99223ac17045da6d6177fdea2c4497e65607 100644 --- a/dizoo/smac/config/smac_3s5z_mappo_config.py +++ b/dizoo/smac/config/smac_3s5z_mappo_config.py @@ -31,7 +31,7 @@ main_config = dict( policy=dict( cuda=True, multi_agent=True, - continuous=False, + action_space='discrete', model=dict( # (int) agent_num: The number of the agent. # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2. @@ -49,6 +49,7 @@ main_config = dict( action_shape=14, # (List[int]) The size of hidden layer # hidden_size_list=[64], + action_space='discrete' ), # used in state_num of hidden_state learn=dict( diff --git a/dizoo/smac/config/smac_5m6m_mappo_config.py b/dizoo/smac/config/smac_5m6m_mappo_config.py index 5d542a14a2a47a875a1fce39ec9d5013f03e276f..e7a3600087209e7bf3c51538ebfb021590ce3705 100644 --- a/dizoo/smac/config/smac_5m6m_mappo_config.py +++ b/dizoo/smac/config/smac_5m6m_mappo_config.py @@ -30,7 +30,7 @@ main_config = dict( policy=dict( cuda=True, multi_agent=True, - continuous=False, + action_space='discrete', model=dict( # (int) agent_num: The number of the agent. # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2. @@ -48,6 +48,7 @@ main_config = dict( action_shape=12, # (List[int]) The size of hidden layer # hidden_size_list=[64], + action_space='discrete', ), # used in state_num of hidden_state learn=dict( diff --git a/dizoo/smac/config/smac_MMM2_mappo_config.py b/dizoo/smac/config/smac_MMM2_mappo_config.py index b8d19aa1a4b1649df68d2291ee7c7f89f2d32e4e..e8c63fb3ff7a5c4b90002b2f18e9bf91fd76b934 100644 --- a/dizoo/smac/config/smac_MMM2_mappo_config.py +++ b/dizoo/smac/config/smac_MMM2_mappo_config.py @@ -30,7 +30,7 @@ main_config = dict( policy=dict( cuda=True, multi_agent=True, - continuous=False, + action_space='discrete', model=dict( # (int) agent_num: The number of the agent. # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2. @@ -47,6 +47,7 @@ main_config = dict( action_shape=18, # (List[int]) The size of hidden layer # hidden_size_list=[64], + action_space='discrete', ), # used in state_num of hidden_state learn=dict( diff --git a/dizoo/smac/config/smac_MMM_mappo_config.py b/dizoo/smac/config/smac_MMM_mappo_config.py index 13e9492f67f95c813d26d3c18067f7083f6bb467..9adc93d96ec0879e273a5585f3f77f38f07df80d 100644 --- a/dizoo/smac/config/smac_MMM_mappo_config.py +++ b/dizoo/smac/config/smac_MMM_mappo_config.py @@ -30,7 +30,7 @@ main_config = dict( policy=dict( cuda=True, multi_agent=True, - continuous=False, + action_space='discrete', model=dict( # (int) agent_num: The number of the agent. # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2. @@ -48,6 +48,7 @@ main_config = dict( action_shape=16, # (List[int]) The size of hidden layer # hidden_size_list=[64], + action_space='discrete', ), # used in state_num of hidden_state learn=dict(