diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py
index d6b415b6c2c00f70eb7d67a2f60ab1ab67d77c7e..6b76eadf58a1b2b063059dfcb5324225de526147 100644
--- a/ding/model/template/mavac.py
+++ b/ding/model/template/mavac.py
@@ -28,6 +28,7 @@ class MAVAC(nn.Module):
             actor_head_layer_num: int = 2,
             critic_head_hidden_size: int = 64,
             critic_head_layer_num: int = 1,
+            action_space: str = 'discrete',
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
diff --git a/ding/model/template/ppg.py b/ding/model/template/ppg.py
index 79c5ba0f6988ee3ca2643badf4dfe0e1db48f2f3..dba98d8d4f713c4c776a6d0c0ad4979ada2f76d6 100644
--- a/ding/model/template/ppg.py
+++ b/ding/model/template/ppg.py
@@ -14,8 +14,8 @@ class PPG(nn.Module):
             self,
             obs_shape: Union[int, SequenceType],
             action_shape: Union[int, SequenceType],
+            action_space: str = 'discrete',
             share_encoder: bool = True,
-            continuous: bool = False,
             encoder_hidden_size_list: SequenceType = [128, 128, 64],
             actor_head_hidden_size: int = 64,
             actor_head_layer_num: int = 2,
@@ -26,7 +26,7 @@ class PPG(nn.Module):
     ) -> None:
         super(PPG, self).__init__()
         self.actor_critic = VAC(
-            obs_shape, action_shape, share_encoder, continuous, encoder_hidden_size_list, actor_head_hidden_size,
+            obs_shape, action_shape, action_space, share_encoder, encoder_hidden_size_list, actor_head_hidden_size,
             actor_head_layer_num, critic_head_hidden_size, critic_head_layer_num, activation, norm_type
         )
         self.aux_critic = copy.deepcopy(self.actor_critic.critic)
diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py
index cce967c506f967706fc45003835cca094396b47f..20746541538ed56fa244dd21171e1edb8a89a858 100644
--- a/ding/model/template/qac.py
+++ b/ding/model/template/qac.py
@@ -23,7 +23,7 @@ class QAC(nn.Module):
             self,
             obs_shape: Union[int, SequenceType],
             action_shape: Union[int, SequenceType, EasyDict],
-            actor_head_type: str,
+            action_space: str,
             twin_critic: bool = False,
             actor_head_hidden_size: int = 64,
             actor_head_layer_num: int = 1,
@@ -39,7 +39,7 @@ class QAC(nn.Module):
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, ), \
                 EasyDict({'action_type_shape': 3, 'action_args_shape': 4}).
-            - actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
+            - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
             - twin_critic (:obj:`bool`): Whether include twin critic.
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
             - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
@@ -56,9 +56,9 @@ class QAC(nn.Module):
         obs_shape: int = squeeze(obs_shape)
         action_shape = squeeze(action_shape)
         self.action_shape = action_shape
-        self.actor_head_type = actor_head_type
-        assert self.actor_head_type in ['regression', 'reparameterization', 'hybrid']
-        if self.actor_head_type == 'regression':  # DDPG, TD3
+        self.action_space = action_space
+        assert self.action_space in ['regression', 'reparameterization', 'hybrid']
+        if self.action_space == 'regression':  # DDPG, TD3
             self.actor = nn.Sequential(
                 nn.Linear(obs_shape, actor_head_hidden_size), activation,
                 RegressionHead(
@@ -70,7 +70,7 @@ class QAC(nn.Module):
                     norm_type=norm_type
                 )
             )
-        elif self.actor_head_type == 'reparameterization':  # SAC
+        elif self.action_space == 'reparameterization':  # SAC
             self.actor = nn.Sequential(
                 nn.Linear(obs_shape, actor_head_hidden_size), activation,
                 ReparameterizationHead(
@@ -82,7 +82,7 @@ class QAC(nn.Module):
                     norm_type=norm_type
                 )
             )
-        elif self.actor_head_type == 'hybrid':  # PADDPG
+        elif self.action_space == 'hybrid':  # PADDPG
             # hybrid action space: action_type(discrete) + action_args(continuous),
             # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
             action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
@@ -110,7 +110,7 @@ class QAC(nn.Module):
             )
             self.actor = nn.ModuleList([actor_action_type, actor_action_args])
         self.twin_critic = twin_critic
-        if self.actor_head_type == 'hybrid':
+        if self.action_space == 'hybrid':
             critic_input_size = obs_shape + action_shape.action_type_shape + action_shape.action_args_shape
         else:
             critic_input_size = obs_shape + action_shape
@@ -194,7 +194,7 @@ class QAC(nn.Module):
 
         Critic Examples:
             >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
-            >>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
+            >>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
             >>> model(inputs, mode='compute_critic')['q_value'] # q value
             tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
 
@@ -245,13 +245,13 @@ class QAC(nn.Module):
             >>> actor_outputs['logit'][1].shape # sigma
             >>> torch.Size([4, 64])
         """
-        if self.actor_head_type == 'regression':
+        if self.action_space == 'regression':
             x = self.actor(inputs)
             return {'action': x['pred']}
-        elif self.actor_head_type == 'reparameterization':
+        elif self.action_space == 'reparameterization':
             x = self.actor(inputs)
             return {'logit': [x['mu'], x['sigma']]}
-        elif self.actor_head_type == 'hybrid':
+        elif self.action_space == 'hybrid':
             logit = self.actor[0](inputs)
             action_args = self.actor[1](inputs)
             return {'logit': logit['logit'], 'action_args': action_args['pred']}
@@ -284,14 +284,14 @@ class QAC(nn.Module):
 
         Examples:
             >>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)}
-            >>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
+            >>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
             >>> model(inputs, mode='compute_critic')['q_value']  # q value
             >>> tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
         """
 
         obs, action = inputs['obs'], inputs['action']
         assert len(obs.shape) == 2
-        if self.actor_head_type == 'hybrid':
+        if self.action_space == 'hybrid':
             action_type_logit = inputs['logit']
             action_type_logit = torch.softmax(action_type_logit, dim=-1)
             action_args = action['action_args']
diff --git a/ding/model/template/qac_dist.py b/ding/model/template/qac_dist.py
index a18c035342d38e3e735024b1e7300c202a00259e..e2ce65f34c76a11a156977e9ab731005d8c25650 100644
--- a/ding/model/template/qac_dist.py
+++ b/ding/model/template/qac_dist.py
@@ -20,7 +20,7 @@ class QACDIST(nn.Module):
         self,
         obs_shape: Union[int, SequenceType],
         action_shape: Union[int, SequenceType],
-        actor_head_type: str = "regression",
+        action_space: str = "regression",
         critic_head_type: str = "categorical",
         actor_head_hidden_size: int = 64,
         actor_head_layer_num: int = 1,
@@ -38,7 +38,7 @@ class QACDIST(nn.Module):
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
+            - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
             - critic_head_type (:obj:`str`): Only ``categorical``.
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
             - actor_head_layer_num (:obj:`int`):
@@ -58,9 +58,9 @@ class QACDIST(nn.Module):
         super(QACDIST, self).__init__()
         obs_shape: int = squeeze(obs_shape)
         action_shape: int = squeeze(action_shape)
-        self.actor_head_type = actor_head_type
-        assert self.actor_head_type in ['regression', 'reparameterization']
-        if self.actor_head_type == 'regression':
+        self.action_space = action_space
+        assert self.action_space in ['regression', 'reparameterization']
+        if self.action_space == 'regression':
             self.actor = nn.Sequential(
                 nn.Linear(obs_shape, actor_head_hidden_size), activation,
                 RegressionHead(
@@ -72,7 +72,7 @@ class QACDIST(nn.Module):
                     norm_type=norm_type
                 )
             )
-        elif self.actor_head_type == 'reparameterization':
+        elif self.action_space == 'reparameterization':
             self.actor = nn.Sequential(
                 nn.Linear(obs_shape, actor_head_hidden_size), activation,
                 ReparameterizationHead(
@@ -156,7 +156,7 @@ class QACDIST(nn.Module):
         Critic Examples:
             >>> # Categorical mode
             >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
-            >>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
+            >>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
             ...                 critic_head_type='categorical', n_atoms=51)
             >>> q_value = model(inputs, mode='compute_critic') # q value
             >>> assert q_value['q_value'].shape == torch.Size([4, 1])
@@ -204,9 +204,9 @@ class QACDIST(nn.Module):
             >>> torch.Size([4, 64])
         """
         x = self.actor(inputs)
-        if self.actor_head_type == 'regression':
+        if self.action_space == 'regression':
             return {'action': x['pred']}
-        elif self.actor_head_type == 'reparameterization':
+        elif self.action_space == 'reparameterization':
             return {'logit': [x['mu'], x['sigma']]}
 
     def compute_critic(self, inputs: Dict) -> Dict:
@@ -232,7 +232,7 @@ class QACDIST(nn.Module):
         Examples:
             >>> # Categorical mode
             >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
-            >>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
+            >>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
             ...                 critic_head_type='categorical', n_atoms=51)
             >>> q_value = model(inputs, mode='compute_critic') # q value
             >>> assert q_value['q_value'].shape == torch.Size([4, 1])
diff --git a/ding/model/template/tests/test_hybrid_qac.py b/ding/model/template/tests/test_hybrid_qac.py
index d4f9f279a1bb16ceb8fe5bfec3c7c92ee97c0d0e..018c3f2d36054577ea1fbec3a00b288c12d12404 100644
--- a/ding/model/template/tests/test_hybrid_qac.py
+++ b/ding/model/template/tests/test_hybrid_qac.py
@@ -16,7 +16,7 @@ hybrid_args = {
         'action_args_shape': (6, )
     }),
     'twin': True,
-    'actor_head_type': 'hybrid'
+    'action_space': 'hybrid'
 }
 
 
@@ -27,10 +27,10 @@ class TestHybridQAC:
         self,
         action_shape=hybrid_args['action_shape'],
         twin=hybrid_args['twin'],
-        actor_head_type=hybrid_args['actor_head_type']
+        action_space=hybrid_args['action_space']
     ):
         N = 32
-        assert actor_head_type == 'hybrid'
+        assert action_space == 'hybrid'
         inputs = {
             'obs': torch.randn(B, N),
             'action': {
@@ -42,7 +42,7 @@ class TestHybridQAC:
         model = QAC(
             obs_shape=(N, ),
             action_shape=action_shape,
-            actor_head_type=actor_head_type,
+            action_space=action_space,
             critic_head_hidden_size=embedding_size,
             actor_head_hidden_size=embedding_size,
             twin_critic=twin,
diff --git a/ding/model/template/tests/test_qac.py b/ding/model/template/tests/test_qac.py
index d3fca991cf356d552d45c9eb943d1cf66dc0e131..ea0e9348d9b3c6c49a7470cd4c3472b26314b83a 100644
--- a/ding/model/template/tests/test_qac.py
+++ b/ding/model/template/tests/test_qac.py
@@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, [True, False], ['regression', 'reparame
 
 
 @pytest.mark.unittest
-@pytest.mark.parametrize('action_shape, twin, actor_head_type', args)
+@pytest.mark.parametrize('action_shape, twin, action_space', args)
 class TestQAC:
 
-    def test_fcqac(self, action_shape, twin, actor_head_type):
+    def test_fcqac(self, action_shape, twin, action_space):
         N = 32
         inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
         model = QAC(
             obs_shape=(N, ),
             action_shape=action_shape,
-            actor_head_type=actor_head_type,
+            action_space=action_space,
             critic_head_hidden_size=embedding_size,
             actor_head_hidden_size=embedding_size,
             twin_critic=twin,
@@ -41,7 +41,7 @@ class TestQAC:
 
         # compute_action
         print(model)
-        if actor_head_type == 'regression':
+        if action_space == 'regression':
             action = model(inputs['obs'], mode='compute_actor')['action']
             if squeeze(action_shape) == 1:
                 assert action.shape == (B, )
@@ -49,7 +49,7 @@ class TestQAC:
                 assert action.shape == (B, squeeze(action_shape))
             assert action.eq(action.clamp(-1, 1)).all()
             is_differentiable(action.sum(), model.actor)
-        elif actor_head_type == 'reparameterization':
+        elif action_space == 'reparameterization':
             (mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
             assert mu.shape == (B, *action_shape)
             assert sigma.shape == (B, *action_shape)
diff --git a/ding/model/template/tests/test_qac_dist.py b/ding/model/template/tests/test_qac_dist.py
index 21e8871153c24a7e9616aa0ba7b8fec9684255f1..2e6f8548092e32b21171fc31f7dc31b24e4865d6 100644
--- a/ding/model/template/tests/test_qac_dist.py
+++ b/ding/model/template/tests/test_qac_dist.py
@@ -17,16 +17,16 @@ args = list(product(*[action_shape_args, ['regression', 'reparameterization']]))
 
 
 @pytest.mark.unittest
-@pytest.mark.parametrize('action_shape, actor_head_type', args)
+@pytest.mark.parametrize('action_shape, action_space', args)
 class TestQACDIST:
 
-    def test_fcqac_dist(self, action_shape, actor_head_type):
+    def test_fcqac_dist(self, action_shape, action_space):
         N = 32
         inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
         model = QACDIST(
             obs_shape=(N, ),
             action_shape=action_shape,
-            actor_head_type=actor_head_type,
+            action_space=action_space,
             critic_head_hidden_size=embedding_size,
             actor_head_hidden_size=embedding_size,
         )
@@ -43,7 +43,7 @@ class TestQACDIST:
 
         # compute_action
         print(model)
-        if actor_head_type == 'regression':
+        if action_space == 'regression':
             action = model(inputs['obs'], mode='compute_actor')['action']
             if squeeze(action_shape) == 1:
                 assert action.shape == (B, )
@@ -51,7 +51,7 @@ class TestQACDIST:
                 assert action.shape == (B, squeeze(action_shape))
             assert action.eq(action.clamp(-1, 1)).all()
             is_differentiable(action.sum(), model.actor)
-        elif actor_head_type == 'reparameterization':
+        elif action_space == 'reparameterization':
             (mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
             assert mu.shape == (B, *action_shape)
             assert sigma.shape == (B, *action_shape)
diff --git a/ding/model/template/tests/test_vac.py b/ding/model/template/tests/test_vac.py
index 698d1a8e83a33f37674a23d2330b256f26877e9f..48eb64e16aa798ca15a87d6adbd9bf1fc4f0ac95 100644
--- a/ding/model/template/tests/test_vac.py
+++ b/ding/model/template/tests/test_vac.py
@@ -8,8 +8,8 @@ from ding.torch_utils import is_differentiable
 
 B, C, H, W = 4, 3, 128, 128
 obs_shape = [4, (8, ), (4, 64, 64)]
-act_args = [[6, False], [(3, ), True], [[2, 3, 6], False]]
-#act_args = [[(3, ), True]]
+act_args = [[6, 'discrete'], [(3, ), 'continuous'], [[2, 3, 6], 'discrete']]
+# act_args = [[(3, ), True]]
 args = list(product(*[obs_shape, act_args, [False, True]]))
 
 
@@ -29,12 +29,12 @@ class TestVAC:
             inputs = torch.randn(B, obs_shape)
         else:
             inputs = torch.randn(B, *obs_shape)
-        model = VAC(obs_shape, action_shape=act_args[0], continuous=act_args[1], share_encoder=share_encoder)
+        model = VAC(obs_shape, action_shape=act_args[0], action_space=act_args[1], share_encoder=share_encoder)
 
         outputs = model(inputs, mode='compute_actor_critic')
         value, logit = outputs['value'], outputs['logit']
-        if model.continuous:
-            outputs = value.sum() + logit[0].sum() + logit[1].sum()
+        if model.action_space == 'continuous':
+            outputs = value.sum() + logit['mu'].sum() + logit['sigma'].sum()
         else:
             if model.multi_head:
                 outputs = value.sum() + sum([t.sum() for t in logit])
@@ -45,8 +45,8 @@ class TestVAC:
         for p in model.parameters():
             p.grad = None
         logit = model(inputs, mode='compute_actor')['logit']
-        if model.continuous:
-            logit = logit[0].sum() + logit[1].sum()
+        if model.action_space == 'continuous':
+            logit = logit['mu'].sum() + logit['sigma'].sum()
         self.output_check(model.actor, logit, model.action_shape)
 
         for p in model.parameters():
diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py
index de142a2f439a109fc5f38be2c845941fb99c5c8a..aa970fb4a260b92be218288132f9267aae1344e3 100644
--- a/ding/model/template/vac.py
+++ b/ding/model/template/vac.py
@@ -1,4 +1,5 @@
 from typing import Union, Dict, Optional
+from easydict import EasyDict
 import torch
 import torch.nn as nn
 
@@ -20,9 +21,9 @@ class VAC(nn.Module):
     def __init__(
         self,
         obs_shape: Union[int, SequenceType],
-        action_shape: Union[int, SequenceType],
+        action_shape: Union[int, SequenceType, EasyDict],
+        action_space: str = 'discrete',
         share_encoder: bool = True,
-        continuous: bool = False,
         encoder_hidden_size_list: SequenceType = [128, 128, 64],
         actor_head_hidden_size: int = 64,
         actor_head_layer_num: int = 1,
@@ -31,6 +32,7 @@ class VAC(nn.Module):
         activation: Optional[nn.Module] = nn.ReLU(),
         norm_type: Optional[str] = None,
         sigma_type: Optional[str] = 'independent',
+        fixed_sigma_value: Optional[int] = 0.3,
         bound_type: Optional[str] = None,
     ) -> None:
         r"""
@@ -39,8 +41,8 @@ class VAC(nn.Module):
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
+            - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid']
             - share_encoder (:obj:`bool`): Whether share encoder.
-            - continuous (:obj:`bool`): Whether collect continuously.
             - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
             - actor_head_layer_num (:obj:`int`):
@@ -56,7 +58,7 @@ class VAC(nn.Module):
         """
         super(VAC, self).__init__()
         obs_shape: int = squeeze(obs_shape)
-        action_shape: int = squeeze(action_shape)
+        action_shape = squeeze(action_shape)
         self.obs_shape, self.action_shape = obs_shape, action_shape
         # Encoder Type
         if isinstance(obs_shape, int) or len(obs_shape) == 1:
@@ -81,8 +83,9 @@ class VAC(nn.Module):
         self.critic_head = RegressionHead(
             critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type
         )
-        self.continuous = continuous
-        if self.continuous:
+        self.action_space = action_space
+        assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space
+        if self.action_space == 'continuous':
             self.multi_head = False
             self.actor_head = ReparameterizationHead(
                 actor_head_hidden_size,
@@ -93,7 +96,7 @@ class VAC(nn.Module):
                 norm_type=norm_type,
                 bound_type=bound_type
             )
-        else:
+        elif self.action_space == 'discrete':
             actor_head_cls = DiscreteHead
             multi_head = not isinstance(action_shape, int)
             self.multi_head = multi_head
@@ -114,6 +117,30 @@ class VAC(nn.Module):
                     activation=activation,
                     norm_type=norm_type
                 )
+        elif self.action_space == 'hybrid':  # HPPO
+            # hybrid action space: action_type(discrete) + action_args(continuous),
+            # such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
+            action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
+            action_shape.action_type_shape = squeeze(action_shape.action_type_shape)
+            actor_action_args = ReparameterizationHead(
+                actor_head_hidden_size,
+                action_shape.action_args_shape,
+                actor_head_layer_num,
+                sigma_type=sigma_type,
+                fixed_sigma_value=fixed_sigma_value,
+                activation=activation,
+                norm_type=norm_type,
+                bound_type=bound_type,
+            )
+            actor_action_type = DiscreteHead(
+                actor_head_hidden_size,
+                action_shape.action_type_shape,
+                actor_head_layer_num,
+                activation=activation,
+                norm_type=norm_type,
+            )
+            self.actor_head = nn.ModuleList([actor_action_type, actor_action_args])
+
         # must use list, not nn.ModuleList
         if self.share_encoder:
             self.actor = [self.encoder, self.actor_head]
@@ -203,10 +230,16 @@ class VAC(nn.Module):
             x = self.encoder(x)
         else:
             x = self.actor_encoder(x)
-        x = self.actor_head(x)
-        if self.continuous:
-            x = {'logit': [x['mu'], x['sigma']]}
-        return x
+
+        if self.action_space == 'discrete':
+            return self.actor_head(x)
+        elif self.action_space == 'continuous':
+            x = self.actor_head(x)  # mu, sigma
+            return {'logit': x}
+        elif self.action_space == 'hybrid':
+            action_type = self.actor_head[0](x)
+            action_args = self.actor_head[1](x)
+            return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}}
 
     def compute_critic(self, x: torch.Tensor) -> Dict:
         r"""
@@ -278,10 +311,16 @@ class VAC(nn.Module):
         else:
             actor_embedding = self.actor_encoder(x)
             critic_embedding = self.critic_encoder(x)
-        value = self.critic_head(critic_embedding)
-        actor_output = self.actor_head(actor_embedding)
-        if self.continuous:
-            logit = [actor_output['mu'], actor_output['sigma']]
-        else:
-            logit = actor_output['logit']
-        return {'logit': logit, 'value': value['pred']}
+
+        value = self.critic_head(critic_embedding)['pred']
+
+        if self.action_space == 'discrete':
+            logit = self.actor_head(actor_embedding)['logit']
+            return {'logit': logit, 'value': value}
+        elif self.action_space == 'continuous':
+            x = self.actor_head(actor_embedding)
+            return {'logit': x, 'value': value}
+        elif self.action_space == 'hybrid':
+            action_type = self.actor_head[0](actor_embedding)
+            action_args = self.actor_head[1](actor_embedding)
+            return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}, 'value': value}
diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py
index 8c9c04b792af12ee4c1415b61ff8dd02c001a9e0..374892932ff823ef5eb1f00cbd0ec1c8b8da5526 100644
--- a/ding/model/wrapper/model_wrappers.py
+++ b/ding/model/wrapper/model_wrappers.py
@@ -5,6 +5,7 @@ import numpy as np
 import torch
 from ding.torch_utils import get_tensor_data
 from ding.rl_utils import create_noise_generator
+from torch.distributions import Categorical, Independent, Normal
 
 
 class IModelWrapper(ABC):
@@ -408,6 +409,91 @@ class HybridEpsGreedyMultinomialSampleWrapper(IModelWrapper):
         return output
 
 
+class HybridReparamMultinomialSampleWrapper(IModelWrapper):
+    """
+    Overview:
+        Reparameterization sampler coupled with multinomial sample used in collector_model
+        to help balance exploration and exploitation.
+        In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
+    Interfaces:
+        forward
+    """
+
+    def forward(self, *args, **kwargs):
+        output = self._model.forward(*args, **kwargs)
+        assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
+
+        logit = output['logit']  # logit: {'action_type': action_type_logit, 'action_args': action_args_logit}
+        # discrete part
+        action_type_logit = logit['action_type']
+        prob = torch.softmax(action_type_logit, dim=-1)
+        pi_action = Categorical(prob)
+        action_type = pi_action.sample()
+        # continuous part
+        mu, sigma = logit['action_args']['mu'], logit['action_args']['sigma']
+        dist = Independent(Normal(mu, sigma), 1)
+        action_args = dist.sample()
+        action = {'action_type': action_type, 'action_args': action_args}
+        output['action'] = action
+        return output
+
+
+class HybridDeterministicArgmaxSampleWrapper(IModelWrapper):
+    """
+    Overview:
+        Deterministic sampler coupled with argmax sample used in eval_model.
+        In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
+    Interfaces:
+        forward
+    """
+
+    def forward(self, *args, **kwargs):
+        output = self._model.forward(*args, **kwargs)
+        assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
+        logit = output['logit']  # logit: {'action_type': action_type_logit, 'action_args': action_args_logit}
+        # discrete part
+        action_type_logit = logit['action_type']
+        action_type = action_type_logit.argmax(dim=-1)
+        # continuous part
+        mu = logit['action_args']['mu']
+        action_args = mu
+        action = {'action_type': action_type, 'action_args': action_args}
+        output['action'] = action
+        return output
+
+
+class DeterministicSample(IModelWrapper):
+    """
+    Overview:
+        Deterministic sampler (just use mu directly) used in eval_model.
+    Interfaces:
+        forward
+    """
+
+    def forward(self, *args, **kwargs):
+        output = self._model.forward(*args, **kwargs)
+        assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
+        output['action'] = output['logit']['mu']
+        return output
+
+
+class ReparamSample(IModelWrapper):
+    """
+    Overview:
+        Reparameterization gaussian sampler used in collector_model.
+    Interfaces:
+        forward
+    """
+
+    def forward(self, *args, **kwargs):
+        output = self._model.forward(*args, **kwargs)
+        assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
+        mu, sigma = output['logit']['mu'], output['logit']['sigma']
+        dist = Independent(Normal(mu, sigma), 1)
+        output['action'] = dist.sample()
+        return output
+
+
 class EpsGreedySampleNGUWrapper(IModelWrapper):
     r"""
     Overview:
@@ -592,8 +678,12 @@ wrapper_name_map = {
     'eps_greedy_sample': EpsGreedySampleWrapper,
     'eps_greedy_sample_ngu': EpsGreedySampleNGUWrapper,
     'eps_greedy_multinomial_sample': EpsGreedyMultinomialSampleWrapper,
+    'deterministic_sample': DeterministicSample,
+    'reparam_sample': ReparamSample,
     'hybrid_eps_greedy_sample': HybridEpsGreedySampleWrapper,
     'hybrid_eps_greedy_multinomial_sample': HybridEpsGreedyMultinomialSampleWrapper,
+    'hybrid_reparam_multinomial_sample': HybridReparamMultinomialSampleWrapper,
+    'hybrid_deterministic_argmax_sample': HybridDeterministicArgmaxSampleWrapper,
     'multinomial_sample': MultinomialSampleWrapper,
     'action_noise': ActionNoiseWrapper,
     # model wrapper
@@ -607,6 +697,8 @@ def model_wrap(model, wrapper_name: str = None, **kwargs):
         if not isinstance(model, IModelWrapper):
             model = wrapper_name_map['base'](model)
         model = wrapper_name_map[wrapper_name](model, **kwargs)
+    else:
+        raise TypeError("not support model_wrapper type: {}".format(wrapper_name))
     return model
 
 
diff --git a/ding/policy/cql.py b/ding/policy/cql.py
index a0a448b36473e3b59b2cf8fa39bbd06ce1f78351..ea1a3ef128cb69934c8631e347243ebe449a77eb 100644
--- a/ding/policy/cql.py
+++ b/ding/policy/cql.py
@@ -99,7 +99,9 @@ class CQLPolicy(SACPolicy):
             # and learning_rate_policy in `cfg.policy.learn`.
             # Default to False.
             # value_network=False,
-            actor_head_type='reparameterization',
+
+            # (str type) action_space: Use reparameterization trick for continous action
+            action_space='reparameterization',
         ),
         learn=dict(
             # (bool) Whether to use multi gpu
diff --git a/ding/policy/il.py b/ding/policy/il.py
index 8169419bfc5d6ce315e4291c75702bf6063543c1..6ddb0eb17ad27a94d7667d6fb51e1013c8b9684f 100644
--- a/ding/policy/il.py
+++ b/ding/policy/il.py
@@ -8,10 +8,11 @@ from ding.model import model_wrap
 from ding.utils import POLICY_REGISTRY
 from ding.utils.data import default_collate, default_decollate
 from .base_policy import Policy
-try:
-    from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
-except ImportError:
-    FootballKaggle5thPlaceModel = None
+# try:
+#     from dizoo.gfootball.model.bots import FootballKaggle5thPlaceModel
+# except ImportError:
+#     FootballKaggle5thPlaceModel = None
+FootballKaggle5thPlaceModel = None
 
 
 @POLICY_REGISTRY.register('IL')
diff --git a/ding/policy/ppg.py b/ding/policy/ppg.py
index 59af611d7ea3ae930acf3a9ba4766784c538cc8d..00cd225e946e4d34ea43b75641c1358f4c6c20c1 100644
--- a/ding/policy/ppg.py
+++ b/ding/policy/ppg.py
@@ -86,7 +86,7 @@ class PPGPolicy(Policy):
         # (bool) Whether to use cuda for network.
         cuda=False,
         # (bool) Whether the RL algorithm is on-policy or off-policy. (Note: in practice PPO can be off-policy used)
-        on_policy=True,
+        on_policy=False,
         priority=False,
         # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
         priority_IS_weight=False,
diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py
index f0ae45f54b9af4168f7bc8d4f1f2cf087bac1500..fb961723240fc65e22a4602c09f3f187081ff966 100644
--- a/ding/policy/ppo.py
+++ b/ding/policy/ppo.py
@@ -32,11 +32,16 @@ class PPOPolicy(Policy):
         on_policy=True,
         # (bool) Whether to use priority(priority sample, IS weight, update priority)
         priority=False,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) Whether to use Importance Sampling Weight to correct biased update due to priority.
+        # If True, priority must be True.
         priority_IS_weight=False,
+        # (bool) Whether to recompurete advantages in each iteration of on-policy PPO
         recompute_adv=True,
-        continuous=True,
+        # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous', 'hybrid']
+        action_space='discrete',
+        # (bool) Whether to use nstep return to calculate value target, otherwise, use return = adv + value
         nstep_return=False,
+        # (bool) Whether to enable multi-agent training, i.e.: MAPPO
         multi_agent=False,
         # (bool) Whether to need policy data in process transition
         transition_with_policy_data=True,
@@ -89,16 +94,22 @@ class PPOPolicy(Policy):
         self._priority_IS_weight = self._cfg.priority_IS_weight
         assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO"
 
-        self._continuous = self._cfg.continuous
+        self._action_space = self._cfg.action_space
         if self._cfg.learn.ppo_param_init:
             for n, m in self._model.named_modules():
                 if isinstance(m, torch.nn.Linear):
                     torch.nn.init.orthogonal_(m.weight)
                     torch.nn.init.zeros_(m.bias)
-            if self._continuous:
+            if self._action_space in ['continuous', 'hybrid']:
                 # init log sigma
-                if hasattr(self._model.actor_head, 'log_sigma_param'):
-                    torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5)
+                if self._action_space == 'continuous':
+                    if hasattr(self._model.actor_head, 'log_sigma_param'):
+                        torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5)
+                elif self._action_space == 'hybrid':  # actor_head[1]: ReparameterizationHead, for action_args
+                    if hasattr(self._model.actor_head[1], 'log_sigma_param'):
+                        torch.nn.init.constant_(self._model.actor_head[1].log_sigma_param, -0.5)
+                        print('init ok')
+
                 for m in list(self._model.critic.modules()) + list(self._model.actor.modules()):
                     if isinstance(m, torch.nn.Linear):
                         # orthogonal initialization
@@ -194,18 +205,42 @@ class PPOPolicy(Policy):
                     adv = (adv - adv.mean()) / (adv.std() + 1e-8)
 
                 # Calculate ppo error
-                if self._continuous:
+                if self._action_space == 'continuous':
                     ppo_batch = ppo_data(
                         output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv,
                         batch['return'], batch['weight']
                     )
                     ppo_loss, ppo_info = ppo_error_continuous(ppo_batch, self._clip_ratio)
-                else:
+                elif self._action_space == 'discrete':
                     ppo_batch = ppo_data(
                         output['logit'], batch['logit'], batch['action'], output['value'], batch['value'], adv,
                         batch['return'], batch['weight']
                     )
                     ppo_loss, ppo_info = ppo_error(ppo_batch, self._clip_ratio)
+                elif self._action_space == 'hybrid':
+                    # discrete part (discrete policy loss and entropy loss)
+                    ppo_discrete_batch = ppo_policy_data(
+                        output['logit']['action_type'], batch['logit']['action_type'], batch['action']['action_type'],
+                        adv, batch['weight']
+                    )
+                    ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_batch, self._clip_ratio)
+                    # continuous part (continuous policy loss and entropy loss, value loss)
+                    ppo_continuous_batch = ppo_data(
+                        output['logit']['action_args'], batch['logit']['action_args'], batch['action']['action_args'],
+                        output['value'], batch['value'], adv, batch['return'], batch['weight']
+                    )
+                    ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous(
+                        ppo_continuous_batch, self._clip_ratio
+                    )
+                    # sum discrete and continuous loss
+                    ppo_loss = type(ppo_continuous_loss)(
+                        ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, ppo_continuous_loss.value_loss,
+                        ppo_continuous_loss.entropy_loss + ppo_discrete_loss.entropy_loss
+                    )
+                    ppo_info = type(ppo_continuous_info)(
+                        max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl),
+                        max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac)
+                    )
                 wv, we = self._value_weight, self._entropy_weight
                 total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss
 
@@ -225,13 +260,13 @@ class PPOPolicy(Policy):
                     'value_max': output['value'].max().item(),
                     'approx_kl': ppo_info.approx_kl,
                     'clipfrac': ppo_info.clipfrac,
-                    'act': batch['action'].float().mean().item(),
                 }
-                if self._continuous:
+                if self._action_space == 'continuous':
                     return_info.update(
                         {
-                            'mu_mean': output['logit'][0].mean().item(),
-                            'sigma_mean': output['logit'][1].mean().item(),
+                            'act': batch['action'].float().mean().item(),
+                            'mu_mean': output['logit']['mu'].mean().item(),
+                            'sigma_mean': output['logit']['sigma'].mean().item(),
                         }
                     )
                 return_infos.append(return_info)
@@ -254,11 +289,13 @@ class PPOPolicy(Policy):
             Init traj and unroll length, collect model.
         """
         self._unroll_len = self._cfg.collect.unroll_len
-        self._continuous = self._cfg.continuous
-        if self._continuous:
-            self._collect_model = model_wrap(self._model, wrapper_name='base')
-        else:
+        self._action_space = self._cfg.action_space
+        if self._action_space == 'continuous':
+            self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample')
+        elif self._action_space == 'discrete':
             self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample')
+        elif self._action_space == 'hybrid':
+            self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample')
         self._collect_model.reset()
         self._gamma = self._cfg.collect.discount_factor
         self._gae_lambda = self._cfg.collect.gae_lambda
@@ -283,10 +320,6 @@ class PPOPolicy(Policy):
         self._collect_model.eval()
         with torch.no_grad():
             output = self._collect_model.forward(data, mode='compute_actor_critic')
-            if self._continuous:
-                (mu, sigma), value = output['logit'], output['value']
-                dist = Independent(Normal(mu, sigma), 1)
-                output['action'] = dist.sample()
         if self._cuda:
             output = to_device(output, 'cpu')
         output = default_decollate(output)
@@ -378,11 +411,13 @@ class PPOPolicy(Policy):
             Evaluate mode init method. Called by ``self.__init__``.
             Init eval model with argmax strategy.
         """
-        self._continuous = self._cfg.continuous
-        if self._continuous:
-            self._eval_model = model_wrap(self._model, wrapper_name='base')
-        else:
+        self._action_space = self._cfg.action_space
+        if self._action_space == 'continuous':
+            self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample')
+        elif self._action_space == 'discrete':
             self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
+        elif self._action_space == 'hybrid':
+            self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample')
         self._eval_model.reset()
 
     def _forward_eval(self, data: dict) -> dict:
@@ -404,9 +439,6 @@ class PPOPolicy(Policy):
         self._eval_model.eval()
         with torch.no_grad():
             output = self._eval_model.forward(data, mode='compute_actor')
-            if self._continuous:
-                (mu, sigma) = output['logit']
-                output.update({'action': mu})
         if self._cuda:
             output = to_device(output, 'cpu')
         output = default_decollate(output)
@@ -430,7 +462,7 @@ class PPOPolicy(Policy):
             'value_max',
             'value_mean',
         ]
-        if self._continuous:
+        if self._action_space == 'continuous':
             variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act']
         return variables
 
diff --git a/ding/policy/sac.py b/ding/policy/sac.py
index 78f154d83141def97214df16f955b7776949702c..48d0d4d2bfd0c14cfd3cb926fd795f8492506ed9 100644
--- a/ding/policy/sac.py
+++ b/ding/policy/sac.py
@@ -599,7 +599,9 @@ class SACPolicy(Policy):
             # and learning_rate_policy in `cfg.policy.learn`.
             # Default to False.
             # value_network=False,
-            actor_head_type='reparameterization',
+
+            # (str type) action_space: Use reparameterization trick for continous action
+            action_space='reparameterization',
         ),
         learn=dict(
             # (bool) Whether to use multi gpu
diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py
index e51851ab85c468faff6b2fdf5d2f6d079a6ae1f6..256b297b68a50f3679ec9885d9bdb98da26a6e34 100644
--- a/ding/rl_utils/ppo.py
+++ b/ding/rl_utils/ppo.py
@@ -108,7 +108,6 @@ def ppo_policy_error(data: namedtuple,
         # only use dual_clip when adv < 0
         policy_loss = -(torch.where(adv < 0, clip2, clip1) * weight).mean()
     else:
-        #policy_loss = (-torch.min(surr1, surr2) * weight).mean()
         policy_loss = (-torch.min(surr1, surr2) * weight).mean()
     with torch.no_grad():
         approx_kl = (logp_old - logp_new).mean().item()
@@ -179,11 +178,11 @@ def ppo_error_continuous(
     if weight is None:
         weight = torch.ones_like(adv)
 
-    dist_new = Independent(Normal(mu_sigma_new[0], mu_sigma_new[1]), 1)
-    if len(mu_sigma_old[0].shape) == 1:
-        dist_old = Independent(Normal(mu_sigma_old[0].unsqueeze(-1), mu_sigma_old[1].unsqueeze(-1)), 1)
+    dist_new = Independent(Normal(mu_sigma_new['mu'], mu_sigma_new['sigma']), 1)
+    if len(mu_sigma_old['mu'].shape) == 1:
+        dist_old = Independent(Normal(mu_sigma_old['mu'].unsqueeze(-1), mu_sigma_old['sigma'].unsqueeze(-1)), 1)
     else:
-        dist_old = Independent(Normal(mu_sigma_old[0], mu_sigma_old[1]), 1)
+        dist_old = Independent(Normal(mu_sigma_old['mu'], mu_sigma_old['sigma']), 1)
     logp_new = dist_new.log_prob(action)
     logp_old = dist_old.log_prob(action)
     entropy_loss = (dist_new.entropy() * weight).mean()
diff --git a/ding/rl_utils/tests/test_ppo.py b/ding/rl_utils/tests/test_ppo.py
index bab78c148548965dd5021cc4e19a1df8764b3751..a72d0e3b1674dc2df290ee4f502187acab9c4db3 100644
--- a/ding/rl_utils/tests/test_ppo.py
+++ b/ding/rl_utils/tests/test_ppo.py
@@ -70,11 +70,11 @@ def test_mappo():
 @pytest.mark.parametrize('use_value_clip, dual_clip, weight', args)
 def test_ppo_error_continous(use_value_clip, dual_clip, weight):
     B, N = 4, 6
-    mu_sigma_new = [torch.rand(B, N).requires_grad_(True), torch.rand(B, N).requires_grad_(True)]
-    mu_sigma_old = [
-        mu_sigma_new[0] + torch.rand_like(mu_sigma_new[0]) * 0.1,
-        mu_sigma_new[1] + torch.rand_like(mu_sigma_new[1]) * 0.1
-    ]
+    mu_sigma_new = {'mu': torch.rand(B, N).requires_grad_(True), 'sigma': torch.rand(B, N).requires_grad_(True)}
+    mu_sigma_old = {
+        'mu': mu_sigma_new['mu'] + torch.rand_like(mu_sigma_new['mu']) * 0.1,
+        'sigma': mu_sigma_new['sigma'] + torch.rand_like(mu_sigma_new['sigma']) * 0.1
+    }
     action = torch.rand(B, N)
     value_new = torch.randn(B).requires_grad_(True)
     value_old = value_new + torch.rand_like(value_new) * 0.1
@@ -84,9 +84,9 @@ def test_ppo_error_continous(use_value_clip, dual_clip, weight):
     loss, info = ppo_error_continuous(data, use_value_clip=use_value_clip, dual_clip=dual_clip)
     assert all([l.shape == tuple() for l in loss])
     assert all([np.isscalar(i) for i in info])
-    assert mu_sigma_new[0].grad is None
+    assert mu_sigma_new['mu'].grad is None
     assert value_new.grad is None
     total_loss = sum(loss)
     total_loss.backward()
-    assert isinstance(mu_sigma_new[0].grad, torch.Tensor)
+    assert isinstance(mu_sigma_new['mu'].grad, torch.Tensor)
     assert isinstance(value_new.grad, torch.Tensor)
diff --git a/ding/utils/default_helper.py b/ding/utils/default_helper.py
index 6f0ebb144e309a1211d8563efe1edef4b0476b20..ed6180d96034f60556f0ad7c8fac5c0679303d0c 100644
--- a/ding/utils/default_helper.py
+++ b/ding/utils/default_helper.py
@@ -410,6 +410,17 @@ def one_time_warning(warning_msg: str) -> None:
     logging.warning(warning_msg)
 
 
+def split_fn(data, indices, start, end):
+    if data is None:
+        return None
+    elif isinstance(data, list):
+        return [split_fn(d, indices, start, end) for d in data]
+    elif isinstance(data, dict):
+        return {k1: split_fn(v1, indices, start, end) for k1, v1 in data.items()}
+    else:
+        return data[indices[start:end]]
+
+
 def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> dict:
     assert isinstance(data, dict), type(data)
     length = []
@@ -436,31 +447,7 @@ def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> d
     for i in range(0, length, split_size):
         if i + split_size > length:
             i = length - split_size
-        batch = {}
-        for k in data.keys():
-            if data[k] is None:
-                batch[k] = None
-            elif k.startswith('prev_state'):
-                batch[k] = [data[k][t] for t in indices[i:i + split_size]]
-            elif isinstance(data[k], list) or isinstance(data[k], tuple):
-                if isinstance(data[k][0], list) and k == 'logit':
-                    # for continuous action
-                    # transform to mu_sigma (:obj:`list`): :math:`[(B, N), (B, N)]`,
-                    # where B is batch size and N is action dim
-                    batch[k] = [
-                        torch.stack(
-                            [
-                                data[k][transition_index][mu_sigma_index]
-                                for transition_index in indices[i:i + split_size]
-                            ]
-                        ) for mu_sigma_index in range(2)
-                    ]
-                else:  # for discrete action
-                    batch[k] = [t[indices[i:i + split_size]] for t in data[k]]
-            elif isinstance(data[k], dict):
-                batch[k] = {k1: v1[indices[i:i + split_size]] for k1, v1 in data[k].items()}
-            else:
-                batch[k] = data[k][indices[i:i + split_size]]
+        batch = split_fn(data, indices, i, i + split_size)
         yield batch
 
 
diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py
index e58586722c70d32719ba98d418a1726e8c8bb099..91c377f7a8cd585007bf64612676d6efe357767f 100644
--- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py
+++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ppo_config.py
@@ -15,22 +15,19 @@ bipedalwalker_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=True,
+        action_space='continuous',
         model=dict(
-            continuous=True,
+            action_space='continuous',
             obs_shape=24,
             action_shape=4,
         ),
         learn=dict(
             epoch_per_collect=10,
-            # update_per_collect=4, # offpolicy
             batch_size=64,
             learning_rate=0.001,
             value_weight=0.5,
             entropy_weight=0.01,
             clip_ratio=0.2,
-            nstep=1,
-            nstep_return=False,
             adv_norm=True,
         ),
         collect=dict(
@@ -50,7 +47,6 @@ bipedalwalker_ppo_create_config = dict(
     ),
 
     env_manager=dict(type='base'),
-    # policy=dict(type='ppo_offpolicy'),  # TODO
     policy=dict(type='ppo'),
 )
 bipedalwalker_ppo_create_config = EasyDict(bipedalwalker_ppo_create_config)
diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py
index 5f7394786c9e9bc9b2263a4b528cf2c29fc79b07..cc79da66362441ddc776f1a6e74a628f7b0adf93 100644
--- a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py
+++ b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py
@@ -18,11 +18,12 @@ lunarlander_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
+        action_space='discrete',
         recompute_adv=True,
         model=dict(
             obs_shape=8,
             action_shape=4,
+            action_space='discrete',
         ),
         learn=dict(
             update_per_collect=8,
diff --git a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py
index 57c9fe6ec9ad455689f2f77d717e5d7c41161bd0..7a8ed191c4ee77e86d759f6d899ec4bd2d04042c 100644
--- a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py
+++ b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py
@@ -44,8 +44,6 @@ lunarlander_ngu_config = dict(
         type='episodic',
     ),
     policy=dict(
-        continuous=False,
-        on_policy=False,
         cuda=True,
         priority=True,
         priority_IS_weight=True,
diff --git a/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py
index 839bec95bafa04381961e1d114d71506fe425817..825c097a23f00e843bb1154e9d85b4b8a2b8ae28 100644
--- a/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py
+++ b/dizoo/box2d/lunarlander/config/lunarlander_rnd_onppo_config.py
@@ -1,6 +1,6 @@
 from easydict import EasyDict
 from ding.entry import serial_pipeline_reward_model
-collector_env_num=8
+collector_env_num = 8
 lunarlander_ppo_rnd_config = dict(
     env=dict(
         collector_env_num=collector_env_num,
@@ -15,20 +15,19 @@ lunarlander_ppo_rnd_config = dict(
         # batch_size=32,
         # update_per_collect=10,
         batch_size=320,
-        update_per_collect=4,   # TODO(pu):2
+        update_per_collect=4,
     ),
     policy=dict(
         recompute_adv=True,
         cuda=True,
-        continuous=False,
-        on_policy=True,
+        action_space='discrete',
         model=dict(
             obs_shape=8,
             action_shape=4,
+            action_space='discrete',
         ),
         learn=dict(
-            # update_per_collect=4,
-            epoch_per_collect=10,  # TODO(pu)
+            epoch_per_collect=10,
             update_per_collect=1,  # 4
             batch_size=64,
             learning_rate=3e-4,
diff --git a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py
index 71972d94a577a668476fbe2e4d246cb1fc00b4cf..fa4ea6935db4600c6da905c7a9afaa2c09a1efa9 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py
@@ -17,11 +17,12 @@ cartpole_gcl_ppo_onpolicy_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
         recompute_adv=True,
+        action_space='discrete',
         model=dict(
             obs_shape=4,
             action_shape=2,
+            action_space='discrete',
             encoder_hidden_size_list=[64, 64, 128],
             critic_head_hidden_size=128,
             actor_head_hidden_size=128,
diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py
index f63ba82b011feea4ac5e04c04c8caf0c747cc718..ffda4cf2367968f0dc4b49040f82ecbab6a89e8e 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_config.py
@@ -10,10 +10,11 @@ cartpole_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             obs_shape=4,
             action_shape=2,
+            action_space='discrete',
             encoder_hidden_size_list=[64, 64, 128],
             critic_head_hidden_size=128,
             actor_head_hidden_size=128,
diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py
index 94818ebc1c70217ff262ec4b925d99db54f57542..e2ca8b126ab9fc1995954c4714ca70cbaa42197d 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_icm_config.py
@@ -20,10 +20,11 @@ cartpole_ppo_icm_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             obs_shape=4,
             action_shape=2,
+            action_space='discrete',
             encoder_hidden_size_list=[64, 64, 128],
             critic_head_hidden_size=128,
             actor_head_hidden_size=128,
@@ -60,4 +61,4 @@ cartpole_ppo_icm_create_config = EasyDict(cartpole_ppo_icm_create_config)
 create_config = cartpole_ppo_icm_create_config
 
 if __name__ == '__main__':
-    serial_pipeline_reward_model([main_config, create_config], seed=0)
\ No newline at end of file
+    serial_pipeline_reward_model([main_config, create_config], seed=0)
diff --git a/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py b/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py
index 59ba6824af72c81450ef123f1d59cbf7c0c84c0e..2c7499fc5fe2001223e4973473cc6c267d319a05 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_ppo_offpolicy_config.py
@@ -16,6 +16,7 @@ cartpole_ppo_offpolicy_config = dict(
             encoder_hidden_size_list=[64, 64, 128],
             critic_head_hidden_size=128,
             actor_head_hidden_size=128,
+            action_space='discrete',
         ),
         learn=dict(
             update_per_collect=6,
diff --git a/dizoo/classic_control/pendulum/config/pendulum_cql_config.py b/dizoo/classic_control/pendulum/config/pendulum_cql_config.py
index 8d7e677a318a75f64050e30e7cfdad03e8f7e2ac..7e91497b52afba1184b3ba4bcc43771ff2b3f05c 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_cql_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_cql_config.py
@@ -15,7 +15,7 @@ pendulum_cql_default_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='reparameterization',
+            action_space='reparameterization',
             actor_head_hidden_size=128,
             critic_head_hidden_size=128,
         ),
diff --git a/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py
index c8d9fde2498697070f3be7ff21449bd3078fb1b4..27c02fc2c444bc305068b94cd18191ba08ce6f7e 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_d4pg_config.py
@@ -17,7 +17,7 @@ pendulum_d4pg_config = dict(
         model=dict(
             obs_shape=3,
             action_shape=1,
-            actor_head_type='regression',
+            action_space='regression',
             v_min=-100,
             v_max=100,
             n_atom=51,
diff --git a/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py b/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py
index ce6da56e30fef846430eb3939ca846ba8cc9709a..dd41f26d47a0eb41e54296495568e450e21f807b 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_ddpg_config.py
@@ -17,7 +17,7 @@ pendulum_ddpg_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=False,
-            actor_head_type='regression',
+            action_space='regression',
         ),
         learn=dict(
             update_per_collect=2,
diff --git a/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py b/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py
index aeff665523a80fd922ed4e8275dd420c31f205fe..545a9e25f42a45ffb93a5886660931256ef46682 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_ppo_config.py
@@ -10,13 +10,13 @@ pendulum_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=True,
+        action_space='continuous',
         recompute_adv=True,
         model=dict(
             obs_shape=3,
             action_shape=1,
             encoder_hidden_size_list=[64, 64],
-            continuous=True,
+            action_space='continuous',
             actor_head_layer_num=0,
             critic_head_layer_num=0,
             sigma_type='conditioned',
diff --git a/dizoo/classic_control/pendulum/config/pendulum_sac_config.py b/dizoo/classic_control/pendulum/config/pendulum_sac_config.py
index f3370c0646290919ad457d2cec2f2656bbdae93e..bac739e8d547ea07d71501ed9d74e2f9fe9bedc2 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_sac_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_sac_config.py
@@ -18,7 +18,7 @@ pendulum_sac_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='reparameterization',
+            action_space='reparameterization',
             actor_head_hidden_size=128,
             critic_head_hidden_size=128,
         ),
diff --git a/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py b/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py
index ecba243d68e8a1ea706de3afe9ee556a9b2d1618..a673e8bde36e335a7a76b845790fc26d77204a0e 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_sac_data_generation_default_config.py
@@ -17,7 +17,7 @@ pendulum_sac_data_genearation_default_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='reparameterization',
+            action_space='reparameterization',
             actor_head_hidden_size=128,
             critic_head_hidden_size=128,
         ),
diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py
index d531358b469ee77a89265736b98b1f707060fc3f..200aa632ab994570262b8102e81a77d8b70a9158 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py
@@ -18,7 +18,7 @@ pendulum_td3_bc_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='regression',
+            action_space='regression',
             actor_head_hidden_size=128,
             critic_head_hidden_size=128,
         ),
@@ -44,7 +44,7 @@ pendulum_td3_bc_config = dict(
             noise_sigma=0.1,
             collector=dict(collect_print_freq=1000, ),
             data_type='hdf5',
-            data_path = './td3/expert_demos.hdf5',
+            data_path='./td3/expert_demos.hdf5',
             normalize_states=True,
         ),
         eval=dict(evaluator=dict(eval_freq=100, ), ),
diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_config.py
index a23944e6dc0aeb6bd0a468b15056cdea81eeaa72..810f0bd34d2f669fe28ecfdf3b017d9c244e6e92 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_td3_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_td3_config.py
@@ -18,7 +18,7 @@ pendulum_td3_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='regression',
+            action_space='regression',
         ),
         learn=dict(
             update_per_collect=2,
diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py
index c1f59e8b8b854c62bd7ae7569b2ba8469370b3b2..357443486de70dd63b4e79a95e5e49395bf5bc57 100644
--- a/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py
+++ b/dizoo/classic_control/pendulum/config/pendulum_td3_data_generation_config.py
@@ -18,7 +18,7 @@ pendulum_td3_generation_config = dict(
             obs_shape=3,
             action_shape=1,
             twin_critic=True,
-            actor_head_type='regression',
+            action_space='regression',
         ),
         learn=dict(
             update_per_collect=2,
@@ -33,7 +33,7 @@ pendulum_td3_generation_config = dict(
                 min=-0.5,
                 max=0.5,
             ),
-            learner = dict(
+            learner=dict(
                 load_path='./td3/ckpt/ckpt_best.pth.tar',
                 hook=dict(
                     load_ckpt_before_run='./td3/ckpt/ckpt_best.pth.tar',
@@ -46,7 +46,7 @@ pendulum_td3_generation_config = dict(
             noise_sigma=0.1,
             collector=dict(collect_print_freq=1000, ),
             save_path='./td3/expert.pkl',
-            data_type = 'hdf5',
+            data_type='hdf5',
         ),
         eval=dict(evaluator=dict(eval_freq=100, ), ),
         other=dict(replay_buffer=dict(
diff --git a/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py b/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py
index a3c1e7d2821428004e526bda53749657ec4510aa..9dda3bd90f7f3189dcfff806c54b0729ebfea4d2 100644
--- a/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py
+++ b/dizoo/gym_hybrid/config/gym_hybrid_ddpg_config.py
@@ -24,10 +24,9 @@ gym_hybrid_ddpg_config = dict(
                 action_args_shape=2,
             ),
             twin_critic=False,
-            actor_head_type='hybrid',
+            action_space='hybrid',
         ),
         learn=dict(
-            action_space='hybrid',
             update_per_collect=10,  # [5, 10]
             batch_size=32,
             discount_factor=0.99,
diff --git a/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py b/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6822b45eaefbb92bf420a588eb9327be92bb205
--- /dev/null
+++ b/dizoo/gym_hybrid/config/gym_hybrid_hppo_config.py
@@ -0,0 +1,66 @@
+from easydict import EasyDict
+from ding.entry import serial_pipeline_onpolicy
+
+gym_hybrid_hppo_config = dict(
+    exp_name='gym_hybrid_hppo_actsacle_fsv0.3_ew0.03_seed0',
+    env=dict(
+        collector_env_num=8,
+        evaluator_env_num=5,
+        # (bool) Scale output action into legal range, usually [-1, 1].
+        act_scale=True,
+        env_id='Moving-v0',  # ['Sliding-v0', 'Moving-v0']
+        n_evaluator_episode=5,
+        stop_value=1.8,
+    ),
+    policy=dict(
+        cuda=True,
+        priority=False,
+        action_space='hybrid',
+        recompute_adv=True,
+        model=dict(
+            obs_shape=10,
+            action_shape=dict(
+                action_type_shape=3,
+                action_args_shape=2,
+            ),
+            action_space='hybrid',
+            encoder_hidden_size_list=[256, 128, 64, 64],
+            sigma_type='fixed',
+            fixed_sigma_value=0.3,  # TODO(pu)
+            bound_type='tanh',
+        ),
+        learn=dict(
+            epoch_per_collect=10,
+            batch_size=320,
+            learning_rate=3e-4,
+            value_weight=0.5,
+            entropy_weight=0.03,  # TODO(pu)
+            clip_ratio=0.2,
+            adv_norm=True,
+            value_norm=True,
+        ),
+        collect=dict(
+            n_sample=int(3200),
+            discount_factor=0.99,
+            gae_lambda=0.95,
+            collector=dict(collect_print_freq=1000, ),
+        ),
+        eval=dict(evaluator=dict(eval_freq=200, ), ),
+    ),
+)
+gym_hybrid_hppo_config = EasyDict(gym_hybrid_hppo_config)
+main_config = gym_hybrid_hppo_config
+
+gym_hybrid_hppo_create_config = dict(
+    env=dict(
+        type='gym_hybrid',
+        import_names=['dizoo.gym_hybrid.envs.gym_hybrid_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(type='ppo'),
+)
+gym_hybrid_hppo_create_config = EasyDict(gym_hybrid_hppo_create_config)
+create_config = gym_hybrid_hppo_create_config
+
+if __name__ == "__main__":
+    serial_pipeline_onpolicy([main_config, create_config], seed=0)
diff --git a/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py b/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py
index 138980eac411fc7936c4a17cf7a1b8dbb845de0c..2067a9cdbaf14540278f9ea7fcaf83cfd4ecc90e 100644
--- a/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py
+++ b/dizoo/gym_hybrid/config/gym_hybrid_pdqn_config.py
@@ -3,7 +3,6 @@ from ding.entry import serial_pipeline
 
 gym_hybrid_pdqn_config = dict(
     exp_name='gym_hybrid_pdqn_seed1',
-
     # exp_name='gym_hybrid_pdqn_dataaction_1encoder_lrd1e-5_lrc1e-3_upc100_seed0',
     env=dict(
         collector_env_num=8,
diff --git a/dizoo/gym_hybrid/envs/gym_hybrid_env.py b/dizoo/gym_hybrid/envs/gym_hybrid_env.py
index e48680b0770973eb011211d845367513789effc3..86ec5e84b972be20186a29b5f37b700982dd3b07 100644
--- a/dizoo/gym_hybrid/envs/gym_hybrid_env.py
+++ b/dizoo/gym_hybrid/envs/gym_hybrid_env.py
@@ -54,8 +54,11 @@ class GymHybridEnv(BaseEnv):
 
     def step(self, action: Dict) -> BaseEnvTimestep:
         if self._act_scale:
-            # acceleration_value
+            # acceleration_value.
             action['action_args'][0] = affine_transform(action['action_args'][0], min_val=0, max_val=1)
+            # rotation_value. Following line can be omitted, because in the affine_transform function,
+            # we have already done the clip(-1,1) operation
+            action['action_args'][1] = affine_transform(action['action_args'][1], min_val=-1, max_val=1)
             action = [action['action_type'], action['action_args']]
         obs, rew, done, info = self._env.step(action)
         self._final_eval_reward += rew
diff --git a/dizoo/league_demo/league_demo_ppo_config.py b/dizoo/league_demo/league_demo_ppo_config.py
index dcb95f3b5611d7d55013b3e325d5d22db452b504..7ded7cca10818998a5d3e9c72d304aeeb0e5f7de 100644
--- a/dizoo/league_demo/league_demo_ppo_config.py
+++ b/dizoo/league_demo/league_demo_ppo_config.py
@@ -13,10 +13,11 @@ league_demo_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             obs_shape=2,
             action_shape=2,
+            action_space='discrete',
             encoder_hidden_size_list=[32, 32],
             critic_head_hidden_size=32,
             actor_head_hidden_size=32,
diff --git a/dizoo/minigrid/config/minigrid_ngu_config.py b/dizoo/minigrid/config/minigrid_ngu_config.py
index 5f1ccf4d2ce5b88efc01c4119a5e5ed3b1f79e7e..a1601a7d82c9310edc22b509fd7641433833d08e 100644
--- a/dizoo/minigrid/config/minigrid_ngu_config.py
+++ b/dizoo/minigrid/config/minigrid_ngu_config.py
@@ -4,12 +4,10 @@ from easydict import EasyDict
 from ding.entry import serial_pipeline_reward_model_ngu
 
 print(torch.cuda.is_available(), torch.__version__)
-collector_env_num = 32  #TODO
+collector_env_num = 32
 evaluator_env_num = 5
 nstep = 5
-minigrid_ppo_rnd_config = dict(
-    # exp_name='debug_minigrid_empty8_ngu_n5_bs2_ul98_erbm1',
-    # exp_name='debug_minigrid_fourrooms_ngu_er01_rbs5e4_n32',
+minigrid_ppo_ngu_config = dict(
     exp_name='debug_minigrid_doorkey_ngu_ul298_er01_rbs3e4_n32',
     env=dict(
         collector_env_num=collector_env_num,
@@ -47,8 +45,6 @@ minigrid_ppo_rnd_config = dict(
         type='episodic',
     ),
     policy=dict(
-        continuous=False,
-        on_policy=False,
         cuda=True,
         priority=True,
         priority_IS_weight=True,
@@ -83,7 +79,7 @@ minigrid_ppo_rnd_config = dict(
             ),
             replay_buffer=dict(
                 replay_buffer_size=30000,
-                # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
+                # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full
                 alpha=0.6,
                 # (Float type)  How much correction is used: 0 means no correction while 1 means full correction
                 beta=0.4,
@@ -91,9 +87,9 @@ minigrid_ppo_rnd_config = dict(
         ),
     ),
 )
-minigrid_ppo_rnd_config = EasyDict(minigrid_ppo_rnd_config)
-main_config = minigrid_ppo_rnd_config
-minigrid_ppo_rnd_create_config = dict(
+minigrid_ppo_ngu_config = EasyDict(minigrid_ppo_ngu_config)
+main_config = minigrid_ppo_ngu_config
+minigrid_ppo_ngu_create_config = dict(
     env=dict(
         type='minigrid',
         import_names=['dizoo.minigrid.envs.minigrid_env'],
@@ -105,8 +101,8 @@ minigrid_ppo_rnd_create_config = dict(
     episodic_reward_model=dict(type='episodic'),
     collector=dict(type='sample_ngu', )
 )
-minigrid_ppo_rnd_create_config = EasyDict(minigrid_ppo_rnd_create_config)
-create_config = minigrid_ppo_rnd_create_config
+minigrid_ppo_ngu_create_config = EasyDict(minigrid_ppo_ngu_create_config)
+create_config = minigrid_ppo_ngu_create_config
 
 if __name__ == "__main__":
     serial_pipeline_reward_model_ngu([main_config, create_config], seed=0)
diff --git a/dizoo/minigrid/config/minigrid_onppo_config.py b/dizoo/minigrid/config/minigrid_onppo_config.py
index 19e0073bf2ce9edcfe963e4e1264b0b3a3b5a3e7..73f270e0767429cd54957e2c83c821ce1b0c9ec1 100644
--- a/dizoo/minigrid/config/minigrid_onppo_config.py
+++ b/dizoo/minigrid/config/minigrid_onppo_config.py
@@ -2,10 +2,7 @@ from easydict import EasyDict
 from ding.entry import serial_pipeline_onpolicy
 collector_env_num = 8
 minigrid_ppo_config = dict(
-    # exp_name="minigrid_empty8_onppo",
     exp_name="minigrid_fourrooms_onppo",
-    # exp_name="minigrid_doorkey88_onppo",
-    # exp_name="minigrid_doorkey_onppo",
     env=dict(
         collector_env_num=8,
         evaluator_env_num=5,
@@ -19,11 +16,11 @@ minigrid_ppo_config = dict(
     policy=dict(
         cuda=True,
         recompute_adv=True,
-        continuous=False,
-        on_policy=True,
+        action_space='discrete',
         model=dict(
             obs_shape=2739,
             action_shape=7,
+            action_space='discrete',
             encoder_hidden_size_list=[256, 128, 64, 64],
         ),
         learn=dict(
diff --git a/dizoo/minigrid/config/minigrid_rnd_onppo_config.py b/dizoo/minigrid/config/minigrid_rnd_onppo_config.py
index d96b8f51004d7527e333ad3b636f62c89a5db4a2..8b24f3484e815b0f79e7fdf5bec56b7e2bc07359 100644
--- a/dizoo/minigrid/config/minigrid_rnd_onppo_config.py
+++ b/dizoo/minigrid/config/minigrid_rnd_onppo_config.py
@@ -4,12 +4,7 @@ import torch
 print(torch.__version__, torch.cuda.is_available())
 collector_env_num = 8
 minigrid_ppo_rnd_config = dict(
-    # exp_name='minigrid_empty8_rnd_onppo_b01_weight1000_maxlen100',
-    # exp_name='minigrid_fourrooms_rnd_onppo_b01_weight1000_maxlen100',
     exp_name='minigrid_doorkey88_rnd_onppo_b01_weight1000_maxlen300',
-    # exp_name='minigrid_doorkey_rnd_onppo_b01_weight1000_maxlen300',
-    # exp_name='minigrid_kcs3r3_rnd_onppo_b01',
-    # exp_name='minigrid_om2dlh_rnd_onppo_b01',
     env=dict(
         collector_env_num=collector_env_num,
         evaluator_env_num=5,
@@ -38,17 +33,17 @@ minigrid_ppo_rnd_config = dict(
     policy=dict(
         recompute_adv=True,
         cuda=True,
-        continuous=False,
-        on_policy=True,
+        action_space='discrete',
         model=dict(
             obs_shape=2739,
             action_shape=7,
+            action_space='discrete',
             encoder_hidden_size_list=[256, 128, 64, 64],
             critic_head_hidden_size=64,  # default=64
             actor_head_hidden_size=64,
         ),
         learn=dict(
-            epoch_per_collect=10,  # TODO(pu)
+            epoch_per_collect=10,
             update_per_collect=1,  # 4
             batch_size=320,  # 64,
             learning_rate=3e-4,
@@ -60,7 +55,7 @@ minigrid_ppo_rnd_config = dict(
         ),
         collect=dict(
             collector_env_num=collector_env_num,
-            n_sample=int(3200),
+            n_sample=3200,
             # here self.traj_length = 3200//8 = 400, because in minigrid env the max_length is 300.
             # in ding/worker/collector/sample_serial_collector.py
             #    self._traj_len = max(
diff --git a/dizoo/mujoco/config/ant_onppo_default_config.py b/dizoo/mujoco/config/ant_onppo_default_config.py
index a1d9e4cbbd1f03af34ec612cc2d9a33dc0894d01..2cc72b6f07fecf1fa982dedf0be40c2281534db7 100644
--- a/dizoo/mujoco/config/ant_onppo_default_config.py
+++ b/dizoo/mujoco/config/ant_onppo_default_config.py
@@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy
 collector_env_num = 1
 evaluator_env_num = 1
 ant_ppo_default_config = dict(
-    exp_name="result_mujoco/ant_onppo_noig",
+    exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0",
+    # exp_name="result_mujoco_para2/ant_onppo_ig_para2",
     env=dict(
         env_id='Ant-v3',
         norm_obs=dict(use_norm=False, ),
@@ -18,33 +19,37 @@ ant_ppo_default_config = dict(
     policy=dict(
         cuda=True,
         recompute_adv=True,
-        continuous=True,
-        on_policy=True,
+        action_space='continuous',
         model=dict(
-            continuous=True,
+            action_space='continuous',
             obs_shape=111,
             action_shape=8,
         ),
         learn=dict(
             epoch_per_collect=10,
             update_per_collect=1,
-            batch_size=64,
+            batch_size=320,
             learning_rate=3e-4,
-            value_weight=0.25,
-            entropy_weight=0,
+            value_weight=0.5,
+            entropy_weight=0.001,
             clip_ratio=0.2,
             adv_norm=True,
             value_norm=True,
+            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must
+            # use ignore_done=False here,
+            # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
+            # for halfcheetah, the length=1000
+            # ignore_done=True,
             ignore_done=False,
             grad_clip_type='clip_norm',
             grad_clip_value=0.5,
         ),
         collect=dict(
             collector_env_num=collector_env_num,
-            n_sample=2048,
+            n_sample=3200,
             unroll_len=1,
             discount_factor=0.99,
-            gae_lambda=0.97,
+            gae_lambda=0.95,
         ),
         eval=dict(evaluator=dict(eval_freq=5000, )),
     ),
diff --git a/dizoo/mujoco/config/ant_onppo_default_config_para2.py b/dizoo/mujoco/config/ant_onppo_default_config_para2.py
deleted file mode 100644
index 2798c12ac9ef09ebc1552eb32a3cbbda7bd03e43..0000000000000000000000000000000000000000
--- a/dizoo/mujoco/config/ant_onppo_default_config_para2.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from easydict import EasyDict
-from ding.entry import serial_pipeline_onpolicy
-
-collector_env_num = 1
-evaluator_env_num = 1
-ant_ppo_default_config = dict(
-    exp_name="result_mujoco_para2/ant_onppo_noig_para2_seed0",
-    # exp_name="result_mujoco_para2/ant_onppo_ig_para2",
-    env=dict(
-        env_id='Ant-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
-        collector_env_num=collector_env_num,
-        evaluator_env_num=evaluator_env_num,
-        use_act_scale=True,
-        n_evaluator_episode=10,
-        stop_value=6000,
-    ),
-    policy=dict(
-        cuda=True,
-        recompute_adv=True,
-        continuous=True,
-        on_policy=True,
-        model=dict(
-            continuous=True,
-            obs_shape=111,
-            action_shape=8,
-        ),
-        learn=dict(
-            epoch_per_collect=10,
-            update_per_collect=1,
-            batch_size=320,
-            learning_rate=3e-4,
-            value_weight=0.5,
-            entropy_weight=0.001,
-            clip_ratio=0.2,
-            adv_norm=True,
-            value_norm=True,
-            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
-            # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
-            # for halfcheetah, the length=1000
-            # ignore_done=True,
-            ignore_done=False,
-            grad_clip_type='clip_norm',
-            grad_clip_value=0.5,
-            
-        ),
-        collect=dict(
-            collector_env_num=collector_env_num,
-            n_sample=3200,
-            unroll_len=1,
-            discount_factor=0.99,
-            gae_lambda=0.95,
-        ),
-        eval=dict(evaluator=dict(eval_freq=5000, )),
-    ),
-)
-ant_ppo_default_config = EasyDict(ant_ppo_default_config)
-main_config = ant_ppo_default_config
-
-ant_ppo_create_default_config = dict(
-    env=dict(
-        type='mujoco',
-        import_names=['dizoo.mujoco.envs.mujoco_env'],
-    ),
-    # env_manager=dict(type='subprocess'),
-    env_manager=dict(type='base'),
-    policy=dict(type='ppo', ),
-)
-ant_ppo_create_default_config = EasyDict(ant_ppo_create_default_config)
-create_config = ant_ppo_create_default_config
-
-if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=0)
diff --git a/dizoo/mujoco/config/halfcheetah_onppo_default_config.py b/dizoo/mujoco/config/halfcheetah_onppo_default_config.py
index e95d955143441cd16f5af09c6326b56a72be47e2..43a9a84fe8e9312da2871296532d36db3bdf9ed6 100644
--- a/dizoo/mujoco/config/halfcheetah_onppo_default_config.py
+++ b/dizoo/mujoco/config/halfcheetah_onppo_default_config.py
@@ -4,9 +4,7 @@ from ding.entry import serial_pipeline_onpolicy
 collector_env_num = 1
 evaluator_env_num = 1
 halfcheetah_ppo_default_config = dict(
-    exp_name="Halfcheetah_onppo",
-    # exp_name="debug/debug_halfcheetah_onppo_ig",
-
+    exp_name="halfcheetah_onppo",
     env=dict(
         env_id='HalfCheetah-v3',
         norm_obs=dict(use_norm=False, ),
@@ -15,44 +13,41 @@ halfcheetah_ppo_default_config = dict(
         evaluator_env_num=evaluator_env_num,
         use_act_scale=True,
         n_evaluator_episode=10,
-        # n_evaluator_episode=1,
         stop_value=12000,
     ),
     policy=dict(
         cuda=True,
         recompute_adv=True,
-        continuous=True,
-        on_policy=True,
+        action_space='continuous',
         model=dict(
-            continuous=True,
+            action_space='continuous',
             obs_shape=17,
             action_shape=6,
         ),
         learn=dict(
-            epoch_per_collect=10,#10,
+            epoch_per_collect=10,
             update_per_collect=1,
-            batch_size=64,#320,
+            batch_size=320,
             learning_rate=3e-4,
-            value_weight=0.25,#0.5,
-            entropy_weight=0,#0.001,
+            value_weight=0.5,
+            entropy_weight=0.001,
             clip_ratio=0.2,
             adv_norm=True,
             value_norm=True,
-            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
+            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must
+            # use ignore_done=False here,
             # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
             # for halfcheetah, the length=1000
-            # ignore_done=True,
-            ignore_done=False,
+            ignore_done=True,
             grad_clip_type='clip_norm',
             grad_clip_value=0.5,
-
         ),
         collect=dict(
             collector_env_num=collector_env_num,
-            n_sample=2048,#3200,
+            n_sample=3200,
             unroll_len=1,
             discount_factor=0.99,
-            gae_lambda=0.97,#0.95,
+            gae_lambda=0.95,
         ),
         eval=dict(evaluator=dict(eval_freq=5000, )),
     ),
@@ -65,12 +60,11 @@ halfcheetah_ppo_create_default_config = dict(
         type='mujoco',
         import_names=['dizoo.mujoco.envs.mujoco_env'],
     ),
-    # env_manager=dict(type='subprocess'),
-    env_manager=dict(type='base'),
+    env_manager=dict(type='subprocess'),
     policy=dict(type='ppo', ),
 )
 halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config)
 create_config = halfcheetah_ppo_create_default_config
 
 if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=0)
+    serial_pipeline_onpolicy([main_config, create_config], seed=1)
diff --git a/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py b/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py
deleted file mode 100644
index 735dc5a0d1761006deb1b0d43d0af57218e249e4..0000000000000000000000000000000000000000
--- a/dizoo/mujoco/config/halfcheetah_onppo_default_config_para2.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from easydict import EasyDict
-from ding.entry import serial_pipeline_onpolicy
-
-collector_env_num = 1
-evaluator_env_num = 1
-halfcheetah_ppo_default_config = dict(
-    # exp_name="result_mujoco_para2/halfcheetah_onppo_noig_para2",
-    exp_name="result_mujoco_para2/halfcheetah_onppo_ig_para2_seed1",
-    env=dict(
-        env_id='HalfCheetah-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
-        collector_env_num=collector_env_num,
-        evaluator_env_num=evaluator_env_num,
-        use_act_scale=True,
-        n_evaluator_episode=10,
-        # n_evaluator_episode=1,
-        stop_value=12000,
-    ),
-    policy=dict(
-        cuda=True,
-        recompute_adv=True,
-        continuous=True,
-        on_policy=True,
-        model=dict(
-            continuous=True,
-            obs_shape=17,
-            action_shape=6,
-        ),
-        learn=dict(
-            epoch_per_collect=10,
-            update_per_collect=1,
-            batch_size=320,
-            learning_rate=3e-4,
-            value_weight=0.5,
-            entropy_weight=0.001,
-            clip_ratio=0.2,
-            adv_norm=True,
-            value_norm=True,
-            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
-            # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
-            # for halfcheetah, the length=1000
-            ignore_done=True,
-            # ignore_done=False,
-            grad_clip_type='clip_norm',
-            grad_clip_value=0.5,
-
-        ),
-        collect=dict(
-            collector_env_num=collector_env_num,
-            n_sample=3200,
-            unroll_len=1,
-            discount_factor=0.99,
-            gae_lambda=0.95,
-        ),
-        eval=dict(evaluator=dict(eval_freq=5000, )),
-    ),
-)
-halfcheetah_ppo_default_config = EasyDict(halfcheetah_ppo_default_config)
-main_config = halfcheetah_ppo_default_config
-
-halfcheetah_ppo_create_default_config = dict(
-    env=dict(
-        type='mujoco',
-        import_names=['dizoo.mujoco.envs.mujoco_env'],
-    ),
-    env_manager=dict(type='subprocess'),
-    # env_manager=dict(type='base'),
-    policy=dict(type='ppo', ),
-)
-halfcheetah_ppo_create_default_config = EasyDict(halfcheetah_ppo_create_default_config)
-create_config = halfcheetah_ppo_create_default_config
-
-if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=1)
diff --git a/dizoo/mujoco/config/hopper_gcl_config.py b/dizoo/mujoco/config/hopper_gcl_config.py
index a0553731d544884619f58dbd1773a30e76298d94..555443e484712916cfa0097de92f921a1c48b6a8 100644
--- a/dizoo/mujoco/config/hopper_gcl_config.py
+++ b/dizoo/mujoco/config/hopper_gcl_config.py
@@ -25,12 +25,12 @@ hopper_gcl_default_config = dict(
     policy=dict(
         cuda=False,
         recompute_adv=True,
+        action_space='continuous',
         model=dict(
             obs_shape=11,
             action_shape=3,
-            continuous=True,
+            action_space='continuous',
         ),
-        continuous=True,
         learn=dict(
             update_per_collect=10,
             batch_size=64,
@@ -59,10 +59,7 @@ hopper_gcl_create_default_config = dict(
         import_names=['dizoo.mujoco.envs.mujoco_env'],
     ),
     env_manager=dict(type='base'),
-    policy=dict(
-        type='ppo',
-        import_names=['ding.policy.ppo'],
-    ),
+    policy=dict(type='ppo', ),
     reward_model=dict(type='guided_cost'),
 )
 hopper_gcl_create_default_config = EasyDict(hopper_gcl_create_default_config)
diff --git a/dizoo/mujoco/config/hopper_onppo_default_config.py b/dizoo/mujoco/config/hopper_onppo_default_config.py
index 5dc646c52939517af95ee1071802c0b63b8e8ae8..828d4e7b0cacfad2d65e6a618c51751340d26a15 100644
--- a/dizoo/mujoco/config/hopper_onppo_default_config.py
+++ b/dizoo/mujoco/config/hopper_onppo_default_config.py
@@ -16,12 +16,12 @@ hopper_ppo_default_config = dict(
     policy=dict(
         cuda=True,
         recompute_adv=True,
+        action_space='continuous',
         model=dict(
             obs_shape=11,
             action_shape=3,
-            continuous=True,
+            action_space='continuous',
         ),
-        continuous=True,
         learn=dict(
             epoch_per_collect=10,
             update_per_collect=1,
@@ -57,4 +57,4 @@ hopper_ppo_create_default_config = EasyDict(hopper_ppo_create_default_config)
 create_config = hopper_ppo_create_default_config
 
 if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=0)
\ No newline at end of file
+    serial_pipeline_onpolicy([main_config, create_config], seed=0)
diff --git a/dizoo/mujoco/config/walker2d_ddpg_gail_config.py b/dizoo/mujoco/config/walker2d_ddpg_gail_config.py
index 2e70219aea6776466f4230b0b98db1a5d4c2e4f6..2c08820b7d3c45372c9a86dba2e6cce7bd220f40 100644
--- a/dizoo/mujoco/config/walker2d_ddpg_gail_config.py
+++ b/dizoo/mujoco/config/walker2d_ddpg_gail_config.py
@@ -21,7 +21,6 @@ walker2d_ddpg_gail_default_config = dict(
         update_per_collect=100,
         expert_data_path='walker2d_ddpg/expert_data_train.pkl',
         load_path='walker2d_ddpg_gail/reward_model/ckpt/ckpt_best.pth.tar',  # state_dict of the reward model
-
         collect_count=100000,
     ),
     policy=dict(
diff --git a/dizoo/mujoco/config/walker2d_gcl_config.py b/dizoo/mujoco/config/walker2d_gcl_config.py
index 537e98055d42d17ce228ac9a2321591739f71cb0..fb68b45dfe66a721bb284a2563334c5f4825a998 100644
--- a/dizoo/mujoco/config/walker2d_gcl_config.py
+++ b/dizoo/mujoco/config/walker2d_gcl_config.py
@@ -24,12 +24,12 @@ walker_gcl_default_config = dict(
     policy=dict(
         cuda=False,
         recompute_adv=True,
+        action_space='continuous',
         model=dict(
             obs_shape=17,
             action_shape=6,
-            continuous=True,
+            action_space='continuous',
         ),
-        continuous=True,
         learn=dict(
             update_per_collect=10,
             batch_size=64,
@@ -58,10 +58,7 @@ walker_gcl_create_default_config = dict(
         import_names=['dizoo.mujoco.envs.mujoco_env'],
     ),
     env_manager=dict(type='base'),
-    policy=dict(
-        type='ppo',
-        import_names=['ding.policy.ppo'],
-    ),
+    policy=dict(type='ppo', ),
     replay_buffer=dict(type='naive', ),
     reward_model=dict(type='guided_cost'),
 )
diff --git a/dizoo/mujoco/config/walker2d_onppo_default_config.py b/dizoo/mujoco/config/walker2d_onppo_default_config.py
index adca526178c77b7bed1001b34e2a0fcf90e19868..102ac1ea6e05e7e7f056d211943442ec80df27a0 100644
--- a/dizoo/mujoco/config/walker2d_onppo_default_config.py
+++ b/dizoo/mujoco/config/walker2d_onppo_default_config.py
@@ -4,7 +4,8 @@ from ding.entry import serial_pipeline_onpolicy
 collector_env_num = 1
 evaluator_env_num = 1
 walker2d_ppo_default_config = dict(
-    exp_name="result_mujoco/wlker2d_onppo_noig",
+    # exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1",
+    # exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1",
     env=dict(
         env_id='Walker2d-v3',
         norm_obs=dict(use_norm=False, ),
@@ -18,24 +19,24 @@ walker2d_ppo_default_config = dict(
     policy=dict(
         cuda=True,
         recompute_adv=True,
-        continuous=True,
-        on_policy=True,
+        action_space='continuous',
         model=dict(
-            continuous=True,
+            action_space='continuous',
             obs_shape=17,
             action_shape=6,
         ),
         learn=dict(
-           epoch_per_collect=10,
+            epoch_per_collect=10,
             update_per_collect=1,
-            batch_size=64,
+            batch_size=320,
             learning_rate=3e-4,
-            value_weight=0.25,
-            entropy_weight=0,
+            value_weight=0.5,
+            entropy_weight=0.001,
             clip_ratio=0.2,
             adv_norm=True,
             value_norm=True,
-            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
+            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must
+            # use ignore_done=False here,
             # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
             # for halfcheetah, the length=1000
             # ignore_done=True,
@@ -45,10 +46,10 @@ walker2d_ppo_default_config = dict(
         ),
         collect=dict(
             collector_env_num=collector_env_num,
-            n_sample=2048,
+            n_sample=3200,
             unroll_len=1,
             discount_factor=0.99,
-            gae_lambda=0.97,
+            gae_lambda=0.95,
         ),
         eval=dict(evaluator=dict(eval_freq=5000, )),
     ),
@@ -69,4 +70,4 @@ walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config
 create_config = walker2d_ppo_create_default_config
 
 if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=0)
+    serial_pipeline_onpolicy([main_config, create_config], seed=1)
diff --git a/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py b/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py
deleted file mode 100644
index d5cf1ac311d603bf1ab9707f24ad892a11d513d8..0000000000000000000000000000000000000000
--- a/dizoo/mujoco/config/walker2d_onppo_default_config_para2.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from easydict import EasyDict
-from ding.entry import serial_pipeline_onpolicy
-
-collector_env_num = 1
-evaluator_env_num = 1
-walker2d_ppo_default_config = dict(
-    # exp_name="result_mujoco_para2/wlker2d_onppo_noig_para2_seed1",
-    # exp_name="result_mujoco_para2/wlker2d_onppo_ig_para2_seed1",
-    env=dict(
-        env_id='Walker2d-v3',
-        norm_obs=dict(use_norm=False, ),
-        norm_reward=dict(use_norm=False, ),
-        collector_env_num=collector_env_num,
-        evaluator_env_num=evaluator_env_num,
-        use_act_scale=True,
-        n_evaluator_episode=10,
-        stop_value=6000,
-    ),
-    policy=dict(
-        cuda=True,
-        recompute_adv=True,
-        continuous=True,
-        on_policy=True,
-        model=dict(
-            continuous=True,
-            obs_shape=17,
-            action_shape=6,
-        ),
-        learn=dict(
-            epoch_per_collect=10,
-            update_per_collect=1,
-            batch_size=320,
-            learning_rate=3e-4,
-            value_weight=0.5,
-            entropy_weight=0.001,
-            clip_ratio=0.2,
-            adv_norm=True,
-            value_norm=True,
-            # for onppo, when we recompute adv, we need the key done in data to split traj, so we must use ignore_done=False here,
-            # but when we add key traj_flag in data as the backup for key done, we could choose to use ignore_done=True
-            # for halfcheetah, the length=1000
-            # ignore_done=True,
-            ignore_done=False,
-            grad_clip_type='clip_norm',
-            grad_clip_value=0.5,
-        ),
-        collect=dict(
-            collector_env_num=collector_env_num,
-            n_sample=3200,
-            unroll_len=1,
-            discount_factor=0.99,
-            gae_lambda=0.95,
-        ),
-        eval=dict(evaluator=dict(eval_freq=5000, )),
-    ),
-)
-walker2d_ppo_default_config = EasyDict(walker2d_ppo_default_config)
-main_config = walker2d_ppo_default_config
-
-walker2d_ppo_create_default_config = dict(
-    env=dict(
-        type='mujoco',
-        import_names=['dizoo.mujoco.envs.mujoco_env'],
-    ),
-    # env_manager=dict(type='subprocess'),
-    env_manager=dict(type='base'),
-    policy=dict(type='ppo', ),
-)
-walker2d_ppo_create_default_config = EasyDict(walker2d_ppo_create_default_config)
-create_config = walker2d_ppo_create_default_config
-
-if __name__ == "__main__":
-    serial_pipeline_onpolicy([main_config, create_config], seed=1)
diff --git a/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py b/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py
index 9a69fe8aa30619c60794832ced06ebe88ea01f16..81fc4aadf52e2a88764b81ab2d23b7231619bbca 100644
--- a/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py
+++ b/dizoo/multiagent_particle/config/cooperative_navigation_mappo_config.py
@@ -20,8 +20,9 @@ main_config = dict(
     policy=dict(
         cuda=False,
         multi_agent=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
+            action_space='discrete',
             agent_num=n_agent,
             agent_obs_shape=2 + 2 + (n_agent - 1) * 2 + num_landmarks * 2,
             global_obs_shape=n_agent * 2 + num_landmarks * 2 + n_agent * 2,
diff --git a/dizoo/overcooked/config/overcooked_demo_ppo_config.py b/dizoo/overcooked/config/overcooked_demo_ppo_config.py
index 9a175917cd2784b4be019758d9e74b2e5eef53d3..207e870ea66013b1da324a106139c5846ca0e607 100644
--- a/dizoo/overcooked/config/overcooked_demo_ppo_config.py
+++ b/dizoo/overcooked/config/overcooked_demo_ppo_config.py
@@ -11,22 +11,21 @@ overcooked_league_demo_ppo_config = dict(
     ),
     policy=dict(
         cuda=False,
-        continuous=False,
         recompute_adv=True,
+        action_space='discrete',
         model=dict(
             obs_shape=[5, 4, 26],
             action_shape=6,
             share_encoder=False,
+            action_space='discrete',
         ),
         learn=dict(
-            update_per_collect=4,
+            epoch_per_collect=4,
             batch_size=128,
             learning_rate=0.001,
             value_weight=0.5,
             entropy_weight=0.01,
             clip_ratio=0.2,
-            nstep=1,
-            nstep_return=False,
             adv_norm=True,
             value_norm=True,
         ),
diff --git a/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py b/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py
index b2f10c72595c9750205e3ff96bcbb27250edafdc..83052ec67c83a9f0a6ed095d30ad15d0213fe667 100644
--- a/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py
+++ b/dizoo/procgen/coinrun/entry/coinrun_ppo_config.py
@@ -10,8 +10,10 @@ coinrun_ppo_default_config = dict(
     ),
     policy=dict(
         cuda=False,
+        action_space='discrete',
         model=dict(
             obs_shape=[3, 64, 64],
+            action_space='discrete',
             action_shape=15,
             encoder_hidden_size_list=[32, 32, 64],
         ),
@@ -34,7 +36,6 @@ coinrun_ppo_default_config = dict(
             ),
             replay_buffer=dict(replay_buffer_size=100000, ),
         ),
-        cuda=True,
     ),
 )
 coinrun_ppo_default_config = EasyDict(coinrun_ppo_default_config)
diff --git a/dizoo/procgen/maze/entry/maze_dqn_config.py b/dizoo/procgen/maze/entry/maze_dqn_config.py
index e677e591319b2c4e87c8cd8ba587a5956de1fb2e..432adcdce442c30d61620a0dd27a776f843c707f 100644
--- a/dizoo/procgen/maze/entry/maze_dqn_config.py
+++ b/dizoo/procgen/maze/entry/maze_dqn_config.py
@@ -34,7 +34,6 @@ maze_dqn_default_config = dict(
             ),
             replay_buffer=dict(replay_buffer_size=100000, ),
         ),
-        cuda=True,
     ),
 )
 maze_dqn_default_config = EasyDict(maze_dqn_default_config)
diff --git a/dizoo/procgen/maze/entry/maze_ppo_config.py b/dizoo/procgen/maze/entry/maze_ppo_config.py
index 6e0ad658be46a1a52e7e1fa61a1500d4887490dc..c6b3cc1d7d580d2ebb2150171b6a2264714cc486 100644
--- a/dizoo/procgen/maze/entry/maze_ppo_config.py
+++ b/dizoo/procgen/maze/entry/maze_ppo_config.py
@@ -11,9 +11,11 @@ maze_ppo_default_config = dict(
     ),
     policy=dict(
         cuda=False,
+        action_space='discrete',
         model=dict(
             obs_shape=[3, 64, 64],
             action_shape=15,
+            action_space='discrete',
             encoder_hidden_size_list=[32, 32, 64],
         ),
         learn=dict(
diff --git a/dizoo/pybullet/config/hopper_ppo_default_config.py b/dizoo/pybullet/config/hopper_ppo_default_config.py
index e60416b9d3b3451ce5e8963e0e28a1a39a73422f..5d42840784acfa587cf8efc7ffae38520ac15e71 100644
--- a/dizoo/pybullet/config/hopper_ppo_default_config.py
+++ b/dizoo/pybullet/config/hopper_ppo_default_config.py
@@ -14,12 +14,12 @@ hopper_ppo_default_config = dict(
     policy=dict(
         cuda=True,
         recompute_adv=True,
+        action_space='continuous',
         model=dict(
             obs_shape=11,
             action_shape=3,
-            continuous=True,
+            action_space='continuous',
         ),
-        continuous=True,
         learn=dict(
             epoch_per_collect=10,
             batch_size=64,
diff --git a/dizoo/slime_volley/config/slime_volley_league_ppo_config.py b/dizoo/slime_volley/config/slime_volley_league_ppo_config.py
deleted file mode 100644
index d48ca085d04d870b2d529229f4f1f65aa854a509..0000000000000000000000000000000000000000
--- a/dizoo/slime_volley/config/slime_volley_league_ppo_config.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from easydict import EasyDict
-
-slime_volley_league_ppo_config = dict(
-    exp_name="slime_volley_league_ppo",
-    env=dict(
-        collector_env_num=8,
-        evaluator_env_num=10,
-        n_evaluator_episode=100,
-        stop_value=0,
-        # Single-agent env for evaluator; Double-agent env for collector.
-        # Should be assigned True or False in code.
-        is_evaluator=None,
-        manager=dict(shared_memory=False, ),
-        env_id="SlimeVolley-v0",
-    ),
-    policy=dict(
-        cuda=False,
-        continuous=False,
-        model=dict(
-            obs_shape=12,
-            action_shape=6,
-            encoder_hidden_size_list=[32, 32],
-            critic_head_hidden_size=32,
-            actor_head_hidden_size=32,
-            share_encoder=False,
-        ),
-        learn=dict(
-            update_per_collect=3,
-            batch_size=32,
-            learning_rate=0.00001,
-            value_weight=0.5,
-            entropy_weight=0.0,
-            clip_ratio=0.2,
-        ),
-        collect=dict(
-            n_episode=128, unroll_len=1, discount_factor=1.0, gae_lambda=1.0, collector=dict(get_train_sample=True, )
-        ),
-        other=dict(
-            league=dict(
-                player_category=['default'],
-                path_policy="slime_volley_league_ppo/policy",
-                active_players=dict(
-                    main_player=1,
-                    main_exploiter=1,
-                    league_exploiter=1,
-                ),
-                main_player=dict(
-                    one_phase_step=200,
-                    branch_probs=dict(
-                        pfsp=0.5,
-                        sp=1.0,
-                    ),
-                    strong_win_rate=0.7,
-                ),
-                main_exploiter=dict(
-                    one_phase_step=200,
-                    branch_probs=dict(main_players=1.0, ),
-                    strong_win_rate=0.7,
-                    min_valid_win_rate=0.3,
-                ),
-                league_exploiter=dict(
-                    one_phase_step=200,
-                    branch_probs=dict(pfsp=1.0, ),
-                    strong_win_rate=0.7,
-                    mutate_prob=0.0,
-                ),
-                use_pretrain=False,
-                use_pretrain_init_historical=False,
-                payoff=dict(
-                    type='battle',
-                    decay=0.99,
-                    min_win_rate_games=8,
-                )
-            ),
-        ),
-    ),
-)
-slime_volley_league_ppo_config = EasyDict(slime_volley_league_ppo_config)
diff --git a/dizoo/slime_volley/config/slime_volley_ppo_config.py b/dizoo/slime_volley/config/slime_volley_ppo_config.py
index fbe854181f889ff3dfe24b48f6a120f1b15bbbca..38b90318c9fd9d24b0686f5a6b68d2cc75246281 100644
--- a/dizoo/slime_volley/config/slime_volley_ppo_config.py
+++ b/dizoo/slime_volley/config/slime_volley_ppo_config.py
@@ -13,10 +13,11 @@ slime_volley_ppo_config = dict(
     ),
     policy=dict(
         cuda=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             obs_shape=12,
             action_shape=6,
+            action_space='discrete',
             encoder_hidden_size_list=[64, 64],
             critic_head_hidden_size=64,
             actor_head_hidden_size=64,
diff --git a/dizoo/smac/config/smac_3s5z_mappo_config.py b/dizoo/smac/config/smac_3s5z_mappo_config.py
index f5f57d940ee7f60020259e9097628171f6597da4..609a99223ac17045da6d6177fdea2c4497e65607 100644
--- a/dizoo/smac/config/smac_3s5z_mappo_config.py
+++ b/dizoo/smac/config/smac_3s5z_mappo_config.py
@@ -31,7 +31,7 @@ main_config = dict(
     policy=dict(
         cuda=True,
         multi_agent=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             # (int) agent_num: The number of the agent.
             # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
@@ -49,6 +49,7 @@ main_config = dict(
             action_shape=14,
             # (List[int]) The size of hidden layer
             # hidden_size_list=[64],
+            action_space='discrete'
         ),
         # used in state_num of hidden_state
         learn=dict(
diff --git a/dizoo/smac/config/smac_5m6m_mappo_config.py b/dizoo/smac/config/smac_5m6m_mappo_config.py
index 5d542a14a2a47a875a1fce39ec9d5013f03e276f..e7a3600087209e7bf3c51538ebfb021590ce3705 100644
--- a/dizoo/smac/config/smac_5m6m_mappo_config.py
+++ b/dizoo/smac/config/smac_5m6m_mappo_config.py
@@ -30,7 +30,7 @@ main_config = dict(
     policy=dict(
         cuda=True,
         multi_agent=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             # (int) agent_num: The number of the agent.
             # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
@@ -48,6 +48,7 @@ main_config = dict(
             action_shape=12,
             # (List[int]) The size of hidden layer
             # hidden_size_list=[64],
+            action_space='discrete',
         ),
         # used in state_num of hidden_state
         learn=dict(
diff --git a/dizoo/smac/config/smac_MMM2_mappo_config.py b/dizoo/smac/config/smac_MMM2_mappo_config.py
index b8d19aa1a4b1649df68d2291ee7c7f89f2d32e4e..e8c63fb3ff7a5c4b90002b2f18e9bf91fd76b934 100644
--- a/dizoo/smac/config/smac_MMM2_mappo_config.py
+++ b/dizoo/smac/config/smac_MMM2_mappo_config.py
@@ -30,7 +30,7 @@ main_config = dict(
     policy=dict(
         cuda=True,
         multi_agent=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             # (int) agent_num: The number of the agent.
             # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
@@ -47,6 +47,7 @@ main_config = dict(
             action_shape=18,
             # (List[int]) The size of hidden layer
             # hidden_size_list=[64],
+            action_space='discrete',
         ),
         # used in state_num of hidden_state
         learn=dict(
diff --git a/dizoo/smac/config/smac_MMM_mappo_config.py b/dizoo/smac/config/smac_MMM_mappo_config.py
index 13e9492f67f95c813d26d3c18067f7083f6bb467..9adc93d96ec0879e273a5585f3f77f38f07df80d 100644
--- a/dizoo/smac/config/smac_MMM_mappo_config.py
+++ b/dizoo/smac/config/smac_MMM_mappo_config.py
@@ -30,7 +30,7 @@ main_config = dict(
     policy=dict(
         cuda=True,
         multi_agent=True,
-        continuous=False,
+        action_space='discrete',
         model=dict(
             # (int) agent_num: The number of the agent.
             # For SMAC 3s5z, agent_num=8; for 2c_vs_64zg, agent_num=2.
@@ -48,6 +48,7 @@ main_config = dict(
             action_shape=16,
             # (List[int]) The size of hidden layer
             # hidden_size_list=[64],
+            action_space='discrete',
         ),
         # used in state_num of hidden_state
         learn=dict(