提交 65ad2a4e 编写于 作者: H Hongsheng Zeng 提交者: Bo Zhou

fix PPO bug; add more benchmark result (#47)

* fix PPO bug; add more benchmark result

* refine code

* update benchmark of PPO, after fix bug

* refine code
上级 6fdf4448
...@@ -53,7 +53,7 @@ class AtariModel(parl.Model): ...@@ -53,7 +53,7 @@ class AtariModel(parl.Model):
return Q return Q
""" """
three steps to build an agent three steps to build an agent
1. define a forward model which is critic_model is this example 1. define a forward model which is critic_model in this example
2. a. to build a DQN algorithm, just pass the critic_model to `DQN` 2. a. to build a DQN algorithm, just pass the critic_model to `DQN`
b. to build a DDQN algorithm, just replace DQN in following line with DDQN b. to build a DDQN algorithm, just replace DQN in following line with DDQN
3. define the I/O part in AtariAgent so that it could update the algorithm based on the interactive data 3. define the I/O part in AtariAgent so that it could update the algorithm based on the interactive data
......
...@@ -9,7 +9,8 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco ...@@ -9,7 +9,8 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
### Benchmark result ### Benchmark result
<img src=".benchmark/DDPG_HalfCheetah-v2.png" width = "400" height ="300" alt="DDPG_HalfCheetah-v2"/> <img src=".benchmark/DDPG_HalfCheetah-v2.png" width = "400" height ="300" alt="DDPG_HalfCheetah-v2"/> <img src=".benchmark/DDPG_Humanoid-v2.png" width = "400" height ="300" alt="DDPG_Humanoid-v2"/>
<img src=".benchmark/DDPG_Hopper-v2.png" width = "400" height ="300" alt="DDPG_Hopper-v2"/>
## How to use ## How to use
### Dependencies: ### Dependencies:
......
...@@ -11,7 +11,7 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g ...@@ -11,7 +11,7 @@ Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari g
<img src=".benchmark/DQN_Pong.png" width = "400" height ="300" alt="DQN_Pong" /> <img src=".benchmark/DQN_Breakout.png" width = "400" height ="300" alt="DQN_Breakout"/> <img src=".benchmark/DQN_Pong.png" width = "400" height ="300" alt="DQN_Pong" /> <img src=".benchmark/DQN_Breakout.png" width = "400" height ="300" alt="DQN_Breakout"/>
<br> <br>
<img src=".benchmark/DQN_BeamRider.png" width = "400" height ="300" alt="DQN_BeamRider"/> <img src=".benchmark/DQN_BeamRider.png" width = "400" height ="300" alt="DQN_BeamRider"/> <img src=".benchmark/DQN_SpaceInvaders.png" width = "400" height ="300" alt="DQN_SpaceInvaders"/>
## How to use ## How to use
### Dependencies: ### Dependencies:
......
...@@ -12,7 +12,7 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco ...@@ -12,7 +12,7 @@ Please see [here](https://github.com/openai/mujoco-py) to know more about Mujoco
### Benchmark result ### Benchmark result
<img src=".benchmark/PPO_HalfCheetah-v2.png" width = "400" height ="300" alt="PPO_HalfCheetah-v2" /> <img src=".benchmark/PPO_HalfCheetah-v2.png" width = "400" height ="300" alt="PPO_HalfCheetah-v2" /> <img src=".benchmark/PPO_Hopper-v2.png" width = "400" height ="300" alt="PPO_Hopper-v2" />
## How to use ## How to use
### Dependencies: ### Dependencies:
......
...@@ -84,18 +84,50 @@ def run_evaluate_episode(env, agent, scaler): ...@@ -84,18 +84,50 @@ def run_evaluate_episode(env, agent, scaler):
def collect_trajectories(env, agent, scaler, episodes): def collect_trajectories(env, agent, scaler, episodes):
all_obs, all_actions, all_rewards, all_unscaled_obs = [], [], [], [] trajectories, all_unscaled_obs = [], []
for e in range(episodes): for e in range(episodes):
obs, actions, rewards, unscaled_obs = run_train_episode( obs, actions, rewards, unscaled_obs = run_train_episode(
env, agent, scaler) env, agent, scaler)
all_obs.append(obs) trajectories.append({
all_actions.append(actions) 'obs': obs,
all_rewards.append(rewards) 'actions': actions,
'rewards': rewards,
})
all_unscaled_obs.append(unscaled_obs) all_unscaled_obs.append(unscaled_obs)
scaler.update(np.concatenate(all_unscaled_obs) # update running statistics for scaling observations
) # update running statistics for scaling observations scaler.update(np.concatenate(all_unscaled_obs))
return np.concatenate(all_obs), np.concatenate( return trajectories
all_actions), np.concatenate(all_rewards)
def build_train_data(trajectories, agent):
train_obs, train_actions, train_advantages, train_discount_sum_rewards = [], [], [], []
for trajectory in trajectories:
pred_values = agent.value_predict(trajectory['obs'])
# scale rewards
scale_rewards = trajectory['rewards'] * (1 - args.gamma)
discount_sum_rewards = calc_discount_sum_rewards(
scale_rewards, args.gamma).astype('float32')
advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam)
# normalize advantages
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-6)
advantages = advantages.astype('float32')
train_obs.append(trajectory['obs'])
train_actions.append(trajectory['actions'])
train_advantages.append(advantages)
train_discount_sum_rewards.append(discount_sum_rewards)
train_obs = np.concatenate(train_obs)
train_actions = np.concatenate(train_actions)
train_advantages = np.concatenate(train_advantages)
train_discount_sum_rewards = np.concatenate(train_discount_sum_rewards)
return train_obs, train_actions, train_advantages, train_discount_sum_rewards
def main(): def main():
...@@ -123,33 +155,22 @@ def main(): ...@@ -123,33 +155,22 @@ def main():
test_flag = 0 test_flag = 0
total_steps = 0 total_steps = 0
while total_steps < args.train_total_steps: while total_steps < args.train_total_steps:
obs, actions, rewards = collect_trajectories( trajectories = collect_trajectories(
env, agent, scaler, episodes=args.episodes_per_batch) env, agent, scaler, episodes=args.episodes_per_batch)
total_steps += obs.shape[0] total_steps += sum([t['obs'].shape[0] for t in trajectories])
total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])
pred_values = agent.value_predict(obs)
# scale rewards train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
scale_rewards = rewards * (1 - args.gamma) trajectories, agent)
discount_sum_rewards = calc_discount_sum_rewards(
scale_rewards, args.gamma)
discount_sum_rewards = discount_sum_rewards.astype('float32')
advantages = calc_gae(scale_rewards, pred_values, args.gamma, args.lam)
# normalize advantages
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-6)
advantages = advantages.astype('float32')
policy_loss, kl = agent.policy_learn(obs, actions, advantages) policy_loss, kl = agent.policy_learn(train_obs, train_actions,
value_loss = agent.value_learn(obs, discount_sum_rewards) train_advantages)
value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)
logger.info( logger.info(
'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
.format(total_steps, .format(total_steps, total_train_rewards / args.episodes_per_batch,
np.sum(rewards) / args.episodes_per_batch, policy_loss, kl, policy_loss, kl, value_loss))
value_loss))
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
test_flag += 1 test_flag += 1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册