运行DDQN出错
Created by: familyld
版本信息: paddlepaddle==1.6.2 parl==1.3.1
该问题在algorithm为DDQN时出现,但只要改为DQN(只更改导入的包和算法实例化两行代码)就不会出现。
报错信息如下:
D:\Anaconda3\lib\site-packages\paddle\fluid\executor.py:779: UserWarning: The following exception is not an EOF exception.
"The following exception is not an EOF exception.")
Traceback (most recent call last):
File "DDQN.py", line 212, in <module>
run_episode(env, agent, rpm)
File "DDQN.py", line 155, in run_episode
batch_next_obs, batch_done)
File "DDQN.py", line 108, in learn
self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络
File "D:\Anaconda3\lib\site-packages\paddle\fluid\executor.py", line 780, in run
six.reraise(*sys.exc_info())
File "D:\Anaconda3\lib\site-packages\six.py", line 693, in reraise
raise value
File "D:\Anaconda3\lib\site-packages\paddle\fluid\executor.py", line 775, in run
use_program_cache=use_program_cache)
File "D:\Anaconda3\lib\site-packages\paddle\fluid\executor.py", line 822, in _run_impl
use_program_cache=use_program_cache)
File "D:\Anaconda3\lib\site-packages\paddle\fluid\executor.py", line 899, in _run_program
fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
Windows not support stack backtrace yet.
------------------------------------------
Python Call Stacks (More useful to users):
------------------------------------------
File "D:\Anaconda3\lib\site-packages\paddle\fluid\framework.py", line 2488, in append_op
attrs=kwargs.get("attrs", None))
File "D:\Anaconda3\lib\site-packages\paddle\fluid\layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "D:\Anaconda3\lib\site-packages\paddle\fluid\layers\tensor.py", line 1227, in range
outputs={'Out': [out]})
File "D:\Anaconda3\lib\site-packages\parl\algorithms\fluid\ddqn.py", line 80, in learn
start=0, end=batch_size, step=1, dtype='int64') * self.act_dim
File "DDQN.py", line 72, in build_program
self.cost = self.alg.learn(obs, act, reward, next_obs, terminal)
File "D:\Anaconda3\lib\site-packages\parl\core\fluid\agent.py", line 80, in __init__
self.build_program()
File "DDQN.py", line 47, in __init__
super(CartpoleAgent, self).__init__(algorithm)
File "DDQN.py", line 198, in <module>
e_greed_decrement=1e-6
----------------------
Error Message Summary:
----------------------
Error: The DataType of range Op's duplicable Variable Start must be consistent. The current variable type is (int64_t), but the previous variable type is (int). at (D:\1.6.2\paddle\paddle\fluid\framework\operator.cc:1151)
[operator < range > error]
原始代码:
import os
import gym
import argparse
import random
import collections
import numpy as np
import paddle.fluid as fluid
import parl
from parl import layers
from parl.utils import logger
from parl.algorithms import DDQN
parser = argparse.ArgumentParser()
parser.add_argument('--model', default='', help=r'预训练模型的序号,默认不使用')
parser.add_argument('--lr', type=float, default=5e-4, help=r'学习率')
parser.add_argument('--max_episode', type=int, default=3000, help=r'最多跑多少个episode')
parser.add_argument('--learn_freq', type=int, default=5, help=r'训练频率,每若干次交互训练一次')
parser.add_argument('--memory_size', type=int, default=20000, help=r'replay memory的大小')
parser.add_argument('--memory_warmup_size', type=int, default=200, help='replay_memory中预存的经验数目')
parser.add_argument('--batch_size', type=int, default=32, help=r'mini-batch的大小')
parser.add_argument('--gamma', type=float, default=0.999, help=r'折扣因子')
opt = parser.parse_args()
print(opt)
class CartpoleModel(parl.Model):
def __init__(self, act_dim):
hid1_size = 128
hid2_size = 128
self.fc1 = layers.fc(size=hid1_size, act='relu')
self.fc2 = layers.fc(size=hid2_size, act='relu')
self.fc3 = layers.fc(size=act_dim, act=None)
def value(self, obs):
h1 = self.fc1(obs)
h2 = self.fc2(h1)
Q = self.fc3(h2)
return Q
class CartpoleAgent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim, e_greed=0.1, e_greed_decrement=0):
assert isinstance(obs_dim, int)
assert isinstance(act_dim, int)
self.obs_dim = obs_dim
self.act_dim = act_dim
super(CartpoleAgent, self).__init__(algorithm)
self.global_step = 0
self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中
self.e_greed = e_greed # 有一定概率随机选取动作,探索
self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.value = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.cost = self.alg.learn(obs, act, reward, next_obs, terminal)
def sample(self, obs):
sample = np.random.rand() # 产生0~1之间的小数
if sample < self.e_greed:
act = np.random.randint(self.act_dim) # 探索:每个动作都有概率被选择
else:
act = self.predict(obs) # 选择最优动作
self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低
return act
def predict(self, obs): # 选择最优动作
obs = np.expand_dims(obs, axis=0)
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.value])[0]
pred_Q = np.squeeze(pred_Q, axis=0)
act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作
return act
def learn(self, obs, act, reward, next_obs, terminal):
# 每隔200个training steps同步一次model和target_model的参数
if self.global_step % self.update_target_steps == 0:
self.alg.sync_target()
self.global_step += 1
act = np.expand_dims(act, -1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal,
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络
return cost
class ReplayMemory(object):
def __init__(self, max_size):
self.buffer = collections.deque(maxlen=max_size)
# 增加一条经验到经验池中
def append(self, exp):
self.buffer.append(exp)
# 从经验池中选取N条经验出来
def sample(self, batch_size):
mini_batch = random.sample(self.buffer, batch_size)
obs_batch, action_batch, reward_batch, next_obs_batch,done_batch = [], [], [], [], []
for experience in mini_batch:
s, a, r, s_p,done = experience
obs_batch.append(s)
action_batch.append(a)
reward_batch.append(r)
next_obs_batch.append(s_p)
done_batch.append(done)
return np.array(obs_batch).astype('float32'), \
np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
def __len__(self):
return len(self.buffer)
# 训练一个episode
def run_episode(env, agent, rpm):
total_reward = 0
obs = env.reset()
step = 0
while True:
step += 1
action = agent.sample(obs)
next_obs, reward, done, _ = env.step(action)
rpm.append((obs, action, reward, next_obs, done))
# train model
if (len(rpm) > opt.memory_warmup_size) and (step % opt.learn_freq == 0):
(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_done) = rpm.sample(opt.batch_size)
train_loss = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_done)
total_reward += reward
obs = next_obs
if done:
break
return total_reward
# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, num_episodes=5, render=False):
eval_reward = []
for i in range(num_episodes):
obs = env.reset()
episode_reward = 0
while True:
action = agent.predict(obs)
obs, reward, done, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if done:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
if __name__ == '__main__':
env = gym.make('CartPole-v1')
# env = env.unwrapped # Cancel the minimum score limit
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 根据parl框架构建agent
rpm = ReplayMemory(opt.memory_size)
model = CartpoleModel(act_dim=act_dim)
alg = DDQN(model, act_dim=act_dim, gamma=opt.gamma, lr=opt.lr)
agent = CartpoleAgent(
alg,
obs_dim=obs_dim,
act_dim=act_dim,
e_greed=0.1, # explore
e_greed_decrement=1e-6
) # probability of exploring is decreasing during training
# 加载模型
if os.path.exists('./models/ddqn_{}.ckpt'.format(opt.model)):
agent.restore('./models/ddqn_{}.ckpt'.format(opt.model))
RUN_NEXT_EPISODE = 'T'
while RUN_NEXT_EPISODE == 'T':
reward = evaluate(env, agent, num_episodes=1, render=True)
print("Reward is: {}".format(reward))
RUN_NEXT_EPISODE = input('Input T or N: ')
exit()
while len(rpm) < opt.memory_warmup_size: # warm up replay memory
run_episode(env, agent, rpm)
# 评估一下初始模型
total_reward = evaluate(env, agent)
logger.info('Test reward: {}'.format(total_reward))
agent.save('./models/ddqn_{}.ckpt'.format(0))
for i in range(opt.max_episode):
total_reward = run_episode(env, agent, rpm)
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, total_reward))
if (i + 1) % 100 == 0:
total_reward = evaluate(env, agent)
logger.info('Test reward: {}'.format(total_reward))
agent.save('./models/ddqn_{}.ckpt'.format(i + 1))
# # save the parameters to ./model.ckpt
# agent.save('./model.ckpt')