未验证 提交 7b5c5241 编写于 作者: B Bo Zhou 提交者: GitHub

add tutorial of deepes, written with numpy, less than 100 lines (#225)

* add tutorial of deepes, written with numpy, less than 100lines

* modify learning_rate as an arugment of Agent
上级 7a16adc0
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gym
import numpy as np
class CartpoleAgent(object):
def __init__(self, obs_dim, act_dim, learning_rate):
self.learning_rate = learning_rate
# init weights
self.w = np.random.random((act_dim, obs_dim)) * 0.1
self.b = np.zeros(act_dim)
self.weights_total_size = self.w.size + self.b.size
def predict(self, obs):
out = np.dot(self.w, obs) + self.b
action = np.argmax(out)
return action
def learn(self, rewards, noises):
gradient = np.dot(
np.asarray(rewards, dtype=np.float32),
np.asarray(noises, dtype=np.float32))
gradient /= rewards.size
flat_weights = self.get_flat_weights()
# Compute the new weights.
new_weights = flat_weights + self.learning_rate * gradient
self.set_flat_weights(new_weights)
def set_flat_weights(self, flat_weights):
self.w = flat_weights[:self.w.size].reshape(self.w.shape)
self.b = flat_weights[self.w.size:]
def get_flat_weights(self):
flat_weights = np.concatenate(([self.w.ravel(), self.b]), axis=0)
return flat_weights
def evaluate(env, agent):
ep_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs)
obs, reward, done, _ = env.step(action)
ep_reward += reward
if done:
break
return ep_reward
def reward_normalize(reward):
reward = np.asarray(reward)
max_r = np.max(reward)
min_r = np.min(reward)
if max_r == min_r:
reward = np.zeros(reward.shape)
else:
reward = (reward - min_r) / (max_r - min_r)
reward -= 0.5
return reward
if __name__ == '__main__':
env = gym.make('CartPole-v0')
agent = CartpoleAgent(obs_dim=4, act_dim=2, learning_rate=0.1)
for epcho in range(100):
rewards = []
noises = []
lastest_flat_weights = agent.get_flat_weights()
for episode in range(10):
noise = np.random.randn(agent.weights_total_size)
perturbation = noise * 0.05
agent.set_flat_weights(lastest_flat_weights + perturbation)
ep_reward = evaluate(env, agent)
noises.append(noise)
rewards.append(ep_reward)
normalized_rewards = reward_normalize(rewards)
agent.set_flat_weights(lastest_flat_weights)
agent.learn(normalized_rewards, noises)
# evaluate
if (epcho % 10) == 0:
ep_reward = evaluate(env, agent)
print('Epcho {}, Test reward {}'.format(epcho, ep_reward))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册