add tutorial of deepes, written with numpy, less than 100 lines (#225)

* add tutorial of deepes, written with numpy, less than 100lines * modify learning_rate as an arugment of Agent

add tutorial of deepes, written with numpy, less than 100 lines (#225)
* add tutorial of deepes, written with numpy, less than 100lines * modify learning_rate as an arugment of Agent
7b5c5241 · Bo Zhou · GitHub · 7a16adc0 · 7b5c5241
隐藏空白更改
内联并排

Showing with 100 addition and 0 deletion

examples/tutorial/deepes.py examples/tutorial/deepes.py +100 -0

未找到文件。
--- a/examples/tutorial/deepes.py
+++ b/examples/tutorial/deepes.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gym
+import numpy as np
+
+
+class CartpoleAgent(object):
+    def __init__(self, obs_dim, act_dim, learning_rate):
+        self.learning_rate = learning_rate
+        # init weights
+        self.w = np.random.random((act_dim, obs_dim)) * 0.1
+        self.b = np.zeros(act_dim)
+        self.weights_total_size = self.w.size + self.b.size
+
+    def predict(self, obs):
+        out = np.dot(self.w, obs) + self.b
+        action = np.argmax(out)
+        return action
+
+    def learn(self, rewards, noises):
+        gradient = np.dot(
+            np.asarray(rewards, dtype=np.float32),
+            np.asarray(noises, dtype=np.float32))
+        gradient /= rewards.size
+
+        flat_weights = self.get_flat_weights()
+        # Compute the new weights.
+        new_weights = flat_weights + self.learning_rate * gradient
+        self.set_flat_weights(new_weights)
+
+    def set_flat_weights(self, flat_weights):
+        self.w = flat_weights[:self.w.size].reshape(self.w.shape)
+        self.b = flat_weights[self.w.size:]
+
+    def get_flat_weights(self):
+        flat_weights = np.concatenate(([self.w.ravel(), self.b]), axis=0)
+        return flat_weights
+
+
+def evaluate(env, agent):
+    ep_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)
+        obs, reward, done, _ = env.step(action)
+        ep_reward += reward
+        if done:
+            break
+    return ep_reward
+
+
+def reward_normalize(reward):
+    reward = np.asarray(reward)
+    max_r = np.max(reward)
+    min_r = np.min(reward)
+    if max_r == min_r:
+        reward = np.zeros(reward.shape)
+    else:
+        reward = (reward - min_r) / (max_r - min_r)
+        reward -= 0.5
+    return reward
+
+
+if __name__ == '__main__':
+    env = gym.make('CartPole-v0')
+    agent = CartpoleAgent(obs_dim=4, act_dim=2, learning_rate=0.1)
+
+    for epcho in range(100):
+        rewards = []
+        noises = []
+        lastest_flat_weights = agent.get_flat_weights()
+
+        for episode in range(10):
+            noise = np.random.randn(agent.weights_total_size)
+            perturbation = noise * 0.05
+
+            agent.set_flat_weights(lastest_flat_weights + perturbation)
+            ep_reward = evaluate(env, agent)
+
+            noises.append(noise)
+            rewards.append(ep_reward)
+
+        normalized_rewards = reward_normalize(rewards)
+        agent.set_flat_weights(lastest_flat_weights)
+        agent.learn(normalized_rewards, noises)
+        # evaluate
+        if (epcho % 10) == 0:
+            ep_reward = evaluate(env, agent)
+            print('Epcho {}, Test reward {}'.format(epcho, ep_reward))