TheCEDL · gina9726 · Oct 16, 2016 · Oct 16, 2016 · Oct 17, 2016 · Oct 17, 2016
diff --git a/81.png b/81.png
diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb
@@ -327,7 +327,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,

diff --git a/HW2_Policy_Graident.py b/HW2_Policy_Graident.py
@@ -0,0 +1,228 @@
+# coding: utf-8
+
+# In this assignment, you will solve a classic control problem - CartPole using policy gradient methods.
+#
+# First, you will implement the "vanilla" policy gradient method, i.e., a method that repeatedly computes **unbiased** estimates $\hat{g}$ of $\nabla_{\theta} E[\sum_t r_t]$ and takes gradient ascent steps $\theta \rightarrow \theta + \epsilon \hat{g}$ so as to increase the total rewards collected in each episode. To make sure our code can solve multiple MDPs with different policy parameterizations, provided code follows an OOP manner and represents MDP and Policy as classes.
+#
+# The following code constructs an instance of the MDP using OpenAI gym.
+
+import gym
+import tensorflow as tf
+import numpy as np
+from policy_gradient import util
+from policy_gradient.policy import CategoricalPolicy
+from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline
+import pdb
+import matplotlib.pyplot as plt
+
+np.random.seed(0)
+tf.set_random_seed(0)
+
+# CartPole-v0 is a MDP with finite state and action space.
+# In this environment, A pendulum is attached by an un-actuated joint to a cart,
+# and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart.
+# A reward of +1 is provided for every timestep that the pendulum remains upright.
+# To visualize CartPole-v0, please see https://gym.openai.com/envs/CartPole-v0
+
+env = gym.make('CartPole-v0')
+
+
+# ## Problem 1: construct a neural network to represent policy
+#
+# Make sure you know how to construct neural network using tensorflow.
+#
+# 1. Open **homework2/policy_gradient/policy.py**.
+# 2. Follow the instruction of Problem 1.
+
+# ## Problem 2: compute the surrogate loss
+#
+# If there are $N$ episodes in an iteration, then for $i$ th episode we define $R_t^i = \sum_{{t^′}=t}^T \gamma^{{t^′}-t}r(s_{t^′}, a_{t^′})$ as the accumulated discounted rewards from timestep $t$ to the end of that episode, where $\gamma$ is the discount rate.
+#
+# The pseudocode for the REINFORCE algorithm is as below:
+#
+# 1. Initialize policy $\pi$ with parameter $\theta_1$.
+# 2. For iteration $k = 1, 2, ...$:
+#     * Sample N episodes $\tau_1, \tau_2, ..., \tau_N$ under the current policy $\theta_k$, where $\tau_i =(s_i^t,a_i^t,R_i^t)_{t=0}^{T−1}$. Note that the last state is dropped since no action is taken after observing the last state.
+#     * Compute the empirical policy gradient using formula: $$\hat{g} = E_{\pi_\theta}[\nabla_{\theta} log\pi_\theta(a_t^i | s_t^i) R_t^i]$$
+#     * Take a gradient step: $\theta_{k+1} = \theta_k + \epsilon \hat{g}$.
+#
+#
+# Note that we can transform the policy gradient formula as
+#
+# $$\hat{g} = \nabla_{\theta} \frac{1}{(NT)}(\sum_{i=1}^N \sum_{t=0}^T log\pi_\theta(a_t^i | s_t^i) R_t^i)$$
+#
+# and $L(\theta) = \frac{1}{(NT)}(\sum_{i=1}^N \sum_{t=0}^T log\pi_\theta(a_t^i | s_t^i) R_t^i)$ is called the surrogate loss.
+#
+# We can first construct the computation graph for $L(\theta)$, and then take its gradient as the empirical policy gradient.
+#
+#
+# 1. Open **homework2/policy_gradient/policy.py**.
+# 2. Follow the instruction of Problem 2.
+
+sess = tf.Session()
+
+# Construct a neural network to represent policy which maps observed state to action.
+in_dim = util.flatten_space(env.observation_space)
+out_dim = util.flatten_space(env.action_space)
+hidden_dim = 8
+
+opt = tf.train.AdamOptimizer(learning_rate=0.01)
+policy = CategoricalPolicy(in_dim, out_dim, hidden_dim, opt, sess)
+
+saver = tf.train.Saver()
+sess.run(tf.initialize_all_variables())
+
+
+# # Problem 3
+#
+# Implement a function that computes the accumulated discounted rewards of each timestep _t_ from _t_ to the end of the episode.
+#
+# For example:
+#
+# ```python
+# rewards = [1, 1, 1]
+# discount_rate = 0.99
+# util.discount_cumsum(rewards, discount_rate)
+# ```
+#
+# should return:
+#
+# `array([ 2.9701,  1.99  ,  1.    ])`
+#
+# 1. Open **homework/policy_gradient/util.py**.
+# 2. Implement the commented function.
+
+# # Problem 4
+#
+# Use baseline to reduce the variance of our gradient estimate.
+#
+# 1. Fill in the function `process_paths` of class `PolicyOptimizer` below.
+
+class PolicyOptimizer(object):
+    def __init__(self, env, policy, baseline, n_iter, n_episode, path_length,
+        discount_rate=.99):
+
+        self.policy = policy
+        self.baseline = baseline
+        self.env = env
+        self.n_iter = n_iter
+        self.n_episode = n_episode
+        self.path_length = path_length
+        self.discount_rate = discount_rate
+
+    def sample_path(self):
+        obs = []
+        actions = []
+        rewards = []
+        ob = self.env.reset()
+
+        for _ in range(self.path_length):
+            a = self.policy.act(ob.reshape(1, -1))
+            next_ob, r, done, _ = self.env.step(a)
+            obs.append(ob)
+            actions.append(a)
+            rewards.append(r)
+            ob = next_ob
+            if done:
+                break
+
+        return dict(
+            observations=np.array(obs),
+            actions=np.array(actions),
+            rewards=np.array(rewards),
+        )
+
+    def process_paths(self, paths):
+        for p in paths:
+            if self.baseline != None:
+                b = self.baseline.predict(p)
+            else:
+                b = 0
+
+            # `p["rewards"]` is a matrix contains the rewards of each timestep in a sample path
+            r = util.discount_cumsum(p["rewards"], self.discount_rate)
+
+            """
+            Problem 4:
+
+            1. Variable `b` is the reward predicted by our baseline
+            2. Use it to reduce variance and then assign the result to the variable `a`
+
+            Sample solution should be only 1 line.
+            """
+            # YOUR CODE HERE >>>>>>
+            a = r - b 
+            # <<<<<<<<
+
+            p["returns"] = r
+            p["baselines"] = b
+            p["advantages"] = (a - a.mean()) / (a.std() + 1e-8) # normalize
+
+        obs = np.concatenate([ p["observations"] for p in paths ])
+        actions = np.concatenate([ p["actions"] for p in paths ])
+        rewards = np.concatenate([ p["rewards"] for p in paths ])
+        advantages = np.concatenate([ p["advantages"] for p in paths ])
+
+        return dict(
+            observations=obs,
+            actions=actions,
+            rewards=rewards,
+            advantages=advantages,
+        )
+
+    def train(self):
+        for i in range(1, self.n_iter + 1):
+            paths = []
+            for _ in range(self.n_episode):
+                paths.append(self.sample_path())
+            data = self.process_paths(paths)
+            loss = self.policy.train(data["observations"], data["actions"], data["advantages"])
+            avg_return = np.mean([sum(p["rewards"]) for p in paths])
+            avg_return_.append(avg_return)
+            iteration.append(i)
+            print("Iteration {}: Average Return = {}".format(i, avg_return))
+            print("Iteration {}: Loss = {}".format(i, loss))
+
+            # CartPole-v0 defines "solving" as getting average reward of 195.0 over 100 consecutive trials.
+            if avg_return >= 195:
+                print("Solve at {} iterations, which equals {} episodes.".format(i, i*100))
+                saver.save(sess, 'model', global_step=i)
+                break
+
+            if self.baseline != None:
+                self.baseline.fit(paths)
+
+    def test(self, ob):
+        return self.policy.test(ob.reshape(1, -1))
+
+
+n_iter = 200
+n_episode = 100
+path_length = 200
+discount_rate = 0.99
+baseline = LinearFeatureBaseline(env.spec)
+
+po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,
+                     discount_rate)
+
+# Train the policy optimizer
+avg_return_ = []
+iteration = []
+po.train()
+plt.plot(iteration, avg_return_)
+plt.xlabel('Iteration')
+plt.ylabel('avg_return')
+plt.title('Without baseline')
+plt.show()
+
+'''
+# Test
+saver.restore(sess, 'model/model-88')
+ob = env.reset()
+done = False
+while not done:
+    a = po.test(ob)
+    print a
+    env.render()
+    ob, r, done, _ = env.step(a)
+'''
diff --git a/Report.md b/Report.md
@@ -0,0 +1,55 @@
+# Homework2 - Policy Gradient
+Member: 簡廷安, 巫姿瑩, 胡展維
+
+## Problem 1
+We construct a two fully-connected layer for training.
+
+    W1 = tf.Variable(tf.random_normal([in_dim, hidden_dim]))
+    b1 = tf.Variable(tf.zeros([hidden_dim]))
+    W2 = tf.Variable(tf.random_normal([hidden_dim, out_dim]))
+    b2 = tf.Variable(tf.zeros([out_dim]))
+    output1 = tf.nn.tanh(tf.matmul(self._observations, W1) + b1)
+    probs = tf.nn.softmax(tf.matmul(output1, W2) + b2)
+
+## Problem 2
+It's very easy to implement surrogate loss using Tensorflow, but the optimizer in Tensorflow only minimize loss. So, we need to add a negative sign(Since we want gradient ascent).
+
+    surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
+
+## Problem 3
+Accumulated reward
+
+    def discount_cumsum(x, discount_rate):
+        R = [x[-1]]
+        for V in x[:-1][::-1]:
+            R.append(V + discount_rate*R[-1])
+    return np.array(R[::-1])
+
+## Problem 4
+Just simply do accumulated reward minus baseline
+
+    a = r - b
+
+## Problem 5
+Compare the variance and performance before and after adding baseline (with figures is better)
+Performance without adding baseline converge faster(69 iteraion v.s 81 iteration)
+
+<img src="./w69.png" width="640">
+<img src="./81.png" width="640">
+
+* Observations:
+    * Initial value of policy is highly related to the convergence time. If it's a good initial value, then it reaches avg_return >= 195 faster.
+    * Because we're trying to determine the best action for a state, we need a baseline to compare from. In theory, training with baseline should converge faster than without baseline; however, for this game, it's opposite. Our guess is that this game is too simple.
+    * Next, we fix the iteration to 200, (It means that we don't stop training even the avg_return is more than 195), as following:   
+        <img src="./With_baseline.png" width="640">
+        <img src="./Without_baseline.png" width="640">   
+      We found that roughly after 100 iteration, the avg_return with baseline almost converges to 200, and avg_return without baseline may cause some vibration. We thought it's because the variance of training without baseline is larger than with baseline, so it can't converge perfectly.
+
+## Problem 6
+The reason why we need normalized advantage function is that we want to get a better policy at each time step. When accumulating rewards, the reward we get at a time step is discounted by a factor. However, the later rewards are exponentially less important for the stimulation of action at current time step. Therefore, we normalize the advantage function over time steps to stimulate the action at each time step.
+
+## Reference paper
+
+[Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748)
+
+[HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION](https://arxiv.org/pdf/1506.02438v5.pdf)
diff --git a/With_baseline.png b/With_baseline.png
diff --git a/Without_baseline.png b/Without_baseline.png
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -27,7 +27,12 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
-        # probs = ???
+        W1 = tf.Variable(tf.random_normal([in_dim, hidden_dim]))
+        b1 = tf.Variable(tf.zeros([hidden_dim]))
+        W2 = tf.Variable(tf.random_normal([hidden_dim, out_dim]))
+        b2 = tf.Variable(tf.zeros([out_dim]))
+        output1 = tf.nn.tanh(tf.matmul(self._observations, W1) + b1)
+        probs = tf.nn.softmax(tf.matmul(output1, W2) + b2)
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -69,7 +74,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
-        # surr_loss = ???
+        surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
         # <<<<<<<<
 
         grads_and_vars = self._opt.compute_gradients(surr_loss)
@@ -98,3 +103,9 @@ def act(self, observation):
     def train(self, observations, actions, advantages):
         loss, _ = self._sess.run([self._loss_op, self._train_op], feed_dict={self._observations:observations, self._actions:actions, self._advantages:advantages})
         return loss
+
+    def test(self, observation):
+        assert observation.shape[0] == 1
+        action_probs = self._sess.run(self._act_op, feed_dict={self._observations: observation})
+        idx = np.argmax(action_probs)
+        return idx
diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -1,6 +1,7 @@
 from gym.spaces import Box, Discrete
 import numpy as np
 from scipy.signal import lfilter
+import math
 
 def flatten_space(space):
 	if isinstance(space, Box):
@@ -19,7 +20,11 @@ def flatten_space(space):
 Sample solution is about 1~7 lines.
 """
 
-# def discount_cumsum(x, discount_rate):
+def discount_cumsum(x, discount_rate):
     # YOUR CODE HERE >>>>>>
-    # return ???
-    # <<<<<<<<
+    R = [x[-1]]
+    for V in x[:-1][::-1]:
+        R.append(V + discount_rate*R[-1])
+
+    return np.array(R[::-1])
+    # <<<<<<<<
diff --git a/w69.png b/w69.png