TheCEDL · brade31919 · Oct 15, 2016 · Oct 15, 2016 · Oct 16, 2016 · Oct 16, 2016
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
@@ -26,21 +26,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:gym.envs.registration:Making new env: CartPole-v0\n",
+      "[2016-10-16 18:17:36,278] Making new env: CartPole-v0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[name: \"/cpu:0\"\n",
+      "device_type: \"CPU\"\n",
+      "memory_limit: 268435456\n",
+      "bus_adjacency: BUS_ANY\n",
+      "incarnation: 5137755054001702941\n",
+      ", name: \"/gpu:0\"\n",
+      "device_type: \"GPU\"\n",
+      "memory_limit: 22279392461\n",
+      "incarnation: 4594359714894799590\n",
+      "physical_device_desc: \"device: 0, name: Tesla M40, pci bus id: 0000:04:00.0\"\n",
+      "]\n"
+     ]
+    }
+   ],
    "source": [
     "import gym\n",
     "import tensorflow as tf\n",
     "import numpy as np\n",
     "from policy_gradient import util\n",
     "from policy_gradient.policy import CategoricalPolicy\n",
     "from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline\n",
+    "import os\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
     "\n",
     "np.random.seed(0)\n",
     "tf.set_random_seed(0)\n",
+    "from tensorflow.python.client import device_lib\n",
+    "print device_lib.list_local_devices()\n",
     "\n",
     "# CartPole-v0 is a MDP with finite state and action space. \n",
     "# In this environment, A pendulum is attached by an un-actuated joint to a cart, \n",
@@ -95,11 +125,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py:90: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
+      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
+     ]
+    }
+   ],
    "source": [
     "sess = tf.Session()\n",
     "\n",
@@ -151,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "collapsed": true
    },
@@ -210,33 +249,42 @@
     "            Sample solution should be only 1 line.\n",
     "            \"\"\"\n",
     "            # YOUR CODE HERE >>>>>>\n",
-    "            # a = ???\n",
+    "            a = r-b\n",
     "            # <<<<<<<<\n",
     "\n",
     "            p[\"returns\"] = r\n",
     "            p[\"baselines\"] = b\n",
+    "            p[\"advantages_wo_norm\"] = a\n",
     "            p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
     "\n",
     "        obs = np.concatenate([ p[\"observations\"] for p in paths ])\n",
     "        actions = np.concatenate([ p[\"actions\"] for p in paths ])\n",
     "        rewards = np.concatenate([ p[\"rewards\"] for p in paths ])\n",
     "        advantages = np.concatenate([ p[\"advantages\"] for p in paths ])\n",
+    "        \n",
+    "        advantages_wo_norm = np.concatenate([ p[\"advantages_wo_norm\"] for p in paths ])\n",
+    "        returns = np.concatenate([ p[\"returns\"] for p in paths ])\n",
     "\n",
     "        return dict(\n",
     "            observations=obs,\n",
     "            actions=actions,\n",
     "            rewards=rewards,\n",
     "            advantages=advantages,\n",
+    "            advantages_wo_norm = advantages_wo_norm,\n",
+    "            returns=returns\n",
     "        )\n",
     "\n",
     "    def train(self):\n",
+    "        avg_return_lst = []\n",
+    "        episode = []\n",
     "        for i in range(1, self.n_iter + 1):\n",
     "            paths = []\n",
     "            for _ in range(self.n_episode):\n",
     "                paths.append(self.sample_path())\n",
     "            data = self.process_paths(paths)\n",
     "            loss = self.policy.train(data[\"observations\"], data[\"actions\"], data[\"advantages\"])\n",
     "            avg_return = np.mean([sum(p[\"rewards\"]) for p in paths])\n",
+    "            avg_return_lst.append(avg_return)\n",
     "            print(\"Iteration {}: Average Return = {}\".format(i, avg_return))\n",
     "            \n",
     "            # CartPole-v0 defines \"solving\" as getting average reward of 195.0 over 100 consecutive trials.\n",
@@ -245,28 +293,141 @@
     "                break\n",
     "\n",
     "            if self.baseline != None:\n",
-    "                self.baseline.fit(paths)"
+    "                self.baseline.fit(paths)\n",
+    "            episode.append(data)\n",
+    "        \n",
+    "        return avg_return_lst,i,episode"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1: Average Return = 11.08\n",
+      "Iteration 2: Average Return = 11.21\n",
+      "Iteration 3: Average Return = 11.68\n",
+      "Iteration 4: Average Return = 11.07\n",
+      "Iteration 5: Average Return = 11.56\n",
+      "Iteration 6: Average Return = 12.53\n",
+      "Iteration 7: Average Return = 13.32\n",
+      "Iteration 8: Average Return = 13.81\n",
+      "Iteration 9: Average Return = 14.65\n",
+      "Iteration 10: Average Return = 15.1\n",
+      "Iteration 11: Average Return = 16.4\n",
+      "Iteration 12: Average Return = 17.16\n",
+      "Iteration 13: Average Return = 18.38\n",
+      "Iteration 14: Average Return = 21.81\n",
+      "Iteration 15: Average Return = 23.06\n",
+      "Iteration 16: Average Return = 22.66\n",
+      "Iteration 17: Average Return = 27.0\n",
+      "Iteration 18: Average Return = 29.09\n",
+      "Iteration 19: Average Return = 30.13\n",
+      "Iteration 20: Average Return = 30.21\n",
+      "Iteration 21: Average Return = 33.67\n",
+      "Iteration 22: Average Return = 34.53\n",
+      "Iteration 23: Average Return = 33.08\n",
+      "Iteration 24: Average Return = 34.78\n",
+      "Iteration 25: Average Return = 35.53\n",
+      "Iteration 26: Average Return = 33.87\n",
+      "Iteration 27: Average Return = 37.25\n",
+      "Iteration 28: Average Return = 35.98\n",
+      "Iteration 29: Average Return = 35.48\n",
+      "Iteration 30: Average Return = 32.6\n",
+      "Iteration 31: Average Return = 38.6\n",
+      "Iteration 32: Average Return = 42.62\n",
+      "Iteration 33: Average Return = 37.86\n",
+      "Iteration 34: Average Return = 42.13\n",
+      "Iteration 35: Average Return = 46.49\n",
+      "Iteration 36: Average Return = 42.17\n",
+      "Iteration 37: Average Return = 48.06\n",
+      "Iteration 38: Average Return = 42.23\n",
+      "Iteration 39: Average Return = 45.62\n",
+      "Iteration 40: Average Return = 46.04\n",
+      "Iteration 41: Average Return = 45.86\n",
+      "Iteration 42: Average Return = 45.93\n",
+      "Iteration 43: Average Return = 46.37\n",
+      "Iteration 44: Average Return = 46.42\n",
+      "Iteration 45: Average Return = 49.61\n",
+      "Iteration 46: Average Return = 51.0\n",
+      "Iteration 47: Average Return = 52.25\n",
+      "Iteration 48: Average Return = 52.78\n",
+      "Iteration 49: Average Return = 52.69\n",
+      "Iteration 50: Average Return = 57.27\n",
+      "Iteration 51: Average Return = 54.55\n",
+      "Iteration 52: Average Return = 52.3\n",
+      "Iteration 53: Average Return = 55.47\n",
+      "Iteration 54: Average Return = 52.21\n",
+      "Iteration 55: Average Return = 59.62\n",
+      "Iteration 56: Average Return = 61.68\n",
+      "Iteration 57: Average Return = 56.72\n",
+      "Iteration 58: Average Return = 59.8\n",
+      "Iteration 59: Average Return = 59.21\n",
+      "Iteration 60: Average Return = 59.79\n",
+      "Iteration 61: Average Return = 61.55\n",
+      "Iteration 62: Average Return = 63.13\n",
+      "Iteration 63: Average Return = 64.52\n",
+      "Iteration 64: Average Return = 68.87\n",
+      "Iteration 65: Average Return = 66.39\n",
+      "Iteration 66: Average Return = 68.56\n",
+      "Iteration 67: Average Return = 71.38\n",
+      "Iteration 68: Average Return = 70.99\n",
+      "Iteration 69: Average Return = 73.13\n",
+      "Iteration 70: Average Return = 79.29\n",
+      "Iteration 71: Average Return = 73.96\n",
+      "Iteration 72: Average Return = 78.36\n",
+      "Iteration 73: Average Return = 75.7\n",
+      "Iteration 74: Average Return = 81.86\n",
+      "Iteration 75: Average Return = 82.47\n",
+      "Iteration 76: Average Return = 86.06\n",
+      "Iteration 77: Average Return = 88.11\n",
+      "Iteration 78: Average Return = 95.86\n",
+      "Iteration 79: Average Return = 93.99\n",
+      "Iteration 80: Average Return = 103.44\n",
+      "Iteration 81: Average Return = 110.93\n",
+      "Iteration 82: Average Return = 122.16\n",
+      "Iteration 83: Average Return = 127.28\n",
+      "Iteration 84: Average Return = 145.37\n",
+      "Iteration 85: Average Return = 152.45\n",
+      "Iteration 86: Average Return = 156.75\n",
+      "Iteration 87: Average Return = 174.1\n",
+      "Iteration 88: Average Return = 181.2\n",
+      "Iteration 89: Average Return = 187.34\n",
+      "Iteration 90: Average Return = 191.77\n",
+      "Iteration 91: Average Return = 190.04\n",
+      "Iteration 92: Average Return = 196.01\n",
+      "Solve at 92 iterations, which equals 9200 episodes.\n"
+     ]
+    }
+   ],
    "source": [
     "n_iter = 200\n",
     "n_episode = 100\n",
     "path_length = 200\n",
     "discount_rate = 0.99\n",
     "baseline = LinearFeatureBaseline(env.spec)\n",
+    "#baseline = None\n",
+    "\n",
     "\n",
     "po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,\n",
     "                     discount_rate)\n",
     "\n",
     "# Train the policy optimizer\n",
-    "po.train()"
+    "\n",
+    "#sess.run(tf.initialize_all_variables())\n",
+    "N=14\n",
+    "[avg,i,data] = po.train()\n",
+    "np.savez('without_baseline_{}'.format(N),average_return_list = avg)\n",
+    "np.savez('substract',data=data)\n",
+    "\n",
+    "#for _ in range(10):\n",
+    "    #sess.run(tf.initialize_all_variables())\n"
    ]
   },
   {
@@ -327,7 +488,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,

diff --git a/Report.md b/Report.md
@@ -0,0 +1,88 @@
+# CEDL homework 2
+### Team member: Juan-ting Lin, Meng-li Shih, Fu-en, Wang
+
+## Overview:
+
+ In this homework, we will try to solve the classic control problem - CartPole.
+
+ <img src="https://cloud.githubusercontent.com/assets/7057863/19025154/dd94466c-8946-11e6-977f-2db4ce478cf3.gif" width="400" height="200" />
+
+ CartPole is an environment which contains a pendulum attached by an un-actuated joint to a cart, and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart. A reward of +1 is provided for every timestep that the pendulum remains upright.
+
+## Implementation:
+
+ In problem 1~4, we implement a simple agent and improve it with mote-carlo sampling and policy-based method
+ * **Problem 1:**
+
+   In problem 1, we construct a simple 2-layer fully-connected feedforward network to perform the policy prediction:
+
+   ```
+   x=tf.contrib.layers.fully_connected(inputs=self._observations,num_outputs=hidden_dim,
+                                          weights_initializer=tf.random_normal_initializer(),
+                                          biases_initializer=tf.random_normal_initializer(),
+                                          activation_fn = tf.tanh)
+   y=tf.contrib.layers.fully_connected(inputs=x,num_outputs=out_dim,
+                                          weights_initializer=tf.random_normal_initializer(),
+                                          biases_initializer=tf.random_normal_initializer())
+   probs=tf.nn.softmax(y)
+   ```
+
+ * **Problem 2:**
+
+   After constructing the policy prediction network, we need to compute the surrogate loss to obtain the policy gradient:
+
+   ```
+   surr_loss = tf.reduce_mean(tf.mul(log_prob,self._advantages))
+   ```
+
+ * **Problem 3:**
+
+   Implement a function that computes the accumulated discounted rewards of each timestep t from t to the end of the episode:
+
+   ```
+   def discount_cumsum(x, discount_rate):
+    sum = 0
+    acc_sum = [0 for _ in range(len(x))]
+    for i in range(len(x)-1,-1,-1):
+        exp = len(x)-i-1
+        sum += x[i]*(discount_rate**exp)
+        acc_sum[i]=sum
+
+    return np.asarray(acc_sum)
+   ```
+ * **Problem 4:**
+
+   Use baseline to reduce the variance of our gradient estimate. By doing so, we can imporove the shortcoming of the mote-carlo method:
+
+   ```
+   a = r-b
+   ```
+
+# Discussion:
+
+  In problem 5~6, we need to discuss the effect of having baseline substracted
+
+* **Problem 5:**
+
+    here we compare the result with and without substracting baseline:
+
+|Substract baseline|Without substracting baseline|
+|---|---|
+|<img src="https://github.com/brade31919/homework2/blob/master/pic/with_baseline_10.png" width="700"> |<img src="https://github.com/brade31919/homework2/blob/master/pic/without_baseline_10.png" width="700"> |
+|<img src="https://github.com/brade31919/homework2/blob/master/pic/with_baseline_std.png" width="700"> |<img src="https://github.com/brade31919/homework2/blob/master/pic/without_baseline_std.png" width="700">|
+
+   In each case of problem 5, we re-conduct 10 training process to get statistical result. The upper part of the block is the average return plot of 10 experiments. It is quite obvious that the case without baseline substracted has larger variation during different training process. To prove our observation, we plot the standard deviation during the training process in the lower part of the block. The center curve is the mean of 10 training processes and the vertical line is the std ( variance^0.5) of each step. As the plot shows us, if we don't substract the baseline, we'll get larger std during training!
+    For detailed explanation, we record all the returns and rewards in each single path and calculate its variance. We show the result in the table below
+
+| |Substract baseline|Without substract baseline|
+|---|---|---|
+|returns|542.5|587.1|
+|advantages|117|587.1|
+
+
+   If we compare the return of the 2 cases, there is no significant difference. Nevertheless, if we compare the advantages(a=r-b), the case with baseline substraction has a much lower variance.
+
+* **Problem 6:**
+
+    In each path, the action performed in latter stages will be discounted exponantially. As a result, in spite of substracting baseline from the average returns, the actions in the latter stage have less influence on the learning process. By doing this, it's like we always encourage and discourage half of the actions. This might help us to control the policy gradient. 
+