TheCEDL · maolin23 · Oct 12, 2016 · Oct 16, 2016 · Oct 16, 2016 · Oct 16, 2016
diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb
@@ -2,11 +2,20 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "# Automatically reload changes to external code\n",
     "%load_ext autoreload\n",
@@ -26,11 +35,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2016-10-08 01:10:27,725] Making new env: CartPole-v0\n"
+     ]
+    }
+   ],
    "source": [
     "import gym\n",
     "import tensorflow as tf\n",
@@ -95,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 125,
    "metadata": {
     "collapsed": false
    },
@@ -151,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 126,
    "metadata": {
     "collapsed": true
    },
@@ -210,12 +227,14 @@
     "            Sample solution should be only 1 line.\n",
     "            \"\"\"\n",
     "            # YOUR CODE HERE >>>>>>\n",
+    "            a = r - b\n",
     "            # a = ???\n",
     "            # <<<<<<<<\n",
     "\n",
     "            p[\"returns\"] = r\n",
     "            p[\"baselines\"] = b\n",
-    "            p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
+    "            # p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
+    "            p[\"advantages\"] = a\n",
     "\n",
     "        obs = np.concatenate([ p[\"observations\"] for p in paths ])\n",
     "        actions = np.concatenate([ p[\"actions\"] for p in paths ])\n",
@@ -250,23 +269,104 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 127,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 1: Average Return = 14.93\n",
+      "Iteration 2: Average Return = 14.86\n",
+      "Iteration 3: Average Return = 17.23\n",
+      "Iteration 4: Average Return = 17.25\n",
+      "Iteration 5: Average Return = 17.67\n",
+      "Iteration 6: Average Return = 19.76\n",
+      "Iteration 7: Average Return = 23.05\n",
+      "Iteration 8: Average Return = 23.82\n",
+      "Iteration 9: Average Return = 23.8\n",
+      "Iteration 10: Average Return = 28.29\n",
+      "Iteration 11: Average Return = 27.4\n",
+      "Iteration 12: Average Return = 34.0\n",
+      "Iteration 13: Average Return = 36.82\n",
+      "Iteration 14: Average Return = 38.68\n",
+      "Iteration 15: Average Return = 40.13\n",
+      "Iteration 16: Average Return = 49.41\n",
+      "Iteration 17: Average Return = 43.39\n",
+      "Iteration 18: Average Return = 45.71\n",
+      "Iteration 19: Average Return = 45.93\n",
+      "Iteration 20: Average Return = 49.56\n",
+      "Iteration 21: Average Return = 53.42\n",
+      "Iteration 22: Average Return = 55.32\n",
+      "Iteration 23: Average Return = 54.51\n",
+      "Iteration 24: Average Return = 53.18\n",
+      "Iteration 25: Average Return = 67.09\n",
+      "Iteration 26: Average Return = 63.37\n",
+      "Iteration 27: Average Return = 64.72\n",
+      "Iteration 28: Average Return = 61.77\n",
+      "Iteration 29: Average Return = 75.14\n",
+      "Iteration 30: Average Return = 69.76\n",
+      "Iteration 31: Average Return = 68.13\n",
+      "Iteration 32: Average Return = 74.82\n",
+      "Iteration 33: Average Return = 72.22\n",
+      "Iteration 34: Average Return = 78.73\n",
+      "Iteration 35: Average Return = 76.48\n",
+      "Iteration 36: Average Return = 80.88\n",
+      "Iteration 37: Average Return = 93.15\n",
+      "Iteration 38: Average Return = 87.53\n",
+      "Iteration 39: Average Return = 100.09\n",
+      "Iteration 40: Average Return = 93.73\n",
+      "Iteration 41: Average Return = 106.68\n",
+      "Iteration 42: Average Return = 109.8\n",
+      "Iteration 43: Average Return = 118.82\n",
+      "Iteration 44: Average Return = 130.64\n",
+      "Iteration 45: Average Return = 140.82\n",
+      "Iteration 46: Average Return = 146.44\n",
+      "Iteration 47: Average Return = 142.08\n",
+      "Iteration 48: Average Return = 158.68\n",
+      "Iteration 49: Average Return = 156.67\n",
+      "Iteration 50: Average Return = 158.33\n",
+      "Iteration 51: Average Return = 163.78\n",
+      "Iteration 52: Average Return = 172.12\n",
+      "Iteration 53: Average Return = 169.73\n",
+      "Iteration 54: Average Return = 173.06\n",
+      "Iteration 55: Average Return = 170.45\n",
+      "Iteration 56: Average Return = 182.6\n",
+      "Iteration 57: Average Return = 178.12\n",
+      "Iteration 58: Average Return = 188.06\n",
+      "Iteration 59: Average Return = 188.39\n",
+      "Iteration 60: Average Return = 189.94\n",
+      "Iteration 61: Average Return = 191.31\n",
+      "Iteration 62: Average Return = 194.09\n",
+      "Iteration 63: Average Return = 193.21\n",
+      "Iteration 64: Average Return = 186.86\n",
+      "Iteration 65: Average Return = 194.54\n",
+      "Iteration 66: Average Return = 192.87\n",
+      "Iteration 67: Average Return = 193.04\n",
+      "Iteration 68: Average Return = 193.49\n",
+      "Iteration 69: Average Return = 193.54\n",
+      "Iteration 70: Average Return = 193.76\n",
+      "Iteration 71: Average Return = 197.12\n",
+      "Solve at 71 iterations, which equals 7100 episodes.\n"
+     ]
+    }
+   ],
    "source": [
     "n_iter = 200\n",
     "n_episode = 100\n",
     "path_length = 200\n",
     "discount_rate = 0.99\n",
     "baseline = LinearFeatureBaseline(env.spec)\n",
+    "# baseline = None\n",
     "\n",
     "po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,\n",
     "                     discount_rate)\n",
     "\n",
     "# Train the policy optimizer\n",
-    "po.train()"
+    "po.train()\n",
+    "sess.close()"
    ]
   },
   {
@@ -309,6 +409,15 @@
     "\n",
     "Include the answer in your report."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -327,7 +436,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,

diff --git a/Report.md b/Report.md
@@ -0,0 +1,48 @@
+## Homework2 - Policy Gradient
+Please complete each homework for each team, and <br>
+mention who contributed which parts in your report.
+
+## Member
+* captain - <a href="https://github.com/maolin23?tab=repositories">林姿伶</a>: 104062546
+* member -  <a href="https://github.com/hedywang73?tab=repositories">汪叔慧</a>: 104062526
+* member - <a href="https://github.com/yenmincheng0708?tab=repositories">嚴敏誠</a>: 104062595
+
+` Contribution `
+```
+姿伶 and 叔慧 discussed the homework.
+Finally, 姿伶 typed the final report on github.
+```
+
+## Problem 5
+```
+Replacing line
+baseline = LinearFeatureBaseline(env.spec)
+with
+baseline = None
+can remove the baseline.
+Modify the code to compare the variance and performance before and after adding baseline. 
+Then, write a report about your findings. (with figures is better)
+```
+理論上，利用減掉baseline這個動作，可以降低variance，於是可以得到比較好的predict結果<br>
+下面這兩張圖，是分別在有或沒有baseline的情況下各跑10次的結果<br>
+中間的曲線代表的是十次平均的結果，上及下的值分別代表的是在這10次當中，該iteration的最大最小值<br>
+從這兩個結果看起來，我們覺得或許是因為這個task太簡單，所以不管有沒有baseline，對於學習的影響不大<br>
+![Fig. 1](https://github.com/CEDL739/homework2/blob/master/reward_with.png)<br>
+　　　　　　　　　　　　　**Fig .1** With baseline<br>
+![Fig. 2](https://github.com/CEDL739/homework2/blob/master/reward_without.png)<br>
+　　　　　　　　　　　　　**Fig .2** Without baseline<br>
+但是我們覺得最重要的是最一開始的initial weight，如果一開始的return很高的話，就可以在很少的iteration內收斂<br>
+
+## Problem 6
+```
+In function process_paths of class PolicyOptimizer, why we need to normalize the advantages? 
+i.e., what's the usage of this line:
+p["advantages"] = (a - a.mean()) / (a.std() + 1e-8)
+Include the answer in your report.
+```
+對 advantage function 進行normalize是因為，當我們計算accumulated reward時，是進行discounted reward計算。<br>
+每個當下的reward都會乘上一個discounted factor，並且這個factor是隨著stage的進行呈現exponentially discounted。<br>
+因此，比較後面的stage的action會因為乘上這個discounted factor，而使得學習上比較沒有效率。<br>
+所以如果對整個時間點的advantage進行normalize，就可以使得整段時間點的每個stage影響比較平均。<br>
+
+
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,99 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(2.9701, <type 'numpy.float64'>)\n",
+      "(1.99, <type 'numpy.float64'>)\n",
+      "(1.0, <type 'numpy.float64'>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gym.spaces import Box, Discrete\n",
+    "import numpy as np\n",
+    "from scipy.signal import lfilter\n",
+    "\n",
+    "def flatten_space(space):\n",
+    "\tif isinstance(space, Box):\n",
+    "\t\treturn np.prod(space.shape)\n",
+    "\telif isinstance(space, Discrete):\n",
+    "\t\treturn space.n\n",
+    "\telse:\n",
+    "\t\traise ValueError(\"Env must be either Box or Discrete.\")\n",
+    "        \n",
+    "x = np.array([1, 1, 1])\n",
+    "y = np.zeros(len(x))\n",
+    "r = 0.99\n",
+    "# print(len(x))\n",
+    "for i in range(len(x)):\n",
+    "    for j in range(i, len(x)):\n",
+    "        y[i] = y[i] + (x[j]*(r**(j-i)))\n",
+    "    print(y[i], type(y[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "No module named base",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-32-b3173feef156>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mBaseline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"rewards\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mImportError\u001b[0m: No module named base"
+     ]
+    }
+   ],
+   "source": [
+    "from base import Baseline\n",
+    "import numpy as np\n",
+    "print(path[\"rewards\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -27,7 +27,14 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        # W = tf.Variable(tf.random_normal([in_dim, hidden_dim], stddev=0.01), name='W')
+        # b = tf.Variable(tf.zeros([hidden_dim]), name='b' )
+        # probs = tf.nn.softmax(tf.tanh(tf.matmul(self._observations, W) + b), name=None)
+        with tf.device('/cpu:0'):
+            h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
+            probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)
         # probs = ???
+
         # <<<<<<<<
 
         # --------------------------------------------------
@@ -69,6 +76,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
         # surr_loss = ???
         # <<<<<<<<
 

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -19,7 +19,13 @@ def flatten_space(space):
 Sample solution is about 1~7 lines.
 """
 
-# def discount_cumsum(x, discount_rate):
+def discount_cumsum(x, discount_rate):
     # YOUR CODE HERE >>>>>>
+    # print(x)
+    y = np.zeros(len(x))
+    for i in range(len(x)):
+        for j in range(i, len(x)):
+            y[i] = y[i] + (x[j]*(discount_rate**(j-i)))
+    return y    
     # return ???
     # <<<<<<<<
diff --git a/reward(b).png b/reward(b).png
diff --git a/reward_with.png b/reward_with.png
diff --git a/reward_without.png b/reward_without.png