TheCEDL · hiram94 · Oct 17, 2016 · Oct 17, 2016 · Oct 17, 2016 · Oct 17, 2016
diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,49 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "No module named tensorflow",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-41389fad42b5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m: No module named tensorflow"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [conda root]",
+   "language": "python",
+   "name": "conda-root-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/Untitled1.ipynb b/Untitled1.ipynb
@@ -0,0 +1,49 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "No module named tensorflow",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-a649b509054f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m: No module named tensorflow"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py
@@ -1,5 +1,6 @@
 import tensorflow as tf
 import numpy as np
+import math
 
 class CategoricalPolicy(object):
     def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
@@ -27,6 +28,12 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 2~4 lines.
         """
         # YOUR CODE HERE >>>>>>
+        W1 = tf.Variable(tf.truncated_normal([in_dim, hidden_dim],stddev=1.0))
+        W2 = tf.Variable(tf.truncated_normal([hidden_dim, out_dim],stddev=1.0))
+        b1 = tf.Variable(tf.zeros([hidden_dim]))
+        b2 = tf.Variable(tf.zeros([out_dim]))
+        hidden1 = tf.nn.tanh(tf.matmul(self._observations, W1) + b1)
+        probs = tf.nn.softmax(tf.matmul(hidden1, W2) + b2)
         # probs = ???
         # <<<<<<<<
 
@@ -69,6 +76,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
         Sample solution is about 1~3 lines.
         """
         # YOUR CODE HERE >>>>>>
+        surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
         # surr_loss = ???
         # <<<<<<<<
 

diff --git a/policy_gradient/util.py b/policy_gradient/util.py
@@ -21,5 +21,12 @@ def flatten_space(space):
 
 # def discount_cumsum(x, discount_rate):
     # YOUR CODE HERE >>>>>>
+def discount_cumsum(x, discount_rate):
+    len_x = len(x)
+    array_discount_rate = np.zeros(len_x)
+    for i in range(len_x):
+        array_discount_rate[i] = x[i] * discount_rate**i
+    array_discount_rate =np.cumsum(array_discount_rate)
+    return array_discount_rate[::-1]
     # return ???
-    # <<<<<<<<
+    # <<<<<<<<
diff --git a/report2.md b/report2.md
@@ -0,0 +1,49 @@
+***
+
+## Homework2 Report
+
+105061469 Haiyang Chang
+
+***
+
+## Problem1
+I constructed a 2-layer neural network as required. The W1,W2 cannot be zeros like b1,b2, otherwise the Average Return will remain really low. I thought this is becauses the neural network would unable to learn the features of training data since 0*in_dim=0. So I use the truncated normal distribution with its stddev being the reciprocal to the input data to make it easier.
++ W1 = tf.Variable(tf.truncated_normal([in_dim, hidden_dim],stddev=1.0))
++ W2 = tf.Variable(tf.truncated_normal([hidden_dim, out_dim],stddev=1.0))
++ b1 = tf.Variable(tf.zeros([hidden_dim]))
+b2 = tf.Variable(tf.zeros([out_dim]))
++ hidden1 = tf.nn.tanh(tf.matmul(self._observations, W1) + b1)
++ probs = tf.nn.softmax(tf.matmul(hidden1, W2) + b2)
+
+## Problem2
+
+The smaller loss, the higher gradient.
++ surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
+
+## Problem3
+
+I tested the function on python to ensure  its accuracy.
++ def discount_cumsum(x, discount_rate):
+  	len_x = len(x)
+  	array_discount_rate = np.zeros(len_x)
+  	for i in range(len_x):
+		array_discount_rate[i] = x[i] * discount_rate**i
+   	array_discount_rate =np.cumsum(array_discount_rate)
+   	return array_discount_rate[::-1]
+
+## Problem4
+
+There could be other forms, but this is the simplest form.
++  a = r - b
+
+## Problem5
+
+Without baseline the Average Return increased to a high level quickly, but finally achieved the similar level as the performance with baseline. The baseline is to guarantee that the gradient will increase. So if the initial state of neural network is not chosen properly, the performance without baseline may not be as satisfactory as with baseline do.
++ With baseline
+![withbaseline.png](https://ooo.0o0.ooo/2016/12/08/5849ac501e81f.png)
++ Without baseline
+![withoutbaseline.png](https://ooo.0o0.ooo/2016/12/08/5849ac501e0e4.png)
+
+## Problem6
+
+The reward is discounted by the discounted rate, with makes the later action seems less important than the former action. By normalizing the advantage, the training process can go on steadily.