Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Homework2 林姿伶 汪叔慧 嚴敏誠 #17

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 120 additions & 11 deletions HW2_Policy_Graident.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"# Automatically reload changes to external code\n",
"%load_ext autoreload\n",
Expand All @@ -26,11 +35,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2016-10-08 01:10:27,725] Making new env: CartPole-v0\n"
]
}
],
"source": [
"import gym\n",
"import tensorflow as tf\n",
Expand Down Expand Up @@ -95,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 125,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -151,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 126,
"metadata": {
"collapsed": true
},
Expand Down Expand Up @@ -210,12 +227,14 @@
" Sample solution should be only 1 line.\n",
" \"\"\"\n",
" # YOUR CODE HERE >>>>>>\n",
" a = r - b\n",
" # a = ???\n",
" # <<<<<<<<\n",
"\n",
" p[\"returns\"] = r\n",
" p[\"baselines\"] = b\n",
" p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
" # p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
" p[\"advantages\"] = a\n",
"\n",
" obs = np.concatenate([ p[\"observations\"] for p in paths ])\n",
" actions = np.concatenate([ p[\"actions\"] for p in paths ])\n",
Expand Down Expand Up @@ -250,23 +269,104 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 127,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration 1: Average Return = 14.93\n",
"Iteration 2: Average Return = 14.86\n",
"Iteration 3: Average Return = 17.23\n",
"Iteration 4: Average Return = 17.25\n",
"Iteration 5: Average Return = 17.67\n",
"Iteration 6: Average Return = 19.76\n",
"Iteration 7: Average Return = 23.05\n",
"Iteration 8: Average Return = 23.82\n",
"Iteration 9: Average Return = 23.8\n",
"Iteration 10: Average Return = 28.29\n",
"Iteration 11: Average Return = 27.4\n",
"Iteration 12: Average Return = 34.0\n",
"Iteration 13: Average Return = 36.82\n",
"Iteration 14: Average Return = 38.68\n",
"Iteration 15: Average Return = 40.13\n",
"Iteration 16: Average Return = 49.41\n",
"Iteration 17: Average Return = 43.39\n",
"Iteration 18: Average Return = 45.71\n",
"Iteration 19: Average Return = 45.93\n",
"Iteration 20: Average Return = 49.56\n",
"Iteration 21: Average Return = 53.42\n",
"Iteration 22: Average Return = 55.32\n",
"Iteration 23: Average Return = 54.51\n",
"Iteration 24: Average Return = 53.18\n",
"Iteration 25: Average Return = 67.09\n",
"Iteration 26: Average Return = 63.37\n",
"Iteration 27: Average Return = 64.72\n",
"Iteration 28: Average Return = 61.77\n",
"Iteration 29: Average Return = 75.14\n",
"Iteration 30: Average Return = 69.76\n",
"Iteration 31: Average Return = 68.13\n",
"Iteration 32: Average Return = 74.82\n",
"Iteration 33: Average Return = 72.22\n",
"Iteration 34: Average Return = 78.73\n",
"Iteration 35: Average Return = 76.48\n",
"Iteration 36: Average Return = 80.88\n",
"Iteration 37: Average Return = 93.15\n",
"Iteration 38: Average Return = 87.53\n",
"Iteration 39: Average Return = 100.09\n",
"Iteration 40: Average Return = 93.73\n",
"Iteration 41: Average Return = 106.68\n",
"Iteration 42: Average Return = 109.8\n",
"Iteration 43: Average Return = 118.82\n",
"Iteration 44: Average Return = 130.64\n",
"Iteration 45: Average Return = 140.82\n",
"Iteration 46: Average Return = 146.44\n",
"Iteration 47: Average Return = 142.08\n",
"Iteration 48: Average Return = 158.68\n",
"Iteration 49: Average Return = 156.67\n",
"Iteration 50: Average Return = 158.33\n",
"Iteration 51: Average Return = 163.78\n",
"Iteration 52: Average Return = 172.12\n",
"Iteration 53: Average Return = 169.73\n",
"Iteration 54: Average Return = 173.06\n",
"Iteration 55: Average Return = 170.45\n",
"Iteration 56: Average Return = 182.6\n",
"Iteration 57: Average Return = 178.12\n",
"Iteration 58: Average Return = 188.06\n",
"Iteration 59: Average Return = 188.39\n",
"Iteration 60: Average Return = 189.94\n",
"Iteration 61: Average Return = 191.31\n",
"Iteration 62: Average Return = 194.09\n",
"Iteration 63: Average Return = 193.21\n",
"Iteration 64: Average Return = 186.86\n",
"Iteration 65: Average Return = 194.54\n",
"Iteration 66: Average Return = 192.87\n",
"Iteration 67: Average Return = 193.04\n",
"Iteration 68: Average Return = 193.49\n",
"Iteration 69: Average Return = 193.54\n",
"Iteration 70: Average Return = 193.76\n",
"Iteration 71: Average Return = 197.12\n",
"Solve at 71 iterations, which equals 7100 episodes.\n"
]
}
],
"source": [
"n_iter = 200\n",
"n_episode = 100\n",
"path_length = 200\n",
"discount_rate = 0.99\n",
"baseline = LinearFeatureBaseline(env.spec)\n",
"# baseline = None\n",
"\n",
"po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,\n",
" discount_rate)\n",
"\n",
"# Train the policy optimizer\n",
"po.train()"
"po.train()\n",
"sess.close()"
]
},
{
Expand Down Expand Up @@ -309,6 +409,15 @@
"\n",
"Include the answer in your report."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -327,7 +436,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
"version": "2.7.12"
}
},
"nbformat": 4,
Expand Down
48 changes: 48 additions & 0 deletions Report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
## Homework2 - Policy Gradient
Please complete each homework for each team, and <br>
mention who contributed which parts in your report.

## Member
* captain - <a href="https://github.com/maolin23?tab=repositories">林姿伶</a>: 104062546
* member - <a href="https://github.com/hedywang73?tab=repositories">汪叔慧</a>: 104062526
* member - <a href="https://github.com/yenmincheng0708?tab=repositories">嚴敏誠</a>: 104062595

` Contribution `
```
姿伶 and 叔慧 discussed the homework.
Finally, 姿伶 typed the final report on github.
```

## Problem 5
```
Replacing line
baseline = LinearFeatureBaseline(env.spec)
with
baseline = None
can remove the baseline.
Modify the code to compare the variance and performance before and after adding baseline.
Then, write a report about your findings. (with figures is better)
```
理論上,利用減掉baseline這個動作,可以降低variance,於是可以得到比較好的predict結果<br>
下面這兩張圖,是分別在有或沒有baseline的情況下各跑10次的結果<br>
中間的曲線代表的是十次平均的結果,上及下的值分別代表的是在這10次當中,該iteration的最大最小值<br>
從這兩個結果看起來,我們覺得或許是因為這個task太簡單,所以不管有沒有baseline,對於學習的影響不大<br>
![Fig. 1](https://github.com/CEDL739/homework2/blob/master/reward_with.png)<br>
             **Fig .1** With baseline<br>
![Fig. 2](https://github.com/CEDL739/homework2/blob/master/reward_without.png)<br>
             **Fig .2** Without baseline<br>
但是我們覺得最重要的是最一開始的initial weight,如果一開始的return很高的話,就可以在很少的iteration內收斂<br>

## Problem 6
```
In function process_paths of class PolicyOptimizer, why we need to normalize the advantages?
i.e., what's the usage of this line:
p["advantages"] = (a - a.mean()) / (a.std() + 1e-8)
Include the answer in your report.
```
對 advantage function 進行normalize是因為,當我們計算accumulated reward時,是進行discounted reward計算。<br>
每個當下的reward都會乘上一個discounted factor,並且這個factor是隨著stage的進行呈現exponentially discounted。<br>
因此,比較後面的stage的action會因為乘上這個discounted factor,而使得學習上比較沒有效率。<br>
所以如果對整個時間點的advantage進行normalize,就可以使得整段時間點的每個stage影響比較平均。<br>


99 changes: 99 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2.9701, <type 'numpy.float64'>)\n",
"(1.99, <type 'numpy.float64'>)\n",
"(1.0, <type 'numpy.float64'>)\n"
]
}
],
"source": [
"from gym.spaces import Box, Discrete\n",
"import numpy as np\n",
"from scipy.signal import lfilter\n",
"\n",
"def flatten_space(space):\n",
"\tif isinstance(space, Box):\n",
"\t\treturn np.prod(space.shape)\n",
"\telif isinstance(space, Discrete):\n",
"\t\treturn space.n\n",
"\telse:\n",
"\t\traise ValueError(\"Env must be either Box or Discrete.\")\n",
" \n",
"x = np.array([1, 1, 1])\n",
"y = np.zeros(len(x))\n",
"r = 0.99\n",
"# print(len(x))\n",
"for i in range(len(x)):\n",
" for j in range(i, len(x)):\n",
" y[i] = y[i] + (x[j]*(r**(j-i)))\n",
" print(y[i], type(y[i]))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "ImportError",
"evalue": "No module named base",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-32-b3173feef156>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mbase\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mBaseline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"rewards\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mImportError\u001b[0m: No module named base"
]
}
],
"source": [
"from base import Baseline\n",
"import numpy as np\n",
"print(path[\"rewards\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
8 changes: 8 additions & 0 deletions policy_gradient/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
Sample solution is about 2~4 lines.
"""
# YOUR CODE HERE >>>>>>
# W = tf.Variable(tf.random_normal([in_dim, hidden_dim], stddev=0.01), name='W')
# b = tf.Variable(tf.zeros([hidden_dim]), name='b' )
# probs = tf.nn.softmax(tf.tanh(tf.matmul(self._observations, W) + b), name=None)
with tf.device('/cpu:0'):
h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)
# probs = ???

# <<<<<<<<

# --------------------------------------------------
Expand Down Expand Up @@ -69,6 +76,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
Sample solution is about 1~3 lines.
"""
# YOUR CODE HERE >>>>>>
surr_loss = -tf.reduce_mean(tf.mul(log_prob, self._advantages))
# surr_loss = ???
# <<<<<<<<

Expand Down
8 changes: 7 additions & 1 deletion policy_gradient/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ def flatten_space(space):
Sample solution is about 1~7 lines.
"""

# def discount_cumsum(x, discount_rate):
def discount_cumsum(x, discount_rate):
# YOUR CODE HERE >>>>>>
# print(x)
y = np.zeros(len(x))
for i in range(len(x)):
for j in range(i, len(x)):
y[i] = y[i] + (x[j]*(discount_rate**(j-i)))
return y
# return ???
# <<<<<<<<
Binary file added reward(b).png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added reward_with.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added reward_without.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.