Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

105062504 魏凱亞, 105065514 柯子逸, 105062536 劉祐欣, 105062518 林怡均 #19

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 147 additions & 17 deletions HW2_Policy_Graident.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"collapsed": false
},
Expand All @@ -26,11 +26,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[2016-10-17 12:58:01,609] Making new env: CartPole-v0\n"
]
}
],
"source": [
"import gym\n",
"import tensorflow as tf\n",
Expand Down Expand Up @@ -95,11 +103,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/yoooosing/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gradients.py:90: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
]
}
],
"source": [
"sess = tf.Session()\n",
"\n",
Expand Down Expand Up @@ -151,9 +168,9 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -197,9 +214,13 @@
" b = self.baseline.predict(p)\n",
" else:\n",
" b = 0\n",
" \n",
"\n",
" \n",
" # `p[\"rewards\"]` is a matrix contains the rewards of each timestep in a sample path\n",
" \n",
" r = util.discount_cumsum(p[\"rewards\"], self.discount_rate)\n",
"\n",
"\n",
" \n",
" \"\"\"\n",
" Problem 4:\n",
Expand All @@ -212,21 +233,28 @@
" # YOUR CODE HERE >>>>>>\n",
" # a = ???\n",
" # <<<<<<<<\n",
"\n",
" \n",
" a=r-b\n",
" \n",
" p[\"returns\"] = r\n",
" p[\"baselines\"] = b\n",
" p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n",
" #print \"before normalized :\",a\n",
" #print \"after normalized :\",p[\"advantages\"]\n",
" #p[\"advantages\"] = a \n",
"\n",
" obs = np.concatenate([ p[\"observations\"] for p in paths ])\n",
" actions = np.concatenate([ p[\"actions\"] for p in paths ])\n",
" rewards = np.concatenate([ p[\"rewards\"] for p in paths ])\n",
" advantages = np.concatenate([ p[\"advantages\"] for p in paths ])\n",
"\n",
" \n",
" \n",
" \n",
" return dict(\n",
" observations=obs,\n",
" actions=actions,\n",
" rewards=rewards,\n",
" advantages=advantages,\n",
" advantages=advantages\n",
" )\n",
"\n",
" def train(self):\n",
Expand All @@ -250,17 +278,118 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"collapsed": false
"collapsed": false,
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration 1: Average Return = 15.44\n",
"Iteration 2: Average Return = 17.6\n",
"Iteration 3: Average Return = 17.74\n",
"Iteration 4: Average Return = 19.21\n",
"Iteration 5: Average Return = 20.07\n",
"Iteration 6: Average Return = 23.13\n",
"Iteration 7: Average Return = 23.53\n",
"Iteration 8: Average Return = 25.81\n",
"Iteration 9: Average Return = 25.49\n",
"Iteration 10: Average Return = 26.29\n",
"Iteration 11: Average Return = 25.49\n",
"Iteration 12: Average Return = 30.72\n",
"Iteration 13: Average Return = 33.17\n",
"Iteration 14: Average Return = 33.87\n",
"Iteration 15: Average Return = 36.76\n",
"Iteration 16: Average Return = 39.55\n",
"Iteration 17: Average Return = 39.2\n",
"Iteration 18: Average Return = 38.92\n",
"Iteration 19: Average Return = 41.87\n",
"Iteration 20: Average Return = 41.94\n",
"Iteration 21: Average Return = 43.86\n",
"Iteration 22: Average Return = 42.23\n",
"Iteration 23: Average Return = 44.84\n",
"Iteration 24: Average Return = 49.13\n",
"Iteration 25: Average Return = 45.72\n",
"Iteration 26: Average Return = 45.36\n",
"Iteration 27: Average Return = 51.72\n",
"Iteration 28: Average Return = 53.18\n",
"Iteration 29: Average Return = 48.63\n",
"Iteration 30: Average Return = 48.2\n",
"Iteration 31: Average Return = 52.59\n",
"Iteration 32: Average Return = 50.93\n",
"Iteration 33: Average Return = 52.7\n",
"Iteration 34: Average Return = 54.51\n",
"Iteration 35: Average Return = 51.8\n",
"Iteration 36: Average Return = 58.35\n",
"Iteration 37: Average Return = 55.49\n",
"Iteration 38: Average Return = 62.07\n",
"Iteration 39: Average Return = 58.62\n",
"Iteration 40: Average Return = 61.17\n",
"Iteration 41: Average Return = 62.93\n",
"Iteration 42: Average Return = 60.65\n",
"Iteration 43: Average Return = 58.44\n",
"Iteration 44: Average Return = 62.81\n",
"Iteration 45: Average Return = 61.65\n",
"Iteration 46: Average Return = 60.62\n",
"Iteration 47: Average Return = 58.42\n",
"Iteration 48: Average Return = 65.54\n",
"Iteration 49: Average Return = 65.38\n",
"Iteration 50: Average Return = 66.89\n",
"Iteration 51: Average Return = 68.54\n",
"Iteration 52: Average Return = 66.87\n",
"Iteration 53: Average Return = 63.61\n",
"Iteration 54: Average Return = 72.27\n",
"Iteration 55: Average Return = 72.03\n",
"Iteration 56: Average Return = 67.19\n",
"Iteration 57: Average Return = 77.55\n",
"Iteration 58: Average Return = 78.94\n",
"Iteration 59: Average Return = 74.54\n",
"Iteration 60: Average Return = 79.95\n",
"Iteration 61: Average Return = 87.68\n",
"Iteration 62: Average Return = 82.11\n",
"Iteration 63: Average Return = 84.65\n",
"Iteration 64: Average Return = 84.64\n",
"Iteration 65: Average Return = 94.46\n",
"Iteration 66: Average Return = 93.72\n",
"Iteration 67: Average Return = 102.79\n",
"Iteration 68: Average Return = 102.95\n",
"Iteration 69: Average Return = 109.52\n",
"Iteration 70: Average Return = 114.67\n",
"Iteration 71: Average Return = 127.23\n",
"Iteration 72: Average Return = 146.82\n",
"Iteration 73: Average Return = 154.05\n",
"Iteration 74: Average Return = 162.69\n",
"Iteration 75: Average Return = 164.27\n",
"Iteration 76: Average Return = 170.13\n",
"Iteration 77: Average Return = 173.03\n",
"Iteration 78: Average Return = 173.41\n",
"Iteration 79: Average Return = 178.39\n",
"Iteration 80: Average Return = 171.67\n",
"Iteration 81: Average Return = 181.47\n",
"Iteration 82: Average Return = 181.88\n",
"Iteration 83: Average Return = 177.43\n",
"Iteration 84: Average Return = 180.1\n",
"Iteration 85: Average Return = 184.74\n",
"Iteration 86: Average Return = 187.58\n",
"Iteration 87: Average Return = 184.49\n",
"Iteration 88: Average Return = 192.28\n",
"Iteration 89: Average Return = 192.14\n",
"Iteration 90: Average Return = 194.14\n",
"Iteration 91: Average Return = 197.31\n",
"Solve at 91 iterations, which equals 9100 episodes.\n"
]
}
],
"source": [
"n_iter = 200\n",
"n_episode = 100\n",
"path_length = 200\n",
"discount_rate = 0.99\n",
"baseline = LinearFeatureBaseline(env.spec)\n",
"#baseline = LinearFeatureBaseline(env.spec)\n",
"baseline = None\n",
"\n",
"po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,\n",
" discount_rate)\n",
Expand Down Expand Up @@ -312,8 +441,9 @@
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 2",
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
Expand All @@ -327,7 +457,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
"version": "2.7.12"
}
},
"nbformat": 4,
Expand Down
3 changes: 3 additions & 0 deletions Report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
our homework2 report is on google doc:

https://docs.google.com/document/d/1cX5wAzI7poj7zC1-W2ySSEU10LgU7vxErt86D-0Lvgs/edit
9 changes: 8 additions & 1 deletion policy_gradient/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
# YOUR CODE HERE >>>>>>
# probs = ???
# <<<<<<<<


#W1 = tf.Variable(tf.zeros([in_dim, hidden_dim]))
#W2 = tf.Variable(tf.zeros([hidden_dim, out_dim]))
#h1_result = tf.tanh(tf.matmul(self._observations, W1))
#probs=tf.nn.softmax(tf.matmul(h1_result,W2))
h1 = tf.contrib.layers.fully_connected(self._observations, hidden_dim, activation_fn=tf.tanh)
probs = tf.contrib.layers.fully_connected(h1, out_dim, activation_fn=tf.nn.softmax)
# --------------------------------------------------
# This operation (variable) is used when choosing action during data sampling phase
# Shape of probs: [1, n_actions]
Expand Down Expand Up @@ -70,6 +76,7 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session):
"""
# YOUR CODE HERE >>>>>>
# surr_loss = ???
surr_loss = -tf.reduce_mean(log_prob * self._advantages)
# <<<<<<<<

grads_and_vars = self._opt.compute_gradients(surr_loss)
Expand Down
16 changes: 12 additions & 4 deletions policy_gradient/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@ def flatten_space(space):
Sample solution is about 1~7 lines.
"""

# def discount_cumsum(x, discount_rate):
# YOUR CODE HERE >>>>>>
# return ???
# <<<<<<<<
def discount_cumsum(x, discount_rate):
a=np.zeros(x.shape)+x
tail=x.shape[0]-1

for i in range(tail):
temp=1
for j in range(i+1,x.shape[0]):
temp=temp*discount_rate
a[i]=a[i]+x[j]*temp
return a