diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/HW2_Policy_Graident.ipynb b/HW2_Policy_Graident.ipynb index 3b51aa3..d9bdadb 100644 --- a/HW2_Policy_Graident.ipynb +++ b/HW2_Policy_Graident.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -26,11 +26,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gym.envs.registration:Making new env: CartPole-v0\n", + "[2016-10-16 18:17:36,278] Making new env: CartPole-v0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[name: \"/cpu:0\"\n", + "device_type: \"CPU\"\n", + "memory_limit: 268435456\n", + "bus_adjacency: BUS_ANY\n", + "incarnation: 5137755054001702941\n", + ", name: \"/gpu:0\"\n", + "device_type: \"GPU\"\n", + "memory_limit: 22279392461\n", + "incarnation: 4594359714894799590\n", + "physical_device_desc: \"device: 0, name: Tesla M40, pci bus id: 0000:04:00.0\"\n", + "]\n" + ] + } + ], "source": [ "import gym\n", "import tensorflow as tf\n", @@ -38,9 +64,13 @@ "from policy_gradient import util\n", "from policy_gradient.policy import CategoricalPolicy\n", "from policy_gradient.baselines.linear_feature_baseline import LinearFeatureBaseline\n", + "import os\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "\n", "np.random.seed(0)\n", "tf.set_random_seed(0)\n", + "from tensorflow.python.client import device_lib\n", + "print device_lib.list_local_devices()\n", "\n", "# CartPole-v0 is a MDP with finite state and action space. \n", "# In this environment, A pendulum is attached by an un-actuated joint to a cart, \n", @@ -95,11 +125,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py:90: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ] + } + ], "source": [ "sess = tf.Session()\n", "\n", @@ -151,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": true }, @@ -210,26 +249,34 @@ " Sample solution should be only 1 line.\n", " \"\"\"\n", " # YOUR CODE HERE >>>>>>\n", - " # a = ???\n", + " a = r-b\n", " # <<<<<<<<\n", "\n", " p[\"returns\"] = r\n", " p[\"baselines\"] = b\n", + " p[\"advantages_wo_norm\"] = a\n", " p[\"advantages\"] = (a - a.mean()) / (a.std() + 1e-8) # normalize\n", "\n", " obs = np.concatenate([ p[\"observations\"] for p in paths ])\n", " actions = np.concatenate([ p[\"actions\"] for p in paths ])\n", " rewards = np.concatenate([ p[\"rewards\"] for p in paths ])\n", " advantages = np.concatenate([ p[\"advantages\"] for p in paths ])\n", + " \n", + " advantages_wo_norm = np.concatenate([ p[\"advantages_wo_norm\"] for p in paths ])\n", + " returns = np.concatenate([ p[\"returns\"] for p in paths ])\n", "\n", " return dict(\n", " observations=obs,\n", " actions=actions,\n", " rewards=rewards,\n", " advantages=advantages,\n", + " advantages_wo_norm = advantages_wo_norm,\n", + " returns=returns\n", " )\n", "\n", " def train(self):\n", + " avg_return_lst = []\n", + " episode = []\n", " for i in range(1, self.n_iter + 1):\n", " paths = []\n", " for _ in range(self.n_episode):\n", @@ -237,6 +284,7 @@ " data = self.process_paths(paths)\n", " loss = self.policy.train(data[\"observations\"], data[\"actions\"], data[\"advantages\"])\n", " avg_return = np.mean([sum(p[\"rewards\"]) for p in paths])\n", + " avg_return_lst.append(avg_return)\n", " print(\"Iteration {}: Average Return = {}\".format(i, avg_return))\n", " \n", " # CartPole-v0 defines \"solving\" as getting average reward of 195.0 over 100 consecutive trials.\n", @@ -245,28 +293,141 @@ " break\n", "\n", " if self.baseline != None:\n", - " self.baseline.fit(paths)" + " self.baseline.fit(paths)\n", + " episode.append(data)\n", + " \n", + " return avg_return_lst,i,episode" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: Average Return = 11.08\n", + "Iteration 2: Average Return = 11.21\n", + "Iteration 3: Average Return = 11.68\n", + "Iteration 4: Average Return = 11.07\n", + "Iteration 5: Average Return = 11.56\n", + "Iteration 6: Average Return = 12.53\n", + "Iteration 7: Average Return = 13.32\n", + "Iteration 8: Average Return = 13.81\n", + "Iteration 9: Average Return = 14.65\n", + "Iteration 10: Average Return = 15.1\n", + "Iteration 11: Average Return = 16.4\n", + "Iteration 12: Average Return = 17.16\n", + "Iteration 13: Average Return = 18.38\n", + "Iteration 14: Average Return = 21.81\n", + "Iteration 15: Average Return = 23.06\n", + "Iteration 16: Average Return = 22.66\n", + "Iteration 17: Average Return = 27.0\n", + "Iteration 18: Average Return = 29.09\n", + "Iteration 19: Average Return = 30.13\n", + "Iteration 20: Average Return = 30.21\n", + "Iteration 21: Average Return = 33.67\n", + "Iteration 22: Average Return = 34.53\n", + "Iteration 23: Average Return = 33.08\n", + "Iteration 24: Average Return = 34.78\n", + "Iteration 25: Average Return = 35.53\n", + "Iteration 26: Average Return = 33.87\n", + "Iteration 27: Average Return = 37.25\n", + "Iteration 28: Average Return = 35.98\n", + "Iteration 29: Average Return = 35.48\n", + "Iteration 30: Average Return = 32.6\n", + "Iteration 31: Average Return = 38.6\n", + "Iteration 32: Average Return = 42.62\n", + "Iteration 33: Average Return = 37.86\n", + "Iteration 34: Average Return = 42.13\n", + "Iteration 35: Average Return = 46.49\n", + "Iteration 36: Average Return = 42.17\n", + "Iteration 37: Average Return = 48.06\n", + "Iteration 38: Average Return = 42.23\n", + "Iteration 39: Average Return = 45.62\n", + "Iteration 40: Average Return = 46.04\n", + "Iteration 41: Average Return = 45.86\n", + "Iteration 42: Average Return = 45.93\n", + "Iteration 43: Average Return = 46.37\n", + "Iteration 44: Average Return = 46.42\n", + "Iteration 45: Average Return = 49.61\n", + "Iteration 46: Average Return = 51.0\n", + "Iteration 47: Average Return = 52.25\n", + "Iteration 48: Average Return = 52.78\n", + "Iteration 49: Average Return = 52.69\n", + "Iteration 50: Average Return = 57.27\n", + "Iteration 51: Average Return = 54.55\n", + "Iteration 52: Average Return = 52.3\n", + "Iteration 53: Average Return = 55.47\n", + "Iteration 54: Average Return = 52.21\n", + "Iteration 55: Average Return = 59.62\n", + "Iteration 56: Average Return = 61.68\n", + "Iteration 57: Average Return = 56.72\n", + "Iteration 58: Average Return = 59.8\n", + "Iteration 59: Average Return = 59.21\n", + "Iteration 60: Average Return = 59.79\n", + "Iteration 61: Average Return = 61.55\n", + "Iteration 62: Average Return = 63.13\n", + "Iteration 63: Average Return = 64.52\n", + "Iteration 64: Average Return = 68.87\n", + "Iteration 65: Average Return = 66.39\n", + "Iteration 66: Average Return = 68.56\n", + "Iteration 67: Average Return = 71.38\n", + "Iteration 68: Average Return = 70.99\n", + "Iteration 69: Average Return = 73.13\n", + "Iteration 70: Average Return = 79.29\n", + "Iteration 71: Average Return = 73.96\n", + "Iteration 72: Average Return = 78.36\n", + "Iteration 73: Average Return = 75.7\n", + "Iteration 74: Average Return = 81.86\n", + "Iteration 75: Average Return = 82.47\n", + "Iteration 76: Average Return = 86.06\n", + "Iteration 77: Average Return = 88.11\n", + "Iteration 78: Average Return = 95.86\n", + "Iteration 79: Average Return = 93.99\n", + "Iteration 80: Average Return = 103.44\n", + "Iteration 81: Average Return = 110.93\n", + "Iteration 82: Average Return = 122.16\n", + "Iteration 83: Average Return = 127.28\n", + "Iteration 84: Average Return = 145.37\n", + "Iteration 85: Average Return = 152.45\n", + "Iteration 86: Average Return = 156.75\n", + "Iteration 87: Average Return = 174.1\n", + "Iteration 88: Average Return = 181.2\n", + "Iteration 89: Average Return = 187.34\n", + "Iteration 90: Average Return = 191.77\n", + "Iteration 91: Average Return = 190.04\n", + "Iteration 92: Average Return = 196.01\n", + "Solve at 92 iterations, which equals 9200 episodes.\n" + ] + } + ], "source": [ "n_iter = 200\n", "n_episode = 100\n", "path_length = 200\n", "discount_rate = 0.99\n", "baseline = LinearFeatureBaseline(env.spec)\n", + "#baseline = None\n", + "\n", "\n", "po = PolicyOptimizer(env, policy, baseline, n_iter, n_episode, path_length,\n", " discount_rate)\n", "\n", "# Train the policy optimizer\n", - "po.train()" + "\n", + "#sess.run(tf.initialize_all_variables())\n", + "N=14\n", + "[avg,i,data] = po.train()\n", + "np.savez('without_baseline_{}'.format(N),average_return_list = avg)\n", + "np.savez('substract',data=data)\n", + "\n", + "#for _ in range(10):\n", + " #sess.run(tf.initialize_all_variables())\n" ] }, { @@ -327,7 +488,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.6" } }, "nbformat": 4, diff --git a/Report.md b/Report.md new file mode 100644 index 0000000..4f22a0b --- /dev/null +++ b/Report.md @@ -0,0 +1,88 @@ +# CEDL homework 2 +### Team member: Juan-ting Lin, Meng-li Shih, Fu-en, Wang + +## Overview: + + In this homework, we will try to solve the classic control problem - CartPole. + + + + CartPole is an environment which contains a pendulum attached by an un-actuated joint to a cart, and the goal is to prevent it from falling over. You can apply a force of +1 or -1 to the cart. A reward of +1 is provided for every timestep that the pendulum remains upright. + +## Implementation: + + In problem 1~4, we implement a simple agent and improve it with mote-carlo sampling and policy-based method + * **Problem 1:** + + In problem 1, we construct a simple 2-layer fully-connected feedforward network to perform the policy prediction: + + ``` + x=tf.contrib.layers.fully_connected(inputs=self._observations,num_outputs=hidden_dim, + weights_initializer=tf.random_normal_initializer(), + biases_initializer=tf.random_normal_initializer(), + activation_fn = tf.tanh) + y=tf.contrib.layers.fully_connected(inputs=x,num_outputs=out_dim, + weights_initializer=tf.random_normal_initializer(), + biases_initializer=tf.random_normal_initializer()) + probs=tf.nn.softmax(y) + ``` + + * **Problem 2:** + + After constructing the policy prediction network, we need to compute the surrogate loss to obtain the policy gradient: + + ``` + surr_loss = tf.reduce_mean(tf.mul(log_prob,self._advantages)) + ``` + + * **Problem 3:** + + Implement a function that computes the accumulated discounted rewards of each timestep t from t to the end of the episode: + + ``` + def discount_cumsum(x, discount_rate): + sum = 0 + acc_sum = [0 for _ in range(len(x))] + for i in range(len(x)-1,-1,-1): + exp = len(x)-i-1 + sum += x[i]*(discount_rate**exp) + acc_sum[i]=sum + + return np.asarray(acc_sum) + ``` + * **Problem 4:** + + Use baseline to reduce the variance of our gradient estimate. By doing so, we can imporove the shortcoming of the mote-carlo method: + + ``` + a = r-b + ``` + +# Discussion: + + In problem 5~6, we need to discuss the effect of having baseline substracted + +* **Problem 5:** + + here we compare the result with and without substracting baseline: + +|Substract baseline|Without substracting baseline| +|---|---| +| | | +| || + + In each case of problem 5, we re-conduct 10 training process to get statistical result. The upper part of the block is the average return plot of 10 experiments. It is quite obvious that the case without baseline substracted has larger variation during different training process. To prove our observation, we plot the standard deviation during the training process in the lower part of the block. The center curve is the mean of 10 training processes and the vertical line is the std ( variance^0.5) of each step. As the plot shows us, if we don't substract the baseline, we'll get larger std during training! + For detailed explanation, we record all the returns and rewards in each single path and calculate its variance. We show the result in the table below + +| |Substract baseline|Without substract baseline| +|---|---|---| +|returns|542.5|587.1| +|advantages|117|587.1| + + + If we compare the return of the 2 cases, there is no significant difference. Nevertheless, if we compare the advantages(a=r-b), the case with baseline substraction has a much lower variance. + +* **Problem 6:** + + In each path, the action performed in latter stages will be discounted exponantially. As a result, in spite of substracting baseline from the average returns, the actions in the latter stage have less influence on the learning process. By doing this, it's like we always encourage and discourage half of the actions. This might help us to control the policy gradient. + diff --git a/pic/Draw_result.py b/pic/Draw_result.py new file mode 100644 index 0000000..c576b13 --- /dev/null +++ b/pic/Draw_result.py @@ -0,0 +1,52 @@ +import numpy as np +import matplotlib.pyplot as plt +import seaborn + +type = "without_baseline" +result_list = [] +max_len = 0 +for i in range(10): + f = np.load('./%s_%d.npz'%(type,i+1)) + result_list.append(f['average_return_list']) + if f['average_return_list'].shape[0]>max_len: + max_len = f['average_return_list'].shape[0] + +for i in range(10): + x_axis = np.arange(1, len(result_list[i])+1) + plt.plot(x_axis,result_list[i],label='iteration %d'%(i+1)) + plt.legend(bbox_to_anchor=(0.1, 1), loc=1, borderaxespad=0.) + +plt.xlabel('Iteration') +plt.ylabel('Average_return') +plt.title('10 different training cycle') +plt.show() + +# non_zero = np.zeros((max_len),dtype=np.float32) +# sum_lst = np.zeros((max_len),dtype = np.float32) +# std_lst = np.zeros((max_len),dtype=np.float32) +# print non_zero.shape +# for j in range(max_len): +# element = [] +# for i in range(10): +# +# try: +# if result_list[i][j]>0: +# non_zero[j]+=1 +# element.append(result_list[i][j]) +# sum_lst[j] += result_list[i][j] +# except: +# pass +# element = np.asarray(element) +# v = (np.var(element))**0.5 +# std_lst[j]=v +# print non_zero +# print std_lst +# mean_lst = sum_lst/non_zero +# x_axis=np.arange(1,max_len+1) +# #plt.plot(x_axis,mean_lst,'b') +# plt.errorbar(x_axis,mean_lst,yerr = [std_lst,std_lst]) +# plt.xlabel('Iteration') +# plt.ylabel('Average_return') +# plt.title('Average progress plot with std') +# plt.show() + diff --git a/pic/Draw_variance.py b/pic/Draw_variance.py new file mode 100644 index 0000000..e0b6962 --- /dev/null +++ b/pic/Draw_variance.py @@ -0,0 +1,23 @@ +import numpy as np +import matplotlib.pyplot as plt +import seaborn + +f=np.load("./substract.npz")["data"] +return_lst=np.array([]) +adv_lst=np.array([]) + +for i in range(len(f)): + return_lst = np.concatenate([return_lst,f[i]["returns"]]) + print f[i]["returns"].shape + adv_lst = np.concatenate([adv_lst,f[i]["advantages_wo_norm"]]) + +var = np.var(adv_lst) +plt.plot(adv_lst) +plt.show() +print var + +#587.128956331 +#116.95866528 + +#587.128956331 +#542.499453896 diff --git a/pic/README.md b/pic/README.md new file mode 100644 index 0000000..ee32a64 --- /dev/null +++ b/pic/README.md @@ -0,0 +1 @@ +## We put image here diff --git a/pic/with_baseline_10.png b/pic/with_baseline_10.png new file mode 100644 index 0000000..f2a4128 Binary files /dev/null and b/pic/with_baseline_10.png differ diff --git a/pic/with_baseline_std.png b/pic/with_baseline_std.png new file mode 100644 index 0000000..a0186eb Binary files /dev/null and b/pic/with_baseline_std.png differ diff --git a/pic/without_baseline_10.png b/pic/without_baseline_10.png new file mode 100644 index 0000000..4297a43 Binary files /dev/null and b/pic/without_baseline_10.png differ diff --git a/pic/without_baseline_std.png b/pic/without_baseline_std.png new file mode 100644 index 0000000..0c1158e Binary files /dev/null and b/pic/without_baseline_std.png differ diff --git a/policy_gradient/policy.py b/policy_gradient/policy.py index 70b8cdc..0ec9b14 100644 --- a/policy_gradient/policy.py +++ b/policy_gradient/policy.py @@ -27,7 +27,15 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): Sample solution is about 2~4 lines. """ # YOUR CODE HERE >>>>>> - # probs = ??? + x=tf.contrib.layers.fully_connected(inputs=self._observations,num_outputs=hidden_dim, + weights_initializer=tf.random_normal_initializer(), + biases_initializer=tf.random_normal_initializer(), + activation_fn = tf.tanh) + y=tf.contrib.layers.fully_connected(inputs=x,num_outputs=out_dim, + weights_initializer=tf.random_normal_initializer(), + biases_initializer=tf.random_normal_initializer()) + probs=tf.nn.softmax(y) + # <<<<<<<< # -------------------------------------------------- @@ -69,10 +77,10 @@ def __init__(self, in_dim, out_dim, hidden_dim, optimizer, session): Sample solution is about 1~3 lines. """ # YOUR CODE HERE >>>>>> - # surr_loss = ??? + surr_loss = tf.reduce_mean(tf.mul(log_prob,self._advantages)) # <<<<<<<< - grads_and_vars = self._opt.compute_gradients(surr_loss) + grads_and_vars = self._opt.compute_gradients(-surr_loss) train_op = self._opt.apply_gradients(grads_and_vars, name="train_op") # -------------------------------------------------- diff --git a/policy_gradient/util.py b/policy_gradient/util.py index 4c57674..dce6539 100644 --- a/policy_gradient/util.py +++ b/policy_gradient/util.py @@ -3,12 +3,12 @@ from scipy.signal import lfilter def flatten_space(space): - if isinstance(space, Box): - return np.prod(space.shape) - elif isinstance(space, Discrete): - return space.n - else: - raise ValueError("Env must be either Box or Discrete.") + if isinstance(space, Box): + return np.prod(space.shape) + elif isinstance(space, Discrete): + return space.n + else: + raise ValueError("Env must be either Box or Discrete.") """ Problem 3: @@ -19,7 +19,14 @@ def flatten_space(space): Sample solution is about 1~7 lines. """ -# def discount_cumsum(x, discount_rate): +def discount_cumsum(x, discount_rate): # YOUR CODE HERE >>>>>> - # return ??? - # <<<<<<<< \ No newline at end of file + sum = 0 + acc_sum = [0 for _ in range(len(x))] + for i in range(len(x)-1,-1,-1): + exp = len(x)-i-1 + sum += x[i]*(discount_rate**exp) + acc_sum[i]=sum + + return np.asarray(acc_sum) + # <<<<<<<