-
Notifications
You must be signed in to change notification settings - Fork 27
/
ch02_ten_armed_testbed.py
294 lines (232 loc) · 10 KB
/
ch02_ten_armed_testbed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
def save_fig(name, ch = 'ch02'):
plt.savefig('figures/{}_{}.png'.format(ch, name))
plt.close()
# --------------------
# Bandit problem and functions
# --------------------
class BaseBandit:
"""
Base class implementation of Section 2.3: The 10-armed Testbed
"""
def __init__(
self,
k_arm=10, # number of arms
eps=0, # explore with prob eps; exploit with prob 1-eps
initial_q=0, # initial action-value estimates
true_q_mean=0 # true reward mean
):
self.k_arm = k_arm
self.possible_actions = np.arange(self.k_arm)
self.eps = eps
self.initial_q = initial_q
self.true_q_mean = true_q_mean
self.reset()
def reset(self):
# p21: for each bandit the action values q_*(a) were selected
# from a normal distribution with 0 mean and variance 1
self.q_true = np.random.randn(self.k_arm) + self.true_q_mean
# initialize 'A simple bandit algorithm' p24
self.q_estimate = np.zeros(self.k_arm) + self.initial_q
self.action_count = np.zeros(self.k_arm)
# record how often the optimal action is selected
self.optimal_action_freq = 0
def act(self):
# explore with prob eps; exploit with prob 1-eps
if np.random.rand() < self.eps: # explore
action = np.random.choice(self.possible_actions)
else:
action = np.argmax(self.q_estimate)
return action
def reward(self, action_idx):
# p21: the actual reward is selected from a normal distribution with mean q_*(A_t) and variance 1
return np.random.randn() + self.q_true[action_idx]
def update_q(self, action, reward):
# simple average update (eq. 2.3)
self.q_estimate[action] += 1/self.action_count[action] * (reward - self.q_estimate[action])
def step(self):
# single loop in 'A simple bandit algorithm' p24
action = self.act()
reward = self.reward(action)
self.action_count[action] += 1
self.update_q(action, reward)
# online update of average for optimal action frequency (eq 2.3)
if action == np.argmax(self.q_true): # action == best possible action
self.optimal_action_freq += 1/np.sum(self.action_count) * (1 - self.optimal_action_freq)
return action, reward
class ExponentialAverageBandit(BaseBandit):
"""
Section 2.5 Tracking a Nonstationary Problem
"""
def __init__(
self,
step_size=0.1, # exponential weighted average param
**kwargs
):
super().__init__(**kwargs)
self.step_size = step_size
def update_q(self, action, reward):
# exponential average update (eq. 2.5)
self.q_estimate[action] += self.step_size * (reward - self.q_estimate[action])
class UCBBandit(BaseBandit):
"""
Implements Section 2.7 Upper Confidence Bound Action Selection and eq. (2.8)
"""
def __init__(
self,
c=2, # controls degree of exploration
**kwargs
):
super().__init__(**kwargs)
self.c = c
def act(self):
if np.random.rand() < self.eps: # explore
action = np.random.choice(self.possible_actions)
else: # exploit (eq. 2.8)
t = np.sum(self.action_count) + 1
q = self.q_estimate + self.c * np.sqrt(np.log(t) / (self.action_count + 1e-6))
action = np.argmax(q)
return action
class GradientBandit(BaseBandit):
"""
Implementation of Section 2.8 Gradient Bandit Algorithm
"""
def __init__(
self,
baseline=True, # use average returns as a baseline for gradient calculation
step_size=0.1, # exponential weighted avg param
**kwargs
):
super().__init__(**kwargs)
self.baseline = baseline
self.step_size = step_size
self.average_reward = 0
def act(self):
e = np.exp(self.q_estimate)
self.softmax = e / np.sum(e)
return np.random.choice(self.possible_actions, p=self.softmax)
def update_q(self, action, reward):
# avg rewards serve as a baseline; if reward > baseline then prob(action) is increased
# first do online update of average reward (2.3)
# (note n number of steps == sum of action counts since at each step only one action is chosen
self.average_reward += 1/np.sum(self.action_count) * (reward - self.average_reward)
baseline = self.average_reward if self.baseline else 0
# gradient update (eq 2.10):
mask = np.zeros_like(self.softmax)
mask[action] = 1
self.q_estimate += self.step_size * (reward - baseline) * (mask - self.softmax)
# --------------------
# Evaluate a list of bandit problems
# --------------------
def run_bandits(bandits, n_runs, n_steps):
""" simulates a list of bandit running each for n_teps and then averaging over n_runs """
rewards = np.zeros((len(bandits), n_runs, n_steps))
optimal_action_freqs = np.zeros_like(rewards)
for b, bandit in enumerate(bandits):
for run in tqdm(range(n_runs)):
# runs are independed; so reset bandit
bandit.reset()
for step in range(n_steps):
# step bandit (act -> reward)
action, reward = bandit.step()
# record reward averages and optimal action frequence
rewards[b, run, step] = reward
if action == np.argmax(bandit.q_true):
optimal_action_freqs[b, run, step] = 1
# average across the n_runs
avg_rewards = rewards.mean(axis=1)
avg_optimal_action_freqs = optimal_action_freqs.mean(axis=1)
return avg_rewards, avg_optimal_action_freqs
# --------------------
# Figure 2.1: An example bandit problem from the 10-armed testbed.
# The true value q⇤(a) of each of the ten actions was selected according to a normal distribution
# with mean zero and unit variance, and then the actual rewards were selected according to a
# mean q⇤(a) unit variance normal distribution, as suggested by these gray distributions.
# --------------------
def fig_2_1():
plt.violinplot(np.random.randn(200,10) + np.random.randn(10), showmeans=True)
plt.xticks(np.arange(1,11), np.arange(1,11))
plt.xlabel('Action')
plt.ylabel('Reward distribution')
save_fig('fig_2_1')
# --------------------
# Figure 2.2: Average performance of eps-greedy action-value methods on the 10-armed testbed.
# These data are averages over 2000 runs with different bandit problems.
# All methods used sample averages as their action-value estimates.
# --------------------
def fig_2_2(runs=2000, steps=1000, epsilons=[0, 0.01, 0.1]):
bandits = [BaseBandit(eps=eps) for eps in epsilons]
avg_rewards, avg_optimal_action_freqs = run_bandits(bandits, runs, steps)
# plot results
plt.subplot(2,1,1)
for eps, rewards in zip(epsilons, avg_rewards):
plt.plot(rewards, label=r'$\epsilon$ = {}'.format(eps), lw=1)
plt.xlabel('Steps')
plt.ylabel('Average reward')
plt.legend()
plt.subplot(2,1,2)
for eps, optimal_action_freq in zip(epsilons, avg_optimal_action_freqs):
plt.plot(optimal_action_freq, label=r'$\epsilon$ = {}'.format(eps), lw=1)
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
plt.tight_layout()
save_fig('fig_2_2')
# --------------------
# Figure 2.3: The effect of optimistic initial action-value estimates on the 10-armed testbed.
# Both methods used a constant step-size parameter, step_size = 0.1.
# --------------------
def fig_2_3(runs=2000, steps=1000, epsilons=[0, 0.1], initial_qs=[5, 0]):
bandits = []
for eps, initial_q in zip(epsilons, initial_qs):
bandits.append(ExponentialAverageBandit(eps=eps, initial_q=initial_q, step_size=0.1))
_, avg_optimal_action_freqs = run_bandits(bandits, runs, steps)
# plot results
for i, (eps, initial_q) in enumerate(zip(epsilons, initial_qs)):
plt.plot(avg_optimal_action_freqs[i], label=r'Q1 = {}, $\epsilon$ = {}'.format(initial_q, eps))
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
save_fig('fig_2_3')
# --------------------
# Figure 2.4: Average performance of UCB action selection on the 10-armed testbed.
# As shown, UCB generally performs better than epsilon-greedy action selection,
# except in the first k steps, when it selects randomly among the as-yet-untried actions.
# --------------------
def fig_2_4(runs=2000, steps=1000):
bandits = [UCBBandit(eps=0, c=2), BaseBandit(eps=0.1)]
_, avg_optimal_action_freqs = run_bandits(bandits, runs, steps)
# plot results
plt.plot(avg_optimal_action_freqs[0], label='UCB c = {}'.format(bandits[0].c))
plt.plot(avg_optimal_action_freqs[1], label=r'$\epsilon$-greedy $\epsilon$ = {}'.format(bandits[1].eps))
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
save_fig('fig_2_4')
# --------------------
# Figure 2.5: Average performance of the gradient bandit algorithm with and without a reward baseline
# on the 10-armed testbed when the q⇤(a) are chosen to be near +4 rather than near zero.
# --------------------
def fig_2_5(runs=2000, steps=1000):
bandits = [
GradientBandit(step_size=0.1, true_q_mean=4, baseline=True),
GradientBandit(step_size=0.4, true_q_mean=4, baseline=True),
GradientBandit(step_size=0.1, true_q_mean=4, baseline=False),
GradientBandit(step_size=0.4, true_q_mean=4, baseline=False)]
_, avg_optimal_action_freqs = run_bandits(bandits, runs, steps)
# plot results
for i, bandit in enumerate(bandits):
plt.plot(avg_optimal_action_freqs[i],
label='step_size = {}, baseline = {}'.format(bandit.step_size, bandit.baseline))
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
save_fig('fig_2_5')
if __name__ == '__main__':
fig_2_1()
fig_2_2()
fig_2_3()
fig_2_4()
fig_2_5()