-
Notifications
You must be signed in to change notification settings - Fork 1
/
Agent.py
147 lines (118 loc) · 7.01 KB
/
Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
import torch
from torch.distributions.categorical import Categorical
import torch.optim.lr_scheduler as lr_scheduler
from MyCustomEnv import Actions
#initialize layer with xavier initialized weights ~(0 mean, sqrt(2/(inputs_outputs) standard deviation)
def layer_init(layer):
torch.nn.init.xavier_normal_(layer.weight, gain=1.0)
torch.nn.init.constant_(layer.bias, val=0.0)
return layer
# We define a deep RL agent
class Agent(torch.nn.Module):
def __init__(self, action_space_dim, observation_space_dim, learning_rate, num_hidden_nodes, use_lr_scheduler):
super().__init__()
# The number of rows and columns in the array representation of a state
state_num_rows, state_num_cols = observation_space_dim
# Number of nodes in hidden layer
self.num_hidden_nodes = num_hidden_nodes
# A function (represented by a neural network) that takes in a state as input,
# and outputs - for each possible action - the probability of taking that action
self.policy_function = torch.nn.Sequential(torch.nn.Flatten(start_dim=1, end_dim=-1),
layer_init(torch.nn.Linear(in_features=state_num_rows * state_num_cols,
out_features=num_hidden_nodes,
dtype=torch.float64)),
torch.nn.ReLU(),
layer_init(torch.nn.Linear(in_features=num_hidden_nodes,
out_features=action_space_dim,
dtype=torch.float64)),
torch.nn.LogSoftmax() # use this instead of softmax for better numerical stability
)
#include critic function: value(state, action)
for tensor in self.policy_function.parameters():
tensor.requires_grad_(True) #record operations on tensor
# Optimizer for training
self.policy_optimizer = torch.optim.Adam(self.policy_function.parameters(), lr=learning_rate, eps=1e-5)
self.use_lr_scheduler = use_lr_scheduler
if self.use_lr_scheduler:
self.scheduler = lr_scheduler.StepLR(self.policy_optimizer, step_size=2, gamma=0.1)
# Compute policy loss for a mini-batch of states and actions; action vectors of policy probabilties for each state
def policy_loss(self, states_batch, weights):
"""
states_batch: tensor format of stack of states from stored batch
weights: weights of policy model
"""
# A tensor containing the policy for each state in the mini-batch
policies_batch = self.policy_function(states_batch)
return -(policies_batch * weights).mean()
# Output an action
def predict(self, state):
# Before passing in the state array to the policy function, we have
# to convert it to a tensor in a specific format
state = torch.tensor(state) #window size x signal features, where the window size corresponds to the weekdays, and the num of signal features are our features
# Converts the array from shape (window_size, num_features) to (1, window_size, num_features)
state = torch.unsqueeze(state, dim=0)
# Compute the action we need to take
policy = self.policy_function(state) #[stochastic] policy aka agent
# Sample an action from the probability distribution (policy)
# This ensures exploration instead of just always choosing the greedy action
# Categorical policies are used in discrete action spaces
action = Categorical(policy).sample().item()
prob = policy[0, action]
return action, prob
# Train for one epoch
def train_one_epoch(self, env, epoch_num):
# Make some empty lists for saving mini-batches of observations
batch_obs = [] # for states
batch_acts = [] # for actions
batch_probs = [] # for probabilities of taking the actions we took
batch_weights = [] # for R(tau) weighting in policy gradient
batch_rets = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# Reset episode-specific variables
state = env.reset() # first obs comes from starting distribution
done = False # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout the episode
# Collect experience by executing trades in the environment, using the
# current policy
buy_count = 0
sell_count = 0
hold_count = 0
while not done:
# Save the current state in our minibatch
batch_obs.append(state.copy())
# Output an action
action, prob = self.predict(state)
# Take that action, collect a reward, and observe the new state
state, reward, done, info = env.step(action)
# Save in memory the action we took, its probability, and the reward we collected
batch_acts.append(action)
batch_probs.append(prob)
#taking action: buy or sell or do nothing
if action == Actions.Sell.value:
ep_rews.append([env._total_reward, 0, 0])
sell_count += 1
elif action == Actions.Hold.value:
ep_rews.append([0, env._total_reward, 0])
hold_count += 1
elif action == Actions.Buy.value:
ep_rews.append([0, 0, env._total_reward])
buy_count += 1
# if reward > 0:
# print("[training epoch {}] info = {}".format(epoch_num, info))
print(f"[training epoch {epoch_num}] realized gain (total reward) = {np.round(env._total_reward, 2)}")
# print(f"[training epoch {epoch_num}] buy count = {env.buy_count}, sell count = {env.sell_count}, hold count = {env.hold_count}")
# source: https://spinningup.openai.com/en/latest/spinningup/rl_intro3.html#implementing-the-simplest-policy-gradient
# Take a single policy gradient update step
self.policy_optimizer.zero_grad() #sets gradients of all optimized torch tensors to zero
states_batch = torch.as_tensor(np.stack(batch_obs), dtype=torch.float64)
ep_rews = torch.as_tensor(ep_rews, dtype=torch.float64) # the weight for each logprob(a|s) is R(tau)
batch_loss = self.policy_loss(states_batch, ep_rews)
batch_loss.backward()
self.policy_optimizer.step()
if self.use_lr_scheduler:
self.scheduler.step() # learning rate optimizer step
return batch_loss, batch_rets, batch_lens
def train(self, env, n_epochs):
for i in range(1, n_epochs+1):
self.train_one_epoch(env, i)