Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AgileRL updates #1220

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 121 additions & 98 deletions tutorials/AgileRL/agilerl_dqn_curriculum.py

Large diffs are not rendered by default.

204 changes: 128 additions & 76 deletions tutorials/AgileRL/agilerl_maddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
"""

import os

import numpy as np
Expand All @@ -10,14 +11,14 @@
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
from agilerl.utils.utils import create_population
from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
from tqdm import trange

from pettingzoo.atari import space_invaders_v2

if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("===== AgileRL MADDPG Demo =====")

# Define the network configuration
NET_CONFIG = {
Expand All @@ -35,15 +36,21 @@
"ALGO": "MADDPG", # Algorithm
# Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
"CHANNELS_LAST": True,
"BATCH_SIZE": 8, # Batch size
"BATCH_SIZE": 32, # Batch size
"O_U_NOISE": True, # Ornstein Uhlenbeck action noise
"EXPL_NOISE": 0.1, # Action noise scale
"MEAN_NOISE": 0.0, # Mean action noise
"THETA": 0.15, # Rate of mean reversion in OU noise
"DT": 0.01, # Timestep for OU noise
"LR_ACTOR": 0.001, # Actor learning rate
"LR_CRITIC": 0.01, # Critic learning rate
"LR_CRITIC": 0.001, # Critic learning rate
"GAMMA": 0.95, # Discount factor
"MEMORY_SIZE": 10000, # Max memory buffer size
"LEARN_STEP": 5, # Learning frequency
"MEMORY_SIZE": 100000, # Max memory buffer size
"LEARN_STEP": 100, # Learning frequency
"TAU": 0.01, # For soft update of target parameters
}

num_envs = 8
# Define the space invaders environment as a parallel environment
env = space_invaders_v2.parallel_env()
if INIT_HP["CHANNELS_LAST"]:
Expand All @@ -53,6 +60,7 @@
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 4)
env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs)
env.reset()

# Configure the multi-agent algo input arguments
Expand Down Expand Up @@ -84,14 +92,15 @@
INIT_HP["AGENT_IDS"] = env.agents

# Create a population ready for evolutionary hyper-parameter optimisation
pop = initialPopulation(
pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
one_hot,
NET_CONFIG,
INIT_HP,
population_size=INIT_HP["POPULATION_SIZE"],
num_envs=num_envs,
device=device,
)

Expand All @@ -109,8 +118,8 @@
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
evo_step=1,
) # Evaluate using last N fitness scores
eval_loop=1, # Evaluate using last N fitness scores
)

# Instantiate a mutations object (used for HPO)
mutations = Mutations(
Expand All @@ -128,7 +137,7 @@
], # RL hyperparams selected for mutation
mutation_sd=0.1, # Mutation strength
# Define search space for each hyperparameter
min_lr=0.0001,
min_lr=0.00001,
max_lr=0.01,
min_learn_step=1,
max_learn_step=120,
Expand All @@ -141,26 +150,32 @@
)

# Define training loop parameters
max_episodes = 5 # Total episodes (default: 6000)
max_steps = 900 # Maximum steps to take in each episode
epsilon = 1.0 # Starting epsilon value
eps_end = 0.1 # Final epsilon value
eps_decay = 0.995 # Epsilon decay
evo_epochs = 20 # Evolution frequency
evo_loop = 1 # Number of evaluation episodes
max_steps = 4500 # Max steps (default: 2000000)
learning_delay = 500 # Steps before starting learning
evo_steps = 10000 # Evolution frequency
eval_steps = None # Evaluation steps per episode - go until done
eval_loop = 1 # Number of evaluation episodes
elite = pop[0] # Assign a placeholder "elite" agent

# Training loop
for idx_epi in trange(max_episodes):
total_steps = 0

# TRAINING LOOP
print("Training...")
pbar = trange(max_steps, unit="step")
while np.less([agent.steps[-1] for agent in pop], max_steps).all():
pop_episode_scores = []
for agent in pop: # Loop through population
state, info = env.reset() # Reset environment at start of episode
agent_reward = {agent_id: 0 for agent_id in env.agents}
scores = np.zeros(num_envs)
completed_episode_scores = []
steps = 0
if INIT_HP["CHANNELS_LAST"]:
state = {
agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
agent_id: np.moveaxis(s, [-1], [-3])
for agent_id, s in state.items()
}
for _ in range(max_steps):

for idx_step in range(evo_steps // num_envs):
agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
env_defined_actions = (
info["env_defined_actions"]
Expand All @@ -169,87 +184,124 @@
)

# Get next action from agent
cont_actions, discrete_action = agent.getAction(
state, epsilon, agent_mask, env_defined_actions
cont_actions, discrete_action = agent.get_action(
states=state,
training=True,
agent_mask=agent_mask,
env_defined_actions=env_defined_actions,
)
if agent.discrete_actions:
action = discrete_action
else:
action = cont_actions

next_state, reward, termination, truncation, info = env.step(
action
) # Act in environment
# Act in environment
next_state, reward, termination, truncation, info = env.step(action)

scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
total_steps += num_envs
steps += num_envs

# Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [-1], [-3])
for agent_id, ns in next_state.items()
}

# Save experiences to replay buffer
memory.save2memory(state, cont_actions, reward, next_state, termination)

# Collect the reward
for agent_id, r in reward.items():
agent_reward[agent_id] += r
memory.save_to_memory(
state,
cont_actions,
reward,
next_state,
termination,
is_vectorised=True,
)

# Learn according to learning frequency
if (memory.counter % agent.learn_step == 0) and (
len(memory) >= agent.batch_size
# Handle learn steps > num_envs
if agent.learn_step > num_envs:
learn_step = agent.learn_step // num_envs
if (
idx_step % learn_step == 0
and len(memory) >= agent.batch_size
and memory.counter > learning_delay
):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)
# Handle num_envs > learn step; learn multiple times per step in env
elif (
len(memory) >= agent.batch_size and memory.counter > learning_delay
):
experiences = memory.sample(
agent.batch_size
) # Sample replay buffer
agent.learn(experiences) # Learn according to agent's RL algorithm
for _ in range(num_envs // agent.learn_step):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)

# Update the state
if INIT_HP["CHANNELS_LAST"]:
next_state = {
agent_id: np.expand_dims(ns, 0)
for agent_id, ns in next_state.items()
}
state = next_state

# Stop episode if any agents have terminated
if any(truncation.values()) or any(termination.values()):
break

# Save the total episode reward
score = sum(agent_reward.values())
agent.scores.append(score)

# Update epsilon for exploration
epsilon = max(eps_end, epsilon * eps_decay)

# Now evolve population if necessary
if (idx_epi + 1) % evo_epochs == 0:
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=max_steps,
loop=evo_loop,
)
for agent in pop
]
# Calculate scores and reset noise for finished episodes
reset_noise_indices = []
term_array = np.array(list(termination.values())).transpose()
trunc_array = np.array(list(truncation.values())).transpose()
for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
if np.any(d) or np.any(t):
completed_episode_scores.append(scores[idx])
agent.scores.append(scores[idx])
scores[idx] = 0
reset_noise_indices.append(idx)
agent.reset_action_noise(reset_noise_indices)

pbar.update(evo_steps // len(pop))

print(f"Episode {idx_epi + 1}/{max_episodes}")
print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
print(
f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
agent.steps[-1] += steps
pop_episode_scores.append(completed_episode_scores)

# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=eval_steps,
loop=eval_loop,
)
for agent in pop
]
mean_scores = [
(
np.mean(episode_scores)
if len(episode_scores) > 0
else "0 completed episodes"
)
for episode_scores in pop_episode_scores
]

print(f"--- Global steps {total_steps} ---")
print(f"Steps {[agent.steps[-1] for agent in pop]}")
print(f"Scores: {mean_scores}")
print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
print(
f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)
# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)

# Update step counter
for agent in pop:
agent.steps.append(agent.steps[-1])

# Save the trained algorithm
path = "./models/MADDPG"
filename = "MADDPG_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)
elite.save_checkpoint(save_path)

pbar.close()
env.close()
Loading
Loading