diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py index a56464c5f..e2f378b15 100644 --- a/tutorials/AgileRL/agilerl_dqn_curriculum.py +++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py @@ -2,6 +2,7 @@ Author: Nick (https://github.com/nicku-a) """ + import copy import os import random @@ -15,7 +16,7 @@ from agilerl.components.replay_buffer import ReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population from tqdm import tqdm, trange from pettingzoo.classic import connect_four_v3 @@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent): while not (done or truncation): # Player 0's turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player=0) if opponent_first: p0_action = self.env.action_space("player_0").sample(p0_action_mask) else: if self.lesson["warm_up_opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, self.lesson["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) self.step(p0_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p0_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0) - p0_next_state = np.expand_dims(p0_next_state, 0) + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 + ) if done or truncation: reward = self.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues if p1_state is not None: reward = self.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent): # Player 1's turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 + ) if not opponent_first: p1_action = self.env.action_space("player_1").sample( p1_action_mask ) else: if self.lesson["warm_up_opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"] ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) self.step(p1_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p1_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0) - p1_next_state = np.expand_dims(p1_next_state, 0) + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 + ) if done or truncation: reward = self.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues reward = self.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -323,11 +320,11 @@ def __init__(self, env, difficulty): self.env = env.env self.difficulty = difficulty if self.difficulty == "random": - self.getAction = self.random_opponent + self.get_action = self.random_opponent elif self.difficulty == "weak": - self.getAction = self.weak_rule_based_opponent + self.get_action = self.weak_rule_based_opponent else: - self.getAction = self.strong_rule_based_opponent + self.get_action = self.strong_rule_based_opponent self.num_cols = 7 self.num_rows = 6 self.length = 4 @@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False): return (True, reward, ended) + ((lengths,) if return_length else ()) +def transform_and_flip(observation, player): + """Transforms and flips observation for input to agent's neural network. + + :param observation: Observation to preprocess + :type observation: dict[str, np.ndarray] + :param player: Player, 0 or 1 + :type player: int + """ + state = observation["observation"] + # Pre-process dimensions for PyTorch (N, C, H, W) + state = np.moveaxis(state, [-1], [-3]) + if player == 1: + # Swap pieces so that the agent always sees the board from the same perspective + state[[0, 1], :, :] = state[[1, 0], :, :] + state_flipped = np.expand_dims(np.flip(state, 2), 0) + state = np.expand_dims(state, 0) + return state, state_flipped + + if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("===== AgileRL Curriculum Learning Demo =====") @@ -549,7 +565,7 @@ def outcome(self, action, player, return_length=False): action_dim = action_dim[0] # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -563,7 +579,6 @@ def outcome(self, action, player, return_length=False): # Configure the replay buffer field_names = ["state", "action", "reward", "next_state", "done"] memory = ReplayBuffer( - action_dim=action_dim, # Number of agent actions memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size field_names=field_names, # Field names to store in memory device=device, @@ -574,8 +589,8 @@ def outcome(self, action, player, return_length=False): tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -625,7 +640,7 @@ def outcome(self, action, player, return_length=False): if LESSON["pretrained_path"] is not None: for agent in pop: # Load pretrained checkpoint - agent.loadCheckpoint(LESSON["pretrained_path"]) + agent.load_checkpoint(LESSON["pretrained_path"]) # Reinit optimizer for new task agent.lr = INIT_HP["LR"] agent.optimizer = torch.optim.Adam( @@ -689,7 +704,7 @@ def outcome(self, action, player, return_length=False): for agent in pop: # Loop through population for episode in range(episodes_per_epoch): env.reset() # Reset environment at start of episode - observation, env_reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() ( p1_state, @@ -718,23 +733,23 @@ def outcome(self, action, player, return_length=False): for idx_step in range(max_steps): # Player 0"s turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip( + observation, player=0 + ) if opponent_first: if LESSON["opponent"] == "self": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_state, 0, p0_action_mask )[0] elif LESSON["opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, LESSON["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) else: - p0_action = agent.getAction( + p0_action = agent.get_action( p0_state, epsilon, p0_action_mask )[ 0 @@ -742,23 +757,18 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p0_action] += 1 env.step(p0_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p0_next_state = np.moveaxis( - observation["observation"], [-1], [-3] + observation, cumulative_reward, done, truncation, _ = env.last() + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 ) - p0_next_state_flipped = np.expand_dims( - np.flip(p0_next_state, 2), 0 - ) - p0_next_state = np.expand_dims(p0_next_state, 0) - if not opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 0 win) if done or truncation: reward = env.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -787,7 +797,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues if p1_state is not None: reward = env.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -799,29 +809,25 @@ def outcome(self, action, player, return_length=False): # Player 1"s turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis( - observation["observation"], [-1], [-3] + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 ) - # Swap pieces so that the agent always sees the board from the same perspective - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) if not opponent_first: if LESSON["opponent"] == "self": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_state, 0, p1_action_mask )[0] elif LESSON["opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"], ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) else: - p1_action = agent.getAction( + p1_action = agent.get_action( p1_state, epsilon, p1_action_mask )[ 0 @@ -829,24 +835,25 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p1_action] += 1 env.step(p1_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p1_next_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims( - np.flip(p1_next_state, 2), 0 + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 ) - p1_next_state = np.expand_dims(p1_next_state, 0) if opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 1 win) if done or truncation: reward = env.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -880,7 +887,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues reward = env.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -935,7 +942,13 @@ def outcome(self, action, player, return_length=False): rewards = [] for i in range(evo_loop): env.reset() # Reset environment at start of episode - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() player = -1 # Tracker for which player"s turn it is @@ -955,42 +968,52 @@ def outcome(self, action, player, return_length=False): if player < 0: if opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 if player > 0: if not opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) - state[[0, 1], :, :] = state[[0, 1], :, :] + state[[0, 1], :, :] = state[[1, 0], :, :] state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 env.step(action) # Act in environment - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() if (player > 0 and opponent_first) or ( player < 0 and not opponent_first ): - score += reward + score = cumulative_reward eval_turns += 1 @@ -1011,23 +1034,23 @@ def outcome(self, action, player, return_length=False): ) pbar.update(0) - # Format action histograms for visualisation - train_actions_hist = [ - freq / sum(train_actions_hist) for freq in train_actions_hist - ] - eval_actions_hist = [ - freq / sum(eval_actions_hist) for freq in eval_actions_hist - ] - train_actions_dict = { - f"train/action_{index}": action - for index, action in enumerate(train_actions_hist) - } - eval_actions_dict = { - f"eval/action_{index}": action - for index, action in enumerate(eval_actions_hist) - } - if wb: + # Format action histograms for visualisation + train_actions_hist = [ + freq / sum(train_actions_hist) for freq in train_actions_hist + ] + eval_actions_hist = [ + freq / sum(eval_actions_hist) for freq in eval_actions_hist + ] + train_actions_dict = { + f"train/action_{index}": action + for index, action in enumerate(train_actions_hist) + } + eval_actions_dict = { + f"eval/action_{index}": action + for index, action in enumerate(eval_actions_hist) + } + wandb_dict = { "global_step": total_steps, "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), @@ -1053,5 +1076,5 @@ def outcome(self, action, player, return_length=False): # Save the trained agent save_path = LESSON["save_path"] os.makedirs(os.path.dirname(save_path), exist_ok=True) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) print(f"Elite agent saved to '{save_path}'.") diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py index 37e193f40..550a4baa3 100644 --- a/tutorials/AgileRL/agilerl_maddpg.py +++ b/tutorials/AgileRL/agilerl_maddpg.py @@ -2,6 +2,7 @@ Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a) """ + import os import numpy as np @@ -10,14 +11,14 @@ from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper from tqdm import trange from pettingzoo.atari import space_invaders_v2 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MADDPG Demo =====") # Define the network configuration NET_CONFIG = { @@ -35,15 +36,21 @@ "ALGO": "MADDPG", # Algorithm # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": True, - "BATCH_SIZE": 8, # Batch size + "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor - "MEMORY_SIZE": 10000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "MEMORY_SIZE": 100000, # Max memory buffer size + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters } + num_envs = 8 # Define the space invaders environment as a parallel environment env = space_invaders_v2.parallel_env() if INIT_HP["CHANNELS_LAST"]: @@ -53,6 +60,7 @@ env = ss.color_reduction_v0(env, mode="B") env = ss.resize_v1(env, x_size=84, y_size=84) env = ss.frame_stack_v1(env, 4) + env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs) env.reset() # Configure the multi-agent algo input arguments @@ -84,7 +92,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -92,6 +100,7 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, ) @@ -109,8 +118,8 @@ tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -128,7 +137,7 @@ ], # RL hyperparams selected for mutation mutation_sd=0.1, # Mutation strength # Define search space for each hyperparameter - min_lr=0.0001, + min_lr=0.00001, max_lr=0.01, min_learn_step=1, max_learn_step=120, @@ -141,26 +150,32 @@ ) # Define training loop parameters - max_episodes = 5 # Total episodes (default: 6000) - max_steps = 900 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes + max_steps = 4500 # Max steps (default: 2000000) + learning_delay = 500 # Steps before starting learning + evo_steps = 10000 # Evolution frequency + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes elite = pop[0] # Assign a placeholder "elite" agent - # Training loop - for idx_epi in trange(max_episodes): + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less([agent.steps[-1] for agent in pop], max_steps).all(): + pop_episode_scores = [] for agent in pop: # Loop through population state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + scores = np.zeros(num_envs) + completed_episode_scores = [] + steps = 0 if INIT_HP["CHANNELS_LAST"]: state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() } - for _ in range(max_steps): + + for idx_step in range(evo_steps // num_envs): agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None env_defined_actions = ( info["env_defined_actions"] @@ -169,87 +184,124 @@ ) # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions + cont_actions, discrete_action = agent.get_action( + states=state, + training=True, + agent_mask=agent_mask, + env_defined_actions=env_defined_actions, ) if agent.discrete_actions: action = discrete_action else: action = cont_actions - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment + # Act in environment + next_state, reward, termination, truncation, info = env.step(action) + + scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1) + total_steps += num_envs + steps += num_envs # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} next_state = { agent_id: np.moveaxis(ns, [-1], [-3]) for agent_id, ns in next_state.items() } # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay + ): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif ( + len(memory) >= agent.batch_size and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } state = next_state - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) + + pbar.update(evo_steps // len(pop)) - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') - print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + agent.steps[-1] += steps + pop_episode_scores.append(completed_episode_scores) + + # Evaluate population + fitnesses = [ + agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, + ) + for agent in pop + ] + mean_scores = [ + ( + np.mean(episode_scores) + if len(episode_scores) > 0 + else "0 completed episodes" ) + for episode_scores in pop_episode_scores + ] + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {[agent.steps[-1] for agent in pop]}") + print(f"Scores: {mean_scores}") + print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}') + print( + f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}' + ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Tournament selection and population mutation + elite, pop = tournament.select(pop) + pop = mutations.mutation(pop) + + # Update step counter + for agent in pop: + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MADDPG" filename = "MADDPG_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py index cc6ed9009..893a80fe7 100644 --- a/tutorials/AgileRL/agilerl_matd3.py +++ b/tutorials/AgileRL/agilerl_matd3.py @@ -2,6 +2,7 @@ Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a) """ + import os import numpy as np @@ -9,14 +10,15 @@ from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper from tqdm import trange from pettingzoo.mpe import simple_speaker_listener_v4 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MATD3 Demo =====") + print("===== AgileRL Online Multi-Agent Demo =====") # Define the network configuration NET_CONFIG = { @@ -31,17 +33,24 @@ # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": False, "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor "MEMORY_SIZE": 100000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters "POLICY_FREQ": 2, # Policy frequnecy } + num_envs = 8 # Define the simple speaker listener environment as a parallel environment env = simple_speaker_listener_v4.parallel_env(continuous_actions=True) + env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs) env.reset() # Configure the multi-agent algo input arguments @@ -73,7 +82,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -81,6 +90,7 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, ) @@ -98,8 +108,8 @@ tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -123,27 +133,32 @@ ) # Define training loop parameters - max_episodes = 500 # Total episodes (default: 6000) - max_steps = 25 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes + max_steps = 13000 # Max steps (default: 2000000) + learning_delay = 0 # Steps before starting learning + evo_steps = 1000 # Evolution frequency + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes elite = pop[0] # Assign a placeholder "elite" agent - # Training loop - for idx_epi in trange(max_episodes): + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less([agent.steps[-1] for agent in pop], max_steps).all(): + pop_episode_scores = [] for agent in pop: # Loop through population state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + scores = np.zeros(num_envs) + completed_episode_scores = [] + steps = 0 if INIT_HP["CHANNELS_LAST"]: state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() } - for _ in range(max_steps): + for idx_step in range(evo_steps // num_envs): agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None env_defined_actions = ( info["env_defined_actions"] @@ -152,87 +167,124 @@ ) # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions + cont_actions, discrete_action = agent.get_action( + states=state, + training=True, + agent_mask=agent_mask, + env_defined_actions=env_defined_actions, ) if agent.discrete_actions: action = discrete_action else: action = cont_actions - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment + # Act in environment + next_state, reward, termination, truncation, info = env.step(action) + + scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1) + total_steps += num_envs + steps += num_envs # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} next_state = { agent_id: np.moveaxis(ns, [-1], [-3]) for agent_id, ns in next_state.items() } # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay + ): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif ( + len(memory) >= agent.batch_size and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } state = next_state - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) + + pbar.update(evo_steps // len(pop)) + + agent.steps[-1] += steps + pop_episode_scores.append(completed_episode_scores) - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') - print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + # Evaluate population + fitnesses = [ + agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, + ) + for agent in pop + ] + mean_scores = [ + ( + np.mean(episode_scores) + if len(episode_scores) > 0 + else "0 completed episodes" ) + for episode_scores in pop_episode_scores + ] + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {[agent.steps[-1] for agent in pop]}") + print(f"Scores: {mean_scores}") + print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}') + print( + f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}' + ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Tournament selection and population mutation + elite, pop = tournament.select(pop) + pop = mutations.mutation(pop) + + # Update step counter + for agent in pop: + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MATD3" filename = "MATD3_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py index f5a2d4b38..67d3ad9cc 100644 --- a/tutorials/AgileRL/render_agilerl_dqn.py +++ b/tutorials/AgileRL/render_agilerl_dqn.py @@ -4,7 +4,7 @@ import numpy as np import torch from agilerl.algorithms.dqn import DQN -from agilerl_dqn_curriculum import Opponent +from agilerl_dqn_curriculum import Opponent, transform_and_flip from PIL import Image, ImageDraw, ImageFont from pettingzoo.classic import connect_four_v3 @@ -68,16 +68,8 @@ def resize_frames(frames, fraction): state_dim = np.zeros(state_dim[0]).flatten().shape action_dim = action_dim[0] - # Instantiate an DQN object - dqn = DQN( - state_dim, - action_dim, - one_hot, - device=device, - ) - - # Load the saved algorithm into the DQN object - dqn.loadCheckpoint(path) + # Load the saved agent + dqn = DQN.load(path, device) for opponent_difficulty in ["random", "weak", "strong", "self"]: # Create opponent @@ -120,38 +112,35 @@ def resize_frames(frames, fraction): for idx_step in range(max_steps): action_mask = observation["action_mask"] if player < 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=0) if opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 ] # Get next action from agent if player > 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state[[0, 1], :, :] = state[[0, 1], :, :] - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=1) if not opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py index ca47349d5..f862ee56a 100644 --- a/tutorials/AgileRL/render_agilerl_maddpg.py +++ b/tutorials/AgileRL/render_agilerl_maddpg.py @@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - maddpg = MADDPG( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MADDPG/MADDPG_trained_agent.pt" - maddpg.loadCheckpoint(path) + maddpg = MADDPG.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -115,9 +102,9 @@ def _label_with_episode_number(frame, episode_num): ) # Get next action from agent - cont_actions, discrete_action = maddpg.getAction( + cont_actions, discrete_action = maddpg.get_action( state, - epsilon=0, + training=False, agent_mask=agent_mask, env_defined_actions=env_defined_actions, ) diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py index efcc610cd..c096d661f 100644 --- a/tutorials/AgileRL/render_agilerl_matd3.py +++ b/tutorials/AgileRL/render_agilerl_matd3.py @@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - matd3 = MATD3( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MATD3/MATD3_trained_agent.pt" - matd3.loadCheckpoint(path) + matd3 = MATD3.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -102,9 +89,9 @@ def _label_with_episode_number(frame, episode_num): ) # Get next action from agent - cont_actions, discrete_action = matd3.getAction( + cont_actions, discrete_action = matd3.get_action( state, - epsilon=0, + training=False, agent_mask=agent_mask, env_defined_actions=env_defined_actions, ) diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt index 1262ee83c..0b2144cfc 100644 --- a/tutorials/AgileRL/requirements.txt +++ b/tutorials/AgileRL/requirements.txt @@ -1,4 +1,4 @@ -agilerl==0.1.22; python_version >= '3.9' +agilerl==1.0.0; python_version >= '3.9' pettingzoo[classic,atari,mpe]>=1.23.1 SuperSuit>=3.9.0 torch>=2.0.1