From 2fa58e4f08b7c06c0b4f7bcb0192e72bbd634123 Mon Sep 17 00:00:00 2001
From: nicku-a <nickua@btinternet.com>
Date: Mon, 24 Jun 2024 14:42:34 +0100
Subject: [PATCH 1/3] Update AgileRL tutorials

---
 tutorials/AgileRL/agilerl_dqn_curriculum.py | 213 ++++++++++----------
 tutorials/AgileRL/agilerl_maddpg.py         | 210 +++++++++++--------
 tutorials/AgileRL/agilerl_matd3.py          | 204 ++++++++++++-------
 tutorials/AgileRL/render_agilerl_dqn.py     |  41 ++--
 tutorials/AgileRL/render_agilerl_maddpg.py  |  25 +--
 tutorials/AgileRL/render_agilerl_matd3.py   |  25 +--
 6 files changed, 398 insertions(+), 320 deletions(-)

diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index a56464c5f..c0d79a511 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -2,6 +2,7 @@
 
 Author: Nick (https://github.com/nicku-a)
 """
+
 import copy
 import os
 import random
@@ -12,13 +13,13 @@
 import torch
 import wandb
 import yaml
+from pettingzoo.classic import connect_four_v3
+from tqdm import tqdm, trange
+
 from agilerl.components.replay_buffer import ReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
-from tqdm import tqdm, trange
-
-from pettingzoo.classic import connect_four_v3
+from agilerl.utils.utils import create_population
 
 
 class CurriculumEnv:
@@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent):
             while not (done or truncation):
                 # Player 0's turn
                 p0_action_mask = observation["action_mask"]
-                p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                p0_state = np.expand_dims(p0_state, 0)
+                p0_state, p0_state_flipped = transform_and_flip(observation, player=0)
                 if opponent_first:
                     p0_action = self.env.action_space("player_0").sample(p0_action_mask)
                 else:
                     if self.lesson["warm_up_opponent"] == "random":
-                        p0_action = opponent.getAction(
+                        p0_action = opponent.get_action(
                             p0_action_mask, p1_action, self.lesson["block_vert_coef"]
                         )
                     else:
-                        p0_action = opponent.getAction(player=0)
+                        p0_action = opponent.get_action(player=0)
                 self.step(p0_action)  # Act in environment
                 observation, env_reward, done, truncation, _ = self.last()
-                p0_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0)
-                p0_next_state = np.expand_dims(p0_next_state, 0)
+                p0_next_state, p0_next_state_flipped = transform_and_flip(
+                    observation, player=0
+                )
 
                 if done or truncation:
                     reward = self.reward(done=True, player=0)
-                    memory.save2memoryVectEnvs(
+                    memory.save_to_memory_vect_envs(
                         np.concatenate(
                             (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                         ),
@@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent):
                 else:  # Play continues
                     if p1_state is not None:
                         reward = self.reward(done=False, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate((p1_state, p1_state_flipped)),
                             [p1_action, 6 - p1_action],
                             [reward, reward],
@@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent):
 
                     # Player 1's turn
                     p1_action_mask = observation["action_mask"]
-                    p1_state = np.moveaxis(observation["observation"], [-1], [-3])
-                    p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                    p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                    p1_state = np.expand_dims(p1_state, 0)
+                    p1_state, p1_state_flipped = transform_and_flip(
+                        observation, player=1
+                    )
                     if not opponent_first:
                         p1_action = self.env.action_space("player_1").sample(
                             p1_action_mask
                         )
                     else:
                         if self.lesson["warm_up_opponent"] == "random":
-                            p1_action = opponent.getAction(
+                            p1_action = opponent.get_action(
                                 p1_action_mask, p0_action, LESSON["block_vert_coef"]
                             )
                         else:
-                            p1_action = opponent.getAction(player=1)
+                            p1_action = opponent.get_action(player=1)
                     self.step(p1_action)  # Act in environment
                     observation, env_reward, done, truncation, _ = self.last()
-                    p1_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                    p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                    p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0)
-                    p1_next_state = np.expand_dims(p1_next_state, 0)
+                    p1_next_state, p1_next_state_flipped = transform_and_flip(
+                        observation, player=1
+                    )
 
                     if done or truncation:
                         reward = self.reward(done=True, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate(
                                 (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                             ),
@@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent):
 
                     else:  # Play continues
                         reward = self.reward(done=False, player=0)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate((p0_state, p0_state_flipped)),
                             [p0_action, 6 - p0_action],
                             [reward, reward],
@@ -323,11 +320,11 @@ def __init__(self, env, difficulty):
         self.env = env.env
         self.difficulty = difficulty
         if self.difficulty == "random":
-            self.getAction = self.random_opponent
+            self.get_action = self.random_opponent
         elif self.difficulty == "weak":
-            self.getAction = self.weak_rule_based_opponent
+            self.get_action = self.weak_rule_based_opponent
         else:
-            self.getAction = self.strong_rule_based_opponent
+            self.get_action = self.strong_rule_based_opponent
         self.num_cols = 7
         self.num_rows = 6
         self.length = 4
@@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False):
         return (True, reward, ended) + ((lengths,) if return_length else ())
 
 
+def transform_and_flip(observation, player):
+    """Transforms and flips observation for input to agent's neural network.
+
+    :param observation: Observation to preprocess
+    :type observation: dict[str, np.ndarray]
+    :param player: Player, 0 or 1
+    :type player: int
+    """
+    state = observation["observation"]
+    # Pre-process dimensions for PyTorch (N, C, H, W)
+    state = np.moveaxis(state, [-1], [-3])
+    if player == 1:
+        # Swap pieces so that the agent always sees the board from the same perspective
+        state[[0, 1], :, :] = state[[1, 0], :, :]
+    state_flipped = np.expand_dims(np.flip(state, 2), 0)
+    state = np.expand_dims(state, 0)
+    return state, state_flipped
+
+
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("===== AgileRL Curriculum Learning Demo =====")
@@ -549,7 +565,7 @@ def outcome(self, action, player, return_length=False):
         action_dim = action_dim[0]
 
         # Create a population ready for evolutionary hyper-parameter optimisation
-        pop = initialPopulation(
+        pop = create_population(
             INIT_HP["ALGO"],
             state_dim,
             action_dim,
@@ -563,7 +579,6 @@ def outcome(self, action, player, return_length=False):
         # Configure the replay buffer
         field_names = ["state", "action", "reward", "next_state", "done"]
         memory = ReplayBuffer(
-            action_dim=action_dim,  # Number of agent actions
             memory_size=INIT_HP["MEMORY_SIZE"],  # Max replay buffer size
             field_names=field_names,  # Field names to store in memory
             device=device,
@@ -574,8 +589,8 @@ def outcome(self, action, player, return_length=False):
             tournament_size=2,  # Tournament selection size
             elitism=True,  # Elitism in tournament selection
             population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-            evo_step=1,
-        )  # Evaluate using last N fitness scores
+            eval_loop=1,  # Evaluate using last N fitness scores
+        )
 
         # Instantiate a mutations object (used for HPO)
         mutations = Mutations(
@@ -625,7 +640,7 @@ def outcome(self, action, player, return_length=False):
         if LESSON["pretrained_path"] is not None:
             for agent in pop:
                 # Load pretrained checkpoint
-                agent.loadCheckpoint(LESSON["pretrained_path"])
+                agent.load_checkpoint(LESSON["pretrained_path"])
                 # Reinit optimizer for new task
                 agent.lr = INIT_HP["LR"]
                 agent.optimizer = torch.optim.Adam(
@@ -689,7 +704,7 @@ def outcome(self, action, player, return_length=False):
             for agent in pop:  # Loop through population
                 for episode in range(episodes_per_epoch):
                     env.reset()  # Reset environment at start of episode
-                    observation, env_reward, done, truncation, _ = env.last()
+                    observation, cumulative_reward, done, truncation, _ = env.last()
 
                     (
                         p1_state,
@@ -718,23 +733,23 @@ def outcome(self, action, player, return_length=False):
                     for idx_step in range(max_steps):
                         # Player 0"s turn
                         p0_action_mask = observation["action_mask"]
-                        p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                        p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                        p0_state = np.expand_dims(p0_state, 0)
+                        p0_state, p0_state_flipped = transform_and_flip(
+                            observation, player=0
+                        )
 
                         if opponent_first:
                             if LESSON["opponent"] == "self":
-                                p0_action = opponent.getAction(
+                                p0_action = opponent.get_action(
                                     p0_state, 0, p0_action_mask
                                 )[0]
                             elif LESSON["opponent"] == "random":
-                                p0_action = opponent.getAction(
+                                p0_action = opponent.get_action(
                                     p0_action_mask, p1_action, LESSON["block_vert_coef"]
                                 )
                             else:
-                                p0_action = opponent.getAction(player=0)
+                                p0_action = opponent.get_action(player=0)
                         else:
-                            p0_action = agent.getAction(
+                            p0_action = agent.get_action(
                                 p0_state, epsilon, p0_action_mask
                             )[
                                 0
@@ -742,23 +757,18 @@ def outcome(self, action, player, return_length=False):
                             train_actions_hist[p0_action] += 1
 
                         env.step(p0_action)  # Act in environment
-                        observation, env_reward, done, truncation, _ = env.last()
-                        p0_next_state = np.moveaxis(
-                            observation["observation"], [-1], [-3]
+                        observation, cumulative_reward, done, truncation, _ = env.last()
+                        p0_next_state, p0_next_state_flipped = transform_and_flip(
+                            observation, player=0
                         )
-                        p0_next_state_flipped = np.expand_dims(
-                            np.flip(p0_next_state, 2), 0
-                        )
-                        p0_next_state = np.expand_dims(p0_next_state, 0)
-
                         if not opponent_first:
-                            score += env_reward
+                            score = cumulative_reward
                         turns += 1
 
                         # Check if game is over (Player 0 win)
                         if done or truncation:
                             reward = env.reward(done=True, player=0)
-                            memory.save2memoryVectEnvs(
+                            memory.save_to_memory_vect_envs(
                                 np.concatenate(
                                     (
                                         p0_state,
@@ -787,7 +797,7 @@ def outcome(self, action, player, return_length=False):
                         else:  # Play continues
                             if p1_state is not None:
                                 reward = env.reward(done=False, player=1)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate((p1_state, p1_state_flipped)),
                                     [p1_action, 6 - p1_action],
                                     [reward, reward],
@@ -799,29 +809,25 @@ def outcome(self, action, player, return_length=False):
 
                             # Player 1"s turn
                             p1_action_mask = observation["action_mask"]
-                            p1_state = np.moveaxis(
-                                observation["observation"], [-1], [-3]
+                            p1_state, p1_state_flipped = transform_and_flip(
+                                observation, player=1
                             )
-                            # Swap pieces so that the agent always sees the board from the same perspective
-                            p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                            p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                            p1_state = np.expand_dims(p1_state, 0)
 
                             if not opponent_first:
                                 if LESSON["opponent"] == "self":
-                                    p1_action = opponent.getAction(
+                                    p1_action = opponent.get_action(
                                         p1_state, 0, p1_action_mask
                                     )[0]
                                 elif LESSON["opponent"] == "random":
-                                    p1_action = opponent.getAction(
+                                    p1_action = opponent.get_action(
                                         p1_action_mask,
                                         p0_action,
                                         LESSON["block_vert_coef"],
                                     )
                                 else:
-                                    p1_action = opponent.getAction(player=1)
+                                    p1_action = opponent.get_action(player=1)
                             else:
-                                p1_action = agent.getAction(
+                                p1_action = agent.get_action(
                                     p1_state, epsilon, p1_action_mask
                                 )[
                                     0
@@ -829,24 +835,21 @@ def outcome(self, action, player, return_length=False):
                                 train_actions_hist[p1_action] += 1
 
                             env.step(p1_action)  # Act in environment
-                            observation, env_reward, done, truncation, _ = env.last()
-                            p1_next_state = np.moveaxis(
-                                observation["observation"], [-1], [-3]
+                            observation, cumulative_reward, done, truncation, _ = (
+                                env.last()
                             )
-                            p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                            p1_next_state_flipped = np.expand_dims(
-                                np.flip(p1_next_state, 2), 0
+                            p1_next_state, p1_next_state_flipped = transform_and_flip(
+                                observation, player=1
                             )
-                            p1_next_state = np.expand_dims(p1_next_state, 0)
 
                             if opponent_first:
-                                score += env_reward
+                                score = cumulative_reward
                             turns += 1
 
                             # Check if game is over (Player 1 win)
                             if done or truncation:
                                 reward = env.reward(done=True, player=1)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate(
                                         (
                                             p0_state,
@@ -880,7 +883,7 @@ def outcome(self, action, player, return_length=False):
 
                             else:  # Play continues
                                 reward = env.reward(done=False, player=0)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate((p0_state, p0_state_flipped)),
                                     [p0_action, 6 - p0_action],
                                     [reward, reward],
@@ -935,7 +938,9 @@ def outcome(self, action, player, return_length=False):
                         rewards = []
                         for i in range(evo_loop):
                             env.reset()  # Reset environment at start of episode
-                            observation, reward, done, truncation, _ = env.last()
+                            observation, cumulative_reward, done, truncation, _ = (
+                                env.last()
+                            )
 
                             player = -1  # Tracker for which player"s turn it is
 
@@ -955,42 +960,48 @@ def outcome(self, action, player, return_length=False):
                                 if player < 0:
                                     if opponent_first:
                                         if LESSON["eval_opponent"] == "random":
-                                            action = opponent.getAction(action_mask)
+                                            action = opponent.get_action(action_mask)
                                         else:
-                                            action = opponent.getAction(player=0)
+                                            action = opponent.get_action(player=0)
                                     else:
                                         state = np.moveaxis(
                                             observation["observation"], [-1], [-3]
                                         )
                                         state = np.expand_dims(state, 0)
-                                        action = agent.getAction(state, 0, action_mask)[
+                                        action = agent.get_action(
+                                            state, 0, action_mask
+                                        )[
                                             0
                                         ]  # Get next action from agent
                                         eval_actions_hist[action] += 1
                                 if player > 0:
                                     if not opponent_first:
                                         if LESSON["eval_opponent"] == "random":
-                                            action = opponent.getAction(action_mask)
+                                            action = opponent.get_action(action_mask)
                                         else:
-                                            action = opponent.getAction(player=1)
+                                            action = opponent.get_action(player=1)
                                     else:
                                         state = np.moveaxis(
                                             observation["observation"], [-1], [-3]
                                         )
-                                        state[[0, 1], :, :] = state[[0, 1], :, :]
+                                        state[[0, 1], :, :] = state[[1, 0], :, :]
                                         state = np.expand_dims(state, 0)
-                                        action = agent.getAction(state, 0, action_mask)[
+                                        action = agent.get_action(
+                                            state, 0, action_mask
+                                        )[
                                             0
                                         ]  # Get next action from agent
                                         eval_actions_hist[action] += 1
 
                                 env.step(action)  # Act in environment
-                                observation, reward, done, truncation, _ = env.last()
+                                observation, cumulative_reward, done, truncation, _ = (
+                                    env.last()
+                                )
 
                                 if (player > 0 and opponent_first) or (
                                     player < 0 and not opponent_first
                                 ):
-                                    score += reward
+                                    score = cumulative_reward
 
                                 eval_turns += 1
 
@@ -1010,24 +1021,24 @@ def outcome(self, action, player, return_length=False):
                     f"    Train Mean Score: {np.mean(agent.scores[-episodes_per_epoch:])}   Train Mean Turns: {mean_turns}   Eval Mean Fitness: {np.mean(fitnesses)}   Eval Best Fitness: {np.max(fitnesses)}   Eval Mean Turns: {eval_turns}   Total Steps: {total_steps}"
                 )
                 pbar.update(0)
-
-                # Format action histograms for visualisation
-                train_actions_hist = [
-                    freq / sum(train_actions_hist) for freq in train_actions_hist
-                ]
-                eval_actions_hist = [
-                    freq / sum(eval_actions_hist) for freq in eval_actions_hist
-                ]
-                train_actions_dict = {
-                    f"train/action_{index}": action
-                    for index, action in enumerate(train_actions_hist)
-                }
-                eval_actions_dict = {
-                    f"eval/action_{index}": action
-                    for index, action in enumerate(eval_actions_hist)
-                }
-
+                
                 if wb:
+                    # Format action histograms for visualisation
+                    train_actions_hist = [
+                        freq / sum(train_actions_hist) for freq in train_actions_hist
+                    ]
+                    eval_actions_hist = [
+                        freq / sum(eval_actions_hist) for freq in eval_actions_hist
+                    ]
+                    train_actions_dict = {
+                        f"train/action_{index}": action
+                        for index, action in enumerate(train_actions_hist)
+                    }
+                    eval_actions_dict = {
+                        f"eval/action_{index}": action
+                        for index, action in enumerate(eval_actions_hist)
+                    }
+
                     wandb_dict = {
                         "global_step": total_steps,
                         "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
@@ -1053,5 +1064,5 @@ def outcome(self, action, player, return_length=False):
         # Save the trained agent
         save_path = LESSON["save_path"]
         os.makedirs(os.path.dirname(save_path), exist_ok=True)
-        elite.saveCheckpoint(save_path)
+        elite.save_checkpoint(save_path)
         print(f"Elite agent saved to '{save_path}'.")
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index 37e193f40..b71e09767 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -2,22 +2,23 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
 """
+
 import os
 
 import numpy as np
 import supersuit as ss
 import torch
+from pettingzoo.atari import space_invaders_v2
+from tqdm import trange
+
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
-from tqdm import trange
-
-from pettingzoo.atari import space_invaders_v2
+from agilerl.utils.utils import create_population
+from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MADDPG Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
@@ -35,15 +36,21 @@
         "ALGO": "MADDPG",  # Algorithm
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": True,
-        "BATCH_SIZE": 8,  # Batch size
+        "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
-        "MEMORY_SIZE": 10000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "MEMORY_SIZE": 100000,  # Max memory buffer size
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
     }
 
+    num_envs = 8
     # Define the space invaders environment as a parallel environment
     env = space_invaders_v2.parallel_env()
     if INIT_HP["CHANNELS_LAST"]:
@@ -53,6 +60,7 @@
         env = ss.color_reduction_v0(env, mode="B")
         env = ss.resize_v1(env, x_size=84, y_size=84)
         env = ss.frame_stack_v1(env, 4)
+    env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs)
     env.reset()
 
     # Configure the multi-agent algo input arguments
@@ -84,7 +92,7 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    pop = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
@@ -92,6 +100,7 @@
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
     )
 
@@ -109,8 +118,8 @@
         tournament_size=2,  # Tournament selection size
         elitism=True,  # Elitism in tournament selection
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
+        eval_loop=1,  # Evaluate using last N fitness scores
+    )
 
     # Instantiate a mutations object (used for HPO)
     mutations = Mutations(
@@ -128,7 +137,7 @@
         ],  # RL hyperparams selected for mutation
         mutation_sd=0.1,  # Mutation strength
         # Define search space for each hyperparameter
-        min_lr=0.0001,
+        min_lr=0.00001,
         max_lr=0.01,
         min_learn_step=1,
         max_learn_step=120,
@@ -141,26 +150,32 @@
     )
 
     # Define training loop parameters
-    max_episodes = 5  # Total episodes (default: 6000)
-    max_steps = 900  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
+    max_steps = 4500  # Max steps (default: 2000000)
+    learning_delay = 500  # Steps before starting learning
+    evo_steps = 10000  # Evolution frequency
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
     elite = pop[0]  # Assign a placeholder "elite" agent
 
-    # Training loop
-    for idx_epi in trange(max_episodes):
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+        pop_episode_scores = []
         for agent in pop:  # Loop through population
             state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+            scores = np.zeros(num_envs)
+            completed_episode_scores = []
+            steps = 0
             if INIT_HP["CHANNELS_LAST"]:
                 state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+                    agent_id: np.moveaxis(s, [-1], [-3])
                     for agent_id, s in state.items()
                 }
-            for _ in range(max_steps):
+
+            for idx_step in range(evo_steps // num_envs):
                 agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
                 env_defined_actions = (
                     info["env_defined_actions"]
@@ -169,87 +184,124 @@
                 )
 
                 # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
+                cont_actions, discrete_action = agent.get_action(
+                    states=state,
+                    training=True,
+                    agent_mask=agent_mask,
+                    env_defined_actions=env_defined_actions,
                 )
                 if agent.discrete_actions:
                     action = discrete_action
                 else:
                     action = cont_actions
 
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
+                # Act in environment
+                next_state, reward, termination, truncation, info = env.step(action)
+
+                scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+                total_steps += num_envs
+                steps += num_envs
 
                 # Image processing if necessary for the environment
                 if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                     next_state = {
                         agent_id: np.moveaxis(ns, [-1], [-3])
                         for agent_id, ns in next_state.items()
                     }
 
                 # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
+                memory.save_to_memory(
+                    state,
+                    cont_actions,
+                    reward,
+                    next_state,
+                    termination,
+                    is_vectorised=True,
+                )
 
                 # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+                # Handle learn steps > num_envs
+                if agent.learn_step > num_envs:
+                    learn_step = agent.learn_step // num_envs
+                    if (
+                        idx_step % learn_step == 0
+                        and len(memory) >= agent.batch_size
+                        and memory.counter > learning_delay
+                    ):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
+                # Handle num_envs > learn step; learn multiple times per step in env
+                elif (
+                    len(memory) >= agent.batch_size and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
+                    for _ in range(num_envs // agent.learn_step):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
 
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
                 state = next_state
 
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
+                # Calculate scores and reset noise for finished episodes
+                reset_noise_indices = []
+                term_array = np.array(list(termination.values())).transpose()
+                trunc_array = np.array(list(truncation.values())).transpose()
+                for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                    if np.any(d) or np.any(t):
+                        completed_episode_scores.append(scores[idx])
+                        agent.scores.append(scores[idx])
+                        scores[idx] = 0
+                        reset_noise_indices.append(idx)
+                agent.reset_action_noise(reset_noise_indices)
+
+            pbar.update(evo_steps // len(pop))
+
+            agent.steps[-1] += steps
+            pop_episode_scores.append(completed_episode_scores)
 
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
-            print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+        # Evaluate population
+        fitnesses = [
+            agent.test(
+                env,
+                swap_channels=INIT_HP["CHANNELS_LAST"],
+                max_steps=eval_steps,
+                loop=eval_loop,
             )
+            for agent in pop
+        ]
+        mean_scores = [
+            (
+                np.mean(episode_scores)
+                if len(episode_scores) > 0
+                else "0 completed episodes"
+            )
+            for episode_scores in pop_episode_scores
+        ]
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {[agent.steps[-1] for agent in pop]}")
+        print(f"Scores: {mean_scores}")
+        print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+        print(
+            f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+        )
+
+        # Tournament selection and population mutation
+        elite, pop = tournament.select(pop)
+        pop = mutations.mutation(pop)
+
+        # Update step counter
+        for agent in pop:
+            agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MADDPG"
     filename = "MADDPG_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    elite.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index cc6ed9009..0791995ce 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -2,21 +2,23 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
 """
+
 import os
 
 import numpy as np
 import torch
+from pettingzoo.mpe import simple_speaker_listener_v4
+from tqdm import trange
+
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
-from tqdm import trange
-
-from pettingzoo.mpe import simple_speaker_listener_v4
+from agilerl.utils.utils import create_population
+from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MATD3 Demo =====")
+    print("===== AgileRL Online Multi-Agent Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
@@ -31,17 +33,24 @@
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": False,
         "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
         "MEMORY_SIZE": 100000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
         "POLICY_FREQ": 2,  # Policy frequnecy
     }
 
+    num_envs = 8
     # Define the simple speaker listener environment as a parallel environment
     env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
+    env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs)
     env.reset()
 
     # Configure the multi-agent algo input arguments
@@ -73,7 +82,7 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    pop = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
@@ -81,6 +90,7 @@
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
     )
 
@@ -98,8 +108,8 @@
         tournament_size=2,  # Tournament selection size
         elitism=True,  # Elitism in tournament selection
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
+        eval_loop=1,  # Evaluate using last N fitness scores
+    )
 
     # Instantiate a mutations object (used for HPO)
     mutations = Mutations(
@@ -123,27 +133,32 @@
     )
 
     # Define training loop parameters
-    max_episodes = 500  # Total episodes (default: 6000)
-    max_steps = 25  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
+    max_steps = 13000  # Max steps (default: 2000000)
+    learning_delay = 0  # Steps before starting learning
+    evo_steps = 1000  # Evolution frequency
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
     elite = pop[0]  # Assign a placeholder "elite" agent
 
-    # Training loop
-    for idx_epi in trange(max_episodes):
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+        pop_episode_scores = []
         for agent in pop:  # Loop through population
             state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+            scores = np.zeros(num_envs)
+            completed_episode_scores = []
+            steps = 0
             if INIT_HP["CHANNELS_LAST"]:
                 state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+                    agent_id: np.moveaxis(s, [-1], [-3])
                     for agent_id, s in state.items()
                 }
 
-            for _ in range(max_steps):
+            for idx_step in range(evo_steps // num_envs):
                 agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
                 env_defined_actions = (
                     info["env_defined_actions"]
@@ -152,87 +167,124 @@
                 )
 
                 # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
+                cont_actions, discrete_action = agent.get_action(
+                    states=state,
+                    training=True,
+                    agent_mask=agent_mask,
+                    env_defined_actions=env_defined_actions,
                 )
                 if agent.discrete_actions:
                     action = discrete_action
                 else:
                     action = cont_actions
 
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
+                # Act in environment
+                next_state, reward, termination, truncation, info = env.step(action)
+
+                scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+                total_steps += num_envs
+                steps += num_envs
 
                 # Image processing if necessary for the environment
                 if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                     next_state = {
                         agent_id: np.moveaxis(ns, [-1], [-3])
                         for agent_id, ns in next_state.items()
                     }
 
                 # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
+                memory.save_to_memory(
+                    state,
+                    cont_actions,
+                    reward,
+                    next_state,
+                    termination,
+                    is_vectorised=True,
+                )
 
                 # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+                # Handle learn steps > num_envs
+                if agent.learn_step > num_envs:
+                    learn_step = agent.learn_step // num_envs
+                    if (
+                        idx_step % learn_step == 0
+                        and len(memory) >= agent.batch_size
+                        and memory.counter > learning_delay
+                    ):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
+                # Handle num_envs > learn step; learn multiple times per step in env
+                elif (
+                    len(memory) >= agent.batch_size and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
+                    for _ in range(num_envs // agent.learn_step):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
 
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
                 state = next_state
 
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
+                # Calculate scores and reset noise for finished episodes
+                reset_noise_indices = []
+                term_array = np.array(list(termination.values())).transpose()
+                trunc_array = np.array(list(truncation.values())).transpose()
+                for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                    if np.any(d) or np.any(t):
+                        completed_episode_scores.append(scores[idx])
+                        agent.scores.append(scores[idx])
+                        scores[idx] = 0
+                        reset_noise_indices.append(idx)
+                agent.reset_action_noise(reset_noise_indices)
+
+            pbar.update(evo_steps // len(pop))
+
+            agent.steps[-1] += steps
+            pop_episode_scores.append(completed_episode_scores)
 
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
-            print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+        # Evaluate population
+        fitnesses = [
+            agent.test(
+                env,
+                swap_channels=INIT_HP["CHANNELS_LAST"],
+                max_steps=eval_steps,
+                loop=eval_loop,
             )
+            for agent in pop
+        ]
+        mean_scores = [
+            (
+                np.mean(episode_scores)
+                if len(episode_scores) > 0
+                else "0 completed episodes"
+            )
+            for episode_scores in pop_episode_scores
+        ]
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {[agent.steps[-1] for agent in pop]}")
+        print(f"Scores: {mean_scores}")
+        print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+        print(
+            f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+        )
+
+        # Tournament selection and population mutation
+        elite, pop = tournament.select(pop)
+        pop = mutations.mutation(pop)
+
+        # Update step counter
+        for agent in pop:
+            agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MATD3"
     filename = "MATD3_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    elite.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()
diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py
index f5a2d4b38..8b075e7d7 100644
--- a/tutorials/AgileRL/render_agilerl_dqn.py
+++ b/tutorials/AgileRL/render_agilerl_dqn.py
@@ -3,11 +3,11 @@
 import imageio
 import numpy as np
 import torch
-from agilerl.algorithms.dqn import DQN
-from agilerl_dqn_curriculum import Opponent
+from agilerl_dqn_curriculum import Opponent, transform_and_flip
+from pettingzoo.classic import connect_four_v3
 from PIL import Image, ImageDraw, ImageFont
 
-from pettingzoo.classic import connect_four_v3
+from agilerl.algorithms.dqn import DQN
 
 
 # Define function to return image
@@ -68,16 +68,8 @@ def resize_frames(frames, fraction):
     state_dim = np.zeros(state_dim[0]).flatten().shape
     action_dim = action_dim[0]
 
-    # Instantiate an DQN object
-    dqn = DQN(
-        state_dim,
-        action_dim,
-        one_hot,
-        device=device,
-    )
-
-    # Load the saved algorithm into the DQN object
-    dqn.loadCheckpoint(path)
+    # Load the saved agent
+    dqn = DQN.load(path, device)
 
     for opponent_difficulty in ["random", "weak", "strong", "self"]:
         # Create opponent
@@ -120,38 +112,35 @@ def resize_frames(frames, fraction):
             for idx_step in range(max_steps):
                 action_mask = observation["action_mask"]
                 if player < 0:
-                    state = np.moveaxis(observation["observation"], [-1], [-3])
-                    state = np.expand_dims(state, 0)
+                    state, _ = transform_and_flip(observation, player=0)
                     if opponent_first:
                         if opponent_difficulty == "self":
-                            action = opponent.getAction(
+                            action = opponent.get_action(
                                 state, epsilon=0, action_mask=action_mask
                             )[0]
                         elif opponent_difficulty == "random":
-                            action = opponent.getAction(action_mask)
+                            action = opponent.get_action(action_mask)
                         else:
-                            action = opponent.getAction(player=0)
+                            action = opponent.get_action(player=0)
                     else:
-                        action = dqn.getAction(
+                        action = dqn.get_action(
                             state, epsilon=0, action_mask=action_mask
                         )[
                             0
                         ]  # Get next action from agent
                 if player > 0:
-                    state = np.moveaxis(observation["observation"], [-1], [-3])
-                    state[[0, 1], :, :] = state[[0, 1], :, :]
-                    state = np.expand_dims(state, 0)
+                    state, _ = transform_and_flip(observation, player=1)
                     if not opponent_first:
                         if opponent_difficulty == "self":
-                            action = opponent.getAction(
+                            action = opponent.get_action(
                                 state, epsilon=0, action_mask=action_mask
                             )[0]
                         elif opponent_difficulty == "random":
-                            action = opponent.getAction(action_mask)
+                            action = opponent.get_action(action_mask)
                         else:
-                            action = opponent.getAction(player=1)
+                            action = opponent.get_action(player=1)
                     else:
-                        action = dqn.getAction(
+                        action = dqn.get_action(
                             state, epsilon=0, action_mask=action_mask
                         )[
                             0
diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py
index ca47349d5..c9a05df6a 100644
--- a/tutorials/AgileRL/render_agilerl_maddpg.py
+++ b/tutorials/AgileRL/render_agilerl_maddpg.py
@@ -4,10 +4,10 @@
 import numpy as np
 import supersuit as ss
 import torch
-from agilerl.algorithms.maddpg import MADDPG
+from pettingzoo.atari import space_invaders_v2
 from PIL import Image, ImageDraw
 
-from pettingzoo.atari import space_invaders_v2
+from agilerl.algorithms.maddpg import MADDPG
 
 
 # Define function to return image
@@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num):
     n_agents = env.num_agents
     agent_ids = env.agents
 
-    # Instantiate an MADDPG object
-    maddpg = MADDPG(
-        state_dim,
-        action_dim,
-        one_hot,
-        n_agents,
-        agent_ids,
-        max_action,
-        min_action,
-        discrete_actions,
-        device=device,
-    )
-
-    # Load the saved algorithm into the MADDPG object
+    # Load the saved agent
     path = "./models/MADDPG/MADDPG_trained_agent.pt"
-    maddpg.loadCheckpoint(path)
+    maddpg = MADDPG.load(path, device)
 
     # Define test loop parameters
     episodes = 10  # Number of episodes to test agent on
@@ -115,9 +102,9 @@ def _label_with_episode_number(frame, episode_num):
             )
 
             # Get next action from agent
-            cont_actions, discrete_action = maddpg.getAction(
+            cont_actions, discrete_action = maddpg.get_action(
                 state,
-                epsilon=0,
+                training=False,
                 agent_mask=agent_mask,
                 env_defined_actions=env_defined_actions,
             )
diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py
index efcc610cd..90a7e92bc 100644
--- a/tutorials/AgileRL/render_agilerl_matd3.py
+++ b/tutorials/AgileRL/render_agilerl_matd3.py
@@ -3,10 +3,10 @@
 import imageio
 import numpy as np
 import torch
-from agilerl.algorithms.matd3 import MATD3
+from pettingzoo.mpe import simple_speaker_listener_v4
 from PIL import Image, ImageDraw
 
-from pettingzoo.mpe import simple_speaker_listener_v4
+from agilerl.algorithms.matd3 import MATD3
 
 
 # Define function to return image
@@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num):
     n_agents = env.num_agents
     agent_ids = env.agents
 
-    # Instantiate an MADDPG object
-    matd3 = MATD3(
-        state_dim,
-        action_dim,
-        one_hot,
-        n_agents,
-        agent_ids,
-        max_action,
-        min_action,
-        discrete_actions,
-        device=device,
-    )
-
-    # Load the saved algorithm into the MADDPG object
+    # Load the saved agent
     path = "./models/MATD3/MATD3_trained_agent.pt"
-    matd3.loadCheckpoint(path)
+    matd3 = MATD3.load(path, device)
 
     # Define test loop parameters
     episodes = 10  # Number of episodes to test agent on
@@ -102,9 +89,9 @@ def _label_with_episode_number(frame, episode_num):
             )
 
             # Get next action from agent
-            cont_actions, discrete_action = matd3.getAction(
+            cont_actions, discrete_action = matd3.get_action(
                 state,
-                epsilon=0,
+                training=False,
                 agent_mask=agent_mask,
                 env_defined_actions=env_defined_actions,
             )

From c9b5189b3af464746b2283925acdbbcac311d081 Mon Sep 17 00:00:00 2001
From: nicku-a <nickua@btinternet.com>
Date: Mon, 24 Jun 2024 14:44:09 +0100
Subject: [PATCH 2/3] Update agilerl version

---
 tutorials/AgileRL/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index 1262ee83c..0b2144cfc 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,4 +1,4 @@
-agilerl==0.1.22; python_version >= '3.9'
+agilerl==1.0.0; python_version >= '3.9'
 pettingzoo[classic,atari,mpe]>=1.23.1
 SuperSuit>=3.9.0
 torch>=2.0.1

From f0eda2edb15759de04dfc078817061989f20b343 Mon Sep 17 00:00:00 2001
From: nicku-a <nickua@btinternet.com>
Date: Mon, 24 Jun 2024 15:04:51 +0100
Subject: [PATCH 3/3] agilerl precommit formatting changes

---
 tutorials/AgileRL/agilerl_dqn_curriculum.py | 38 ++++++++++++++-------
 tutorials/AgileRL/agilerl_maddpg.py         |  6 ++--
 tutorials/AgileRL/agilerl_matd3.py          |  6 ++--
 tutorials/AgileRL/render_agilerl_dqn.py     |  4 +--
 tutorials/AgileRL/render_agilerl_maddpg.py  |  4 +--
 tutorials/AgileRL/render_agilerl_matd3.py   |  4 +--
 6 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index c0d79a511..e2f378b15 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -13,13 +13,13 @@
 import torch
 import wandb
 import yaml
-from pettingzoo.classic import connect_four_v3
-from tqdm import tqdm, trange
-
 from agilerl.components.replay_buffer import ReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
 from agilerl.utils.utils import create_population
+from tqdm import tqdm, trange
+
+from pettingzoo.classic import connect_four_v3
 
 
 class CurriculumEnv:
@@ -835,9 +835,13 @@ def transform_and_flip(observation, player):
                                 train_actions_hist[p1_action] += 1
 
                             env.step(p1_action)  # Act in environment
-                            observation, cumulative_reward, done, truncation, _ = (
-                                env.last()
-                            )
+                            (
+                                observation,
+                                cumulative_reward,
+                                done,
+                                truncation,
+                                _,
+                            ) = env.last()
                             p1_next_state, p1_next_state_flipped = transform_and_flip(
                                 observation, player=1
                             )
@@ -938,9 +942,13 @@ def transform_and_flip(observation, player):
                         rewards = []
                         for i in range(evo_loop):
                             env.reset()  # Reset environment at start of episode
-                            observation, cumulative_reward, done, truncation, _ = (
-                                env.last()
-                            )
+                            (
+                                observation,
+                                cumulative_reward,
+                                done,
+                                truncation,
+                                _,
+                            ) = env.last()
 
                             player = -1  # Tracker for which player"s turn it is
 
@@ -994,9 +1002,13 @@ def transform_and_flip(observation, player):
                                         eval_actions_hist[action] += 1
 
                                 env.step(action)  # Act in environment
-                                observation, cumulative_reward, done, truncation, _ = (
-                                    env.last()
-                                )
+                                (
+                                    observation,
+                                    cumulative_reward,
+                                    done,
+                                    truncation,
+                                    _,
+                                ) = env.last()
 
                                 if (player > 0 and opponent_first) or (
                                     player < 0 and not opponent_first
@@ -1021,7 +1033,7 @@ def transform_and_flip(observation, player):
                     f"    Train Mean Score: {np.mean(agent.scores[-episodes_per_epoch:])}   Train Mean Turns: {mean_turns}   Eval Mean Fitness: {np.mean(fitnesses)}   Eval Best Fitness: {np.max(fitnesses)}   Eval Mean Turns: {eval_turns}   Total Steps: {total_steps}"
                 )
                 pbar.update(0)
-                
+
                 if wb:
                     # Format action histograms for visualisation
                     train_actions_hist = [
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index b71e09767..550a4baa3 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -8,14 +8,14 @@
 import numpy as np
 import supersuit as ss
 import torch
-from pettingzoo.atari import space_invaders_v2
-from tqdm import trange
-
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
 from agilerl.utils.utils import create_population
 from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
+from tqdm import trange
+
+from pettingzoo.atari import space_invaders_v2
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index 0791995ce..893a80fe7 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -7,14 +7,14 @@
 
 import numpy as np
 import torch
-from pettingzoo.mpe import simple_speaker_listener_v4
-from tqdm import trange
-
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
 from agilerl.utils.utils import create_population
 from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper
+from tqdm import trange
+
+from pettingzoo.mpe import simple_speaker_listener_v4
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py
index 8b075e7d7..67d3ad9cc 100644
--- a/tutorials/AgileRL/render_agilerl_dqn.py
+++ b/tutorials/AgileRL/render_agilerl_dqn.py
@@ -3,11 +3,11 @@
 import imageio
 import numpy as np
 import torch
+from agilerl.algorithms.dqn import DQN
 from agilerl_dqn_curriculum import Opponent, transform_and_flip
-from pettingzoo.classic import connect_four_v3
 from PIL import Image, ImageDraw, ImageFont
 
-from agilerl.algorithms.dqn import DQN
+from pettingzoo.classic import connect_four_v3
 
 
 # Define function to return image
diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py
index c9a05df6a..f862ee56a 100644
--- a/tutorials/AgileRL/render_agilerl_maddpg.py
+++ b/tutorials/AgileRL/render_agilerl_maddpg.py
@@ -4,10 +4,10 @@
 import numpy as np
 import supersuit as ss
 import torch
-from pettingzoo.atari import space_invaders_v2
+from agilerl.algorithms.maddpg import MADDPG
 from PIL import Image, ImageDraw
 
-from agilerl.algorithms.maddpg import MADDPG
+from pettingzoo.atari import space_invaders_v2
 
 
 # Define function to return image
diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py
index 90a7e92bc..c096d661f 100644
--- a/tutorials/AgileRL/render_agilerl_matd3.py
+++ b/tutorials/AgileRL/render_agilerl_matd3.py
@@ -3,10 +3,10 @@
 import imageio
 import numpy as np
 import torch
-from pettingzoo.mpe import simple_speaker_listener_v4
+from agilerl.algorithms.matd3 import MATD3
 from PIL import Image, ImageDraw
 
-from agilerl.algorithms.matd3 import MATD3
+from pettingzoo.mpe import simple_speaker_listener_v4
 
 
 # Define function to return image