Merge branch 'main' into pre-commit-ci-update-config

Eclectic-Sheep · Sep 18, 2023 · f7b52db · f7b52db
2 parents 57de9b2 + e95960b
commit f7b52db
Show file tree

Hide file tree

Showing 44 changed files with 1,911 additions and 1,168 deletions.
diff --git a/README.md b/README.md
@@ -168,7 +168,14 @@ That's all it takes to train an agent with SheepRL! 🎉
 
 > **Note**
 >
-> You can find more information about the observation space by checking [the related howto section](./howto/select_observations.md).
+> Before you start using the SheepRL framework, it is **highly recommended** that you read the following instructional documents:
+> 
+> 1. How to [run experiments](./howto/run_experiments.md)
+> 2. How to [modify the default configs](./howto/configs.md)
+> 3. How to [work with steps](./howto/work_with_steps.md)
+> 4. How to [select observations](./howto/select_observations.md)
+>
+> Moreover, there are other useful documents in the [`howto` folder](./howto/), which containes some guidance on how to properly use the framework.
 
 ### :chart_with_upwards_trend: Check your results
 

diff --git a/examples/architecture_template.py b/examples/architecture_template.py
@@ -139,7 +139,7 @@ def main():
     if devices is None or devices in ("1", "2"):
         raise RuntimeError(
             "Please run the script with the number of devices greater than 2: "
-            "`lightning run model --devices=3 sheeprl.py ...`"
+            "`lightning run model --devices=3 examples/architecture_template.py ...`"
         )
 
     world_collective = TorchCollective()

diff --git a/howto/configs.md b/howto/configs.md
@@ -138,8 +138,8 @@ horizon: 15
 
 # Training recipe
 learning_starts: 65536
-pretrain_steps: 1
-gradient_steps: 1
+per_rank_pretrain_steps: 1
+per_rank_gradient_steps: 1
 train_every: 16
 
 # Model related parameters

diff --git a/howto/register_new_algorithm.md b/howto/register_new_algorithm.md
@@ -129,7 +129,6 @@ def main(cfg: DictConfig):
             {
                 "Rewards/rew_avg": MeanMetric(),
                 "Game/ep_len_avg": MeanMetric(),
-                "Time/step_per_second": MeanMetric(),
                 "Loss/value_loss": MeanMetric(),
                 "Loss/policy_loss": MeanMetric(),
                 "Loss/entropy_loss": MeanMetric(),
@@ -222,7 +221,6 @@ def main(cfg: DictConfig):
 
         # Log metrics
         metrics_dict = aggregator.compute()
-        fabric.log("Time/step_per_second", int(global_step / (time.perf_counter() - start_time)), global_step)
         fabric.log_dict(metrics_dict, global_step)
         aggregator.reset()
 

diff --git a/howto/work_with_steps.md b/howto/work_with_steps.md
@@ -0,0 +1,32 @@
+# Work with steps
+In this document we want to discuss about the hyper-parameters which refer to the concept of step.
+There are various ways to interpret it, so it is necessary to clearly specify how to interpret it.
+
+## Policy steps
+We start from the concept of *policy step*: a policy step is the particular step in which the policy selects the action to perform in the environment, given an observation received by it.
+
+> **Note**
+>
+> The environment step is the step performed by the environment: the environment takes in input an action and computes the next observation and the next reward.
+
+Now that we have introduced the concept of policy step, it is necessary to clarify some aspects:
+
+1. When there are multiple parallel environments, the policy step is proportional to the number of parallel environments. E.g., if there are $m$ environments, then the actor has to choose $m$ actions and each environment performs an environment step: this means that $\bold{m}$ **policy steps** are performed.
+2. When there are multiple parallel processes (i.e. the script has been run with `lightning run model --devices>=2 ...`), the policy step it is proportional to the number of parallel processes. E.g., let us assume that there are $n$ processes each one containing one single environment: the $n$ actors select an action and a (per-process) step in the environment is performed. In this case $\bold{n}$ **policy steps** are performed.
+
+In general, if we have $n$ parallel processes, each one with $m$ independent environments, the policy step increases **globally** by $n \cdot m$ at each iteration.
+
+The hyper-parameters which refer to the *policy steps* are:
+
+* `total_steps`: the total number of policy steps to perform in an experiment. Effectively, this number will be divided in each process by $n \cdot m$ to obtain the number of training steps to be performed by each of them.
+* `exploration_steps`: the number of policy steps in which the agent explores the environment in the P2E algorithms.
+* `max_episode_steps`: the maximum number of policy steps an episode can last ($\text{max\_steps}$); when this number is reached a `terminated=True` is returned by the environment. This means that if you decide to have an action repeat greater than one ($\text{action\_repeat} > 1$), then the environment performs a maximum number of steps equal to: $\text{env\_steps} = \text{max\_steps} \cdot \text{action\_repeat}$.
+* `learning_starts`: how many policy steps the agent has to perform before starting the training.
+* `train_every`: how many policy steps the agent has to perform between one training and the next.
+
+## Gradient steps
+A *gradient step* consists of an update of the parameters of the agent, i.e., a call of the *train* function. The gradient step is proportional to the number of parallel processes, indeed, if there are $n$ parallel processes, $n \cdot \text{gradient\_steps}$ calls to the *train* method will be executed.
+
+The hyper-parameters which refer to the *gradient steps* are:
+* `algo.per_rank_gradient_steps`: the number of gradient steps per rank to perform in a single iteration.
+* `algo.per_rank_pretrain_steps`: the number of gradient steps per rank to perform in the first iteration.
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,9 +26,11 @@ dependencies = [
   "tensorboard>=2.10",
   "python-dotenv>=1.0.0",
   "lightning==2.0.*",
-  "lightning-utilities<0.9",
+  "lightning-utilities<=0.9",
   "hydra-core==1.3.0",
-  "torchmetrics==1.1.*"
+  "torchmetrics==1.1.*",
+  "rich==13.5.*",
+  "opencv-python==4.8.0.*"
 ]
 dynamic = ["version"]
 

diff --git a/sheeprl/__init__.py b/sheeprl/__init__.py
@@ -31,4 +31,4 @@
 np.int = np.int64
 np.bool = bool
 
-__version__ = "0.3.0"
+__version__ = "0.3.2"