From 21b025693413ddbf0ae1ab264ecfbe581a9565da Mon Sep 17 00:00:00 2001 From: Ryan Date: Fri, 1 Nov 2024 09:56:13 -0600 Subject: [PATCH] docs: update docs for non-Trial-centric world (#10174) The model debugging guide was completely out-of-date, and needed a near-total rewrite. Additionally, the Core API user guide had additional details that needed updating, which I missed in my first pass. Also, there were issues with two examples: - the iris example was not configured to train long enough to actually converge, which looks bad for an example - The core_api_mnist_pytorch example had a couple show-stopper bugs, so not all of its stages ran at all. Finally, several examples touched in the searcher-context-removal project needed `make fmt` applied to them. --- .../api-guides/apis-howto/api-core-ug.rst | 47 ++- docs/model-dev-guide/create-experiment.rst | 2 + docs/model-dev-guide/debug-models.rst | 302 +++++------------- .../computer_vision/iris_tf_keras/train.py | 3 +- examples/deepspeed/dcgan/data.py | 7 +- examples/deepspeed/dcgan/model.py | 5 +- .../detsd/trainer.py | 4 +- .../core_api_pytorch_mnist/adaptive.yaml | 4 +- .../core_api_pytorch_mnist/distributed.yaml | 3 +- .../model_def_adaptive.py | 2 + 10 files changed, 116 insertions(+), 263 deletions(-) diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst index 5b51199eac9..c13dadb4578 100644 --- a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst +++ b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst @@ -305,46 +305,26 @@ configuration file. Step 4: Hyperparameter Search ******************************* -With the Core API you can run advanced hyperparameter searches with arbitrary training code. The -hyperparameter search logic is in the master, which coordinates many different Trials. Each trial -runs a train-validate-report loop: - -.. table:: - - +----------+--------------------------------------------------------------------------+ - | Train | Train until a point chosen by the hyperparameter search algorithm and | - | | obtained via the Core API. The length of training is absolute, so you | - | | have to keep track of how much you have already trained to know how much | - | | more to train. | - +----------+--------------------------------------------------------------------------+ - | Validate | Validate your model to obtain the metric you configured in the | - | | ``searcher.metric`` field of your experiment config. | - +----------+--------------------------------------------------------------------------+ - | Report | Use the Core API to report results to the master. | - +----------+--------------------------------------------------------------------------+ +With the Core API you can run advanced hyperparameter searches with any training loop. The +hyperparameter search logic is in the master, which can create trials and can decide to preempt them +if they are underpeforming. To perform a hyperparameter search, we'll update our script to define the hyperparameter search settings we want to use for our experiment. More specifically, we'll need to define the following settings in our experiment configuration file: -- ``name:`` ``adaptive_asha`` (name of our searcher. For all options, visit :ref:`search-methods`. -- ``metric``: ``test_loss`` -- ``smaller_is_better``: ``True`` (This is equivalent to minimization vs. maximization of - objective.) -- ``max_trials``: 500 (This is the maximum number of trials the searcher should run.) -- ``time_metric``: ``epochs`` (This is the name of the "time" metric which we report in validation +- ``name: adaptive_asha`` (name of our searcher. For all options, visit :ref:`search-methods`). +- ``metric: test_loss`` +- ``smaller_is_better: true`` (This is equivalent to minimization vs. maximization of objective.) +- ``max_trials: 50`` (This is the maximum number of trials the searcher should run.) +- ``time_metric: epochs`` (This is the name of the "time" metric which we report in validation metrics). -- ``max_time``: 20 (The max number of epochs a trial will report. For more information, visit +- ``max_time: 20`` (The max number of epochs a trial will report. For more information, visit Adaptive ASHA in the :ref:`Experiment Configuration Reference `. In addition, we also need to define the hyperparameters themselves. Adaptive ASHA will pick values between the ``minval`` and ``maxval`` for each hyperparameter for each trial. -.. note:: - - To see early stopping in action, try setting ``max_trials`` to over 500 and playing around with - the hyperparameter search values. - In this step, we’ll run our experiment using the ``model_def_adaptive.py`` script and its accompanying ``adaptive.yaml`` experiment configuration file. @@ -375,6 +355,15 @@ hardcoded values: :end-before: # Docs snippet end: per trial basis :dedent: +Lastly, to comply with the requirements of the ASHA search, we must report an ``epochs`` metric with +our validation metrics, since we set ``time_metric: epochs`` in our searcher: + +.. literalinclude:: ../../../../examples/tutorials/core_api_pytorch_mnist/model_def_adaptive.py + :language: python + :start-after: # Docs snippet start: report epochs + :end-before: # Docs snippet end: report epochs + :dedent: + Step 4.1: Run the Experiment ============================ diff --git a/docs/model-dev-guide/create-experiment.rst b/docs/model-dev-guide/create-experiment.rst index bd62ac1b320..1bfe607d6b9 100644 --- a/docs/model-dev-guide/create-experiment.rst +++ b/docs/model-dev-guide/create-experiment.rst @@ -110,6 +110,8 @@ Example Python script command: script.py [args...] +.. _predefined-launchers: + ********************** Predefined Launchers ********************** diff --git a/docs/model-dev-guide/debug-models.rst b/docs/model-dev-guide/debug-models.rst index 74385d612e9..3ae8cc15b19 100644 --- a/docs/model-dev-guide/debug-models.rst +++ b/docs/model-dev-guide/debug-models.rst @@ -4,37 +4,21 @@ Debugging Models ################## -Using Determined to debug models depends on your environment. - -Your code on a Determined cluster differs from typical training scripts in the following ways: - -- The code conforms to the Trial APIs as a subclass of the Determined ``Trial`` class, indirectly, - by using one of the concrete subclasses, such as - :class:`~determined.pytorch.deepspeed.DeepSpeedTrial`. - -- The code runs in a Docker container on another machine. +Using Determined to train your model can introduce a number of failure points that aren't present +when running training scripts locally. Running your code on a Determined cluster differs from +typical training scripts in the following ways: +- The code runs in a Docker container, possibly on another machine. - Your model can run many times in a hyperparameter search. - - Your model can be distributed across multiple GPUs or machines. -These debugging steps introduce code changes incrementally, working toward a fully functioning -Determined model. Follow the nine steps as applicable to your environment: - -- Model-related Issues - - `Step 1 - Verify that your code runs locally`_ - - `Step 2 - Verify that each Trial subclass method works locally`_ - - `Step 3 - Verify local test mode`_ - -- Docker- or Cluster-related Issues - - `Step 4 - Verify that the original code runs in a notebook or shell`_ - - `Step 5 - Verify that each Trial subclass method works in a notebook or shell`_ - - `Step 6 - Verify that local test mode works in a notebook or shell`_ +These debugging steps introduce environment and code changes incrementally, working toward fully +functioning distributed training on a Determined cluster: -- Higher-level Issues - - `Step 7 - Verify that cluster test mode works with slots_per_trial set to 1`_ - - `Step 8 - Verify that a single-GPU experiment works`_ - - `Step 9 - Verify that a multi-GPU experiment works`_ +- `Step 1 - Verify that your training script runs locally`_ +- `Step 2 - Verify that your training script runs in a notebook or shell`_ +- `Step 3 - Verify that a single-GPU experiment works`_ +- `Step 4 - Verify that a multi-GPU experiment works`_ ************** Prerequisite @@ -45,90 +29,29 @@ development machine, on-prem, or on cloud. For installation guides, visit :ref:` .. _step1: -********************************************* - Step 1 - Verify that your code runs locally -********************************************* +******************************************************** + Step 1 - Verify that your training script runs locally +******************************************************** -This step assumes you have ported (converted) your model from code outside of Determined. Otherwise, -skip to :ref:`Step 2 `. +Determined's training APIs are designed to work both on-cluster and locally (that is, without +interacting with a Determined master), so you should be able to run your training script on the same +local machine that you ran your model before integrating with Determined APIs. + +If you ported your model to :class:`~determined.pytorch.PyTorchTrial` or +:class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and are having trouble getting your ported +model to work, one debugging strategy is to manually call the various methods of your Trial directly +before calling ``Trainer.fit()``. Confirm that your code works as expected before continuing. .. _step2: -*************************************************************** - Step 2 - Verify that each Trial subclass method works locally -*************************************************************** - -This step assumes you have a working local environment for training. If you typically run your code -in a Docker environment, skip to :ref:`Step 4 `. This step also ensures that your class -performs as expected by calling its methods and verifying the output. - -:class:`~determined.pytorch.PyTorchTrial` supports a fully-local training mode, which can be useful -for debugging. See :ref:`pytorch_trainer_ug` for usage details. - -#. Create simple tests to verify each ``Trial`` subclass method. - - Examples of what these tests might look like for - :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` can be found in the - :meth:`determined.TrialContext.from_config` documentation, but only you can verify what is - reasonable for your test. - -#. Diagnose failures: - - If you experience issues running the ``Trial`` subclass methods locally, it is likely there are - errors are in your trial class or the ``hyperparameters`` section of your configuration file. - Ideally, method-by-method evaluation makes it easier to find and solve issues. - -.. _step3: - -********************************* - Step 3 - Verify local test mode -********************************* - -:ref:`Step 2 ` validated that your Trial API calls work as expected. This step uses your code -to run an actual Determined training loop with abbreviated workloads to make sure that it meets -Determined requirements. - -This step assumes you have a working local environment for training. If you do not, skip to -:ref:`Step 4 `. - -#. Create an experiment using the following command: - - .. code:: bash - - det experiment create myconfig.yaml my_model_dir --local --test - - The ``--local`` argument specifies that training occurs where you launched the experiment instead - of occurring on a cluster. The ``--test`` argument runs abbreviated workloads to try to detect - bugs sooner and exits immediately. - - The test is considered to have passed if the command completes successfully. - -#. Diagnose failures: - - Local test mode performs the following actions: - - #. Builds a model. - #. Runs a single batch of training data. - #. Evaluates the model. - #. Saves a checkpoint to a dummy location. - - If your per-method checks in :ref:`Step 2 ` passed but local test mode fails, your - ``Trial`` subclass might not be implemented correctly. Double-check the documentation. It is also - possible that you have found a bug or an invalid assumption in the Determined software and should - `file a GitHub issue `__ or contact - Determined on `Slack - `__. - -.. _step4: - -******************************************************************** - Step 4 - Verify that the original code runs in a notebook or shell -******************************************************************** +*********************************************************************** + Step 2 - Verify that your training script runs in a notebook or shell +*********************************************************************** -This step is the same as :ref:`Step 1 `, except the original code runs on the Determined -cluster instead of locally. +This step is the same as :ref:`Step 1 `, except the your training script runs on the +Determined cluster instead of locally. #. Launch a notebook or shell on the cluster: @@ -154,7 +77,8 @@ cluster instead of locally. #. Verify code execution: - After you are on the cluster, testing is the same as :ref:`Step 1 `. + After you are on the cluster, you can test your script by just running it, as in :ref:`Step 1 + `. #. Diagnose failures: @@ -176,72 +100,24 @@ cluster instead of locally. - If you need environment variables to be set for your model to work, see :ref:`command-notebook-configuration`. -.. _step5: - -****************************************************************************** - Step 5 - Verify that each Trial subclass method works in a notebook or shell -****************************************************************************** - -This step is the same as :ref:`Step 2 `, except the original code runs on the Determined -cluster instead of locally. - -#. Launch a notebook or shell: - - If you prefer to use Jupyter notebook, enter: - - .. code:: bash - - det notebook start --context my_model_dir - # Your browser should automatically open the notebook. - - If you prefer to use SSH to interact with your model, enter: - - .. code:: bash - - det shell start --context my_model_dir - # Your terminal should automatically connect to the shell. - - When interacting with the shell or notebook, testing is the same as :ref:`Step 2 `. - -#. Diagnose failures: - - Combine the failure diagnosis steps used in :ref:`Step 2 ` and :ref:`Step 4 `. - -.. _step6: - -******************************************************************* - Step 6 - Verify that local test mode works in a notebook or shell -******************************************************************* - -This step is the same as :ref:`Step 3 `, except the original code runs on the Determined -cluster instead of locally. - -#. Launch a notebook or shell as described in :ref:`Step 4 `. - - On the cluster, testing is the same as :ref:`Step 3 `, except that the second model - definition argument of the ``det experiment create`` command should be - ``/run/determined/workdir`` or ``.`` if you have not changed the working directory after - connecting to the cluster. This is because the ``--context`` specified when creating the shell or - notebook is copied to the ``/run/determined/workdir`` directory inside the container, the same as - the model definition argument is copied to ``det experiment create``. +.. _step3: -#. Diagnose failures following the same steps described in :ref:`Step 3 ` and :ref:`Step 4 - `. +**************************************************** + Step 3 - Verify that a single-GPU experiment works +**************************************************** -.. _step7: +In this step, instead of launching the command from an interactive environment, it is submitted to +the cluster and managed by Determined. -**************************************************************************** - Step 7 - Verify that cluster test mode works with slots_per_trial set to 1 -**************************************************************************** +#. Apply customizations: -This step is similar to :ref:`Step 6 `, except instead of launching the command from an -interactive environment, it is submitted to the cluster and managed by Determined. + If you customized your command environment in testing :ref:`Step 2 `, make sure to apply + the same customizations in your experiment configuration file. -#. Apply customizations: +#. Set ``entrypoint``: - If you customized your command environment in testing :ref:`Step 3 `, :ref:`Step 4 - `, or :ref:`Step 5 `, make sure to apply the same customizations in your experiment - configuration file. + Set the ``entrypoint`` of your experiment config to match the way you call your training script + in your environment, including all arguments. #. Set ``resources.slots_per_trial``: @@ -253,17 +129,21 @@ interactive environment, it is submitted to the cluster and managed by Determine resources: slots_per_trial: 1 -#. Create an experiment with the ``--test`` argument, omitting the ``--local`` argument: +#. Submit your experiment: .. code:: bash - det experiment create myconfig.yaml my_model_dir --test + det experiment create myconfig.yaml my_model_dir -f #. Diagnose failures: - If you can run local test mode inside a notebook or shell but are unable to successfully submit - an experiment, make sure that notebook or shell customizations you might have made are replicated - in your :ref:`experiment configuration `, such as: + The experiment configuration is validated upon submission. If you see errors about ``invalid + experiment configuration`` during submission, review the :ref:`experiment configuration + `. + + If your training script runs inside a notebook or shell, but fails when on-cluster, make sure + that notebook or shell customizations you might have made are replicated in your experiment + config, such as: - If required, a custom Docker image is set in the experiment configuration. @@ -292,73 +172,53 @@ interactive environment, it is submitted to the cluster and managed by Determine before training starts. The message ``Checkpoint storage validation failed``, indicates that you should review the ``checkpoint_storage`` setting values. - - The experiment configuration is more strictly validated for cluster-managed experiments than - for ``--local --test`` mode. Errors related to ``invalid experiment configuration`` when - attempting to submit the experiment to the cluster indicate that the experiment configuration - has errors. Review the :ref:`experiment configuration `. - If you are unable to identify the cause of the problem, contact Determined `community support `__! -.. _step8: - -**************************************************** - Step 8 - Verify that a single-GPU experiment works -**************************************************** - -This step is similar to :ref:`Step 7 `, except that it introduces hyperparameter search and -executes full training for each trial. - -#. Configure your system the same as :ref:`Step 7 `: - - Confirm that your experiment configuration does not specify ``resources.slots_per_trial`` or that - it is set to ``1``. For example: - - .. code:: yaml - - resources: - slots_per_trial: 1 +.. _step4: -#. Create an experiment without the ``--test`` or ``--local`` arguments: +*************************************************** + Step 4 - Verify that a multi-GPU experiment works +*************************************************** - You might find the ``--follow``, or ``-f``, argument helpful: +This step introduces distributed training. - .. code:: bash +#. Make any necessary code changes: - det experiment create myconfig.yaml my_model_dir -f + - If you are using the Core API for training, distributed training may take extra work. The + :ref:`api-core-ug-basic` and :ref:`api-core-ug` examples can help you understand what all is + required. -#. Diagnose failures: + - If you are using Determined's :class:`keras.DeterminedCallback + ` for training, you will have to take the `standard steps + for enabling distributed training in Keras + `__, except that you don't need to + configure the ``TF_CONFIG`` environment variable because it is configured by Determined's + :ref:`launch-tensorflow`. - If :ref:`Step 7 ` worked but this step does not, check: + - For the remaining training APIs, distributed training should work without additional code + changes. - - Check if the error happens when the experiment configuration has ``searcher.source_trial_id`` - set. One possibility in an actual experiment that does not occur in a ``--test`` experiment is - the loading of a previous checkpoint. Errors when loading from a checkpoint can be caused by - architectural changes, where the new model code is not architecturally compatible with the old - model code. +#. Wrap your training script in ``entrypoint`` with the correct launcher for the training API you + are using. For example, if you are using PyTorchTrial, you should use Determined's + :ref:`pytorch-dist-launcher`: - - Generally, issues in this step are caused by doing training and evaluation continuously. Focus - on how that change can cause issues with your code. - -.. _step9: - -*************************************************** - Step 9 - Verify that a multi-GPU experiment works -*************************************************** + .. code:: yaml -This step is similar to :ref:`Step 8 `, except that it introduces distributed training. This -step only applies if you have multiple GPUs and want to use distributed training. + entrypoint: >- + python3 -m determined.launch.torch_distributed -- + python3 ./my_train_script.py --my-option=value -#. Configure your system the same as :ref:`Step 7 `: + See :ref:`predefined-launchers` for more launcher options. - Set ``resources.slots_per_trial`` to a number greater than ``1``. For example: +#. Configure ``resources.slots_per_trial`` to a number greater than ``1``. For example: .. code:: yaml resources: slots_per_trial: 2 -#. Create your experiment: +#. Submit your experiment: .. code:: bash @@ -366,8 +226,8 @@ step only applies if you have multiple GPUs and want to use distributed training #. Diagnose failures: - If you are using the ``determined`` library APIs correctly, distributed training should work - without error. Otherwise, common problems might be: + Double-check that any code changes you made are correct, and also that you wrapped your code with + the correct launcher. Otherwise, common problems might be: - If your experiment is not being scheduled on the cluster, ensure that the ``slots_per_trial`` setting is valid for your cluster. For example: @@ -384,9 +244,9 @@ step only applies if you have multiple GPUs and want to use distributed training Ensure that there are no other notebooks, shells, or experiments on the cluster that might consume too many resources and prevent the experiment from starting. - - Determined is designed to control the details of distributed training for you. If you also try - to control those details, such as by calling ``tf.config.set_visible_devices()`` while - training a Keras model, it is likely to cause issues. + - Determined is designed to control many of the details of distributed training for you. If you + also try to control those details, such as by calling ``tf.config.set_visible_devices()`` + while training a Keras model, it is likely to cause issues. - Some classes of metrics must be specially calculated during distributed training. Most metrics, such as loss or accuracy, can be calculated piecemeal on each worker in a distributed diff --git a/examples/computer_vision/iris_tf_keras/train.py b/examples/computer_vision/iris_tf_keras/train.py index 2e5fccc252c..2ec868d2d0e 100644 --- a/examples/computer_vision/iris_tf_keras/train.py +++ b/examples/computer_vision/iris_tf_keras/train.py @@ -112,11 +112,12 @@ def main(core_context, strategy, checkpoint, continue_id, hparams, epochs): callbacks=[det_cb, tb_cb], ) + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) parser = argparse.ArgumentParser() - parser.add_argument("--epochs", type=int, default=100, help="how long to train for") + parser.add_argument("--epochs", type=int, default=500, help="how long to train for") args = parser.parse_args() info = det.get_cluster_info() diff --git a/examples/deepspeed/dcgan/data.py b/examples/deepspeed/dcgan/data.py index c950df584d1..5467a43ff18 100644 --- a/examples/deepspeed/dcgan/data.py +++ b/examples/deepspeed/dcgan/data.py @@ -21,9 +21,10 @@ def get_dataset(data_config: dict) -> torch.utils.data.Dataset: if data_config.get("dataroot", None) is None: - if str(data_config.get("dataset"),"").lower() != "fake": - raise ValueError('`dataroot` parameter is required for dataset "%s"' - % data_config.get("dataset", "")) + if str(data_config.get("dataset"), "").lower() != "fake": + raise ValueError( + '`dataroot` parameter is required for dataset "%s"' % data_config.get("dataset", "") + ) else: context = contextlib.nullcontext() else: diff --git a/examples/deepspeed/dcgan/model.py b/examples/deepspeed/dcgan/model.py index 8ceab93dc6a..99322dd5a9c 100644 --- a/examples/deepspeed/dcgan/model.py +++ b/examples/deepspeed/dcgan/model.py @@ -17,8 +17,9 @@ class DCGANTrial(det_ds.DeepSpeedTrial): - def __init__(self, context: det_ds.DeepSpeedTrialContext, - hparams: dict, data_config: dict) -> None: + def __init__( + self, context: det_ds.DeepSpeedTrialContext, hparams: dict, data_config: dict + ) -> None: self.context = context self.hparams = hparams self.data_config = data_config diff --git a/examples/diffusion/textual_inversion_stable_diffusion/detsd/trainer.py b/examples/diffusion/textual_inversion_stable_diffusion/detsd/trainer.py index f8204250060..926c08cdbd6 100644 --- a/examples/diffusion/textual_inversion_stable_diffusion/detsd/trainer.py +++ b/examples/diffusion/textual_inversion_stable_diffusion/detsd/trainer.py @@ -246,9 +246,7 @@ def train_on_cluster(cls) -> None: trainer.logger.info(f"Step {trainer.steps_completed} completed.") is_end_of_training = trainer.steps_completed == trainer.num_sgd_steps - time_to_report = ( - trainer.steps_completed % trainer.metric_report_freq == 0 - ) + time_to_report = trainer.steps_completed % trainer.metric_report_freq == 0 time_to_ckpt = trainer.steps_completed % trainer.checkpoint_freq == 0 # Report metrics, checkpoint, and preempt as appropriate. diff --git a/examples/tutorials/core_api_pytorch_mnist/adaptive.yaml b/examples/tutorials/core_api_pytorch_mnist/adaptive.yaml index b87936cc26d..6dd1465f55a 100644 --- a/examples/tutorials/core_api_pytorch_mnist/adaptive.yaml +++ b/examples/tutorials/core_api_pytorch_mnist/adaptive.yaml @@ -28,7 +28,7 @@ searcher: name: adaptive_asha metric: test_loss smaller_is_better: true - max_trials: 500 + max_trials: 50 time_metric: epochs max_time: 20 -entrypoint: python3 model_def_adaptive.py +entrypoint: python3 model_def_adaptive.py --epochs 20 diff --git a/examples/tutorials/core_api_pytorch_mnist/distributed.yaml b/examples/tutorials/core_api_pytorch_mnist/distributed.yaml index 7c4511ca114..b9ab44e480a 100644 --- a/examples/tutorials/core_api_pytorch_mnist/distributed.yaml +++ b/examples/tutorials/core_api_pytorch_mnist/distributed.yaml @@ -28,9 +28,8 @@ hyperparameters: max_restarts: 0 records_per_epoch: 60000 searcher: - name: adaptive_asha + name: single metric: test_loss smaller_is_better: true - max_trials: 500 resources: slots_per_trial: 4 diff --git a/examples/tutorials/core_api_pytorch_mnist/model_def_adaptive.py b/examples/tutorials/core_api_pytorch_mnist/model_def_adaptive.py index b599673d2d7..a9d2fcac035 100644 --- a/examples/tutorials/core_api_pytorch_mnist/model_def_adaptive.py +++ b/examples/tutorials/core_api_pytorch_mnist/model_def_adaptive.py @@ -99,10 +99,12 @@ def test(args, model, device, test_loader, core_context, steps_completed, epochs ) ) + # Docs snippet start: report epochs core_context.train.report_validation_metrics( steps_completed=steps_completed, metrics={"test_loss": test_loss, "epochs": epochs_completed}, ) + # Docs snippet end: report epochs def load_state(checkpoint_directory, trial_id):