From 32f402cf511bf6ba56d336e89b4aa7c0c3253c33 Mon Sep 17 00:00:00 2001
From: Anda Zhou <83614683+azhou-determined@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:24:59 -0700
Subject: [PATCH] docs: docs changes for searcher context removal (#10182)

update docs and add release note for searcher context removal in 0.38.0

(cherry picked from commit f02872a42855d41261b1914256f7b4503c0ced23)
---
 .../apis-howto/deepspeed/deepspeed.rst        | 231 ++++++++++++++++++
 .../deepspeed/pytorch2deepspeed.rst           |  10 +-
 .../deploy/helm-config-reference.rst          |   2 +-
 .../reference/experiment-config-reference.rst | 105 ++++----
 .../training/api-deepspeed-reference.rst      |  13 +
 .../training/api-pytorch-reference.rst        |  14 ++
 .../searcher-context-removal.rst              |  72 ++++++
 examples/deepspeed/dcgan/README.md            |   8 +-
 examples/deepspeed/dcgan/model.py             |  18 +-
 harness/determined/pytorch/_trainer_utils.py  |  18 +-
 .../pytorch/deepspeed/_deepspeed_context.py   |  23 ++
 .../determined/pytorch/deepspeed/_trainer.py  |   2 +
 master/pkg/searcher/searcher.go               |   6 +-
 13 files changed, 447 insertions(+), 75 deletions(-)
 create mode 100644 docs/release-notes/searcher-context-removal.rst

diff --git a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst
index 6c5f25bc8f9..083e7cc8dce 100644
--- a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst
+++ b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst
@@ -365,6 +365,237 @@ profiling batches 3 and 4.
    rendering times for TensorBoard and memory issues. For long-running experiments, it is
    recommended to configure a profiling schedule.
 
+*******************
+ DeepSpeed Trainer
+*******************
+
+With the DeepSpeed Trainer API, you can implement and iterate on model training code locally before
+running on cluster. When you are satisfied with your model code, you configure and submit the code
+on cluster.
+
+The DeepSpeed Trainer API lets you do the following:
+
+-  Work locally, iterating on your model code.
+-  Debug models in your favorite debug environment (e.g., directly on your machine, IDE, or Jupyter
+   notebook).
+-  Run training scripts without needing to use an experiment configuration file.
+-  Load previously saved checkpoints directly into your model.
+
+Initializing the Trainer
+========================
+
+After defining the PyTorch Trial, initialize the trial and the trainer.
+:meth:`~determined.pytorch.deepspeed.init` returns a
+:class:`~determined.pytorch.deepspeed.DeepSpeedTrialContext` for instantiating
+:class:`~determined.pytorch.deepspeed.DeepSpeedTrial`. Initialize
+:class:`~determined.pytorch.deepspeed.Trainer` with the trial and context.
+
+.. code:: python
+
+   from determined.pytorch import deepspeed as det_ds
+
+   def main():
+       with det_ds.init() as train_context:
+           trial = MyTrial(train_context)
+           trainer = det_ds.Trainer(trial, train_context)
+
+   if __name__ == "__main__":
+       # Configure logging
+       logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT)
+       main()
+
+Training is configured with a call to :meth:`~determined.pytorch.deepspeed.Trainer.fit` with
+training loop arguments, such as checkpointing periods, validation periods, and checkpointing
+policy.
+
+.. code:: diff
+
+   from determined import pytorch
+   from determined.pytorch import deepspeed as det_ds
+
+   def main():
+       with det_ds.init() as train_context:
+           trial = MyTrial(train_context)
+           trainer = det_ds.Trainer(trial, train_context)
+   +       trainer.fit(
+   +           max_length=pytorch.Epoch(10),
+   +           checkpoint_period=pytorch.Batch(100),
+   +           validation_period=pytorch.Batch(100),
+   +           checkpoint_policy="all"
+   +       )
+
+
+   if __name__ == "__main__":
+       # Configure logging
+       logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT)
+       main()
+
+Run Your Training Script Locally
+================================
+
+Run training scripts locally without submitting to a cluster or defining an experiment configuration
+file.
+
+.. code:: python
+
+   from determined import pytorch
+   from determined.pytorch import deepspeed as det_ds
+
+   def main():
+       with det_ds.init() as train_context:
+           trial = MyTrial(train_context)
+           trainer = det_ds.Trainer(trial, train_context)
+           trainer.fit(
+               max_length=pytorch.Epoch(10),
+               checkpoint_period=pytorch.Batch(100),
+               validation_period=pytorch.Batch(100),
+               checkpoint_policy="all",
+           )
+
+
+   if __name__ == "__main__":
+       # Configure logging
+       logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT)
+       main()
+
+You can run this Python script directly (``python3 train.py``), or in a Jupyter notebook. This code
+will train for ten epochs, checkpointing and validating every 100 batches.
+
+Local Distributed Training
+==========================
+
+Local training can utilize multiple GPUs on a single node with a few modifications to the above
+code.
+
+.. code:: diff
+
+   import deepspeed
+
+   def main():
+   +     # Initialize distributed backend before det_ds.init()
+   +     deepspeed.init_distributed()
+   +     # Initialize DistributedContext
+         with det_ds.init(
+   +       distributed=core.DistributedContext.from_deepspeed()
+         ) as train_context:
+             trial = MyTrial(train_context)
+             trainer = det_ds.Trainer(trial, train_context)
+             trainer.fit(
+                 max_length=pytorch.Epoch(10),
+                 checkpoint_period=pytorch.Batch(100),
+                 validation_period=pytorch.Batch(100),
+                 checkpoint_policy="all"
+             )
+
+This code can be directly invoked with your distributed backend's launcher: ``deepspeed --num_gpus=4
+trainer.py --deepspeed --deepspeed_config ds_config.json``
+
+Test Mode
+=========
+
+Trainer accepts a test_mode parameter which, if true, trains and validates your training code for
+only one batch, checkpoints, then exits. This is helpful for debugging code or writing automated
+tests around your model code.
+
+.. code:: diff
+
+    trainer.fit(
+                 max_length=pytorch.Epoch(10),
+                 checkpoint_period=pytorch.Batch(100),
+                 validation_period=pytorch.Batch(100),
+   +             test_mode=True
+             )
+
+Prepare Your Training Code for Deploying to a Determined Cluster
+================================================================
+
+Once you are satisfied with the results of training the model locally, you submit the code to a
+cluster. This example allows for distributed training locally and on cluster without having to make
+code changes.
+
+Example workflow of frequent iterations between local debugging and cluster deployment:
+
+.. code:: diff
+
+    def main():
+   +   info = det.get_cluster_info()
+   +   if info is None:
+   +       # Local: configure local distributed training.
+   +       deepspeed.init_distributed()
+   +       distributed_context = core.DistributedContext.from_deepspeed()
+   +       latest_checkpoint = None
+   +   else:
+   +       # On-cluster: Determined will automatically detect distributed context.
+   +       distributed_context = None
+   +       # On-cluster: configure the latest checkpoint for pause/resume training functionality.
+   +       latest_checkpoint = info.latest_checkpoint
+
+   +     with det_ds.init(
+   +       distributed=distributed_context
+         ) as train_context:
+             trial = DCGANTrial(train_context)
+             trainer = det_ds.Trainer(trial, train_context)
+             trainer.fit(
+                 max_length=pytorch.Epoch(11),
+                 checkpoint_period=pytorch.Batch(100),
+                 validation_period=pytorch.Batch(100),
+   +             latest_checkpoint=latest_checkpoint,
+             )
+
+To run Trainer API solely on-cluster, the code is simpler:
+
+.. code:: python
+
+   def main():
+       with det_ds.init() as train_context:
+           trial_inst = gan_model.DCGANTrial(train_context)
+           trainer = det_ds.Trainer(trial_inst, train_context)
+           trainer.fit(
+               max_length=pytorch.Epoch(11),
+               checkpoint_period=pytorch.Batch(100),
+               validation_period=pytorch.Batch(100),
+               latest_checkpoint=det.get_cluster_info().latest_checkpoint,
+           )
+
+Submit Your Trial for Training on Cluster
+=========================================
+
+To run your experiment on cluster, you'll need to create an experiment configuration (YAML) file.
+Your experiment configuration file must contain searcher configuration and entrypoint.
+
+.. code:: python
+
+   name: dcgan_deepspeed_mnist
+   searcher:
+     name: single
+     metric: validation_loss
+   resources:
+     slots_per_trial: 2
+   entrypoint: python3 -m determined.launch.deepspeed python3 train.py
+
+Submit the trial to the cluster:
+
+.. code:: bash
+
+   det e create det.yaml .
+
+If your training code needs to read some values from the experiment configuration, you can set the
+``data`` field and read from ``det.get_cluster_info().trial.user_data`` or set ``hyperparameters``
+and read from ``det.get_cluster_info().trial.hparams``.
+
+Profiling
+=========
+
+When training on cluster, you can enable the system metrics profiler by adding a parameter to your
+``fit()`` call:
+
+.. code:: diff
+
+    trainer.fit(
+       ...,
+   +   profiling_enabled=True
+    )
+
 *****************************
  Known DeepSpeed Constraints
 *****************************
diff --git a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst
index 5596616d896..79ade0f1f82 100644
--- a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst
+++ b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst
@@ -18,8 +18,14 @@ Reference conversion example:
 
 .. code:: diff
 
-   -class MyTrial(PyTorchTrial):
-   +class MyTrial(DeepSpeedTrial):
+   +import deepspeed
+
+   -from determined import pytorch
+   +from determined.pytorch import deepspeed as det_ds
+
+
+   -class MyTrial(pytorch.PyTorchTrial):
+   +class MyTrial(det_ds.DeepSpeedTrial):
         def __init__(self, context):
            self.context = context
            self.args = AttrDict(self.context.get_hparams())
diff --git a/docs/reference/deploy/helm-config-reference.rst b/docs/reference/deploy/helm-config-reference.rst
index bede49d2f1b..5548c9ce93f 100644
--- a/docs/reference/deploy/helm-config-reference.rst
+++ b/docs/reference/deploy/helm-config-reference.rst
@@ -248,7 +248,7 @@
       to: ``determinedai/pytorch-ngc-dev:0736b6d``.
 
    -  ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies
-      <experiment-config-min-validation-period>`.
+      <config-log-policies>`.
 
       .. code:: yaml
 
diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst
index 65896f57cb4..6b3222b2581 100644
--- a/docs/reference/experiment-config-reference.rst
+++ b/docs/reference/experiment-config-reference.rst
@@ -182,19 +182,20 @@ Example:
 
 .. _scheduling-unit:
 
-``scheduling_unit``
-===================
+``scheduling_unit`` (deprecated)
+================================
 
 Optional. Instructs how frequent to perform system operations, such as periodic checkpointing and
-preemption, in the unit of batches. The number of records in a batch is controlled by the
-:ref:`global_batch_size <config-global-batch-size>` hyperparameter. Defaults to ``100``.
+preemption, in the unit of batches. This field has been deprecated and the behavior should be
+configured in training code directly. Please see :ref:`apis-howto-overview` for details specific to
+your training framework.
+
+.. _config-records-per-epoch:
+
+``records_per_epoch`` (deprecated)
+==================================
 
--  Setting this value too small can increase the overhead of system operations and decrease training
-   throughput.
--  Setting this value too large might prevent the system from reallocating resources from this
-   workload to another, potentially more important, workload.
--  As a rule of thumb, it should be set to the number of batches that can be trained in roughly
-   60--180 seconds.
+Optional. The number of records in the training data set. This field has been deprecated.
 
 .. _max-restarts:
 
@@ -321,22 +322,12 @@ While debugging, the logger will display lines highlighted in blue for easy iden
 
 .. _experiment-config-min-validation-period:
 
-``min_validation_period``
-=========================
-
-Optional. Specifies the minimum frequency at which validation should be run for each trial.
+``min_validation_period`` (deprecated)
+======================================
 
--  The frequency should be defined using a nested dictionary indicating the unit as records,
-   batches, or epochs. For example:
-
-.. code:: yaml
-
-   min_validation_period:
-      epochs: 2
-
--  :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and
-   :class:`~determined.keras.TFKerasTrial`: If this is in the unit of epochs, ``records_per_epoch``
-   must be specified.
+Optional. Specifies the minimum frequency at which validation should be run for each trial. This
+field has been deprecated and should be specified directly in training code. Please see
+:ref:`apis-howto-overview` for details specific to your training framework.
 
 .. _experiment-config-perform-initial-validation:
 
@@ -362,22 +353,12 @@ Determined checkpoints in the following situations:
 
 .. _experiment-config-min-checkpoint-period:
 
-``min_checkpoint_period``
-=========================
-
-Optional. Specifies the minimum frequency for running checkpointing for each trial.
-
--  This value should be set using a nested dictionary in the form of records, batches, or epochs.
-   For example:
+``min_checkpoint_period`` (deprecated)
+======================================
 
-   .. code:: yaml
-
-      min_checkpoint_period:
-         epochs: 2
-
--  :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and
-   :class:`~determined.keras.TFKerasTrial`: If the unit is in epochs, you must also specify
-   ``records_per_epoch``.
+Optional. Specifies the minimum frequency for running checkpointing for each trial. This field has
+been deprecated and should be specified directly in training code. Please see
+:ref:`apis-howto-overview` for details specific to your training framework.
 
 ``checkpoint_policy``
 =====================
@@ -394,8 +375,7 @@ Should be set to one of the following values:
 
 -  ``none``: A checkpoint will never be taken *due* to a validation. However, even with this policy
    selected, checkpoints are still expected to be taken after the trial is finished training, due to
-   cluster scheduling decisions, before search method decisions, or due to
-   :ref:`min_checkpoint_period <experiment-config-min-checkpoint-period>`.
+   cluster scheduling decisions, or when specified in training code.
 
 .. _checkpoint-storage:
 
@@ -835,9 +815,6 @@ Single
 The ``single`` search method does not perform a hyperparameter search at all; rather, it trains a
 single trial for a fixed length. When using this search method, all of the hyperparameters specified
 in the :ref:`hyperparameters <experiment-configuration_hyperparameters>` section must be constants.
-By default, validation metrics are only computed once, after the specified length of training has
-been completed; :ref:`min_validation_period <experiment-config-min-validation-period>` can be used
-to specify that validation metrics should be computed more frequently.
 
 ``metric``
 ----------
@@ -847,6 +824,12 @@ configuration.
 
 .. _experiment-configuration_single-searcher-max-length:
 
+``max_length`` (deprecated)
+---------------------------
+
+Previously, ``max_length`` was required to determine the length of each trial. This field has been
+deprecated and all training lengths should be specified directly in training code.
+
 **Optional Fields**
 
 ``smaller_is_better``
@@ -873,10 +856,7 @@ Random
 
 The ``random`` search method implements a simple random search. The user specifies how many
 hyperparameter configurations should be trained and how long each configuration should be trained
-for; the configurations are sampled randomly from the hyperparameter space. Each trial is trained
-for the specified length and then validation metrics are computed. :ref:`min_validation_period
-<experiment-config-min-validation-period>` can be used to specify that validation metrics should be
-computed more frequently.
+for; the configurations are sampled randomly from the hyperparameter space.
 
 ``metric``
 ----------
@@ -889,6 +869,12 @@ configuration.
 
 Required. The number of trials, i.e., hyperparameter configurations, to evaluate.
 
+``max_length`` (deprecated)
+---------------------------
+
+Previously, ``max_length`` was required to determine the length of each trial. This field has been
+deprecated and all training lengths should be specified directly in training code.
+
 **Optional Fields**
 
 ``smaller_is_better``
@@ -929,6 +915,12 @@ specified via the ``hyperparameters`` field. For more details see the
 Required. The name of the validation metric used to evaluate the performance of a hyperparameter
 configuration.
 
+``max_length`` (deprecated)
+---------------------------
+
+Previously, ``max_length`` was required to determine the length of each trial. This field has been
+deprecated and all training lengths should be specified directly in training code.
+
 **Optional Fields**
 
 ``smaller_is_better``
@@ -971,6 +963,12 @@ to terminate training. Once trials are stopped, they will not be resumed.
 Required. The name of the validation metric used to evaluate the performance of a hyperparameter
 configuration.
 
+``max_length`` (deprecated)
+---------------------------
+
+The length of the trial. This field has been deprecated and should be replaced with ``time_metric``
+and ``max_time`` below.
+
 ``time_metric``
 ---------------
 
@@ -1401,12 +1399,13 @@ details.
 
 .. _exp-config-optimizations:
 
-***************
- Optimizations
-***************
+****************************
+ Optimizations (deprecated)
+****************************
 
 The ``optimizations`` section contains configuration options that influence the performance of the
-experiment.
+experiment. This section has been deprecated and should be configured in training code. Please see
+:ref:`apis-howto-overview` for details specific to your training framework.
 
 .. _config-aggregation-frequency:
 
diff --git a/docs/reference/training/api-deepspeed-reference.rst b/docs/reference/training/api-deepspeed-reference.rst
index 0fa7fbe8f87..6d00a8253ec 100644
--- a/docs/reference/training/api-deepspeed-reference.rst
+++ b/docs/reference/training/api-deepspeed-reference.rst
@@ -48,3 +48,16 @@ documentation):
 -  :ref:`determined.pytorch.samplers <pytorch-samplers>`
 -  :ref:`determined.pytorch.MetricReducer <pytorch-metric-reducer>`
 -  :ref:`determined.pytorch.PyTorchCallback <pytorch-callbacks>`
+
+******************************************
+ ``determined.pytorch.deepspeed.Trainer``
+******************************************
+
+.. autoclass:: determined.pytorch.deepspeed.Trainer
+   :members:
+
+*****************************************
+ ``determined.pytorch.deepspeed.init()``
+*****************************************
+
+.. autofunction:: determined.pytorch.deepspeed.init
diff --git a/docs/reference/training/api-pytorch-reference.rst b/docs/reference/training/api-pytorch-reference.rst
index 5b33714ae03..31a710048a8 100644
--- a/docs/reference/training/api-pytorch-reference.rst
+++ b/docs/reference/training/api-pytorch-reference.rst
@@ -120,3 +120,17 @@ platform which includes:
 *******************************
 
 .. autofunction:: determined.pytorch.init
+
+******************************
+ ``determined.pytorch.Batch``
+******************************
+
+.. autoclass:: determined.pytorch.Batch
+   :members:
+
+******************************
+ ``determined.pytorch.Epoch``
+******************************
+
+.. autoclass:: determined.pytorch.Epoch
+   :members:
diff --git a/docs/release-notes/searcher-context-removal.rst b/docs/release-notes/searcher-context-removal.rst
new file mode 100644
index 00000000000..74c81a746b2
--- /dev/null
+++ b/docs/release-notes/searcher-context-removal.rst
@@ -0,0 +1,72 @@
+:orphan:
+
+**Breaking Changes**
+
+-  ASHA: All experiments using ASHA hyperparameter search must now configure ``max_time`` and
+   ``time_metric`` in the experiment config, instead of ``max_length``. Additionally, training code
+   must report the configured ``time_metric`` in validation metrics. As a convenience, Determined
+   training loops now automatically report ``batches`` and ``epochs`` with metrics, which you can
+   use as your ``time_metric``. ASHA experiments without this modification will no longer run.
+
+-  Custom Searchers: all custom searchers including DeepSpeed Autotune were deprecated in ``0.36.0``
+   and are now being removed. Users are encouraged to use a preset searcher, which can be easily
+   :ref:`configured <experiment-configuration_searcher>` for any experiment.
+
+**New Features**
+
+-  API: introduce ``keras.DeterminedCallback``, a new high-level training API for TF Keras that
+   integrates Keras training code with Determined through a single :ref:`Keras Callback
+   <api-keras-ug>`.
+
+-  API: introduce ``deepspeed.Trainer``, a new high-level training API for DeepSpeedTrial that
+   allows for Python-side training loop configurations and includes support for local training.
+
+**Deprecations**
+
+-  Experiment Config: the ``max_length`` field of the searcher configuration section has been
+   deprecated for all experiments and searchers. Users are expected to configure the desired
+   training length directly in training code.
+
+-  Experiment Config: the ``optimizations`` config has been deprecated. Please see :ref:`Training
+   APIs <apis-howto-overview>` to configure supported optimizations through training code directly.
+
+-  Experiment Config: the ``scheduling_unit``, ``min_checkpoint_period``, and
+   ``min_validation_period`` config fields have been deprecated. Instead, these configuration
+   options should be specified in training code.
+
+-  Experiment Config: the ``entrypoint`` field no longer accepts ``model_def:TrialClass`` as trial
+   definitions. Please invoke your training script directly (``python3 train.py``).
+
+-  Core API: the ``SearcherContext`` (``core.searcher``) has been deprecated. Training code no
+   longer requires ``core.searcher.operations`` to run, and progress should be reported through
+   ``core.train.report_progress``.
+
+-  DeepSpeed: the ``num_micro_batches_per_slot`` and ``train_micro_batch_size_per_gpu`` attributes
+   on ``DeepSpeedContext`` have been replaced with ``get_train_micro_batch_size_per_gpu()`` and
+   ``get_num_micro_batches_per_slot()``.
+
+-  Horovod: the horovod distributed training backend has been deprecated. Users are encouraged to
+   migrate to the native distributed backend of their training framework (``torch.distributed`` or
+   ``tf.distribute``).
+
+-  Trial APIs: ``TFKerasTrial`` has been deprecated. Users are encouraged to migrate to the new
+   :ref:`Keras Callback <api-keras-ug>`.
+
+-  Launchers: the ``--trial`` argument in Determined launchers has been deprecated. Please invoke
+   your training script directly.
+
+-  ASHA: the ``stop_once`` field of the ``searcher`` config for ASHA searchers has been deprecated.
+   All ASHA searches are now early-stopping based (``stop_once: true``) instead of promotion based.
+
+-  CLI: The ``--test`` and ``--local`` flags for ``det experiment create`` have been deprecated. All
+   training APIs now support local execution (``python3 train.py``). Please see ``training apis``
+   for details specific to your framework.
+
+-  Web UI: previously, trials that reported an ``epoch`` metric enabled an epoch X-axis in the Web
+   UI metrics tab. This metric name has been changed to ``epochs``, with ``epoch`` as a fallback
+   option.
+
+**Removed Features**
+
+-  WebUI: "Continue Training" no longer supports configurable number of batches in the Web UI and
+   will simply resume the trial from the last checkpoint.
diff --git a/examples/deepspeed/dcgan/README.md b/examples/deepspeed/dcgan/README.md
index f0b9811b9c9..31481d432c3 100644
--- a/examples/deepspeed/dcgan/README.md
+++ b/examples/deepspeed/dcgan/README.md
@@ -25,10 +25,16 @@ After installing docker and pulling an image, users can launch a container via
 
 Install necessary dependencies via `pip install determined mpi4py`
 
-Then, run the following command:
+Then, run the following command if running on a single node and GPU:
 ```
 python trainer.py
 ```
+For multiple nodes GPUs, use the following:
+```
+deepspeed --num_nodes=<node_count> --num_gpus=<gpu_count> trainer.py --deepspeed --deepspeed_config ds_config.json
+```
+Where `num_nodes` corresponds to the number of nodes on your local cluster and `num_gpus` corresponds to
+the number of GPUs per node.
 
 Any additional configs can be specified in `mnist.yaml` and `ds_config.json` accordingly.
 
diff --git a/examples/deepspeed/dcgan/model.py b/examples/deepspeed/dcgan/model.py
index 99322dd5a9c..8b3e08d7ebb 100644
--- a/examples/deepspeed/dcgan/model.py
+++ b/examples/deepspeed/dcgan/model.py
@@ -47,7 +47,7 @@ def __init__(
         self.discriminator = self.context.wrap_model_engine(discriminator)
         self.fixed_noise = self.context.to_device(
             torch.randn(
-                self.context.train_micro_batch_size_per_gpu, self.hparams["noise_length"], 1, 1
+                self.context.get_train_micro_batch_size_per_gpu(), self.hparams["noise_length"], 1, 1
             )
         )
         self.criterion = nn.BCELoss()
@@ -63,7 +63,7 @@ def _get_noise(self, dtype: torch.dtype) -> torch.Tensor:
             torch.Tensor,
             self.context.to_device(
                 torch.randn(
-                    self.context.train_micro_batch_size_per_gpu,
+                    self.context.get_train_micro_batch_size_per_gpu(),
                     self.hparams["noise_length"],
                     1,
                     1,
@@ -94,7 +94,7 @@ def train_batch(
         else:
             dtype = torch.float32
         real_label, fake_label = self._get_label_constants(
-            self.context.train_micro_batch_size_per_gpu, dtype
+            self.context.get_train_micro_batch_size_per_gpu(), dtype
         )
         ############################
         # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
@@ -107,7 +107,7 @@ def train_batch(
         D_x = 0.0
         D_G_z1 = 0.0
         fake_sample_count = (
-            self.context.train_micro_batch_size_per_gpu * self.gradient_accumulation_steps
+            self.context.get_train_micro_batch_size_per_gpu() * self.gradient_accumulation_steps
         )
 
         for i in range(self.gradient_accumulation_steps):
@@ -133,7 +133,7 @@ def train_batch(
             output = self.discriminator(fake.detach())
             errD_fake = self.criterion(output, fake_label)
             self.discriminator.backward(errD_fake)
-            errD_fake_sum += errD_fake * self.context.train_micro_batch_size_per_gpu
+            errD_fake_sum += errD_fake * self.context.get_train_micro_batch_size_per_gpu()
             D_G_z1 += output.sum().item()
             # update
             self.discriminator.step()
@@ -154,7 +154,7 @@ def train_batch(
             output = self.discriminator(fake)
             errG = self.criterion(output, real_label)  # fake labels are real for generator cost
             self.generator.backward(errG)
-            errG_sum += errG * self.context._train_micro_batch_size_per_gpu
+            errG_sum += errG * self.context.get_train_micro_batch_size_per_gpu()
             D_G_z2_sum += output.sum().item()
             self.generator.step()
 
@@ -189,7 +189,7 @@ def build_training_data_loader(self) -> Any:
         dataset = data.get_dataset(self.data_config)
         return DataLoader(
             dataset,
-            batch_size=self.context.train_micro_batch_size_per_gpu,
+            batch_size=self.context.get_train_micro_batch_size_per_gpu(),
             shuffle=True,
             num_workers=int(self.hparams["data_workers"]),
         )
@@ -201,9 +201,9 @@ def build_validation_data_loader(self) -> Any:
             dataset,
             list(
                 range(
-                    self.context.train_micro_batch_size_per_gpu
+                    self.context.get_train_micro_batch_size_per_gpu()
                     * self.context.distributed.get_size()
                 )
             ),
         )
-        return DataLoader(dataset, batch_size=self.context.train_micro_batch_size_per_gpu)
+        return DataLoader(dataset, batch_size=self.context.get_train_micro_batch_size_per_gpu())
diff --git a/harness/determined/pytorch/_trainer_utils.py b/harness/determined/pytorch/_trainer_utils.py
index 254fad6e150..9a27642b9cb 100644
--- a/harness/determined/pytorch/_trainer_utils.py
+++ b/harness/determined/pytorch/_trainer_utils.py
@@ -12,10 +12,6 @@ class TrainUnit:
     the value of unit, where the value can be an int or an implementable collections.abc.Container.
 
     TrainUnits are used to define periodic training behavior such as checkpointing and validating.
-
-    int values are treated as periods, e.g. Batch(100) will checkpoint/validate every 100 batches.
-    collections.abc.Container values are treated as schedules, e.g. Batch(1,5,10) will
-    checkpoint/validate on batches 1, 5, and 10.
     """
 
     def __init__(self, value: Union[int, abc.Container]):
@@ -87,7 +83,12 @@ def _divides(self, steps: int) -> bool:
 
 class Epoch(TrainUnit):
     """
-    Epoch step type (e.g. Epoch(1) defines 1 epoch)
+    Defines an Epoch unit for specifying length to PyTorch trainers.
+
+    Epoch(int) values are treated as periods, e.g. Epoch(100) will checkpoint/validate every 100
+    epochs.
+    Epoch(collections.abc.Container) values are treated as schedules, e.g. Epoch([1,5,10]) will
+    checkpoint/validate on epochs 1, 5, and 10.
     """
 
     pass
@@ -95,7 +96,12 @@ class Epoch(TrainUnit):
 
 class Batch(TrainUnit):
     """
-    Batch step type (e.g. Batch(1) defines 1 batch)
+    Defines a Batch unit for specifying length to PyTorch trainers.
+
+    Batch(int) values are treated as periods, e.g. Batch(100) will checkpoint/validate every 100
+    batches.
+    Batch(collections.abc.Container) values are treated as schedules, e.g. Batch([1,5,10]) will
+    checkpoint/validate on batches 1, 5, and 10.
     """
 
     @staticmethod
diff --git a/harness/determined/pytorch/deepspeed/_deepspeed_context.py b/harness/determined/pytorch/deepspeed/_deepspeed_context.py
index b71f44e31da..6e15ea278e8 100644
--- a/harness/determined/pytorch/deepspeed/_deepspeed_context.py
+++ b/harness/determined/pytorch/deepspeed/_deepspeed_context.py
@@ -2,6 +2,7 @@
 import logging
 import pathlib
 import time
+import warnings
 from importlib import util as importutil
 from typing import Any, Dict, List, Optional, Set, Type, Union, cast
 
@@ -239,6 +240,17 @@ def get_train_micro_batch_size_per_gpu(self) -> int:
             )
         return self._train_micro_batch_size_per_gpu
 
+    @property
+    def train_micro_batch_size_per_gpu(self) -> int:
+        warnings.warn(
+            "DeepSpeedTrialContext.train_micro_batch_size_per_gpu has been deprecated in "
+            "Determined 0.38.0; please use the context.get_train_micro_batch_size_per_gpu() getter "
+            "instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return self.get_train_micro_batch_size_per_gpu()
+
     def get_num_micro_batches_per_slot(self) -> int:
         if self._num_micro_batches_per_slot is None:
             raise det.errors.InvalidExperimentException(
@@ -246,6 +258,17 @@ def get_num_micro_batches_per_slot(self) -> int:
             )
         return self._num_micro_batches_per_slot
 
+    @property
+    def num_micro_batches_per_slot(self) -> int:
+        warnings.warn(
+            "DeepSpeedTrialContext.num_micro_batches_per_slot has been deprecated in "
+            "Determined 0.38.0; please use the context.get_num_micro_batches_per_slot() getter "
+            "instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return self.get_num_micro_batches_per_slot()
+
     def _init_device(self) -> None:
         if not self._num_gpus:
             raise det.errors.InvalidExperimentException("GPUs required for DeepSpeedTrial.")
diff --git a/harness/determined/pytorch/deepspeed/_trainer.py b/harness/determined/pytorch/deepspeed/_trainer.py
index 8e36f345235..587a1b41999 100644
--- a/harness/determined/pytorch/deepspeed/_trainer.py
+++ b/harness/determined/pytorch/deepspeed/_trainer.py
@@ -65,7 +65,9 @@ def fit(
             max_length: The maximum number of steps to train for. This is a ``TrainUnit`` type
                 (``Batch`` or ``Epoch``) which takes an ``int``. For example, ``Epoch(1)`` would
                 train for a maximum length of one epoch.
+
                 .. note::
+
                    If using an ASHA searcher, this value should match the searcher config values in
                    the experiment config (i.e. ``Epoch(1)`` = `max_time: 1` and `time_metric:
                    "epochs"`).
diff --git a/master/pkg/searcher/searcher.go b/master/pkg/searcher/searcher.go
index 8b282b369f7..95479e2b9a6 100644
--- a/master/pkg/searcher/searcher.go
+++ b/master/pkg/searcher/searcher.go
@@ -86,7 +86,7 @@ func (s *Searcher) TrialCreated(requestID model.RequestID) ([]Action, error) {
 	operations, err := s.method.trialCreated(s.context(), requestID)
 	if err != nil {
 		return nil, errors.Wrapf(err,
-			"error while handling a trial created event: %d", requestID)
+			"error while handling a trial created event: %s", requestID)
 	}
 	s.record(operations)
 	return operations, nil
@@ -156,7 +156,7 @@ func (s *Searcher) ValidationCompleted(
 
 	operations, err := s.method.validationCompleted(s.context(), requestID, metrics)
 	if err != nil {
-		return nil, errors.Wrapf(err, "error while handling a validation completed event: %d", requestID)
+		return nil, errors.Wrapf(err, "error while handling a validation completed event: %s", requestID)
 	}
 	s.record(operations)
 	return operations, nil
@@ -170,7 +170,7 @@ func (s *Searcher) TrialExited(requestID model.RequestID) ([]Action, error) {
 	s.state.TrialsClosed[requestID] = true
 	actions, err := s.method.trialExited(s.context(), requestID)
 	if err != nil {
-		return nil, errors.Wrapf(err, "error while handling a trial closed event: %d", requestID)
+		return nil, errors.Wrapf(err, "error while handling a trial closed event: %s", requestID)
 	}
 	s.record(actions)