feat: remove searcher context from harness and master [MD-498] (#10131)

Co-authored-by: Ryan <[email protected]> Co-authored-by: Guangqing Tang <[email protected]> Co-authored-by: Michael Kardash <[email protected]>
determined-ai · Oct 26, 2024 · 3910426 · 3910426
1 parent 27bebdd
commit 3910426
Show file tree

Hide file tree

Showing 337 changed files with 11,242 additions and 22,501 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -2603,6 +2603,7 @@ jobs:
       - run: pip install mypy pytest coverage
       - install-codecov
       - setup-paths
+      - run: make -C harness install
       - run: COVERAGE_FILE=$PWD/test-unit-harness-tf2-pycov make -C harness test-tf2
       - run: coverage xml -i --data-file=./test-unit-harness-tf2-pycov
       - run: codecov -v -t $CODECOV_TOKEN -F harness

diff --git a/docs/.redirects/redirects.json b/docs/.redirects/redirects.json
@@ -69,7 +69,6 @@
     "reference/deploy/config/helm-config-reference": "../helm-config-reference.html",
     "reference/deploy/config/common-config-options": "../common-config-options.html",
     "reference/deploy/config/agent-config-reference": "../agent-config-reference.html",
-    "reference/searcher/custom-searcher-reference": "../custom-searcher-reference.html",
     "setup-cluster/security/tls": "../../manage/security/tls.html",
     "setup-cluster/security/scim": "../../manage/security/scim.html",
     "setup-cluster/security/saml": "../../manage/security/saml.html",
@@ -86,13 +85,15 @@
     "model-hub-library/transformers/tutorial": "../../model-dev-guide/api-guides/_index.html",
     "model-hub-library/mmdetection/overview": "../../model-dev-guide/api-guides/_index.html",
     "model-dev-guide/hyperparameter/search-methods/index": "_index.html",
+    "model-dev-guide/hyperparameter/search-methods/hp-custom": "_index.html",
     "model-dev-guide/api-guides/batch-processing/batch-process-api-ug": "../batch-process-api-ug.html",
     "model-dev-guide/best-practices/index": "../_index.html",
     "model-dev-guide/best-practices/_index": "../_index.html",
     "model-dev-guide/model-management/index": "_index.html",
     "model-dev-guide/hyperparameter/index": "_index.html",
     "model-dev-guide/prepare-container/index": "_index.html",
     "model-dev-guide/dtrain/index": "_index.html",
+    "model-dev-guide/api-guides/apis-howto/deepspeed/autotuning": "_index.html",
     "model-dev-guide/api-guides/apis-howto/deepspeed/index": "_index.html",
     "model-dev-guide/api-guides/apis-howto/index": "_index.html",
     "model-dev-guide/api-guides/index": "_index.html",
@@ -110,6 +111,8 @@
     "integrations/pachyderm/pachyderm": "../data-transformers/pachyderm.html",
     "architecture/index": "../get-started/architecture/_index.html",
     "reference/index": "_index.html",
+    "reference/custom-searcher-reference": "_index.html",
+    "reference/searcher/custom-searcher-reference": "../_index.html",
     "model-dev-guide/index": "_index.html",
     "model-hub-library/index": "../model-dev-guide/api-guides/_index.html",
     "tutorials/index": "_index.html",
@@ -133,9 +136,9 @@
     "tutorials/tf-mnist-tutorial": "_index.html",
     "model-dev-guide/batch-processing/batch-process-api-ug": "../api-guides/batch-process-api-ug.html",
     "model-dev-guide/apis-howto/deepspeed/advanced": "../../api-guides/apis-howto/deepspeed/advanced.html",
+    "model-dev-guide/apis-howto/deepspeed/autotuning": "../../api-guides/apis-howto/deepspeed/_index.html",
     "model-dev-guide/apis-howto/deepspeed/deepspeed": "../../api-guides/apis-howto/deepspeed/deepspeed.html",
     "model-dev-guide/apis-howto/deepspeed/overview": "../../api-guides/apis-howto/deepspeed/_index.html",
-    "model-dev-guide/apis-howto/deepspeed/autotuning": "../../api-guides/apis-howto/deepspeed/autotuning.html",
     "model-dev-guide/apis-howto/deepspeed/pytorch2deepspeed": "../../api-guides/apis-howto/deepspeed/pytorch2deepspeed.html",
     "model-dev-guide/apis-howto/api-core-ug": "../api-guides/apis-howto/api-core-ug.html",
     "model-dev-guide/apis-howto/api-pytorch-ug": "../api-guides/apis-howto/api-pytorch-ug.html",
@@ -150,11 +153,11 @@
     "training/model-management/overview": "../../model-dev-guide/model-management/_index.html",
     "training/model-management/checkpoints": "../../model-dev-guide/model-management/checkpoints.html",
     "training/best-practices/overview": "../../model-dev-guide/_index.html",
+    "training/hyperparameter/search-methods/hp-custom": "../../../model-dev-guide/hyperparameter/search-methods/_index.html",
     "training/hyperparameter/search-methods/hp-random": "../../../model-dev-guide/hyperparameter/search-methods/hp-random.html",
     "training/hyperparameter/search-methods/hp-adaptive-asha": "../../../model-dev-guide/hyperparameter/search-methods/hp-adaptive-asha.html",
     "training/hyperparameter/search-methods/hp-grid": "../../../model-dev-guide/hyperparameter/search-methods/hp-grid.html",
     "training/hyperparameter/search-methods/hp-single": "../../../model-dev-guide/hyperparameter/search-methods/hp-single.html",
-    "training/hyperparameter/search-methods/hp-custom": "../../../model-dev-guide/hyperparameter/search-methods/hp-custom.html",
     "training/hyperparameter/search-methods/overview": "../../../model-dev-guide/hyperparameter/search-methods/_index.html",
     "training/hyperparameter/hp-constraints-det": "../../model-dev-guide/hyperparameter/hp-constraints-det.html",
     "training/hyperparameter/handle-trial-errors": "../../model-dev-guide/hyperparameter/handle-trial-errors.html",
@@ -223,7 +226,6 @@
     "cluster-setup-guide/historical-cluster-usage-data": "../manage/historical-cluster-usage-data.html",
     "cluster-setup-guide/workspaces": "../manage/workspaces.html",
     "quickstart-mdldev": "tutorials/quickstart-mdldev.html",
-    "reference/reference-searcher/custom-searcher-reference": "../custom-searcher-reference.html",
     "reference/reference-model-hub/modelhub/transformers-api": "../../training/_index.html",
     "reference/reference-model-hub/modelhub/mmdetection-api": "../../training/_index.html",
     "reference/reference-model-hub/index": "../training/_index.html",
@@ -233,6 +235,7 @@
     "reference/reference-deploy/config/helm-config-reference": "../../deploy/helm-config-reference.html",
     "reference/reference-deploy/config/common-config-options": "../../deploy/common-config-options.html",
     "reference/reference-deploy/index": "../deploy/_index.html",
+    "reference/reference-searcher/custom-searcher-reference": "../_index.html",
     "reference/reference-training/training/api-deepspeed-reference": "../../training/api-deepspeed-reference.html",
     "reference/reference-training/training/api-pytorch-reference": "../../training/api-pytorch-reference.html",
     "reference/reference-training/training/api-det-reference": "../../training/api-det-reference.html",

diff --git a/docs/get-started/architecture/introduction.rst b/docs/get-started/architecture/introduction.rst
@@ -810,8 +810,6 @@ In this example experiment configuration, numbers, strings, maps, and an array a
    searcher:
      name: single
      metric: error
-     max_length:
-       batches: 500
      smaller_is_better: true
    environment:
      environment_variables:

diff --git a/docs/get-started/example-solutions/_index.rst b/docs/get-started/example-solutions/_index.rst
@@ -55,21 +55,6 @@ For an introduction to using the training APIs, please visit :ref:`Training APIs
       -  Enron Email Corpus
       -  :download:`gpt_neox.tgz </examples/gpt_neox.tgz>`
 
-********************
- DeepSpeed Autotune
-********************
-
-.. list-table::
-   :header-rows: 1
-
-   -  -  Framework
-      -  Dataset
-      -  Filename
-
-   -  -  DeepSpeed (PyTorch)
-      -  ImageNet (Generated)
-      -  :download:`torchvision.tgz </examples/torchvision.tgz>`
-
    -  -  Hugging Face (DeepSpeed/PyTorch)
       -  Beans (Hugging Face)
       -  :download:`hf_image_classification.tgz </examples/hf_image_classification.tgz>`

diff --git a/docs/get-started/webui-qs.rst b/docs/get-started/webui-qs.rst
@@ -158,8 +158,6 @@ our multi-trial search. Finally, we'll run a remote distributed training job.
                  name: random
                  metric: validation_loss
                  max_trials: 20
-                 max_length:
-                   batches: 1000
                  smaller_is_better: true
 
                entrypoint: python3 train.py

diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug-basic.rst
@@ -25,8 +25,8 @@ the the following capabilities:
 -  hyperparameter search
 -  distributing work across multiple GPUs and/or nodes
 
-These are the same features provided by the higher-level PyTorchTrial, DeepSpeedTrial, and
-TFKerasTrial APIs: those APIs are implemented using the Core API.
+These features are also available in the higher-level PyTorchTrial and DeepSpeedTrial APIs, both of
+which are built on top of the Core API.
 
 This user guide shows you how to get started using the Core API.
 
@@ -85,7 +85,7 @@ with only a few new lines of code.
    .. literalinclude:: ../../../../examples/tutorials/core_api/1_metrics.py
       :language: python
       :start-after: NEW: import determined
-      :end-before: def main
+      :end-at: import determined as det
 
 #. Enable ``logging``, using the ``det.LOG_FORMAT`` for logs. This enables useful log messages from
    the ``determined`` library, and ``det.LOG_FORMAT`` enables filter-by-level in the WebUI.
@@ -250,27 +250,6 @@ runs a train-validate-report loop:
       :dedent:
       :start-at: hparams = info.trial.hparams
 
-#. Modify ``main()`` to run the train-validate-report loop mentioned above by iterating through
-   ``core_context.searcher.operations()``. Each :class:`~determined.core.SearcherOperation` from
-   :meth:`~determined.core.SearcherContext.operations` has a ``length`` attribute that specifies the
-   absolute length of training to complete. After validating, report the searcher metric value using
-   ``op.report_completed()``.
-
-   .. literalinclude:: ../../../../examples/tutorials/core_api/3_hpsearch.py
-      :language: python
-      :dedent:
-      :start-at: batch = starting_batch
-      :end-at: op.report_completed
-
-#. Because the training length can vary, you might exit the train-validate-report loop before saving
-   the last of your progress. To handle this, add a conditional save after the loop ends:
-
-   .. literalinclude:: ../../../../examples/tutorials/core_api/3_hpsearch.py
-      :language: python
-      :dedent:
-      :start-at: if last_checkpoint_batch != steps_completed
-      :end-at: save_state
-
 #. Create a new ``3_hpsearch.yaml`` file and add an ``entrypoint`` that invokes ``3_hpsearch.py``:
 
    .. literalinclude:: ../../../../examples/tutorials/core_api/3_hpsearch.yaml
@@ -365,32 +344,15 @@ considerations are:
       :start-after: some logs are easier to read
       :end-at: logging.info
 
-#. Only the chief worker is permitted to report training metrics, report validation metrics, upload
-   checkpoints, or report searcher operations completed. This rule applies to the steps you take
-   periodically during training:
+#. Only the chief worker is permitted to report metrics, upload checkpoints, or report progress.
+   This rule applies to the steps you take periodically during training:
 
    .. literalinclude:: ../../../../examples/tutorials/core_api/4_distributed.py
       :language: python
       :dedent:
       :start-at: if steps_completed % 10 == 0
       :end-at: return
 
-   The rule also applies to the steps you take after validating:
-
-   .. literalinclude:: ../../../../examples/tutorials/core_api/4_distributed.py
-      :language: python
-      :dedent:
-      :start-after: only the chief may report validation metrics
-      :end-at: op.report_completed
-
-   The rule also applies to the conditional save after the main loop completes:
-
-   .. literalinclude:: ../../../../examples/tutorials/core_api/4_distributed.py
-      :language: python
-      :dedent:
-      :start-at: again, only the chief may upload checkpoints
-      :end-at: save_state
-
 #. Create a ``4_distributed.yaml`` file by copying the ``3_distributed.yaml`` file and changing the
    first couple of lines:
 
@@ -411,7 +373,7 @@ considerations are:
    .. literalinclude:: ../../../../examples/tutorials/core_api/4_distributed.yaml
       :language: yaml
       :start-at: searcher:
-      :end-at: max_length:
+      :end-at: metric:
 
 #. Run the code using the Determined CLI with the following command:
 

diff --git a/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst b/docs/model-dev-guide/api-guides/apis-howto/api-core-ug.rst
@@ -332,8 +332,10 @@ settings in our experiment configuration file:
 -  ``smaller_is_better``: ``True`` (This is equivalent to minimization vs. maximization of
    objective.)
 -  ``max_trials``: 500 (This is the maximum number of trials the searcher should run.)
--  ``max_length``: 20 epochs (The max length of a trial. For more information, visit Adaptive ASHA
-   in the :ref:`Experiment Configuration Reference <experiment-configuration>`.
+-  ``time_metric``: ``epochs`` (This is the name of the "time" metric which we report in validation
+   metrics).
+-  ``max_time``: 20 (The max number of epochs a trial will report. For more information, visit
+   Adaptive ASHA in the :ref:`Experiment Configuration Reference <experiment-configuration>`.
 
 In addition, we also need to define the hyperparameters themselves. Adaptive ASHA will pick values
 between the ``minval`` and ``maxval`` for each hyperparameter for each trial.