From 2594d904202ed8a710bd77ef795a1e6d2b6d0975 Mon Sep 17 00:00:00 2001
From: Ryan <rb@hpe.com>
Date: Thu, 10 Oct 2024 09:44:58 -0600
Subject: [PATCH] chore: remove e2e_slurm_gpu series tests (#10021)

Note that there are nightly tests decorated with:

   - @e2e_slurm
   - skipif(not torch.cuda.is_available())

So we still have some GPU-specific slurm tests at this point.  But those
tests were not actually running as part of the e2e_slurm_gpu tests
anyway.

This is part of a larger effort to get rid of our znode tests, which
are notoriously unreliable.
---
 .circleci/real_config.yml                     | 22 -----
 e2e_tests/pytest.ini                          |  1 -
 e2e_tests/tests/cluster/test_checkpoints.py   |  2 +-
 e2e_tests/tests/experiment/test_profiling.py  |  1 -
 e2e_tests/tests/experiment/test_pytorch.py    | 64 ---------------
 .../fixtures/pytorch_identity/__init__.py     |  0
 .../pytorch_identity/distributed.yaml         | 24 ------
 .../fixtures/pytorch_identity/model_def.py    | 79 ------------------
 e2e_tests/tests/nightly/test_pytorch2.py      |  3 +-
 .../pytorch_identity/distributed.yaml         |  1 -
 .../fixtures/pytorch_identity/model_def.py    | 80 ++++++++++++++++++-
 .../experiment/pytorch/test_pytorch_trial.py  |  4 +-
 tools/slurm/README.md                         |  1 -
 13 files changed, 82 insertions(+), 200 deletions(-)
 delete mode 100644 e2e_tests/tests/experiment/test_pytorch.py
 delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/__init__.py
 delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml
 delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/model_def.py
 delete mode 120000 harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml
 mode change 120000 => 100644 harness/tests/experiment/fixtures/pytorch_identity/model_def.py

diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
index 55564d674c1..e47a75d5969 100644
--- a/.circleci/real_config.yml
+++ b/.circleci/real_config.yml
@@ -4208,18 +4208,6 @@ workflows:
             security:
               initial_user_password: ${INITIAL_USER_PASSWORD}
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-gpu
-          mark: "e2e_slurm_gpu"
-          requires:
-            - package-and-push-system-local-ee
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters:
-            branches:
-              only:
-                - main
-
       # Singularity over SLURM test on GCP
       - test-e2e-hpc-gcp:
           context:
@@ -5106,16 +5094,6 @@ workflows:
           extra-pytest-flags: "--no-compare-stats"
           collect-det-job-logs: false
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-gpu
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters: *upstream-feature-branch
-          mark: "e2e_slurm_gpu"
-          requires:
-            - package-and-push-system-local-ee
-            - request-hpc-tests
-
       - test-e2e-slurm:
           name: test-e2e-slurm-enroot-znode
           context:
diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
index d3cc7142b67..d7212c4bb24 100644
--- a/e2e_tests/pytest.ini
+++ b/e2e_tests/pytest.ini
@@ -20,7 +20,6 @@ markers =
     e2e_pbs: end to end pbs integration tests
     e2e_saml: tests for saml with okta
     e2e_slurm: end to end slurm integration tests
-    e2e_slurm_gpu: end to end slurm GPU tests
     e2e_slurm_restart: slurm integration tests that require restarting the master
     e2e_slurm_preemption: hpc integration test to ensure preemption is working
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
diff --git a/e2e_tests/tests/cluster/test_checkpoints.py b/e2e_tests/tests/cluster/test_checkpoints.py
index a375452944d..cd766612f12 100644
--- a/e2e_tests/tests/cluster/test_checkpoints.py
+++ b/e2e_tests/tests/cluster/test_checkpoints.py
@@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 def test_set_gc_policy() -> None:
     sess = api_utils.user_session()
     save_exp_best = 3
@@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:
 
 
 @pytest.mark.e2e_cpu
+@pytest.mark.e2e_slurm
 def test_delete_checkpoints() -> None:
     sess = api_utils.user_session()
     config = {
diff --git a/e2e_tests/tests/experiment/test_profiling.py b/e2e_tests/tests/experiment/test_profiling.py
index 68c724a359a..e7c1081cb08 100644
--- a/e2e_tests/tests/experiment/test_profiling.py
+++ b/e2e_tests/tests/experiment/test_profiling.py
@@ -12,7 +12,6 @@
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 @pytest.mark.timeout(30 * 60)
 def test_streaming_observability_metrics_apis() -> None:
     sess = api_utils.user_session()
diff --git a/e2e_tests/tests/experiment/test_pytorch.py b/e2e_tests/tests/experiment/test_pytorch.py
deleted file mode 100644
index 9daa0fbd673..00000000000
--- a/e2e_tests/tests/experiment/test_pytorch.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import List
-
-import pytest
-
-from tests import api_utils
-from tests import config as conf
-from tests import experiment as exp
-
-
-@pytest.mark.parallel
-@pytest.mark.e2e_slurm_gpu
-def test_pytorch_gradient_aggregation() -> None:
-    sess = api_utils.user_session()
-    config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml"))
-
-    exp_id = exp.run_basic_test_with_temp_config(
-        sess, config, conf.fixtures_path("pytorch_identity"), 1
-    )
-    trials = exp.experiment_trials(sess, exp_id)
-    assert len(trials) == 1
-    workloads = exp.workloads_with_validation(trials[0].workloads)
-    actual_weights = []
-    for wl in workloads:
-        if wl.metrics:
-            actual_weights.append(wl.metrics.avgMetrics["weight"])
-
-    # independently compute expected metrics
-    batch_size = 4
-    epoch_size = 64
-    num_epochs = 3
-    batches = [
-        (v[:], v[:])
-        for v in (
-            [x * 0.1 + 1.0 for x in range(y, y + batch_size)]
-            for y in (z % epoch_size for z in range(0, epoch_size * num_epochs, batch_size))
-        )
-    ]
-
-    lr = 0.001
-
-    def compute_expected_weight(data: List[float], label: List[float], w: float) -> float:
-        n = len(data)
-        expected_step = 2.0 * lr * sum((d * (l - d * w) for d, l in zip(data, label))) / n
-        return w + expected_step
-
-    expected_weights = []
-    weight = 0.0
-    data: List[float] = []
-    label: List[float] = []
-    for i, batch in enumerate(batches):
-        if i % 2 == 0:
-            # for even-numbered batches the optimizer step is a no-op:
-            # the weights don't change
-            data, label = batch
-        else:
-            additional_data, additional_label = batch
-            data += additional_data
-            label += additional_label
-            weight = compute_expected_weight(data, label, weight)
-        expected_weights.append(weight)
-
-    assert actual_weights == pytest.approx(
-        expected_weights
-    ), f"{actual_weights} != {expected_weights}"
diff --git a/e2e_tests/tests/fixtures/pytorch_identity/__init__.py b/e2e_tests/tests/fixtures/pytorch_identity/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml b/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml
deleted file mode 100644
index 6d9d63dd36a..00000000000
--- a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: pytorch-identity
-data:
-  model_type: single_output
-entrypoint: model_def:IdentityPyTorchTrial
-hyperparameters:
-  global_batch_size: 4
-records_per_epoch: 64
-resources:
-  slots_per_trial: 2
-scheduling_unit: 1
-searcher:
-  name: single
-  metric: val_loss
-  max_length:
-    epochs: 3
-  smaller_is_better: true
-max_restarts: 0
-min_checkpoint_period:
-  batches: 1
-min_validation_period:
-  batches: 1
-optimizations:
-  aggregation_frequency: 2
-  average_aggregated_gradients: true
diff --git a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py b/e2e_tests/tests/fixtures/pytorch_identity/model_def.py
deleted file mode 100644
index 570ccf5a2b5..00000000000
--- a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from typing import Any, Dict, Tuple
-
-import torch.utils.data
-
-from determined import pytorch
-
-
-class MetricsCallback(pytorch.PyTorchCallback):
-    def __init__(self):
-        self.validation_metrics = []
-
-    def on_validation_end(self, metrics: Dict[str, Any]) -> None:
-        self.validation_metrics.append(metrics)
-
-
-class IdentityDataset(torch.utils.data.Dataset):
-    def __init__(self, initial_value: int = 1):
-        self.initial_value = initial_value
-
-    def __len__(self) -> int:
-        return 64
-
-    def __getitem__(self, index: int) -> Tuple:
-        v = float(self.initial_value + 0.1 * index)
-        return torch.Tensor([v]), torch.Tensor([v])
-
-
-class IdentityPyTorchTrial(pytorch.PyTorchTrial):
-    def __init__(self, context: pytorch.PyTorchTrialContext) -> None:
-        self.context = context
-
-        model = torch.nn.Linear(1, 1, False)
-        model.weight.data.fill_(0)
-        self.model = context.wrap_model(model)
-
-        self.lr = 0.001
-
-        optimizer = torch.optim.SGD(self.model.parameters(), self.lr)
-        self.opt = context.wrap_optimizer(optimizer)
-
-        self.loss_fn = torch.nn.MSELoss(reduction="mean")
-        self.metrics_callback = MetricsCallback()
-
-    def train_batch(
-        self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int
-    ) -> Dict[str, torch.Tensor]:
-        data, label = batch
-
-        loss = self.loss_fn(self.model(data), label)
-
-        self.context.backward(loss)
-
-        self.context.step_optimizer(self.opt)
-
-        return {
-            "loss": loss,
-        }
-
-    def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]:
-        data, label = batch
-
-        loss = self.loss_fn(self.model(data), label)
-
-        weight = self.model.weight.data.item()
-
-        return {"val_loss": loss, "weight": weight}
-
-    def build_training_data_loader(self) -> pytorch.DataLoader:
-        return pytorch.DataLoader(
-            IdentityDataset(), batch_size=self.context.get_per_slot_batch_size()
-        )
-
-    def build_validation_data_loader(self) -> pytorch.DataLoader:
-        return pytorch.DataLoader(
-            IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size()
-        )
-
-    def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]:
-        return {"metrics": self.metrics_callback}
diff --git a/e2e_tests/tests/nightly/test_pytorch2.py b/e2e_tests/tests/nightly/test_pytorch2.py
index c0689e6894b..086f5159693 100644
--- a/e2e_tests/tests/nightly/test_pytorch2.py
+++ b/e2e_tests/tests/nightly/test_pytorch2.py
@@ -7,14 +7,13 @@
 
 @pytest.mark.distributed
 @pytest.mark.gpu_required
-@pytest.mark.e2e_slurm_gpu
 def test_pytorch2_hf_language_modeling_distributed() -> None:
     sess = api_utils.user_session()
     test_dir = "hf_language_modeling"
 
     config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
     config = conf.set_pt2_image(config)
-    config = conf.set_slots_per_trial(config, 4)
+    config = conf.set_slots_per_trial(config, 8)
 
     # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
     config = conf.set_entrypoint(
diff --git a/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml b/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml
deleted file mode 120000
index f995af3ce37..00000000000
--- a/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml
\ No newline at end of file
diff --git a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py
deleted file mode 120000
index 7e14b820b38..00000000000
--- a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../e2e_tests/tests/fixtures/pytorch_identity/model_def.py
\ No newline at end of file
diff --git a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py
new file mode 100644
index 00000000000..570ccf5a2b5
--- /dev/null
+++ b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py
@@ -0,0 +1,79 @@
+from typing import Any, Dict, Tuple
+
+import torch.utils.data
+
+from determined import pytorch
+
+
+class MetricsCallback(pytorch.PyTorchCallback):
+    def __init__(self):
+        self.validation_metrics = []
+
+    def on_validation_end(self, metrics: Dict[str, Any]) -> None:
+        self.validation_metrics.append(metrics)
+
+
+class IdentityDataset(torch.utils.data.Dataset):
+    def __init__(self, initial_value: int = 1):
+        self.initial_value = initial_value
+
+    def __len__(self) -> int:
+        return 64
+
+    def __getitem__(self, index: int) -> Tuple:
+        v = float(self.initial_value + 0.1 * index)
+        return torch.Tensor([v]), torch.Tensor([v])
+
+
+class IdentityPyTorchTrial(pytorch.PyTorchTrial):
+    def __init__(self, context: pytorch.PyTorchTrialContext) -> None:
+        self.context = context
+
+        model = torch.nn.Linear(1, 1, False)
+        model.weight.data.fill_(0)
+        self.model = context.wrap_model(model)
+
+        self.lr = 0.001
+
+        optimizer = torch.optim.SGD(self.model.parameters(), self.lr)
+        self.opt = context.wrap_optimizer(optimizer)
+
+        self.loss_fn = torch.nn.MSELoss(reduction="mean")
+        self.metrics_callback = MetricsCallback()
+
+    def train_batch(
+        self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int
+    ) -> Dict[str, torch.Tensor]:
+        data, label = batch
+
+        loss = self.loss_fn(self.model(data), label)
+
+        self.context.backward(loss)
+
+        self.context.step_optimizer(self.opt)
+
+        return {
+            "loss": loss,
+        }
+
+    def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]:
+        data, label = batch
+
+        loss = self.loss_fn(self.model(data), label)
+
+        weight = self.model.weight.data.item()
+
+        return {"val_loss": loss, "weight": weight}
+
+    def build_training_data_loader(self) -> pytorch.DataLoader:
+        return pytorch.DataLoader(
+            IdentityDataset(), batch_size=self.context.get_per_slot_batch_size()
+        )
+
+    def build_validation_data_loader(self) -> pytorch.DataLoader:
+        return pytorch.DataLoader(
+            IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size()
+        )
+
+    def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]:
+        return {"metrics": self.metrics_callback}
diff --git a/harness/tests/experiment/pytorch/test_pytorch_trial.py b/harness/tests/experiment/pytorch/test_pytorch_trial.py
index 32c5abface0..daa7f0463f8 100644
--- a/harness/tests/experiment/pytorch/test_pytorch_trial.py
+++ b/harness/tests/experiment/pytorch/test_pytorch_trial.py
@@ -1468,8 +1468,7 @@ def amp_metrics_test(trial_class, training_metrics, agg_freq=1):
 def run_identity(tmp_path: pathlib.Path):
     checkpoint_dir = str(tmp_path.joinpath("checkpoint"))
 
-    config = utils.load_config(utils.fixtures_path("pytorch_identity/distributed.yaml"))
-    hparams = config["hyperparameters"]
+    hparams = {"global_batch_size": 4}
 
     exp_config = utils.make_default_exp_config(
         hparams,
@@ -1477,7 +1476,6 @@ def run_identity(tmp_path: pathlib.Path):
         searcher_metric="validation_loss",
         checkpoint_dir=checkpoint_dir,
     )
-    exp_config.update(config)
     exp_config["searcher"]["smaller_is_better"] = True
 
     # each subprocess must import separately as trial_class cannot be pickled.
diff --git a/tools/slurm/README.md b/tools/slurm/README.md
index e88e4d7a39d..e8be4acce48 100644
--- a/tools/slurm/README.md
+++ b/tools/slurm/README.md
@@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
 **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
 
 The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
-  - `test-e2e-slurm-gpu`: Test is skipped because the compute instance that the tests run on do not have any GPUs.
   - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious.
   - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
   - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.