From 2594d904202ed8a710bd77ef795a1e6d2b6d0975 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 10 Oct 2024 09:44:58 -0600 Subject: [PATCH] chore: remove e2e_slurm_gpu series tests (#10021) Note that there are nightly tests decorated with: - @e2e_slurm - skipif(not torch.cuda.is_available()) So we still have some GPU-specific slurm tests at this point. But those tests were not actually running as part of the e2e_slurm_gpu tests anyway. This is part of a larger effort to get rid of our znode tests, which are notoriously unreliable. --- .circleci/real_config.yml | 22 ----- e2e_tests/pytest.ini | 1 - e2e_tests/tests/cluster/test_checkpoints.py | 2 +- e2e_tests/tests/experiment/test_profiling.py | 1 - e2e_tests/tests/experiment/test_pytorch.py | 64 --------------- .../fixtures/pytorch_identity/__init__.py | 0 .../pytorch_identity/distributed.yaml | 24 ------ .../fixtures/pytorch_identity/model_def.py | 79 ------------------ e2e_tests/tests/nightly/test_pytorch2.py | 3 +- .../pytorch_identity/distributed.yaml | 1 - .../fixtures/pytorch_identity/model_def.py | 80 ++++++++++++++++++- .../experiment/pytorch/test_pytorch_trial.py | 4 +- tools/slurm/README.md | 1 - 13 files changed, 82 insertions(+), 200 deletions(-) delete mode 100644 e2e_tests/tests/experiment/test_pytorch.py delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/__init__.py delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml delete mode 100644 e2e_tests/tests/fixtures/pytorch_identity/model_def.py delete mode 120000 harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml mode change 120000 => 100644 harness/tests/experiment/fixtures/pytorch_identity/model_def.py diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 55564d674c1..e47a75d5969 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -4208,18 +4208,6 @@ workflows: security: initial_user_password: ${INITIAL_USER_PASSWORD} - - test-e2e-slurm: - name: test-e2e-slurm-gpu - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - context: - - dev-ci-cluster-default-user-credentials - filters: - branches: - only: - - main - # Singularity over SLURM test on GCP - test-e2e-hpc-gcp: context: @@ -5106,16 +5094,6 @@ workflows: extra-pytest-flags: "--no-compare-stats" collect-det-job-logs: false - - test-e2e-slurm: - name: test-e2e-slurm-gpu - context: - - dev-ci-cluster-default-user-credentials - filters: *upstream-feature-branch - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - - request-hpc-tests - - test-e2e-slurm: name: test-e2e-slurm-enroot-znode context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index d3cc7142b67..d7212c4bb24 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -20,7 +20,6 @@ markers = e2e_pbs: end to end pbs integration tests e2e_saml: tests for saml with okta e2e_slurm: end to end slurm integration tests - e2e_slurm_gpu: end to end slurm GPU tests e2e_slurm_restart: slurm integration tests that require restarting the master e2e_slurm_preemption: hpc integration test to ensure preemption is working e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access diff --git a/e2e_tests/tests/cluster/test_checkpoints.py b/e2e_tests/tests/cluster/test_checkpoints.py index a375452944d..cd766612f12 100644 --- a/e2e_tests/tests/cluster/test_checkpoints.py +++ b/e2e_tests/tests/cluster/test_checkpoints.py @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None: @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu def test_set_gc_policy() -> None: sess = api_utils.user_session() save_exp_best = 3 @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None: @pytest.mark.e2e_cpu +@pytest.mark.e2e_slurm def test_delete_checkpoints() -> None: sess = api_utils.user_session() config = { diff --git a/e2e_tests/tests/experiment/test_profiling.py b/e2e_tests/tests/experiment/test_profiling.py index 68c724a359a..e7c1081cb08 100644 --- a/e2e_tests/tests/experiment/test_profiling.py +++ b/e2e_tests/tests/experiment/test_profiling.py @@ -12,7 +12,6 @@ @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu @pytest.mark.timeout(30 * 60) def test_streaming_observability_metrics_apis() -> None: sess = api_utils.user_session() diff --git a/e2e_tests/tests/experiment/test_pytorch.py b/e2e_tests/tests/experiment/test_pytorch.py deleted file mode 100644 index 9daa0fbd673..00000000000 --- a/e2e_tests/tests/experiment/test_pytorch.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import List - -import pytest - -from tests import api_utils -from tests import config as conf -from tests import experiment as exp - - -@pytest.mark.parallel -@pytest.mark.e2e_slurm_gpu -def test_pytorch_gradient_aggregation() -> None: - sess = api_utils.user_session() - config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml")) - - exp_id = exp.run_basic_test_with_temp_config( - sess, config, conf.fixtures_path("pytorch_identity"), 1 - ) - trials = exp.experiment_trials(sess, exp_id) - assert len(trials) == 1 - workloads = exp.workloads_with_validation(trials[0].workloads) - actual_weights = [] - for wl in workloads: - if wl.metrics: - actual_weights.append(wl.metrics.avgMetrics["weight"]) - - # independently compute expected metrics - batch_size = 4 - epoch_size = 64 - num_epochs = 3 - batches = [ - (v[:], v[:]) - for v in ( - [x * 0.1 + 1.0 for x in range(y, y + batch_size)] - for y in (z % epoch_size for z in range(0, epoch_size * num_epochs, batch_size)) - ) - ] - - lr = 0.001 - - def compute_expected_weight(data: List[float], label: List[float], w: float) -> float: - n = len(data) - expected_step = 2.0 * lr * sum((d * (l - d * w) for d, l in zip(data, label))) / n - return w + expected_step - - expected_weights = [] - weight = 0.0 - data: List[float] = [] - label: List[float] = [] - for i, batch in enumerate(batches): - if i % 2 == 0: - # for even-numbered batches the optimizer step is a no-op: - # the weights don't change - data, label = batch - else: - additional_data, additional_label = batch - data += additional_data - label += additional_label - weight = compute_expected_weight(data, label, weight) - expected_weights.append(weight) - - assert actual_weights == pytest.approx( - expected_weights - ), f"{actual_weights} != {expected_weights}" diff --git a/e2e_tests/tests/fixtures/pytorch_identity/__init__.py b/e2e_tests/tests/fixtures/pytorch_identity/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml b/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml deleted file mode 100644 index 6d9d63dd36a..00000000000 --- a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: pytorch-identity -data: - model_type: single_output -entrypoint: model_def:IdentityPyTorchTrial -hyperparameters: - global_batch_size: 4 -records_per_epoch: 64 -resources: - slots_per_trial: 2 -scheduling_unit: 1 -searcher: - name: single - metric: val_loss - max_length: - epochs: 3 - smaller_is_better: true -max_restarts: 0 -min_checkpoint_period: - batches: 1 -min_validation_period: - batches: 1 -optimizations: - aggregation_frequency: 2 - average_aggregated_gradients: true diff --git a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py b/e2e_tests/tests/fixtures/pytorch_identity/model_def.py deleted file mode 100644 index 570ccf5a2b5..00000000000 --- a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import Any, Dict, Tuple - -import torch.utils.data - -from determined import pytorch - - -class MetricsCallback(pytorch.PyTorchCallback): - def __init__(self): - self.validation_metrics = [] - - def on_validation_end(self, metrics: Dict[str, Any]) -> None: - self.validation_metrics.append(metrics) - - -class IdentityDataset(torch.utils.data.Dataset): - def __init__(self, initial_value: int = 1): - self.initial_value = initial_value - - def __len__(self) -> int: - return 64 - - def __getitem__(self, index: int) -> Tuple: - v = float(self.initial_value + 0.1 * index) - return torch.Tensor([v]), torch.Tensor([v]) - - -class IdentityPyTorchTrial(pytorch.PyTorchTrial): - def __init__(self, context: pytorch.PyTorchTrialContext) -> None: - self.context = context - - model = torch.nn.Linear(1, 1, False) - model.weight.data.fill_(0) - self.model = context.wrap_model(model) - - self.lr = 0.001 - - optimizer = torch.optim.SGD(self.model.parameters(), self.lr) - self.opt = context.wrap_optimizer(optimizer) - - self.loss_fn = torch.nn.MSELoss(reduction="mean") - self.metrics_callback = MetricsCallback() - - def train_batch( - self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int - ) -> Dict[str, torch.Tensor]: - data, label = batch - - loss = self.loss_fn(self.model(data), label) - - self.context.backward(loss) - - self.context.step_optimizer(self.opt) - - return { - "loss": loss, - } - - def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]: - data, label = batch - - loss = self.loss_fn(self.model(data), label) - - weight = self.model.weight.data.item() - - return {"val_loss": loss, "weight": weight} - - def build_training_data_loader(self) -> pytorch.DataLoader: - return pytorch.DataLoader( - IdentityDataset(), batch_size=self.context.get_per_slot_batch_size() - ) - - def build_validation_data_loader(self) -> pytorch.DataLoader: - return pytorch.DataLoader( - IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size() - ) - - def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]: - return {"metrics": self.metrics_callback} diff --git a/e2e_tests/tests/nightly/test_pytorch2.py b/e2e_tests/tests/nightly/test_pytorch2.py index c0689e6894b..086f5159693 100644 --- a/e2e_tests/tests/nightly/test_pytorch2.py +++ b/e2e_tests/tests/nightly/test_pytorch2.py @@ -7,14 +7,13 @@ @pytest.mark.distributed @pytest.mark.gpu_required -@pytest.mark.e2e_slurm_gpu def test_pytorch2_hf_language_modeling_distributed() -> None: sess = api_utils.user_session() test_dir = "hf_language_modeling" config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml")) config = conf.set_pt2_image(config) - config = conf.set_slots_per_trial(config, 4) + config = conf.set_slots_per_trial(config, 8) # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches. config = conf.set_entrypoint( diff --git a/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml b/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml deleted file mode 120000 index f995af3ce37..00000000000 --- a/harness/tests/experiment/fixtures/pytorch_identity/distributed.yaml +++ /dev/null @@ -1 +0,0 @@ -../../../../../e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml \ No newline at end of file diff --git a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py deleted file mode 120000 index 7e14b820b38..00000000000 --- a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py +++ /dev/null @@ -1 +0,0 @@ -../../../../../e2e_tests/tests/fixtures/pytorch_identity/model_def.py \ No newline at end of file diff --git a/harness/tests/experiment/fixtures/pytorch_identity/model_def.py b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py new file mode 100644 index 00000000000..570ccf5a2b5 --- /dev/null +++ b/harness/tests/experiment/fixtures/pytorch_identity/model_def.py @@ -0,0 +1,79 @@ +from typing import Any, Dict, Tuple + +import torch.utils.data + +from determined import pytorch + + +class MetricsCallback(pytorch.PyTorchCallback): + def __init__(self): + self.validation_metrics = [] + + def on_validation_end(self, metrics: Dict[str, Any]) -> None: + self.validation_metrics.append(metrics) + + +class IdentityDataset(torch.utils.data.Dataset): + def __init__(self, initial_value: int = 1): + self.initial_value = initial_value + + def __len__(self) -> int: + return 64 + + def __getitem__(self, index: int) -> Tuple: + v = float(self.initial_value + 0.1 * index) + return torch.Tensor([v]), torch.Tensor([v]) + + +class IdentityPyTorchTrial(pytorch.PyTorchTrial): + def __init__(self, context: pytorch.PyTorchTrialContext) -> None: + self.context = context + + model = torch.nn.Linear(1, 1, False) + model.weight.data.fill_(0) + self.model = context.wrap_model(model) + + self.lr = 0.001 + + optimizer = torch.optim.SGD(self.model.parameters(), self.lr) + self.opt = context.wrap_optimizer(optimizer) + + self.loss_fn = torch.nn.MSELoss(reduction="mean") + self.metrics_callback = MetricsCallback() + + def train_batch( + self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int + ) -> Dict[str, torch.Tensor]: + data, label = batch + + loss = self.loss_fn(self.model(data), label) + + self.context.backward(loss) + + self.context.step_optimizer(self.opt) + + return { + "loss": loss, + } + + def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]: + data, label = batch + + loss = self.loss_fn(self.model(data), label) + + weight = self.model.weight.data.item() + + return {"val_loss": loss, "weight": weight} + + def build_training_data_loader(self) -> pytorch.DataLoader: + return pytorch.DataLoader( + IdentityDataset(), batch_size=self.context.get_per_slot_batch_size() + ) + + def build_validation_data_loader(self) -> pytorch.DataLoader: + return pytorch.DataLoader( + IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size() + ) + + def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]: + return {"metrics": self.metrics_callback} diff --git a/harness/tests/experiment/pytorch/test_pytorch_trial.py b/harness/tests/experiment/pytorch/test_pytorch_trial.py index 32c5abface0..daa7f0463f8 100644 --- a/harness/tests/experiment/pytorch/test_pytorch_trial.py +++ b/harness/tests/experiment/pytorch/test_pytorch_trial.py @@ -1468,8 +1468,7 @@ def amp_metrics_test(trial_class, training_metrics, agg_freq=1): def run_identity(tmp_path: pathlib.Path): checkpoint_dir = str(tmp_path.joinpath("checkpoint")) - config = utils.load_config(utils.fixtures_path("pytorch_identity/distributed.yaml")) - hparams = config["hyperparameters"] + hparams = {"global_batch_size": 4} exp_config = utils.make_default_exp_config( hparams, @@ -1477,7 +1476,6 @@ def run_identity(tmp_path: pathlib.Path): searcher_metric="validation_loss", checkpoint_dir=checkpoint_dir, ) - exp_config.update(config) exp_config["searcher"]["smaller_is_better"] = True # each subprocess must import separately as trial_class cannot be pickled. diff --git a/tools/slurm/README.md b/tools/slurm/README.md index e88e4d7a39d..e8be4acce48 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.** The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate: - - `test-e2e-slurm-gpu`: Test is skipped because the compute instance that the tests run on do not have any GPUs. - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious. - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite. - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.