diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 55564d674c17..e47a75d59696 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -4208,18 +4208,6 @@ workflows: security: initial_user_password: ${INITIAL_USER_PASSWORD} - - test-e2e-slurm: - name: test-e2e-slurm-gpu - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - context: - - dev-ci-cluster-default-user-credentials - filters: - branches: - only: - - main - # Singularity over SLURM test on GCP - test-e2e-hpc-gcp: context: @@ -5106,16 +5094,6 @@ workflows: extra-pytest-flags: "--no-compare-stats" collect-det-job-logs: false - - test-e2e-slurm: - name: test-e2e-slurm-gpu - context: - - dev-ci-cluster-default-user-credentials - filters: *upstream-feature-branch - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - - request-hpc-tests - - test-e2e-slurm: name: test-e2e-slurm-enroot-znode context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index d3cc7142b671..d7212c4bb249 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -20,7 +20,6 @@ markers = e2e_pbs: end to end pbs integration tests e2e_saml: tests for saml with okta e2e_slurm: end to end slurm integration tests - e2e_slurm_gpu: end to end slurm GPU tests e2e_slurm_restart: slurm integration tests that require restarting the master e2e_slurm_preemption: hpc integration test to ensure preemption is working e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access diff --git a/e2e_tests/tests/cluster/test_checkpoints.py b/e2e_tests/tests/cluster/test_checkpoints.py index a375452944d9..cd766612f121 100644 --- a/e2e_tests/tests/cluster/test_checkpoints.py +++ b/e2e_tests/tests/cluster/test_checkpoints.py @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None: @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu def test_set_gc_policy() -> None: sess = api_utils.user_session() save_exp_best = 3 @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None: @pytest.mark.e2e_cpu +@pytest.mark.e2e_slurm def test_delete_checkpoints() -> None: sess = api_utils.user_session() config = { diff --git a/e2e_tests/tests/experiment/test_profiling.py b/e2e_tests/tests/experiment/test_profiling.py index 68c724a359a4..e7c1081cb086 100644 --- a/e2e_tests/tests/experiment/test_profiling.py +++ b/e2e_tests/tests/experiment/test_profiling.py @@ -12,7 +12,6 @@ @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu @pytest.mark.timeout(30 * 60) def test_streaming_observability_metrics_apis() -> None: sess = api_utils.user_session() diff --git a/e2e_tests/tests/experiment/test_pytorch.py b/e2e_tests/tests/experiment/test_pytorch.py deleted file mode 100644 index 9daa0fbd673c..000000000000 --- a/e2e_tests/tests/experiment/test_pytorch.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import List - -import pytest - -from tests import api_utils -from tests import config as conf -from tests import experiment as exp - - -@pytest.mark.parallel -@pytest.mark.e2e_slurm_gpu -def test_pytorch_gradient_aggregation() -> None: - sess = api_utils.user_session() - config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml")) - - exp_id = exp.run_basic_test_with_temp_config( - sess, config, conf.fixtures_path("pytorch_identity"), 1 - ) - trials = exp.experiment_trials(sess, exp_id) - assert len(trials) == 1 - workloads = exp.workloads_with_validation(trials[0].workloads) - actual_weights = [] - for wl in workloads: - if wl.metrics: - actual_weights.append(wl.metrics.avgMetrics["weight"]) - - # independently compute expected metrics - batch_size = 4 - epoch_size = 64 - num_epochs = 3 - batches = [ - (v[:], v[:]) - for v in ( - [x * 0.1 + 1.0 for x in range(y, y + batch_size)] - for y in (z % epoch_size for z in range(0, epoch_size * num_epochs, batch_size)) - ) - ] - - lr = 0.001 - - def compute_expected_weight(data: List[float], label: List[float], w: float) -> float: - n = len(data) - expected_step = 2.0 * lr * sum((d * (l - d * w) for d, l in zip(data, label))) / n - return w + expected_step - - expected_weights = [] - weight = 0.0 - data: List[float] = [] - label: List[float] = [] - for i, batch in enumerate(batches): - if i % 2 == 0: - # for even-numbered batches the optimizer step is a no-op: - # the weights don't change - data, label = batch - else: - additional_data, additional_label = batch - data += additional_data - label += additional_label - weight = compute_expected_weight(data, label, weight) - expected_weights.append(weight) - - assert actual_weights == pytest.approx( - expected_weights - ), f"{actual_weights} != {expected_weights}" diff --git a/e2e_tests/tests/fixtures/pytorch_identity/__init__.py b/e2e_tests/tests/fixtures/pytorch_identity/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml b/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml deleted file mode 100644 index 6d9d63dd36ab..000000000000 --- a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: pytorch-identity -data: - model_type: single_output -entrypoint: model_def:IdentityPyTorchTrial -hyperparameters: - global_batch_size: 4 -records_per_epoch: 64 -resources: - slots_per_trial: 2 -scheduling_unit: 1 -searcher: - name: single - metric: val_loss - max_length: - epochs: 3 - smaller_is_better: true -max_restarts: 0 -min_checkpoint_period: - batches: 1 -min_validation_period: - batches: 1 -optimizations: - aggregation_frequency: 2 - average_aggregated_gradients: true diff --git a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py b/e2e_tests/tests/fixtures/pytorch_identity/model_def.py deleted file mode 100644 index 570ccf5a2b57..000000000000 --- a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import Any, Dict, Tuple - -import torch.utils.data - -from determined import pytorch - - -class MetricsCallback(pytorch.PyTorchCallback): - def __init__(self): - self.validation_metrics = [] - - def on_validation_end(self, metrics: Dict[str, Any]) -> None: - self.validation_metrics.append(metrics) - - -class IdentityDataset(torch.utils.data.Dataset): - def __init__(self, initial_value: int = 1): - self.initial_value = initial_value - - def __len__(self) -> int: - return 64 - - def __getitem__(self, index: int) -> Tuple: - v = float(self.initial_value + 0.1 * index) - return torch.Tensor([v]), torch.Tensor([v]) - - -class IdentityPyTorchTrial(pytorch.PyTorchTrial): - def __init__(self, context: pytorch.PyTorchTrialContext) -> None: - self.context = context - - model = torch.nn.Linear(1, 1, False) - model.weight.data.fill_(0) - self.model = context.wrap_model(model) - - self.lr = 0.001 - - optimizer = torch.optim.SGD(self.model.parameters(), self.lr) - self.opt = context.wrap_optimizer(optimizer) - - self.loss_fn = torch.nn.MSELoss(reduction="mean") - self.metrics_callback = MetricsCallback() - - def train_batch( - self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int - ) -> Dict[str, torch.Tensor]: - data, label = batch - - loss = self.loss_fn(self.model(data), label) - - self.context.backward(loss) - - self.context.step_optimizer(self.opt) - - return { - "loss": loss, - } - - def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]: - data, label = batch - - loss = self.loss_fn(self.model(data), label) - - weight = self.model.weight.data.item() - - return {"val_loss": loss, "weight": weight} - - def build_training_data_loader(self) -> pytorch.DataLoader: - return pytorch.DataLoader( - IdentityDataset(), batch_size=self.context.get_per_slot_batch_size() - ) - - def build_validation_data_loader(self) -> pytorch.DataLoader: - return pytorch.DataLoader( - IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size() - ) - - def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]: - return {"metrics": self.metrics_callback} diff --git a/e2e_tests/tests/nightly/test_pytorch2.py b/e2e_tests/tests/nightly/test_pytorch2.py index c0689e6894b3..086f51596937 100644 --- a/e2e_tests/tests/nightly/test_pytorch2.py +++ b/e2e_tests/tests/nightly/test_pytorch2.py @@ -7,14 +7,13 @@ @pytest.mark.distributed @pytest.mark.gpu_required -@pytest.mark.e2e_slurm_gpu def test_pytorch2_hf_language_modeling_distributed() -> None: sess = api_utils.user_session() test_dir = "hf_language_modeling" config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml")) config = conf.set_pt2_image(config) - config = conf.set_slots_per_trial(config, 4) + config = conf.set_slots_per_trial(config, 8) # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches. config = conf.set_entrypoint( diff --git a/tools/slurm/README.md b/tools/slurm/README.md index e88e4d7a39d6..e8be4acce482 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.** The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate: - - `test-e2e-slurm-gpu`: Test is skipped because the compute instance that the tests run on do not have any GPUs. - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious. - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite. - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.