diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 55564d674c17..e47a75d59696 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -4208,18 +4208,6 @@ workflows: security: initial_user_password: ${INITIAL_USER_PASSWORD} - - test-e2e-slurm: - name: test-e2e-slurm-gpu - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - context: - - dev-ci-cluster-default-user-credentials - filters: - branches: - only: - - main - # Singularity over SLURM test on GCP - test-e2e-hpc-gcp: context: @@ -5106,16 +5094,6 @@ workflows: extra-pytest-flags: "--no-compare-stats" collect-det-job-logs: false - - test-e2e-slurm: - name: test-e2e-slurm-gpu - context: - - dev-ci-cluster-default-user-credentials - filters: *upstream-feature-branch - mark: "e2e_slurm_gpu" - requires: - - package-and-push-system-local-ee - - request-hpc-tests - - test-e2e-slurm: name: test-e2e-slurm-enroot-znode context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index d3cc7142b671..d7212c4bb249 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -20,7 +20,6 @@ markers = e2e_pbs: end to end pbs integration tests e2e_saml: tests for saml with okta e2e_slurm: end to end slurm integration tests - e2e_slurm_gpu: end to end slurm GPU tests e2e_slurm_restart: slurm integration tests that require restarting the master e2e_slurm_preemption: hpc integration test to ensure preemption is working e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access diff --git a/e2e_tests/tests/cluster/test_checkpoints.py b/e2e_tests/tests/cluster/test_checkpoints.py index a375452944d9..cd766612f121 100644 --- a/e2e_tests/tests/cluster/test_checkpoints.py +++ b/e2e_tests/tests/cluster/test_checkpoints.py @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None: @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu def test_set_gc_policy() -> None: sess = api_utils.user_session() save_exp_best = 3 @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None: @pytest.mark.e2e_cpu +@pytest.mark.e2e_slurm def test_delete_checkpoints() -> None: sess = api_utils.user_session() config = { diff --git a/e2e_tests/tests/experiment/test_profiling.py b/e2e_tests/tests/experiment/test_profiling.py index 68c724a359a4..e7c1081cb086 100644 --- a/e2e_tests/tests/experiment/test_profiling.py +++ b/e2e_tests/tests/experiment/test_profiling.py @@ -12,7 +12,6 @@ @pytest.mark.e2e_gpu -@pytest.mark.e2e_slurm_gpu @pytest.mark.timeout(30 * 60) def test_streaming_observability_metrics_apis() -> None: sess = api_utils.user_session() diff --git a/e2e_tests/tests/experiment/test_pytorch.py b/e2e_tests/tests/experiment/test_pytorch.py index 9daa0fbd673c..a31fafcd961a 100644 --- a/e2e_tests/tests/experiment/test_pytorch.py +++ b/e2e_tests/tests/experiment/test_pytorch.py @@ -8,7 +8,6 @@ @pytest.mark.parallel -@pytest.mark.e2e_slurm_gpu def test_pytorch_gradient_aggregation() -> None: sess = api_utils.user_session() config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml")) diff --git a/e2e_tests/tests/nightly/test_pytorch2.py b/e2e_tests/tests/nightly/test_pytorch2.py index c0689e6894b3..086f51596937 100644 --- a/e2e_tests/tests/nightly/test_pytorch2.py +++ b/e2e_tests/tests/nightly/test_pytorch2.py @@ -7,14 +7,13 @@ @pytest.mark.distributed @pytest.mark.gpu_required -@pytest.mark.e2e_slurm_gpu def test_pytorch2_hf_language_modeling_distributed() -> None: sess = api_utils.user_session() test_dir = "hf_language_modeling" config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml")) config = conf.set_pt2_image(config) - config = conf.set_slots_per_trial(config, 4) + config = conf.set_slots_per_trial(config, 8) # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches. config = conf.set_entrypoint(