chore: remove e2e_slurm_gpu series tests

Note that there are nightly tests decorated with: - @e2e_slurm - skipif(not torch.cuda.is_available()) So we still have some GPU-specific slurm tests at this point. But those tests were not actually running as part of the e2e_slurm_gpu tests anyway. This is part of a larger effort to get rid of our znode tests, which are notoriously unreliable.
determined-ai · Oct 4, 2024 · acd6929 · acd6929
1 parent a0cc818
commit acd6929
Show file tree

Hide file tree

Showing 4 changed files with 2 additions and 5 deletions.
diff --git a/e2e_tests/tests/cluster/test_checkpoints.py b/e2e_tests/tests/cluster/test_checkpoints.py
@@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 def test_set_gc_policy() -> None:
     sess = api_utils.user_session()
     save_exp_best = 3
@@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:
 
 
 @pytest.mark.e2e_cpu
+@pytest.mark.e2e_slurm
 def test_delete_checkpoints() -> None:
     sess = api_utils.user_session()
     config = {

diff --git a/e2e_tests/tests/experiment/test_profiling.py b/e2e_tests/tests/experiment/test_profiling.py
@@ -12,7 +12,6 @@
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 @pytest.mark.timeout(30 * 60)
 def test_streaming_observability_metrics_apis() -> None:
     sess = api_utils.user_session()

diff --git a/e2e_tests/tests/experiment/test_pytorch.py b/e2e_tests/tests/experiment/test_pytorch.py
@@ -8,7 +8,6 @@
 
 
 @pytest.mark.parallel
-@pytest.mark.e2e_slurm_gpu
 def test_pytorch_gradient_aggregation() -> None:
     sess = api_utils.user_session()
     config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml"))

diff --git a/e2e_tests/tests/nightly/test_pytorch2.py b/e2e_tests/tests/nightly/test_pytorch2.py
@@ -7,14 +7,13 @@
 
 @pytest.mark.distributed
 @pytest.mark.gpu_required
-@pytest.mark.e2e_slurm_gpu
 def test_pytorch2_hf_language_modeling_distributed() -> None:
     sess = api_utils.user_session()
     test_dir = "hf_language_modeling"
 
     config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
     config = conf.set_pt2_image(config)
-    config = conf.set_slots_per_trial(config, 4)
+    config = conf.set_slots_per_trial(config, 8)
 
     # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
     config = conf.set_entrypoint(