Skip to content

Commit

Permalink
chore: remove e2e_slurm_gpu series tests
Browse files Browse the repository at this point in the history
Note that there are nightly tests decorated with:

   - @e2e_slurm
   - skipif(not torch.cuda.is_available())

So we still have some GPU-specific slurm tests at this point.  But those
tests were not actually running as part of the e2e_slurm_gpu tests
anyway.

This is part of a larger effort to get rid of our znode tests, which
are notoriously unreliable.
  • Loading branch information
rb-determined-ai committed Oct 4, 2024
1 parent a0cc818 commit acd6929
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 5 deletions.
2 changes: 1 addition & 1 deletion e2e_tests/tests/cluster/test_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
def test_set_gc_policy() -> None:
sess = api_utils.user_session()
save_exp_best = 3
Expand Down Expand Up @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:


@pytest.mark.e2e_cpu
@pytest.mark.e2e_slurm
def test_delete_checkpoints() -> None:
sess = api_utils.user_session()
config = {
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/experiment/test_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
@pytest.mark.timeout(30 * 60)
def test_streaming_observability_metrics_apis() -> None:
sess = api_utils.user_session()
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/experiment/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


@pytest.mark.parallel
@pytest.mark.e2e_slurm_gpu
def test_pytorch_gradient_aggregation() -> None:
sess = api_utils.user_session()
config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml"))
Expand Down
3 changes: 1 addition & 2 deletions e2e_tests/tests/nightly/test_pytorch2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@

@pytest.mark.distributed
@pytest.mark.gpu_required
@pytest.mark.e2e_slurm_gpu
def test_pytorch2_hf_language_modeling_distributed() -> None:
sess = api_utils.user_session()
test_dir = "hf_language_modeling"

config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
config = conf.set_pt2_image(config)
config = conf.set_slots_per_trial(config, 4)
config = conf.set_slots_per_trial(config, 8)

# Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
config = conf.set_entrypoint(
Expand Down

0 comments on commit acd6929

Please sign in to comment.