Skip to content

Commit

Permalink
chore: remove e2e_slurm_gpu series tests
Browse files Browse the repository at this point in the history
Note that there are nightly tests decorated with:

   - @e2e_slurm
   - skipif(not torch.cuda.is_available())

So we still have some GPU-specific slurm tests at this point.  But those
tests were not actually running as part of the e2e_slurm_gpu tests
anyway.

This is part of a larger effort to get rid of our znode tests, which
are notoriously unreliable.
  • Loading branch information
rb-determined-ai committed Oct 4, 2024
1 parent a0cc818 commit ecf0aac
Show file tree
Hide file tree
Showing 6 changed files with 2 additions and 28 deletions.
22 changes: 0 additions & 22 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4208,18 +4208,6 @@ workflows:
security:
initial_user_password: ${INITIAL_USER_PASSWORD}
- test-e2e-slurm:
name: test-e2e-slurm-gpu
mark: "e2e_slurm_gpu"
requires:
- package-and-push-system-local-ee
context:
- dev-ci-cluster-default-user-credentials
filters:
branches:
only:
- main

# Singularity over SLURM test on GCP
- test-e2e-hpc-gcp:
context:
Expand Down Expand Up @@ -5106,16 +5094,6 @@ workflows:
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false

- test-e2e-slurm:
name: test-e2e-slurm-gpu
context:
- dev-ci-cluster-default-user-credentials
filters: *upstream-feature-branch
mark: "e2e_slurm_gpu"
requires:
- package-and-push-system-local-ee
- request-hpc-tests

- test-e2e-slurm:
name: test-e2e-slurm-enroot-znode
context:
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ markers =
e2e_pbs: end to end pbs integration tests
e2e_saml: tests for saml with okta
e2e_slurm: end to end slurm integration tests
e2e_slurm_gpu: end to end slurm GPU tests
e2e_slurm_restart: slurm integration tests that require restarting the master
e2e_slurm_preemption: hpc integration test to ensure preemption is working
e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
Expand Down
2 changes: 1 addition & 1 deletion e2e_tests/tests/cluster/test_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
def test_set_gc_policy() -> None:
sess = api_utils.user_session()
save_exp_best = 3
Expand Down Expand Up @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:


@pytest.mark.e2e_cpu
@pytest.mark.e2e_slurm
def test_delete_checkpoints() -> None:
sess = api_utils.user_session()
config = {
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/experiment/test_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
@pytest.mark.timeout(30 * 60)
def test_streaming_observability_metrics_apis() -> None:
sess = api_utils.user_session()
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/experiment/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


@pytest.mark.parallel
@pytest.mark.e2e_slurm_gpu
def test_pytorch_gradient_aggregation() -> None:
sess = api_utils.user_session()
config = conf.load_config(conf.fixtures_path("pytorch_identity/distributed.yaml"))
Expand Down
3 changes: 1 addition & 2 deletions e2e_tests/tests/nightly/test_pytorch2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@

@pytest.mark.distributed
@pytest.mark.gpu_required
@pytest.mark.e2e_slurm_gpu
def test_pytorch2_hf_language_modeling_distributed() -> None:
sess = api_utils.user_session()
test_dir = "hf_language_modeling"

config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
config = conf.set_pt2_image(config)
config = conf.set_slots_per_trial(config, 4)
config = conf.set_slots_per_trial(config, 8)

# Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
config = conf.set_entrypoint(
Expand Down

0 comments on commit ecf0aac

Please sign in to comment.