chore: remove e2e_slurm_misconfigured series tests (#10023)

The only test in this series was an e2e test that the log shipper ships a certain log in a certain situation. We have unit tests for that, so delete the e2e test. Now, I actually failed this e2e test when I first landed the log shipper, so it wasn't totally good-for-nothing. But effectively, all it was testing these days was that we did in fact use the new log shipper in slurm workloads. Now that the EE code is merged into the OSS code, and we can just use the same LogShipperWrappedEntrypoint() function in all three classes of resource manager, I don't see any real justification for this test. This is part of a larger effort to get rid of our znode tests, which are notoriously unreliable.
determined-ai · Oct 10, 2024 · 2356f91 · 2356f91
1 parent b243c26
commit 2356f91
Show file tree

Hide file tree

Showing 6 changed files with 2 additions and 128 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -4158,56 +4158,6 @@ workflows:
               only:
                 - main
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-misconfigured
-          requires:
-            - package-and-push-system-local-ee
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters:
-            branches:
-              only:
-                - main
-          mark: e2e_slurm_misconfigured
-          master_config: |
-            task_container_defaults:
-              slurm:
-                sbatch_args:
-                  - --time=04:00:00
-              environment_variables:
-                # Some ports are not working, disable them so distributed jobs work.
-                - NCCL_IB_HCA=mlx6_0:0
-            checkpoint_storage:
-              type: shared_fs
-              host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints
-              storage_path: determined-checkpoint
-              save_experiment_best: 0
-              save_trial_best: 1
-              save_trial_latest: 1
-            db:
-              user: postgres
-              host: localhost
-              port: 5432
-              name: determined
-              password: ${HPC_DB_PASSWD}
-            resource_manager:
-              type: slurm
-              master_host: junkmaster
-              master_port: 8080
-              host: localhost
-              port: 8181
-              protocol: http
-              slot_type: cuda
-              user_name: launcher
-              group_name: hpcd
-              singularity_image_root: /lustre/hdd/foundation_engineering/images
-              job_storage_root: /scratch/launcher/.launcher.$HOSTNAME
-              auth_file: /home/launcher/.launcher.$HOSTNAME.token
-              path: /opt/singularity/bin:/usr/local/bin:${PATH}
-              ld_library_path:
-            security:
-              initial_user_password: ${INITIAL_USER_PASSWORD}
-
       # Singularity over SLURM test on GCP
       - test-e2e-hpc-gcp:
           context:
@@ -5034,54 +4984,6 @@ workflows:
           type: approval
           filters: *upstream-feature-branch
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-misconfigured
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters: *upstream-feature-branch
-          requires:
-            - package-and-push-system-local-ee
-            - request-hpc-tests
-          mark: e2e_slurm_misconfigured
-          master_config: |
-            task_container_defaults:
-              slurm:
-                sbatch_args:
-                  - --time=04:00:00
-              environment_variables:
-                # Some ports are not working, disable them so distributed jobs work.
-                - NCCL_IB_HCA=mlx6_0:0
-            checkpoint_storage:
-              type: shared_fs
-              host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints
-              storage_path: determined-checkpoint
-              save_experiment_best: 0
-              save_trial_best: 1
-              save_trial_latest: 1
-            db:
-              user: postgres
-              host: localhost
-              port: 5432
-              name: determined
-              password: ${HPC_DB_PASSWD}
-            resource_manager:
-              type: slurm
-              master_host: junkmaster
-              master_port: 8080
-              host: localhost
-              port: 8181
-              protocol: http
-              slot_type: cuda
-              user_name: launcher
-              group_name: hpcd
-              singularity_image_root: /lustre/hdd/foundation_engineering/images
-              job_storage_root: /scratch/launcher/.launcher.$HOSTNAME
-              auth_file: /home/launcher/.launcher.$HOSTNAME.token
-              path: /opt/singularity/bin:/usr/local/bin:${PATH}
-              ld_library_path:
-            security:
-              initial_user_password: ${INITIAL_USER_PASSWORD}
-
       - test-e2e-slurm:
           name: test-e2e-slurm-restart
           context:

diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
@@ -23,7 +23,6 @@ markers =
     e2e_slurm_restart: slurm integration tests that require restarting the master
     e2e_slurm_preemption: hpc integration test to ensure preemption is working
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
-    e2e_slurm_misconfigured: end to end slurm integration tests to test
     test_oauth: end to end test for oauth client, add, remove in EE.
     test_model_registry_rbac: end to end test for RBAC model registry.
     gpu_required: tests with a hard CUDA requirement

diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py
@@ -7,11 +7,9 @@
 
 from determined.common import api
 from determined.common.api import bindings
-from determined.experimental import client
 from tests import api_utils, command
 from tests import config as conf
 from tests import experiment as exp
-from tests.experiment import noop
 
 
 def run_failure_test_multiple(
@@ -172,25 +170,6 @@ def test_docker_login() -> None:
     )
 
 
-# A devcluster needs to be run with the master host entered incorrectly
-# (with an unreachable master_host name).
-@pytest.mark.e2e_slurm_misconfigured
-@api_utils.skipif_not_slurm()
-def test_master_host() -> None:
-    sess = api_utils.user_session()
-    # Creates an experiment normally, should error if the back communication channel is broken
-    exp_ref = noop.create_experiment(sess)
-    assert exp_ref.wait(interval=0.01) == client.ExperimentState.ERROR
-    msg = (
-        "Unable to reach the master at DET_MASTER=http://junkmaster:8080.  "
-        "This may be due to an address "
-        "resolution problem, a certificate problem, a firewall problem, "
-        "a proxy problem, or some other networking error."
-    )
-    trial = exp_ref.get_trials()[0]
-    assert exp.check_if_string_present_in_trial_logs(sess, trial.id, msg)
-
-
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available")
 @pytest.mark.e2e_slurm
 @pytest.mark.e2e_pbs

diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py
@@ -37,7 +37,6 @@
     "e2e_slurm_restart",
     "e2e_slurm_preemption",
     "e2e_slurm_internet_connected_cluster",
-    "e2e_slurm_misconfigured",
     "det_deploy_local",
     "test_oauth",
     "test_model_registry_rbac",

diff --git a/master/pkg/tasks/dispatcher_task.go b/master/pkg/tasks/dispatcher_task.go
@@ -263,14 +263,10 @@ func (t *TaskSpec) ToDispatcherManifest(
 
 	launchParameters.SetCustom(customParams)
 
-	// Prepend the entrypoint like: `ship-logs.sh "$@"`.
-	shipLogsShell := filepath.Join(RunDir, taskShipLogsShell)
-	shipLogsPython := filepath.Join(RunDir, taskShipLogsPython)
-
 	// Add entrypoint command as argument
 	wrappedEntryPoint := append(
-		[]string{determinedLocalFs + "/" + dispatcherEntrypointScriptResource, shipLogsShell, shipLogsPython},
-		t.Entrypoint...)
+		[]string{determinedLocalFs + "/" + dispatcherEntrypointScriptResource},
+		t.LogShipperWrappedEntrypoint()...)
 	launchParameters.SetArguments(wrappedEntryPoint)
 
 	// We just pass through the image reference here.  It may be any scheme that

diff --git a/tools/slurm/README.md b/tools/slurm/README.md
@@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
 **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
 
 The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
-  - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious.
   - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
   - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.