diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index e47a75d5969..3657b0b4e1f 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -4158,56 +4158,6 @@ workflows: only: - main - - test-e2e-slurm: - name: test-e2e-slurm-misconfigured - requires: - - package-and-push-system-local-ee - context: - - dev-ci-cluster-default-user-credentials - filters: - branches: - only: - - main - mark: e2e_slurm_misconfigured - master_config: | - task_container_defaults: - slurm: - sbatch_args: - - --time=04:00:00 - environment_variables: - # Some ports are not working, disable them so distributed jobs work. - - NCCL_IB_HCA=mlx6_0:0 - checkpoint_storage: - type: shared_fs - host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints - storage_path: determined-checkpoint - save_experiment_best: 0 - save_trial_best: 1 - save_trial_latest: 1 - db: - user: postgres - host: localhost - port: 5432 - name: determined - password: ${HPC_DB_PASSWD} - resource_manager: - type: slurm - master_host: junkmaster - master_port: 8080 - host: localhost - port: 8181 - protocol: http - slot_type: cuda - user_name: launcher - group_name: hpcd - singularity_image_root: /lustre/hdd/foundation_engineering/images - job_storage_root: /scratch/launcher/.launcher.$HOSTNAME - auth_file: /home/launcher/.launcher.$HOSTNAME.token - path: /opt/singularity/bin:/usr/local/bin:${PATH} - ld_library_path: - security: - initial_user_password: ${INITIAL_USER_PASSWORD} - # Singularity over SLURM test on GCP - test-e2e-hpc-gcp: context: @@ -5034,54 +4984,6 @@ workflows: type: approval filters: *upstream-feature-branch - - test-e2e-slurm: - name: test-e2e-slurm-misconfigured - context: - - dev-ci-cluster-default-user-credentials - filters: *upstream-feature-branch - requires: - - package-and-push-system-local-ee - - request-hpc-tests - mark: e2e_slurm_misconfigured - master_config: | - task_container_defaults: - slurm: - sbatch_args: - - --time=04:00:00 - environment_variables: - # Some ports are not working, disable them so distributed jobs work. - - NCCL_IB_HCA=mlx6_0:0 - checkpoint_storage: - type: shared_fs - host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints - storage_path: determined-checkpoint - save_experiment_best: 0 - save_trial_best: 1 - save_trial_latest: 1 - db: - user: postgres - host: localhost - port: 5432 - name: determined - password: ${HPC_DB_PASSWD} - resource_manager: - type: slurm - master_host: junkmaster - master_port: 8080 - host: localhost - port: 8181 - protocol: http - slot_type: cuda - user_name: launcher - group_name: hpcd - singularity_image_root: /lustre/hdd/foundation_engineering/images - job_storage_root: /scratch/launcher/.launcher.$HOSTNAME - auth_file: /home/launcher/.launcher.$HOSTNAME.token - path: /opt/singularity/bin:/usr/local/bin:${PATH} - ld_library_path: - security: - initial_user_password: ${INITIAL_USER_PASSWORD} - - test-e2e-slurm: name: test-e2e-slurm-restart context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index d7212c4bb24..3305b6b1c66 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -23,7 +23,6 @@ markers = e2e_slurm_restart: slurm integration tests that require restarting the master e2e_slurm_preemption: hpc integration test to ensure preemption is working e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access - e2e_slurm_misconfigured: end to end slurm integration tests to test test_oauth: end to end test for oauth client, add, remove in EE. test_model_registry_rbac: end to end test for RBAC model registry. gpu_required: tests with a hard CUDA requirement diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py index 9268c8d14b3..f2ca3ff4ba7 100644 --- a/e2e_tests/tests/cluster/test_slurm.py +++ b/e2e_tests/tests/cluster/test_slurm.py @@ -7,11 +7,9 @@ from determined.common import api from determined.common.api import bindings -from determined.experimental import client from tests import api_utils, command from tests import config as conf from tests import experiment as exp -from tests.experiment import noop def run_failure_test_multiple( @@ -172,25 +170,6 @@ def test_docker_login() -> None: ) -# A devcluster needs to be run with the master host entered incorrectly -# (with an unreachable master_host name). -@pytest.mark.e2e_slurm_misconfigured -@api_utils.skipif_not_slurm() -def test_master_host() -> None: - sess = api_utils.user_session() - # Creates an experiment normally, should error if the back communication channel is broken - exp_ref = noop.create_experiment(sess) - assert exp_ref.wait(interval=0.01) == client.ExperimentState.ERROR - msg = ( - "Unable to reach the master at DET_MASTER=http://junkmaster:8080. " - "This may be due to an address " - "resolution problem, a certificate problem, a firewall problem, " - "a proxy problem, or some other networking error." - ) - trial = exp_ref.get_trials()[0] - assert exp.check_if_string_present_in_trial_logs(sess, trial.id, msg) - - @pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available") @pytest.mark.e2e_slurm @pytest.mark.e2e_pbs diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py index 170cdb5b538..a1f84a0413f 100644 --- a/e2e_tests/tests/conftest.py +++ b/e2e_tests/tests/conftest.py @@ -37,7 +37,6 @@ "e2e_slurm_restart", "e2e_slurm_preemption", "e2e_slurm_internet_connected_cluster", - "e2e_slurm_misconfigured", "det_deploy_local", "test_oauth", "test_model_registry_rbac", diff --git a/master/pkg/tasks/dispatcher_task.go b/master/pkg/tasks/dispatcher_task.go index 61c1c565505..eefd11ef82f 100644 --- a/master/pkg/tasks/dispatcher_task.go +++ b/master/pkg/tasks/dispatcher_task.go @@ -263,14 +263,10 @@ func (t *TaskSpec) ToDispatcherManifest( launchParameters.SetCustom(customParams) - // Prepend the entrypoint like: `ship-logs.sh "$@"`. - shipLogsShell := filepath.Join(RunDir, taskShipLogsShell) - shipLogsPython := filepath.Join(RunDir, taskShipLogsPython) - // Add entrypoint command as argument wrappedEntryPoint := append( - []string{determinedLocalFs + "/" + dispatcherEntrypointScriptResource, shipLogsShell, shipLogsPython}, - t.Entrypoint...) + []string{determinedLocalFs + "/" + dispatcherEntrypointScriptResource}, + t.LogShipperWrappedEntrypoint()...) launchParameters.SetArguments(wrappedEntryPoint) // We just pass through the image reference here. It may be any scheme that diff --git a/tools/slurm/README.md b/tools/slurm/README.md index e8be4acce48..eefdaca3df4 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.** The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate: - - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious. - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite. - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.