diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 25d5d0f4226..3ba282ebaf7 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -5351,14 +5351,6 @@ workflows: - package-and-push-system-local-ee extra-pytest-flags: "--no-compare-stats" collect-det-job-logs: false - - test-e2e-slurm: - name: test-e2e-slurm-preemption - context: - - dev-ci-cluster-default-user-credentials - mark: "e2e_slurm_preemption" - requires: - - package-and-push-system-local-ee - extra-pytest-flags: "--no-compare-stats" - test-e2e-slurm: name: test-e2e-slurm-znode context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index 3305b6b1c66..7ec4f8e50a9 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -21,7 +21,6 @@ markers = e2e_saml: tests for saml with okta e2e_slurm: end to end slurm integration tests e2e_slurm_restart: slurm integration tests that require restarting the master - e2e_slurm_preemption: hpc integration test to ensure preemption is working e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access test_oauth: end to end test for oauth client, add, remove in EE. test_model_registry_rbac: end to end test for RBAC model registry. diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py index f2ca3ff4ba7..54b26f0de2f 100644 --- a/e2e_tests/tests/cluster/test_slurm.py +++ b/e2e_tests/tests/cluster/test_slurm.py @@ -184,62 +184,6 @@ def test_mnist_pytorch_distributed() -> None: exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1) -# Test to ensure that determined is able to handle preemption gracefully when using dispatcher RM. -# Preemption: -# When users launch a set of experiments requesting different levels of priorities -# and resources, and when there are a limited set of resources, high priority experiments can -# cancel or requeue low priority experiments. -# Preemption is dependent upon the underlying HPC system and the WLM (SLURM/PBS) setup. -# Nodes in an HPC systems are typically divided into multiple partitions (logical grouping of -# nodes into possibly overlapping sets) used for different purposes. Using WLMs sysadmins -# typically assign varying levels of priority for each partition. Also, users can request the -# WLM to provide specific partition and priority level for their jobs. -# In the following test case we test an example preemption scenario. We launch the two experiments -# iris_tf_keras_cancellable and iris_tf_keras_high_priority in order. Ensure that the -# iris_tf_keras_cancellable experiment is requeued, iris_tf_keras_high_priority experiment -# runs to completion. After that, iris_tf_keras_cancellable experiment is resumed and it runs -# to completion. -# NB: The clusters casablanca-login and znode have one node (8-GPUs) being used in two partitions: -# 1. defq_GPU_cancellable - partition for low priority and jobs are requeued if necessary -# 2. defq_GPU_hipri - partition for high priority non-cancellable jobs -@pytest.mark.e2e_slurm_preemption -@api_utils.skipif_not_slurm() -def test_slurm_preemption() -> None: - sess = api_utils.user_session() - # Launch the iris_tf_keras_cancellable experiment requesting 8 GPUs on defq_GPU_cancellable - # partition - cancelable_exp_id = exp.create_experiment( - sess, - conf.cv_examples_path("iris_tf_keras/iris_tf_keras_cancelable.yaml"), - conf.cv_examples_path("iris_tf_keras"), - None, - ) - # Wait for the first cancellable experiment to enter RUNNING state. - exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING) - # Wait for the first cancellable experiment to complete at least one checkpoint. - exp.wait_for_at_least_one_checkpoint(sess, cancelable_exp_id, 300) - # Launch the iris_tf_keras_high_priority experiment requesting 8 GPUs on defq_GPU_hipri - # partition - high_priority_exp_id = exp.create_experiment( - sess, - conf.cv_examples_path("iris_tf_keras/iris_tf_keras_high_priority.yaml"), - conf.cv_examples_path("iris_tf_keras"), - None, - ) - # In this scenario, iris_tf_keras_high_priority experiment will cause the - # iris_tf_keras_cancelable experiment to get requeued. The experiment - # iris_tf_keras_high_priority will execute to completion. - exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.QUEUED) - exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.RUNNING) - exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.COMPLETED) - # Now, the experiment iris_tf_keras_cancelable will resume as soon as the requested - # resources are available. - exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING) - # Finally, the experiment iris_tf_keras_cancelable will complete if there are no other - # interruptions. - exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.COMPLETED) - - @pytest.mark.e2e_slurm @pytest.mark.e2e_pbs @api_utils.skipif_not_hpc() diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py index a1f84a0413f..92a99c651d3 100644 --- a/e2e_tests/tests/conftest.py +++ b/e2e_tests/tests/conftest.py @@ -35,7 +35,6 @@ "e2e_saml", "e2e_slurm", "e2e_slurm_restart", - "e2e_slurm_preemption", "e2e_slurm_internet_connected_cluster", "det_deploy_local", "test_oauth", diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml deleted file mode 100644 index 579ada49c17..00000000000 --- a/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: iris_tf_keras_cancelable -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv -environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d -resources: - slots_per_trial: 8 - resource_pool: defq_GPU_cancelable -hyperparameters: - learning_rate: 1.0e-4 - learning_rate_decay: 1.0e-6 - layer1_dense_size: 16 - global_batch_size: 32 -searcher: - name: single - metric: val_categorical_accuracy - smaller_is_better: false - max_length: - batches: 500 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial -min_validation_period: - batches: 50 -min_checkpoint_period: - batches: 50 -max_restarts: 0 diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml deleted file mode 100644 index 9e1ce5855b8..00000000000 --- a/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: iris_tf_keras_high_priority -data: - train_url: http://download.tensorflow.org/data/iris_training.csv - test_url: http://download.tensorflow.org/data/iris_test.csv -environment: - image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d -resources: - slots_per_trial: 8 - resource_pool: defq_GPU_hipri -hyperparameters: - learning_rate: 1.0e-4 - learning_rate_decay: 1.0e-6 - layer1_dense_size: 16 - global_batch_size: 32 -searcher: - name: single - metric: val_categorical_accuracy - smaller_is_better: false - max_length: - batches: 500 -entrypoint: python3 -m determined.launch.horovod --autohorovod --trial model_def:IrisTrial -min_validation_period: - batches: 50 -min_checkpoint_period: - batches: 50 -max_restarts: 0 diff --git a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_cancelable.yaml b/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_cancelable.yaml deleted file mode 100644 index 7b588d48b42..00000000000 --- a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_cancelable.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: cifar10_pytorch_cancelable -resources: - slots_per_trial: 8 - resource_pool: defq_GPU_cancelable -hyperparameters: - learning_rate: 1.0e-4 - learning_rate_decay: 1.0e-6 - layer1_dropout: 0.25 - layer2_dropout: 0.25 - layer3_dropout: 0.5 - global_batch_size: 32 -records_per_epoch: 500 -searcher: - name: single - metric: validation_error - max_length: - epochs: 2 -entrypoint: model_def:CIFARTrial -min_validation_period: - epochs: 1 -min_checkpoint_period: - epochs: 1 -max_restarts: 0 diff --git a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_high_priority.yaml b/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_high_priority.yaml deleted file mode 100644 index 5bd0c5f920a..00000000000 --- a/examples/legacy/computer_vision/cifar10_pytorch/cifar10_pytorch_high_priority.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: cifar10_pytorch_high_priority -resources: - slots_per_trial: 8 - resource_pool: defq_GPU_hipri -hyperparameters: - learning_rate: 1.0e-4 - learning_rate_decay: 1.0e-6 - layer1_dropout: 0.25 - layer2_dropout: 0.25 - layer3_dropout: 0.5 - global_batch_size: 32 -records_per_epoch: 500 -searcher: - name: single - metric: validation_error - max_length: - epochs: 2 -entrypoint: model_def:CIFARTrial -min_validation_period: - epochs: 1 -min_checkpoint_period: - epochs: 1 -max_restarts: 0 diff --git a/harness/tests/launch/test_launch.py b/harness/tests/launch/test_launch.py index 9bd29b0c7aa..8d8ad365afb 100644 --- a/harness/tests/launch/test_launch.py +++ b/harness/tests/launch/test_launch.py @@ -1,4 +1,7 @@ +import os import runpy +import signal +import time from typing import Any, Dict, List from unittest import mock @@ -55,3 +58,41 @@ def test_launch_script(mock_validate_config: mock.MagicMock) -> None: with pytest.raises(SystemExit) as e: runpy.run_module("determined.exec.launch", run_name="__main__", alter_sys=True) assert e.value.code == 1, e + + +@mock.patch("subprocess.Popen") +@mock.patch("determined.common.api.authentication.login_from_task") +def test_launch_catches_slurm_preemption( + mock_login_from_task: mock.MagicMock, + mock_popen: mock.MagicMock, +) -> None: + """ + Determined's slurm preemption logic involves catching the SIGTERM sent by slurm, reporting it + to the master, then the python code will detect that through the normal should_preempt() logic. + + Here, we need to test that the launch script actually catches SIGTERM and makes the right API + call to the master. + """ + + # Send a SIGTERM to this process during the Popen.wait(). + def wait_side_effect() -> int: + os.kill(os.getpid(), signal.SIGTERM) + # Make any syscall at all to ensure the signal handler has a good chance to fire. + time.sleep(0.000001) + # SIGTERM handler should have fired, make sure the right API call was made. + mock_login_from_task.return_value.post.assert_called_once_with( + "/api/v1/allocations/allocationId/signals/pending_preemption" + ) + # Return a unique value to make sure our mocks are wired up right. + return 789 + + mock_popen.return_value.wait.side_effect = wait_side_effect + + old_handler = signal.getsignal(signal.SIGTERM) + try: + with test_util.set_env_vars({"DET_RESOURCES_TYPE": "slurm-job"}): + with test_util.set_mock_cluster_info(["0.0.0.1"], 0, 1): + config = {"entrypoint": "yo"} + assert launch.launch(det.ExperimentConfig(config)) == 789 + finally: + signal.signal(signal.SIGTERM, old_handler)