Skip to content

Commit

Permalink
chore: remove e2e_slurm_preemption test series (#10053)
Browse files Browse the repository at this point in the history
The only test in this series was an e2e test sure that we caught slurm's
SIGTERM, reported that to the master via an API call, and that training
could would then see should_preempt() return True.

This didn't need to be an e2e test; it can be a unit test to make sure
our SIGTERM handler is working, plus rerouting the API logic to pass
through the our normal preemption handling (#10071).  There is already
plenty of coverage ensuring should_preempt() works in python code.

So by adding that unit test here and following the refactor of #10071,
it is safe to remove the e2e_slurm_preemption series entirely.

This is part of a larger effort to get rid of our znode tests, which
are notoriously unreliable.
  • Loading branch information
rb-determined-ai authored Oct 22, 2024
1 parent e6182ed commit c6c476c
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 168 deletions.
8 changes: 0 additions & 8 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5351,14 +5351,6 @@ workflows:
- package-and-push-system-local-ee
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false
- test-e2e-slurm:
name: test-e2e-slurm-preemption
context:
- dev-ci-cluster-default-user-credentials
mark: "e2e_slurm_preemption"
requires:
- package-and-push-system-local-ee
extra-pytest-flags: "--no-compare-stats"
- test-e2e-slurm:
name: test-e2e-slurm-znode
context:
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ markers =
e2e_saml: tests for saml with okta
e2e_slurm: end to end slurm integration tests
e2e_slurm_restart: slurm integration tests that require restarting the master
e2e_slurm_preemption: hpc integration test to ensure preemption is working
e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
test_oauth: end to end test for oauth client, add, remove in EE.
test_model_registry_rbac: end to end test for RBAC model registry.
Expand Down
56 changes: 0 additions & 56 deletions e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,62 +184,6 @@ def test_mnist_pytorch_distributed() -> None:
exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)


# Test to ensure that determined is able to handle preemption gracefully when using dispatcher RM.
# Preemption:
# When users launch a set of experiments requesting different levels of priorities
# and resources, and when there are a limited set of resources, high priority experiments can
# cancel or requeue low priority experiments.
# Preemption is dependent upon the underlying HPC system and the WLM (SLURM/PBS) setup.
# Nodes in an HPC systems are typically divided into multiple partitions (logical grouping of
# nodes into possibly overlapping sets) used for different purposes. Using WLMs sysadmins
# typically assign varying levels of priority for each partition. Also, users can request the
# WLM to provide specific partition and priority level for their jobs.
# In the following test case we test an example preemption scenario. We launch the two experiments
# iris_tf_keras_cancellable and iris_tf_keras_high_priority in order. Ensure that the
# iris_tf_keras_cancellable experiment is requeued, iris_tf_keras_high_priority experiment
# runs to completion. After that, iris_tf_keras_cancellable experiment is resumed and it runs
# to completion.
# NB: The clusters casablanca-login and znode have one node (8-GPUs) being used in two partitions:
# 1. defq_GPU_cancellable - partition for low priority and jobs are requeued if necessary
# 2. defq_GPU_hipri - partition for high priority non-cancellable jobs
@pytest.mark.e2e_slurm_preemption
@api_utils.skipif_not_slurm()
def test_slurm_preemption() -> None:
sess = api_utils.user_session()
# Launch the iris_tf_keras_cancellable experiment requesting 8 GPUs on defq_GPU_cancellable
# partition
cancelable_exp_id = exp.create_experiment(
sess,
conf.cv_examples_path("iris_tf_keras/iris_tf_keras_cancelable.yaml"),
conf.cv_examples_path("iris_tf_keras"),
None,
)
# Wait for the first cancellable experiment to enter RUNNING state.
exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING)
# Wait for the first cancellable experiment to complete at least one checkpoint.
exp.wait_for_at_least_one_checkpoint(sess, cancelable_exp_id, 300)
# Launch the iris_tf_keras_high_priority experiment requesting 8 GPUs on defq_GPU_hipri
# partition
high_priority_exp_id = exp.create_experiment(
sess,
conf.cv_examples_path("iris_tf_keras/iris_tf_keras_high_priority.yaml"),
conf.cv_examples_path("iris_tf_keras"),
None,
)
# In this scenario, iris_tf_keras_high_priority experiment will cause the
# iris_tf_keras_cancelable experiment to get requeued. The experiment
# iris_tf_keras_high_priority will execute to completion.
exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.QUEUED)
exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.RUNNING)
exp.wait_for_experiment_state(sess, high_priority_exp_id, bindings.experimentv1State.COMPLETED)
# Now, the experiment iris_tf_keras_cancelable will resume as soon as the requested
# resources are available.
exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.RUNNING)
# Finally, the experiment iris_tf_keras_cancelable will complete if there are no other
# interruptions.
exp.wait_for_experiment_state(sess, cancelable_exp_id, bindings.experimentv1State.COMPLETED)


@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
@api_utils.skipif_not_hpc()
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
"e2e_saml",
"e2e_slurm",
"e2e_slurm_restart",
"e2e_slurm_preemption",
"e2e_slurm_internet_connected_cluster",
"det_deploy_local",
"test_oauth",
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

41 changes: 41 additions & 0 deletions harness/tests/launch/test_launch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os
import runpy
import signal
import time
from typing import Any, Dict, List
from unittest import mock

Expand Down Expand Up @@ -55,3 +58,41 @@ def test_launch_script(mock_validate_config: mock.MagicMock) -> None:
with pytest.raises(SystemExit) as e:
runpy.run_module("determined.exec.launch", run_name="__main__", alter_sys=True)
assert e.value.code == 1, e


@mock.patch("subprocess.Popen")
@mock.patch("determined.common.api.authentication.login_from_task")
def test_launch_catches_slurm_preemption(
mock_login_from_task: mock.MagicMock,
mock_popen: mock.MagicMock,
) -> None:
"""
Determined's slurm preemption logic involves catching the SIGTERM sent by slurm, reporting it
to the master, then the python code will detect that through the normal should_preempt() logic.
Here, we need to test that the launch script actually catches SIGTERM and makes the right API
call to the master.
"""

# Send a SIGTERM to this process during the Popen.wait().
def wait_side_effect() -> int:
os.kill(os.getpid(), signal.SIGTERM)
# Make any syscall at all to ensure the signal handler has a good chance to fire.
time.sleep(0.000001)
# SIGTERM handler should have fired, make sure the right API call was made.
mock_login_from_task.return_value.post.assert_called_once_with(
"/api/v1/allocations/allocationId/signals/pending_preemption"
)
# Return a unique value to make sure our mocks are wired up right.
return 789

mock_popen.return_value.wait.side_effect = wait_side_effect

old_handler = signal.getsignal(signal.SIGTERM)
try:
with test_util.set_env_vars({"DET_RESOURCES_TYPE": "slurm-job"}):
with test_util.set_mock_cluster_info(["0.0.0.1"], 0, 1):
config = {"entrypoint": "yo"}
assert launch.launch(det.ExperimentConfig(config)) == 789
finally:
signal.signal(signal.SIGTERM, old_handler)

0 comments on commit c6c476c

Please sign in to comment.