From eed83be46b41c7ce55ea9ede0b3681b6ff7f0bd8 Mon Sep 17 00:00:00 2001 From: Daniel Janicek Date: Wed, 6 Nov 2024 08:52:54 -0800 Subject: [PATCH 1/3] znode delete --- .circleci/real_config.yml | 402 ------------------ e2e_tests/pytest.ini | 1 - .../tests/cluster/managed_slurm_cluster.py | 110 ----- .../tests/cluster/test_master_restart.py | 44 -- e2e_tests/tests/cluster/test_slurm.py | 14 - e2e_tests/tests/conftest.py | 1 - tools/slurm/README.md | 4 +- 7 files changed, 1 insertion(+), 575 deletions(-) delete mode 100644 e2e_tests/tests/cluster/managed_slurm_cluster.py diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index a4451177768..64a609ca2e8 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -2723,15 +2723,6 @@ jobs: - store_test_results: path: /tmp/test-results - test-e2e-slurm-disabled: - parameters: - master_config: - type: string - default: - docker: - - image: <> - steps: - - run: echo "Test suite disabled." # By default, this job only runs on the main branch unless otherwise # specified. To invoke this job on a developer branch, add the 'ci-run-allgcp' @@ -2906,252 +2897,6 @@ jobs: - store_test_results: path: /tmp/test-results/ - test-e2e-slurm: - parameters: - mark: - type: string - default: e2e_slurm - runner_class: - type: string - default: determined-ai/znode-cluster - master_config: - type: string - default: | - task_container_defaults: - slurm: - sbatch_args: - - --time=04:00:00 - environment_variables: - # Some ports are not working, disable them so distributed jobs work. - - NCCL_IB_HCA=mlx6_0:0 - checkpoint_storage: - type: shared_fs - host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints - storage_path: determined-checkpoint - save_experiment_best: 0 - save_trial_best: 1 - save_trial_latest: 1 - db: - user: postgres - host: localhost - port: 5432 - name: determined - password: ${HPC_DB_PASSWD} - resource_manager: - type: slurm - master_host: $HOSTNAME - master_port: 8080 - host: localhost - port: 8181 - protocol: http - slot_type: cuda - user_name: launcher - group_name: hpcd - singularity_image_root: /lustre/hdd/foundation_engineering/images - job_storage_root: /scratch/launcher/.launcher.$HOSTNAME - auth_file: /home/launcher/.launcher.$HOSTNAME.token - path: /opt/singularity/bin:/usr/local/bin:${PATH} - ld_library_path: - security: - initial_user_password: ${INITIAL_USER_PASSWORD} - reserved_ports_znode50: - type: string - default: | - reserved_ports: - - 12350 - - 12351 - - 12360 - - 12361 - - 12365 - - 12366 - - 29400 - determined_master_host: - type: string - default: localhost:8080 - cluster_unix_user: - type: string - default: launcher - cluster_determined_user: - type: string - default: determined - determined_admin_username: - type: string - default: admin - database_username: - type: string - default: postgres - database_password: - type: string - default: launcher - extra-pytest-flags: - type: string - default: "" - collect-det-job-logs: - type: boolean - default: true - # Following https://circleci.com/docs/2.0/runner-installation-linux/index.html#start-the-service - machine: true - resource_class: <> - environment: - SHARED_CLUSTER: True - steps: - - checkout - - attach_workspace: - at: . - - - set-slack-user-id - - run: sudo yum install -y xmlsec1 - - run: - name: Remove previous HPE MLDE Master RPM - # FE-35: the slow exit of a container, or NFS caching can cause incorrect response - # and failure executing command "rm -rf ${dir}". Wait for 5 seconds and retry the command. - # Ignore the failure on retry, cause the failure from "rm -rf" should not stop the test suite to run. - command: | - export DET_PKG_NAME=$(rpm -qp --queryformat "%{NAME}" master/dist/hpe-mlde-master_*-ee_linux_amd64.rpm) - if rpm -q $DET_PKG_NAME; then - sudo rpm -e $DET_PKG_NAME - fi - echo "Cleanup state from prior runs on HOSTNAME=$HOSTNAME" - - dir="/scratch/launcher/.launcher.$HOSTNAME/checkpoints/determined-checkpoint" - if sudo rm -rf ${dir}; then - echo "Removed ${dir}" - else - sleep 5 - echo "Retry cleanup ${dir}" - sudo rm -rf ${dir} || true - fi - - dir="/scratch/launcher/.launcher.$HOSTNAME/archiveVolumes/" - if sudo rm -rf ${dir}; then - echo "Removed ${dir}" - else - sleep 5 - echo "Retry cleanup ${dir}" - sudo rm -rf ${dir} || true - fi - - dir="/scratch/launcher/.launcher.$HOSTNAME/jobs" - if sudo rm -rf ${dir}; then - echo "Removed ${dir}" - else - sleep 5 - echo "Retry cleanup ${dir}" - sudo rm -rf ${dir} || true - fi - - - setup-python-venv: - determined: True - extra-requirements-file: "e2e_tests/tests/requirements.txt" - install-python: false - executor: <> - - - run: - name: Recreate Fresh Database - command: | - if systemctl is-active --quiet determined-master; then - sudo systemctl stop determined-master - fi - PGPASSWORD=<> dropdb --host=localhost --port=5432 --username=<> --if-exists determined - PGPASSWORD=<> createdb --host=localhost --port=5432 --username=<> determined - - - run: - name: Install/Configure HPE MLDE Master - command: | - sudo rpm -i master/dist/hpe-mlde-master_*-ee_linux_amd64.rpm - cat \<< EOF > ./master.yaml - <> - EOF - cat \<< EOF > ./reserved.yaml - <> - EOF - echo "hostname is $(hostname)" - # Disallow certain ports on znode50 for fewer conflicts with znode51 - if [[ "$(hostname)" == "znode50" ]] ; then cat ./reserved.yaml >> ./master.yaml ; fi - cat ./master.yaml - sudo cp ./master.yaml /etc/determined/master.yaml - sudo systemctl daemon-reload - sudo systemctl start determined-master - # Show if there are any drained nodes - sinfo -R - # Resume any drained nodes due to problems killing podman processes. - # This will return an error if all nodes do not require a resume (ignore status) - sudo scontrol update nodename=znode5[0-1,3-4] state=resume || true - sudo su - launcher -c "pdsh -R ssh -w znode50,znode51,znode53,znode54 mkdir -p /tmp/launcher_podman" || true - # Cleanup podman state from any issues in prior runs - #sudo su - launcher -c "pdsh -R ssh -w znode50,znode51,znode53,znode54 XDG_RUNTIME_DIR=/tmp/launcher_podman podman system migrate" || true - sinfo -R - - - run: - name: Download Apptainer/Enroot image - command: | - # rocm images are not required at present - if sudo su - launcher -c "/etc/launcher/scripts/manage-singularity-cache <>"; then - echo "Downloaded Singularity image <> " - else - EXIT_STATUS=$? - echo "Failed downloading Singularity image. Received exit code $EXIT_STATUS" - #Ignore the other failures except for IMAGE_REF_NOT_FOUND_IN_DOC=18 or DOC_FILE_NOT_FOUND=11 - if [[ $EXIT_STATUS -eq 18 || $EXIT_STATUS -eq 11 ]]; then - exit $EXIT_STATUS - else - exit 0 - fi - fi - if sudo su - launcher -c "ENROOT_RUNTIME_PATH=/tmp/launcher /etc/launcher/scripts/manage-enroot-cache -s /lustre/ssd/foundation_engineering/ <>"; then - echo "Downloaded Enroot image <> " - else - EXIT_STATUS=$? - echo "Failed downloading Enroot image. Received exit code $EXIT_STATUS" - #Ignore the other failures except for IMAGE_REF_NOT_FOUND_IN_DOC=18 or DOC_FILE_NOT_FOUND=11 - if [[ $EXIT_STATUS -eq 18 || $EXIT_STATUS -eq 11 ]]; then - exit $EXIT_STATUS - else - exit 0 - fi - fi - - - wait-for-master: - host: localhost - - - run: - name: Populate determined user agent values - command: | - id <> || sudo useradd <> - TOKEN=$( - curl "<>/api/v1/auth/login" \ - -f \ - -X POST \ - --data-binary @- \<< EOF | jq -r '.token' - { - "username": "<>", - "password": "$INITIAL_USER_PASSWORD" - } - EOF - ) - curl "<>/api/v1/users/2" \ - -f \ - -X PATCH \ - -H "Authorization: Bearer ${TOKEN}" \ - --data-binary @- \<< EOF - {"agentUserGroup": { - "agentUid": $(id -u <>), - "agentUser": "<>", - "agentGid": $(id -g <>), - "agentGroup": "$(id -gn <>)" - } - } - EOF - - - run-e2e-tests: - mark: <> - master-host: localhost - managed-devcluster: false - extra-pytest-flags: <> - collect-det-job-logs: <> - - - store_test_results: - path: /tmp/test-results/ test-e2e: parameters: @@ -5112,77 +4857,6 @@ workflows: type: approval filters: *upstream-feature-branch - - test-e2e-slurm: - name: test-e2e-slurm-restart - context: - - dev-ci-cluster-default-user-credentials - mark: "e2e_slurm_restart" - filters: *upstream-feature-branch - requires: - - package-and-push-system-local-ee - - request-hpc-tests - extra-pytest-flags: "--no-compare-stats" - collect-det-job-logs: false - - - test-e2e-slurm: - name: test-e2e-slurm-enroot-znode - context: - - dev-ci-cluster-default-user-credentials - matrix: - parameters: - mark: ["e2e_slurm and not deepspeed"] - requires: - - package-and-push-system-local-ee - - request-hpc-tests - master_config: | - task_container_defaults: - slurm: - sbatch_args: - - --time=04:00:00 - # The current image must be created in the launcher account before running this test - # cd /lustre/ssd/foundation_engineering/ - # enroot import docker://determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh - # enroot create /lustre/ssd/foundation_engineering/determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh - # image: determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0 - image: - cpu: <> - cuda: <> - environment_variables: - # Some ports are not working, disable them so distributed jobs work. - - NCCL_IB_HCA=mlx6_0:0 - # Workaround XDG_RUNTIME_DIR not provided by Slurm - - ENROOT_RUNTIME_PATH=/tmp/launcher - checkpoint_storage: - type: shared_fs - host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints - storage_path: determined-checkpoint - save_experiment_best: 0 - save_trial_best: 1 - save_trial_latest: 1 - db: - user: postgres - host: localhost - port: 5432 - name: determined - password: ${HPC_DB_PASSWD} - resource_manager: - type: slurm - master_host: $HOSTNAME - master_port: 8080 - host: localhost - port: 8181 - protocol: http - slot_type: cuda - user_name: launcher - container_run_type: enroot - group_name: hpcd - singularity_image_root: /lustre/hdd/foundation_engineering/images - job_storage_root: /scratch/launcher/.launcher.$HOSTNAME - auth_file: /home/launcher/.launcher.$HOSTNAME.token - path: /opt/singularity/bin:/usr/local/bin:${PATH} - ld_library_path: - security: - initial_user_password: ${INITIAL_USER_PASSWORD} # Singularity over SLURM test on GCP - test-e2e-hpc-gcp: @@ -5385,82 +5059,6 @@ workflows: - build-docs - build-react-ee context: github-read - - test-e2e-slurm: - name: test-e2e-slurm-restart - context: - - dev-ci-cluster-default-user-credentials - mark: "e2e_slurm_restart" - requires: - - package-and-push-system-local-ee - extra-pytest-flags: "--no-compare-stats" - collect-det-job-logs: false - - test-e2e-slurm: - name: test-e2e-slurm-znode - context: - - dev-ci-cluster-default-user-credentials - requires: - - package-and-push-system-local-ee - extra-pytest-flags: "--no-compare-stats" - - test-e2e-slurm: - name: test-e2e-slurm-enroot-znode - context: - - dev-ci-cluster-default-user-credentials - matrix: - parameters: - mark: ["e2e_slurm and not deepspeed"] - requires: - - package-and-push-system-local-ee - master_config: | - task_container_defaults: - slurm: - sbatch_args: - - --time=04:00:00 - # The current image must be created in the launcher account before running this test - # cd /lustre/ssd/foundation_engineering/ - # enroot import docker://determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh - # enroot create /lustre/ssd/foundation_engineering/determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh - # image: determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0 - image: - cpu: <> - cuda: <> - environment_variables: - # Some ports are not working, disable them so distributed jobs work. - - NCCL_IB_HCA=mlx6_0:0 - # Workaround XDG_RUNTIME_DIR not provided by Slurm - - ENROOT_RUNTIME_PATH=/tmp/launcher - checkpoint_storage: - type: shared_fs - host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints - storage_path: determined-checkpoint - save_experiment_best: 0 - save_trial_best: 1 - save_trial_latest: 1 - db: - user: postgres - host: localhost - port: 5432 - name: determined - password: ${HPC_DB_PASSWD} - resource_manager: - type: slurm - master_host: $HOSTNAME - master_port: 8080 - host: localhost - port: 8181 - protocol: http - slot_type: cuda - user_name: launcher - container_run_type: enroot - group_name: hpcd - singularity_image_root: /lustre/hdd/foundation_engineering/images - job_storage_root: /scratch/launcher/.launcher.$HOSTNAME - auth_file: /home/launcher/.launcher.$HOSTNAME.token - path: /opt/singularity/bin:/usr/local/bin:${PATH} - ld_library_path: - security: - initial_user_password: ${INITIAL_USER_PASSWORD} - - terminate-vpc-circleci: - context: ["gcp"] manual-e2e-react: when: << pipeline.parameters.e2e-react >> diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index 7ec4f8e50a9..db2f35ea6c8 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -20,7 +20,6 @@ markers = e2e_pbs: end to end pbs integration tests e2e_saml: tests for saml with okta e2e_slurm: end to end slurm integration tests - e2e_slurm_restart: slurm integration tests that require restarting the master e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access test_oauth: end to end test for oauth client, add, remove in EE. test_model_registry_rbac: end to end test for RBAC model registry. diff --git a/e2e_tests/tests/cluster/managed_slurm_cluster.py b/e2e_tests/tests/cluster/managed_slurm_cluster.py deleted file mode 100644 index 0d04df9968b..00000000000 --- a/e2e_tests/tests/cluster/managed_slurm_cluster.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import shlex -import subprocess -import time -from typing import Any, Iterator - -import pytest - -from tests import config as conf -from tests.cluster import abstract_cluster, utils - - -# ManagedSlurmCluster is an implementation of the abstract class Cluster, to suit a slurm based -# devcluster instance. It is used as part of the e2e slurm tests that require the master to be -# restarted. -class ManagedSlurmCluster(abstract_cluster.Cluster): - def __init__(self) -> None: - self.is_circleci_job = os.getenv("IS_CIRCLECI_JOB") - self.dc = None - return - - def __enter__(self) -> "ManagedSlurmCluster": - self._start_devcluster() - return self - - def __exit__(self, *_: Any) -> None: - self.kill_master() - return - - def kill_master(self) -> None: - if self.is_circleci_job: - # Use the pre-installed determined master service when running the tests as part of a - # CircleCI job. - subprocess.run(shlex.split("sudo systemctl stop determined-master")) - else: - # Use the local instance of devcluster. - if self.dc: - self.dc.kill() - self.dc = None - time.sleep(10) - - def restart_master(self) -> None: - try: - self.kill_master() - self._start_devcluster() - except Exception as e: - print(e) - raise - - def _start_devcluster(self) -> None: - try: - if self.is_circleci_job: - # Use the pre-installed determined master service when running the tests as part - # of a CircleCI job. - subprocess.run(shlex.split("sudo systemctl start determined-master")) - else: - # Use a local instance of the devcluster. - master_config_file = os.getenv("MASTER_CONFIG_FILE") - if not master_config_file: - raise Exception( - "MASTER_CONFIG_FILE is not set. Please set the MASTER_CONFIG_FILE to point " - "to the master config file you want to use. Use ./tools/slurmcluster.sh -s " - " to create a new one." - ) - if not os.path.exists(master_config_file): - raise Exception( - f"Master config file {master_config_file} is missing. Please use " - "./tools/slurmcluster.sh -s to create one." - ) - self.dc = subprocess.Popen( # type: ignore - ["devcluster", "-c", master_config_file, "--oneshot"], - cwd="..", - ) - time.sleep(30) - except Exception as e: - print(e) - raise - - def ensure_agent_ok(self) -> None: - pass - - def restart_agent(self, wait_for_amnesia: bool = True, wait_for_agent: bool = True) -> None: - pass - - -# Create a pytest fixture that returns a ManagedSlurmCluster instance and set it's scope equal as -# session (active for entire duration of the pytest command execution). -@pytest.fixture(scope="session") -def managed_slurm_cluster_session(request: Any) -> Iterator[ManagedSlurmCluster]: - with ManagedSlurmCluster() as msc: - yield msc - - -# Create a pytest fixture that returns a fixture of kind managed_slurm_cluster_session, defined -# above. Additionally, log the timestamp and the nodeid (pytest identifier for each test) before -# and after every test. -@pytest.fixture -def managed_slurm_cluster_restarts( - managed_slurm_cluster_session: ManagedSlurmCluster, request: Any -) -> Iterator[ManagedSlurmCluster]: - if os.getenv("IS_CIRCLECI_JOB"): - # CircleCI job has master running on port 8080 - conf.MASTER_PORT = "8080" - else: - # Local instance of devcluster is run on port 8081 - conf.MASTER_PORT = "8081" - nodeid = request.node.nodeid - managed_slurm_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n") - yield managed_slurm_cluster_session - managed_slurm_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} teardown\n") diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 59441c0f294..9150ab15c2e 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -19,7 +19,6 @@ abstract_cluster, managed_cluster, managed_cluster_k8s, - managed_slurm_cluster, utils, ) from tests.experiment import noop @@ -28,18 +27,6 @@ logger = logging.getLogger(__name__) -# Create a pytest fixture that returns a restartable instance of ManagedSlurmCluster. -@pytest.fixture -def restartable_managed_slurm_cluster( - managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, -) -> Iterator[managed_slurm_cluster.ManagedSlurmCluster]: - try: - yield managed_slurm_cluster_restarts - except Exception: - managed_slurm_cluster_restarts.restart_master() - raise - - @pytest.mark.managed_devcluster def test_master_restart_ok(restartable_managed_cluster: managed_cluster.ManagedCluster) -> None: _test_master_restart_ok(restartable_managed_cluster) @@ -90,13 +77,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK _test_master_restart_ok(k8s_managed_cluster) -# Test to ensure master restarts successfully. -@pytest.mark.e2e_slurm_restart -def test_master_restart_ok_slurm( - managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, -) -> None: - _test_master_restart_ok(managed_slurm_cluster_restarts) - def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None: # - Kill master @@ -143,17 +123,6 @@ def test_master_restart_reattach_recover_experiment_k8s( _test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime) -# Test to ensure that master can reattach to the experiment and resume it, after the determined -# master has restarted. -@pytest.mark.e2e_slurm_restart -@pytest.mark.parametrize("downtime", [0, 20, 60]) -def test_master_restart_reattach_recover_experiment_slurm( - managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int -) -> None: - _test_master_restart_reattach_recover_experiment( - managed_slurm_cluster_restarts, downtime, max_workload_ticks=500 - ) - @pytest.mark.managed_devcluster def test_master_agent_restart_reattach_recover_experiment( @@ -544,19 +513,6 @@ def test_master_restart_cmd_k8s( _test_master_restart_cmd(k8s_managed_cluster, slots, downtime) -# Test to ensure that master can recover and complete a command that was in running state -# when the master has restarted. -@pytest.mark.e2e_slurm_restart -@pytest.mark.parametrize("slots", [0, 1]) -@pytest.mark.parametrize("downtime", [0, 20, 60]) -def test_master_restart_cmd_slurm( - restartable_managed_slurm_cluster: managed_slurm_cluster.ManagedSlurmCluster, - slots: int, - downtime: int, -) -> None: - _test_master_restart_cmd(restartable_managed_slurm_cluster, slots, downtime) - - def _test_master_restart_cmd( managed_cluster: abstract_cluster.Cluster, slots: int, downtime: int ) -> None: diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py index 5a8166e0d56..ddd61fe9234 100644 --- a/e2e_tests/tests/cluster/test_slurm.py +++ b/e2e_tests/tests/cluster/test_slurm.py @@ -170,20 +170,6 @@ def test_docker_login() -> None: ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available") -@pytest.mark.e2e_slurm -@pytest.mark.e2e_pbs -@api_utils.skipif_not_hpc() -def test_mnist_pytorch_distributed() -> None: - sess = api_utils.user_session() - config = conf.load_config(conf.tutorials_path("mnist_pytorch/distributed.yaml")) - assert "--epochs 1" in config["entrypoint"], "update test to match tutorial" - config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64") - config["max_restarts"] = 0 - - exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1) - - @pytest.mark.e2e_slurm @pytest.mark.e2e_pbs @api_utils.skipif_not_hpc() diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py index 92a99c651d3..85e0e1cfeb9 100644 --- a/e2e_tests/tests/conftest.py +++ b/e2e_tests/tests/conftest.py @@ -34,7 +34,6 @@ "e2e_pbs", "e2e_saml", "e2e_slurm", - "e2e_slurm_restart", "e2e_slurm_internet_connected_cluster", "det_deploy_local", "test_oauth", diff --git a/tools/slurm/README.md b/tools/slurm/README.md index eefdaca3df4..3a234254a7a 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -148,9 +148,7 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.** -The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate: - - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite. - - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP. + ## Important Workaround Explained From 9cd59f17a8ea58d04aac3b6d1bf4207b85e69688 Mon Sep 17 00:00:00 2001 From: Daniel Janicek Date: Wed, 6 Nov 2024 09:29:22 -0800 Subject: [PATCH 2/3] lint --- e2e_tests/tests/cluster/conftest.py | 4 ---- e2e_tests/tests/cluster/test_master_restart.py | 9 +-------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py index ba545dd0546..dfbb7601d5d 100644 --- a/e2e_tests/tests/cluster/conftest.py +++ b/e2e_tests/tests/cluster/conftest.py @@ -11,7 +11,3 @@ restartable_managed_cluster_multi_resource_pools, ) from .managed_cluster_k8s import k8s_managed_cluster # noqa -from .managed_slurm_cluster import ( # noqa - managed_slurm_cluster_restarts, - managed_slurm_cluster_session, -) diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 9150ab15c2e..8dad51701d3 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -15,12 +15,7 @@ from tests import config as conf from tests import detproc from tests import experiment as exp -from tests.cluster import ( - abstract_cluster, - managed_cluster, - managed_cluster_k8s, - utils, -) +from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils from tests.experiment import noop from tests.task import task @@ -77,7 +72,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK _test_master_restart_ok(k8s_managed_cluster) - def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None: # - Kill master # - Restart master @@ -123,7 +117,6 @@ def test_master_restart_reattach_recover_experiment_k8s( _test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime) - @pytest.mark.managed_devcluster def test_master_agent_restart_reattach_recover_experiment( restartable_managed_cluster: managed_cluster.ManagedCluster, From f2a6f355e223b12a412ac452ba86312f202e8077 Mon Sep 17 00:00:00 2001 From: Daniel Janicek Date: Wed, 6 Nov 2024 10:19:16 -0800 Subject: [PATCH 3/3] lint --- e2e_tests/tests/cluster/test_master_restart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 8dad51701d3..fb420bb9a8d 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -1,6 +1,5 @@ import logging import time -from typing import Iterator import docker import pytest