From eed83be46b41c7ce55ea9ede0b3681b6ff7f0bd8 Mon Sep 17 00:00:00 2001
From: Daniel Janicek <daniel.janicek@hpe.com>
Date: Wed, 6 Nov 2024 08:52:54 -0800
Subject: [PATCH 1/3] znode delete

---
 .circleci/real_config.yml                     | 402 ------------------
 e2e_tests/pytest.ini                          |   1 -
 .../tests/cluster/managed_slurm_cluster.py    | 110 -----
 .../tests/cluster/test_master_restart.py      |  44 --
 e2e_tests/tests/cluster/test_slurm.py         |  14 -
 e2e_tests/tests/conftest.py                   |   1 -
 tools/slurm/README.md                         |   4 +-
 7 files changed, 1 insertion(+), 575 deletions(-)
 delete mode 100644 e2e_tests/tests/cluster/managed_slurm_cluster.py

diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
index a4451177768..64a609ca2e8 100644
--- a/.circleci/real_config.yml
+++ b/.circleci/real_config.yml
@@ -2723,15 +2723,6 @@ jobs:
       - store_test_results:
           path: /tmp/test-results
 
-  test-e2e-slurm-disabled:
-    parameters:
-      master_config:
-        type: string
-        default:
-    docker:
-      - image: <<pipeline.parameters.docker-image>>
-    steps:
-      - run: echo "Test suite disabled."
 
   # By default, this job only runs on the main branch unless otherwise
   # specified. To invoke this job on a developer branch, add the 'ci-run-allgcp'
@@ -2906,252 +2897,6 @@ jobs:
       - store_test_results:
           path: /tmp/test-results/
 
-  test-e2e-slurm:
-    parameters:
-      mark:
-        type: string
-        default: e2e_slurm
-      runner_class:
-        type: string
-        default: determined-ai/znode-cluster
-      master_config:
-        type: string
-        default: |
-          task_container_defaults:
-            slurm:
-              sbatch_args:
-                - --time=04:00:00
-            environment_variables:
-              # Some ports are not working, disable them so distributed jobs work.
-              - NCCL_IB_HCA=mlx6_0:0
-          checkpoint_storage:
-            type: shared_fs
-            host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints
-            storage_path: determined-checkpoint
-            save_experiment_best: 0
-            save_trial_best: 1
-            save_trial_latest: 1
-          db:
-            user: postgres
-            host: localhost
-            port: 5432
-            name: determined
-            password: ${HPC_DB_PASSWD}
-          resource_manager:
-            type: slurm
-            master_host: $HOSTNAME
-            master_port: 8080
-            host: localhost
-            port: 8181
-            protocol: http
-            slot_type: cuda
-            user_name: launcher
-            group_name: hpcd
-            singularity_image_root: /lustre/hdd/foundation_engineering/images
-            job_storage_root: /scratch/launcher/.launcher.$HOSTNAME
-            auth_file: /home/launcher/.launcher.$HOSTNAME.token
-            path: /opt/singularity/bin:/usr/local/bin:${PATH}
-            ld_library_path:
-          security:
-            initial_user_password: ${INITIAL_USER_PASSWORD}
-      reserved_ports_znode50:
-        type: string
-        default: |
-          reserved_ports:
-            - 12350
-            - 12351
-            - 12360
-            - 12361
-            - 12365
-            - 12366
-            - 29400
-      determined_master_host:
-        type: string
-        default: localhost:8080
-      cluster_unix_user:
-        type: string
-        default: launcher
-      cluster_determined_user:
-        type: string
-        default: determined
-      determined_admin_username:
-        type: string
-        default: admin
-      database_username:
-        type: string
-        default: postgres
-      database_password:
-        type: string
-        default: launcher
-      extra-pytest-flags:
-        type: string
-        default: ""
-      collect-det-job-logs:
-        type: boolean
-        default: true
-    # Following https://circleci.com/docs/2.0/runner-installation-linux/index.html#start-the-service
-    machine: true
-    resource_class: <<parameters.runner_class>>
-    environment:
-      SHARED_CLUSTER: True
-    steps:
-      - checkout
-      - attach_workspace:
-          at: .
-
-      - set-slack-user-id
-      - run: sudo yum install -y xmlsec1
-      - run:
-          name: Remove previous HPE MLDE Master RPM
-          # FE-35: the slow exit of a container, or NFS caching can cause incorrect response
-          # and failure executing command "rm -rf ${dir}". Wait for 5 seconds and retry the command.
-          # Ignore the failure on retry, cause the failure from "rm -rf" should not stop the test suite to run.
-          command: |
-            export DET_PKG_NAME=$(rpm -qp --queryformat "%{NAME}" master/dist/hpe-mlde-master_*-ee_linux_amd64.rpm)
-            if rpm -q $DET_PKG_NAME; then
-              sudo rpm -e $DET_PKG_NAME
-            fi
-            echo "Cleanup state from prior runs on HOSTNAME=$HOSTNAME"
-
-            dir="/scratch/launcher/.launcher.$HOSTNAME/checkpoints/determined-checkpoint"
-            if sudo rm -rf ${dir}; then
-              echo "Removed ${dir}"
-            else
-              sleep 5
-              echo "Retry cleanup ${dir}"
-              sudo rm -rf ${dir} || true
-            fi
-
-            dir="/scratch/launcher/.launcher.$HOSTNAME/archiveVolumes/"
-            if sudo rm -rf ${dir}; then
-              echo "Removed ${dir}"
-            else
-              sleep 5
-              echo "Retry cleanup ${dir}"
-              sudo rm -rf ${dir} || true
-            fi
-
-            dir="/scratch/launcher/.launcher.$HOSTNAME/jobs"
-            if sudo rm -rf ${dir}; then
-              echo "Removed ${dir}"
-            else
-              sleep 5
-              echo "Retry cleanup ${dir}"
-              sudo rm -rf ${dir} || true
-            fi
-
-      - setup-python-venv:
-          determined: True
-          extra-requirements-file: "e2e_tests/tests/requirements.txt"
-          install-python: false
-          executor: <<parameters.runner_class>>
-
-      - run:
-          name: Recreate Fresh Database
-          command: |
-            if systemctl is-active --quiet determined-master; then
-              sudo systemctl stop determined-master
-            fi
-            PGPASSWORD=<<parameters.database_password>> dropdb --host=localhost --port=5432 --username=<<parameters.database_username>> --if-exists determined
-            PGPASSWORD=<<parameters.database_password>> createdb --host=localhost --port=5432 --username=<<parameters.database_username>> determined
-
-      - run:
-          name: Install/Configure HPE MLDE Master
-          command: |
-            sudo rpm -i master/dist/hpe-mlde-master_*-ee_linux_amd64.rpm
-            cat \<< EOF > ./master.yaml
-            <<parameters.master_config>>
-            EOF
-            cat \<< EOF > ./reserved.yaml
-            <<parameters.reserved_ports_znode50>>
-            EOF
-            echo "hostname is $(hostname)"
-            # Disallow certain ports on znode50 for fewer conflicts with znode51
-            if [[ "$(hostname)" == "znode50" ]] ; then cat ./reserved.yaml >> ./master.yaml ; fi
-            cat ./master.yaml
-            sudo cp ./master.yaml /etc/determined/master.yaml
-            sudo systemctl daemon-reload
-            sudo systemctl start determined-master
-            # Show if there are any drained nodes
-            sinfo -R
-            # Resume any drained nodes due to problems killing podman processes.
-            # This will return an error if all nodes do not require a resume (ignore status)
-            sudo scontrol update nodename=znode5[0-1,3-4] state=resume || true
-            sudo su - launcher -c "pdsh -R ssh -w znode50,znode51,znode53,znode54 mkdir -p /tmp/launcher_podman" || true
-            # Cleanup podman state from any issues in prior runs
-            #sudo su - launcher -c "pdsh -R ssh -w znode50,znode51,znode53,znode54 XDG_RUNTIME_DIR=/tmp/launcher_podman podman system migrate" || true
-            sinfo -R
-
-      - run:
-          name: Download Apptainer/Enroot image
-          command: |
-            # rocm images are not required at present
-            if sudo su - launcher -c "/etc/launcher/scripts/manage-singularity-cache <<pipeline.parameters.default-pt-gpu-hpc-image>>"; then
-                echo "Downloaded Singularity image <<pipeline.parameters.default-pt-gpu-hpc-image>> "
-            else
-                EXIT_STATUS=$?
-                echo "Failed downloading Singularity image. Received exit code $EXIT_STATUS"
-                #Ignore the other failures except for IMAGE_REF_NOT_FOUND_IN_DOC=18 or DOC_FILE_NOT_FOUND=11
-                if [[ $EXIT_STATUS -eq 18 || $EXIT_STATUS -eq 11 ]]; then
-                    exit $EXIT_STATUS
-                else
-                    exit 0
-                fi
-            fi
-            if sudo su - launcher -c "ENROOT_RUNTIME_PATH=/tmp/launcher /etc/launcher/scripts/manage-enroot-cache -s /lustre/ssd/foundation_engineering/ <<pipeline.parameters.default-pt-tf-cpu-hpc-image>>"; then
-                echo "Downloaded Enroot image <<pipeline.parameters.default-pt-tf-cpu-hpc-image>> "
-            else
-                EXIT_STATUS=$?
-                echo "Failed downloading Enroot image. Received exit code $EXIT_STATUS"
-                #Ignore the other failures except for IMAGE_REF_NOT_FOUND_IN_DOC=18 or DOC_FILE_NOT_FOUND=11
-                if [[ $EXIT_STATUS -eq 18 || $EXIT_STATUS -eq 11 ]]; then
-                    exit $EXIT_STATUS
-                else
-                    exit 0
-                fi
-            fi
-
-      - wait-for-master:
-          host: localhost
-
-      - run:
-          name: Populate determined user agent values
-          command: |
-            id <<parameters.cluster_unix_user>> || sudo useradd <<parameters.cluster_unix_user>>
-            TOKEN=$(
-              curl "<<parameters.determined_master_host>>/api/v1/auth/login" \
-                -f \
-                -X POST \
-                --data-binary @- \<< EOF | jq -r '.token'
-              {
-                  "username": "<<parameters.determined_admin_username>>",
-                  "password": "$INITIAL_USER_PASSWORD"
-              }
-            EOF
-            )
-            curl "<<parameters.determined_master_host>>/api/v1/users/2" \
-              -f \
-              -X PATCH \
-              -H "Authorization: Bearer ${TOKEN}" \
-              --data-binary @- \<< EOF
-              {"agentUserGroup": {
-                "agentUid": $(id -u <<parameters.cluster_unix_user>>),
-                "agentUser": "<<parameters.cluster_unix_user>>",
-                "agentGid": $(id -g <<parameters.cluster_unix_user>>),
-                "agentGroup": "$(id -gn <<parameters.cluster_unix_user>>)"
-                }
-              }
-            EOF
-
-      - run-e2e-tests:
-          mark: <<parameters.mark>>
-          master-host: localhost
-          managed-devcluster: false
-          extra-pytest-flags: <<parameters.extra-pytest-flags>>
-          collect-det-job-logs: <<parameters.collect-det-job-logs>>
-
-      - store_test_results:
-          path: /tmp/test-results/
 
   test-e2e:
     parameters:
@@ -5112,77 +4857,6 @@ workflows:
           type: approval
           filters: *upstream-feature-branch
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-restart
-          context:
-            - dev-ci-cluster-default-user-credentials
-          mark: "e2e_slurm_restart"
-          filters: *upstream-feature-branch
-          requires:
-            - package-and-push-system-local-ee
-            - request-hpc-tests
-          extra-pytest-flags: "--no-compare-stats"
-          collect-det-job-logs: false
-
-      - test-e2e-slurm:
-          name: test-e2e-slurm-enroot-znode
-          context:
-            - dev-ci-cluster-default-user-credentials
-          matrix:
-            parameters:
-              mark: ["e2e_slurm and not deepspeed"]
-          requires:
-            - package-and-push-system-local-ee
-            - request-hpc-tests
-          master_config: |
-            task_container_defaults:
-              slurm:
-                sbatch_args:
-                  - --time=04:00:00
-              # The current image must be created in the launcher account before running this test
-              # cd /lustre/ssd/foundation_engineering/
-              # enroot import docker://determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh
-              # enroot create /lustre/ssd/foundation_engineering/determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh
-              # image: determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0
-              image:
-                cpu: <<pipeline.parameters.default-pt-tf-cpu-hpc-image>>
-                cuda: <<pipeline.parameters.default-pt-gpu-hpc-image>>
-              environment_variables:
-                  # Some ports are not working, disable them so distributed jobs work.
-                  - NCCL_IB_HCA=mlx6_0:0
-                  # Workaround XDG_RUNTIME_DIR not provided by Slurm
-                  - ENROOT_RUNTIME_PATH=/tmp/launcher
-            checkpoint_storage:
-              type: shared_fs
-              host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints
-              storage_path: determined-checkpoint
-              save_experiment_best: 0
-              save_trial_best: 1
-              save_trial_latest: 1
-            db:
-              user: postgres
-              host: localhost
-              port: 5432
-              name: determined
-              password: ${HPC_DB_PASSWD}
-            resource_manager:
-              type: slurm
-              master_host: $HOSTNAME
-              master_port: 8080
-              host: localhost
-              port: 8181
-              protocol: http
-              slot_type: cuda
-              user_name: launcher
-              container_run_type: enroot
-              group_name: hpcd
-              singularity_image_root: /lustre/hdd/foundation_engineering/images
-              job_storage_root: /scratch/launcher/.launcher.$HOSTNAME
-              auth_file: /home/launcher/.launcher.$HOSTNAME.token
-              path: /opt/singularity/bin:/usr/local/bin:${PATH}
-              ld_library_path:
-            security:
-              initial_user_password: ${INITIAL_USER_PASSWORD}
 
       # Singularity over SLURM test on GCP
       - test-e2e-hpc-gcp:
@@ -5385,82 +5059,6 @@ workflows:
             - build-docs
             - build-react-ee
           context: github-read
-      - test-e2e-slurm:
-          name: test-e2e-slurm-restart
-          context:
-            - dev-ci-cluster-default-user-credentials
-          mark: "e2e_slurm_restart"
-          requires:
-            - package-and-push-system-local-ee
-          extra-pytest-flags: "--no-compare-stats"
-          collect-det-job-logs: false
-      - test-e2e-slurm:
-          name: test-e2e-slurm-znode
-          context:
-            - dev-ci-cluster-default-user-credentials
-          requires:
-            - package-and-push-system-local-ee
-          extra-pytest-flags: "--no-compare-stats"
-      - test-e2e-slurm:
-          name: test-e2e-slurm-enroot-znode
-          context:
-            - dev-ci-cluster-default-user-credentials
-          matrix:
-            parameters:
-              mark: ["e2e_slurm and not deepspeed"]
-          requires:
-            - package-and-push-system-local-ee
-          master_config: |
-            task_container_defaults:
-              slurm:
-                sbatch_args:
-                  - --time=04:00:00
-              # The current image must be created in the launcher account before running this test
-              # cd /lustre/ssd/foundation_engineering/
-              # enroot import docker://determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh
-              # enroot create /lustre/ssd/foundation_engineering/determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0.sqsh
-              # image: determinedai+environments+cuda-11.3-pytorch-1.10-tf-2.8-gpu-24586f0
-              image:
-                cpu: <<pipeline.parameters.default-pt-tf-cpu-hpc-image>>
-                cuda: <<pipeline.parameters.default-pt-gpu-hpc-image>>
-              environment_variables:
-                  # Some ports are not working, disable them so distributed jobs work.
-                  - NCCL_IB_HCA=mlx6_0:0
-                  # Workaround XDG_RUNTIME_DIR not provided by Slurm
-                  - ENROOT_RUNTIME_PATH=/tmp/launcher
-            checkpoint_storage:
-              type: shared_fs
-              host_path: /scratch/launcher/.launcher.$HOSTNAME/checkpoints
-              storage_path: determined-checkpoint
-              save_experiment_best: 0
-              save_trial_best: 1
-              save_trial_latest: 1
-            db:
-              user: postgres
-              host: localhost
-              port: 5432
-              name: determined
-              password: ${HPC_DB_PASSWD}
-            resource_manager:
-              type: slurm
-              master_host: $HOSTNAME
-              master_port: 8080
-              host: localhost
-              port: 8181
-              protocol: http
-              slot_type: cuda
-              user_name: launcher
-              container_run_type: enroot
-              group_name: hpcd
-              singularity_image_root: /lustre/hdd/foundation_engineering/images
-              job_storage_root: /scratch/launcher/.launcher.$HOSTNAME
-              auth_file: /home/launcher/.launcher.$HOSTNAME.token
-              path: /opt/singularity/bin:/usr/local/bin:${PATH}
-              ld_library_path:
-            security:
-              initial_user_password: ${INITIAL_USER_PASSWORD}
-      - terminate-vpc-circleci:
-            context: ["gcp"]
 
   manual-e2e-react:
     when: << pipeline.parameters.e2e-react >>
diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
index 7ec4f8e50a9..db2f35ea6c8 100644
--- a/e2e_tests/pytest.ini
+++ b/e2e_tests/pytest.ini
@@ -20,7 +20,6 @@ markers =
     e2e_pbs: end to end pbs integration tests
     e2e_saml: tests for saml with okta
     e2e_slurm: end to end slurm integration tests
-    e2e_slurm_restart: slurm integration tests that require restarting the master
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
     test_oauth: end to end test for oauth client, add, remove in EE.
     test_model_registry_rbac: end to end test for RBAC model registry.
diff --git a/e2e_tests/tests/cluster/managed_slurm_cluster.py b/e2e_tests/tests/cluster/managed_slurm_cluster.py
deleted file mode 100644
index 0d04df9968b..00000000000
--- a/e2e_tests/tests/cluster/managed_slurm_cluster.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import os
-import shlex
-import subprocess
-import time
-from typing import Any, Iterator
-
-import pytest
-
-from tests import config as conf
-from tests.cluster import abstract_cluster, utils
-
-
-# ManagedSlurmCluster is an implementation of the abstract class Cluster, to suit a slurm based
-# devcluster instance. It is used as part of the e2e slurm tests that require the master to be
-# restarted.
-class ManagedSlurmCluster(abstract_cluster.Cluster):
-    def __init__(self) -> None:
-        self.is_circleci_job = os.getenv("IS_CIRCLECI_JOB")
-        self.dc = None
-        return
-
-    def __enter__(self) -> "ManagedSlurmCluster":
-        self._start_devcluster()
-        return self
-
-    def __exit__(self, *_: Any) -> None:
-        self.kill_master()
-        return
-
-    def kill_master(self) -> None:
-        if self.is_circleci_job:
-            # Use the pre-installed determined master service when running the tests as part of a
-            # CircleCI job.
-            subprocess.run(shlex.split("sudo systemctl stop determined-master"))
-        else:
-            # Use the local instance of devcluster.
-            if self.dc:
-                self.dc.kill()
-            self.dc = None
-        time.sleep(10)
-
-    def restart_master(self) -> None:
-        try:
-            self.kill_master()
-            self._start_devcluster()
-        except Exception as e:
-            print(e)
-            raise
-
-    def _start_devcluster(self) -> None:
-        try:
-            if self.is_circleci_job:
-                # Use the pre-installed determined master service when running the tests as part
-                # of a CircleCI job.
-                subprocess.run(shlex.split("sudo systemctl start determined-master"))
-            else:
-                # Use a local instance of the devcluster.
-                master_config_file = os.getenv("MASTER_CONFIG_FILE")
-                if not master_config_file:
-                    raise Exception(
-                        "MASTER_CONFIG_FILE is not set. Please set the MASTER_CONFIG_FILE to point "
-                        "to the master config file you want to use. Use ./tools/slurmcluster.sh -s "
-                        "<machine name> to create a new one."
-                    )
-                if not os.path.exists(master_config_file):
-                    raise Exception(
-                        f"Master config file {master_config_file} is missing. Please use "
-                        "./tools/slurmcluster.sh -s <machine name> to create one."
-                    )
-                self.dc = subprocess.Popen(  # type: ignore
-                    ["devcluster", "-c", master_config_file, "--oneshot"],
-                    cwd="..",
-                )
-            time.sleep(30)
-        except Exception as e:
-            print(e)
-            raise
-
-    def ensure_agent_ok(self) -> None:
-        pass
-
-    def restart_agent(self, wait_for_amnesia: bool = True, wait_for_agent: bool = True) -> None:
-        pass
-
-
-# Create a pytest fixture that returns a ManagedSlurmCluster instance and set it's scope equal as
-# session (active for entire duration of the pytest command execution).
-@pytest.fixture(scope="session")
-def managed_slurm_cluster_session(request: Any) -> Iterator[ManagedSlurmCluster]:
-    with ManagedSlurmCluster() as msc:
-        yield msc
-
-
-# Create a pytest fixture that returns a fixture of kind managed_slurm_cluster_session, defined
-# above. Additionally, log the timestamp and the nodeid (pytest identifier for each test) before
-# and after every test.
-@pytest.fixture
-def managed_slurm_cluster_restarts(
-    managed_slurm_cluster_session: ManagedSlurmCluster, request: Any
-) -> Iterator[ManagedSlurmCluster]:
-    if os.getenv("IS_CIRCLECI_JOB"):
-        # CircleCI job has master running on port 8080
-        conf.MASTER_PORT = "8080"
-    else:
-        # Local instance of devcluster is run on port 8081
-        conf.MASTER_PORT = "8081"
-    nodeid = request.node.nodeid
-    managed_slurm_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n")
-    yield managed_slurm_cluster_session
-    managed_slurm_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} teardown\n")
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 59441c0f294..9150ab15c2e 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -19,7 +19,6 @@
     abstract_cluster,
     managed_cluster,
     managed_cluster_k8s,
-    managed_slurm_cluster,
     utils,
 )
 from tests.experiment import noop
@@ -28,18 +27,6 @@
 logger = logging.getLogger(__name__)
 
 
-# Create a pytest fixture that returns a restartable instance of ManagedSlurmCluster.
-@pytest.fixture
-def restartable_managed_slurm_cluster(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
-) -> Iterator[managed_slurm_cluster.ManagedSlurmCluster]:
-    try:
-        yield managed_slurm_cluster_restarts
-    except Exception:
-        managed_slurm_cluster_restarts.restart_master()
-        raise
-
-
 @pytest.mark.managed_devcluster
 def test_master_restart_ok(restartable_managed_cluster: managed_cluster.ManagedCluster) -> None:
     _test_master_restart_ok(restartable_managed_cluster)
@@ -90,13 +77,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK
     _test_master_restart_ok(k8s_managed_cluster)
 
 
-# Test to ensure master restarts successfully.
-@pytest.mark.e2e_slurm_restart
-def test_master_restart_ok_slurm(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
-) -> None:
-    _test_master_restart_ok(managed_slurm_cluster_restarts)
-
 
 def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None:
     # - Kill master
@@ -143,17 +123,6 @@ def test_master_restart_reattach_recover_experiment_k8s(
     _test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime)
 
 
-# Test to ensure that master can reattach to the experiment and resume it, after the determined
-# master has restarted.
-@pytest.mark.e2e_slurm_restart
-@pytest.mark.parametrize("downtime", [0, 20, 60])
-def test_master_restart_reattach_recover_experiment_slurm(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int
-) -> None:
-    _test_master_restart_reattach_recover_experiment(
-        managed_slurm_cluster_restarts, downtime, max_workload_ticks=500
-    )
-
 
 @pytest.mark.managed_devcluster
 def test_master_agent_restart_reattach_recover_experiment(
@@ -544,19 +513,6 @@ def test_master_restart_cmd_k8s(
     _test_master_restart_cmd(k8s_managed_cluster, slots, downtime)
 
 
-# Test to ensure that master can recover and complete a command that was in running state
-# when the master has restarted.
-@pytest.mark.e2e_slurm_restart
-@pytest.mark.parametrize("slots", [0, 1])
-@pytest.mark.parametrize("downtime", [0, 20, 60])
-def test_master_restart_cmd_slurm(
-    restartable_managed_slurm_cluster: managed_slurm_cluster.ManagedSlurmCluster,
-    slots: int,
-    downtime: int,
-) -> None:
-    _test_master_restart_cmd(restartable_managed_slurm_cluster, slots, downtime)
-
-
 def _test_master_restart_cmd(
     managed_cluster: abstract_cluster.Cluster, slots: int, downtime: int
 ) -> None:
diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py
index 5a8166e0d56..ddd61fe9234 100644
--- a/e2e_tests/tests/cluster/test_slurm.py
+++ b/e2e_tests/tests/cluster/test_slurm.py
@@ -170,20 +170,6 @@ def test_docker_login() -> None:
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available")
-@pytest.mark.e2e_slurm
-@pytest.mark.e2e_pbs
-@api_utils.skipif_not_hpc()
-def test_mnist_pytorch_distributed() -> None:
-    sess = api_utils.user_session()
-    config = conf.load_config(conf.tutorials_path("mnist_pytorch/distributed.yaml"))
-    assert "--epochs 1" in config["entrypoint"], "update test to match tutorial"
-    config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64")
-    config["max_restarts"] = 0
-
-    exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)
-
-
 @pytest.mark.e2e_slurm
 @pytest.mark.e2e_pbs
 @api_utils.skipif_not_hpc()
diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py
index 92a99c651d3..85e0e1cfeb9 100644
--- a/e2e_tests/tests/conftest.py
+++ b/e2e_tests/tests/conftest.py
@@ -34,7 +34,6 @@
     "e2e_pbs",
     "e2e_saml",
     "e2e_slurm",
-    "e2e_slurm_restart",
     "e2e_slurm_internet_connected_cluster",
     "det_deploy_local",
     "test_oauth",
diff --git a/tools/slurm/README.md b/tools/slurm/README.md
index eefdaca3df4..3a234254a7a 100644
--- a/tools/slurm/README.md
+++ b/tools/slurm/README.md
@@ -148,9 +148,7 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
 
 **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
 
-The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
-  - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
-  - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.
+
 
 ## Important Workaround Explained
 

From 9cd59f17a8ea58d04aac3b6d1bf4207b85e69688 Mon Sep 17 00:00:00 2001
From: Daniel Janicek <daniel.janicek@hpe.com>
Date: Wed, 6 Nov 2024 09:29:22 -0800
Subject: [PATCH 2/3] lint

---
 e2e_tests/tests/cluster/conftest.py            | 4 ----
 e2e_tests/tests/cluster/test_master_restart.py | 9 +--------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py
index ba545dd0546..dfbb7601d5d 100644
--- a/e2e_tests/tests/cluster/conftest.py
+++ b/e2e_tests/tests/cluster/conftest.py
@@ -11,7 +11,3 @@
     restartable_managed_cluster_multi_resource_pools,
 )
 from .managed_cluster_k8s import k8s_managed_cluster  # noqa
-from .managed_slurm_cluster import (  # noqa
-    managed_slurm_cluster_restarts,
-    managed_slurm_cluster_session,
-)
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 9150ab15c2e..8dad51701d3 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -15,12 +15,7 @@
 from tests import config as conf
 from tests import detproc
 from tests import experiment as exp
-from tests.cluster import (
-    abstract_cluster,
-    managed_cluster,
-    managed_cluster_k8s,
-    utils,
-)
+from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
 from tests.experiment import noop
 from tests.task import task
 
@@ -77,7 +72,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK
     _test_master_restart_ok(k8s_managed_cluster)
 
 
-
 def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None:
     # - Kill master
     # - Restart master
@@ -123,7 +117,6 @@ def test_master_restart_reattach_recover_experiment_k8s(
     _test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime)
 
 
-
 @pytest.mark.managed_devcluster
 def test_master_agent_restart_reattach_recover_experiment(
     restartable_managed_cluster: managed_cluster.ManagedCluster,

From f2a6f355e223b12a412ac452ba86312f202e8077 Mon Sep 17 00:00:00 2001
From: Daniel Janicek <daniel.janicek@hpe.com>
Date: Wed, 6 Nov 2024 10:19:16 -0800
Subject: [PATCH 3/3] lint

---
 e2e_tests/tests/cluster/test_master_restart.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 8dad51701d3..fb420bb9a8d 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -1,6 +1,5 @@
 import logging
 import time
-from typing import Iterator
 
 import docker
 import pytest