determined-ai · djanicekpach · Nov 13, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
@@ -20,7 +20,6 @@ markers =
     e2e_pbs: end to end pbs integration tests
     e2e_saml: tests for saml with okta
     e2e_slurm: end to end slurm integration tests
-    e2e_slurm_restart: slurm integration tests that require restarting the master
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
     test_oauth: end to end test for oauth client, add, remove in EE.
     test_model_registry_rbac: end to end test for RBAC model registry.

@@ -11,7 +11,3 @@
     restartable_managed_cluster_multi_resource_pools,
 )
 from .managed_cluster_k8s import k8s_managed_cluster  # noqa
-from .managed_slurm_cluster import (  # noqa
-    managed_slurm_cluster_restarts,
-    managed_slurm_cluster_session,
-)
@@ -1,6 +1,5 @@
 import logging
 import time
-from typing import Iterator
 
 import docker
 import pytest
@@ -15,31 +14,13 @@
 from tests import config as conf
 from tests import detproc
 from tests import experiment as exp
-from tests.cluster import (
-    abstract_cluster,
-    managed_cluster,
-    managed_cluster_k8s,
-    managed_slurm_cluster,
-    utils,
-)
+from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
 from tests.experiment import noop
 from tests.task import task
 
 logger = logging.getLogger(__name__)
 
 
-# Create a pytest fixture that returns a restartable instance of ManagedSlurmCluster.
-@pytest.fixture
-def restartable_managed_slurm_cluster(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
-) -> Iterator[managed_slurm_cluster.ManagedSlurmCluster]:
-    try:
-        yield managed_slurm_cluster_restarts
-    except Exception:
-        managed_slurm_cluster_restarts.restart_master()
-        raise
-
-
 @pytest.mark.managed_devcluster
 def test_master_restart_ok(restartable_managed_cluster: managed_cluster.ManagedCluster) -> None:
     _test_master_restart_ok(restartable_managed_cluster)
@@ -90,14 +71,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK
     _test_master_restart_ok(k8s_managed_cluster)
 
 
-# Test to ensure master restarts successfully.
-@pytest.mark.e2e_slurm_restart
-def test_master_restart_ok_slurm(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
-) -> None:
-    _test_master_restart_ok(managed_slurm_cluster_restarts)
-
-
 def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None:
     # - Kill master
     # - Restart master
@@ -143,18 +116,6 @@ def test_master_restart_reattach_recover_experiment_k8s(
     _test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime)
 
 
-# Test to ensure that master can reattach to the experiment and resume it, after the determined
-# master has restarted.
-@pytest.mark.e2e_slurm_restart
-@pytest.mark.parametrize("downtime", [0, 20, 60])
-def test_master_restart_reattach_recover_experiment_slurm(
-    managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int
-) -> None:
-    _test_master_restart_reattach_recover_experiment(
-        managed_slurm_cluster_restarts, downtime, max_workload_ticks=500
-    )
-
-
 @pytest.mark.managed_devcluster
 def test_master_agent_restart_reattach_recover_experiment(
     restartable_managed_cluster: managed_cluster.ManagedCluster,
@@ -544,19 +505,6 @@ def test_master_restart_cmd_k8s(
     _test_master_restart_cmd(k8s_managed_cluster, slots, downtime)
 
 
-# Test to ensure that master can recover and complete a command that was in running state
-# when the master has restarted.
-@pytest.mark.e2e_slurm_restart
-@pytest.mark.parametrize("slots", [0, 1])
-@pytest.mark.parametrize("downtime", [0, 20, 60])
-def test_master_restart_cmd_slurm(
-    restartable_managed_slurm_cluster: managed_slurm_cluster.ManagedSlurmCluster,
-    slots: int,
-    downtime: int,
-) -> None:
-    _test_master_restart_cmd(restartable_managed_slurm_cluster, slots, downtime)
-
-
 def _test_master_restart_cmd(
     managed_cluster: abstract_cluster.Cluster, slots: int, downtime: int
 ) -> None:

@@ -170,20 +170,6 @@ def test_docker_login() -> None:
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available")
-@pytest.mark.e2e_slurm
-@pytest.mark.e2e_pbs
-@api_utils.skipif_not_hpc()
-def test_mnist_pytorch_distributed() -> None:
-    sess = api_utils.user_session()
-    config = conf.load_config(conf.tutorials_path("mnist_pytorch/distributed.yaml"))
-    assert "--epochs 1" in config["entrypoint"], "update test to match tutorial"
-    config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64")
-    config["max_restarts"] = 0
-
-    exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)
-
-
 @pytest.mark.e2e_slurm
 @pytest.mark.e2e_pbs
 @api_utils.skipif_not_hpc()

@@ -34,7 +34,6 @@
     "e2e_pbs",
     "e2e_saml",
     "e2e_slurm",
-    "e2e_slurm_restart",
     "e2e_slurm_internet_connected_cluster",
     "det_deploy_local",
     "test_oauth",

diff --git a/tools/slurm/README.md b/tools/slurm/README.md
@@ -148,9 +148,7 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
 
 **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
 
-The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
-  - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
-  - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.
+
 
 ## Important Workaround Explained