Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: delete znode tests #10201

Merged
merged 3 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
402 changes: 0 additions & 402 deletions .circleci/real_config.yml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ markers =
e2e_pbs: end to end pbs integration tests
e2e_saml: tests for saml with okta
e2e_slurm: end to end slurm integration tests
e2e_slurm_restart: slurm integration tests that require restarting the master
e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
test_oauth: end to end test for oauth client, add, remove in EE.
test_model_registry_rbac: end to end test for RBAC model registry.
Expand Down
4 changes: 0 additions & 4 deletions e2e_tests/tests/cluster/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,3 @@
restartable_managed_cluster_multi_resource_pools,
)
from .managed_cluster_k8s import k8s_managed_cluster # noqa
from .managed_slurm_cluster import ( # noqa
managed_slurm_cluster_restarts,
managed_slurm_cluster_session,
)
110 changes: 0 additions & 110 deletions e2e_tests/tests/cluster/managed_slurm_cluster.py

This file was deleted.

54 changes: 1 addition & 53 deletions e2e_tests/tests/cluster/test_master_restart.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import time
from typing import Iterator

import docker
import pytest
Expand All @@ -15,31 +14,13 @@
from tests import config as conf
from tests import detproc
from tests import experiment as exp
from tests.cluster import (
abstract_cluster,
managed_cluster,
managed_cluster_k8s,
managed_slurm_cluster,
utils,
)
from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
from tests.experiment import noop
from tests.task import task

logger = logging.getLogger(__name__)


# Create a pytest fixture that returns a restartable instance of ManagedSlurmCluster.
@pytest.fixture
def restartable_managed_slurm_cluster(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
) -> Iterator[managed_slurm_cluster.ManagedSlurmCluster]:
try:
yield managed_slurm_cluster_restarts
except Exception:
managed_slurm_cluster_restarts.restart_master()
raise


@pytest.mark.managed_devcluster
def test_master_restart_ok(restartable_managed_cluster: managed_cluster.ManagedCluster) -> None:
_test_master_restart_ok(restartable_managed_cluster)
Expand Down Expand Up @@ -90,14 +71,6 @@ def test_master_restart_ok_k8s(k8s_managed_cluster: managed_cluster_k8s.ManagedK
_test_master_restart_ok(k8s_managed_cluster)


# Test to ensure master restarts successfully.
@pytest.mark.e2e_slurm_restart
def test_master_restart_ok_slurm(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster,
) -> None:
_test_master_restart_ok(managed_slurm_cluster_restarts)


def _test_master_restart_ok(managed_cluster: abstract_cluster.Cluster) -> None:
# - Kill master
# - Restart master
Expand Down Expand Up @@ -143,18 +116,6 @@ def test_master_restart_reattach_recover_experiment_k8s(
_test_master_restart_reattach_recover_experiment(k8s_managed_cluster, downtime)


# Test to ensure that master can reattach to the experiment and resume it, after the determined
# master has restarted.
@pytest.mark.e2e_slurm_restart
@pytest.mark.parametrize("downtime", [0, 20, 60])
def test_master_restart_reattach_recover_experiment_slurm(
managed_slurm_cluster_restarts: managed_slurm_cluster.ManagedSlurmCluster, downtime: int
) -> None:
_test_master_restart_reattach_recover_experiment(
managed_slurm_cluster_restarts, downtime, max_workload_ticks=500
)


@pytest.mark.managed_devcluster
def test_master_agent_restart_reattach_recover_experiment(
restartable_managed_cluster: managed_cluster.ManagedCluster,
Expand Down Expand Up @@ -544,19 +505,6 @@ def test_master_restart_cmd_k8s(
_test_master_restart_cmd(k8s_managed_cluster, slots, downtime)


# Test to ensure that master can recover and complete a command that was in running state
# when the master has restarted.
@pytest.mark.e2e_slurm_restart
@pytest.mark.parametrize("slots", [0, 1])
@pytest.mark.parametrize("downtime", [0, 20, 60])
def test_master_restart_cmd_slurm(
restartable_managed_slurm_cluster: managed_slurm_cluster.ManagedSlurmCluster,
slots: int,
downtime: int,
) -> None:
_test_master_restart_cmd(restartable_managed_slurm_cluster, slots, downtime)


def _test_master_restart_cmd(
managed_cluster: abstract_cluster.Cluster, slots: int, downtime: int
) -> None:
Expand Down
14 changes: 0 additions & 14 deletions e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,20 +170,6 @@ def test_docker_login() -> None:
)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="no gpu available")
@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
@api_utils.skipif_not_hpc()
def test_mnist_pytorch_distributed() -> None:
sess = api_utils.user_session()
config = conf.load_config(conf.tutorials_path("mnist_pytorch/distributed.yaml"))
assert "--epochs 1" in config["entrypoint"], "update test to match tutorial"
config["entrypoint"] = config["entrypoint"].replace("--epochs 1", "--batches 64")
config["max_restarts"] = 0

exp.run_basic_test_with_temp_config(sess, config, conf.fixtures_path("mnist_pytorch"), 1)


@pytest.mark.e2e_slurm
@pytest.mark.e2e_pbs
@api_utils.skipif_not_hpc()
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"e2e_pbs",
"e2e_saml",
"e2e_slurm",
"e2e_slurm_restart",
"e2e_slurm_internet_connected_cluster",
"det_deploy_local",
"test_oauth",
Expand Down
4 changes: 1 addition & 3 deletions tools/slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow

**On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**

The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
- `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
- `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.


## Important Workaround Explained

Expand Down
Loading