Skip to content

Commit

Permalink
Increase Nanny close timeout in LocalCUDACluster tests
Browse files Browse the repository at this point in the history
Tests in CI may fail at times, possibly under high loads only, due to
`Nanny` close timeout, whose internal mechanism to establish timeout
to kill processes may leave too little time for the process to shutdown
properly.

Dask-CUDA introduced a new `IncreasedCloseTimeoutNanny` class intended
to be used with `LocalCUDACluster` in tests to reduce chances such
timeouts occur. This new class is now used in tests to improve the
situation in CI.
  • Loading branch information
pentschev committed Oct 30, 2023
1 parent ff635fc commit 23c3755
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
8 changes: 7 additions & 1 deletion python/cuml/benchmark/automated/dask/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from dask_cuda import initialize
from dask_cuda import LocalCUDACluster
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
from dask.distributed import Client

enable_tcp_over_ucx = True
Expand All @@ -28,7 +29,11 @@
@pytest.fixture(scope="module")
def cluster():

cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
cluster = LocalCUDACluster(
protocol="tcp",
scheduler_port=0,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()

Expand All @@ -54,6 +59,7 @@ def ucx_cluster():
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()
Expand Down
8 changes: 7 additions & 1 deletion python/cuml/tests/dask/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from dask_cuda import initialize
from dask_cuda import LocalCUDACluster
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
from dask.distributed import Client

enable_tcp_over_ucx = True
Expand All @@ -14,7 +15,11 @@
@pytest.fixture(scope="module")
def cluster():

cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
cluster = LocalCUDACluster(
protocol="tcp",
scheduler_port=0,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()

Expand All @@ -40,6 +45,7 @@ def ucx_cluster():
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()
Expand Down

0 comments on commit 23c3755

Please sign in to comment.