From 95f0a33e377eff36bf2b20f25748489dbdb5e5b2 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 19 Sep 2024 11:46:22 -0400
Subject: [PATCH 01/13] DOC v24.12 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 10 +++----
 .github/workflows/pr.yaml                     | 12 ++++-----
 .github/workflows/test.yaml                   |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-114_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-118_arch-x86_64.yaml             | 14 +++++-----
 .../all_cuda-125_arch-x86_64.yaml             | 14 +++++-----
 dependencies.yaml                             | 26 +++++++++----------
 docs/source/explicit_comms.rst                |  2 +-
 pyproject.toml                                | 10 +++----
 11 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67bbd027..3d097bcd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 76014652..0e20bdaf 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1a0e7d87..631a6173 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 7c7ba044..af28c42b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 42103004..7850211e 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.10"
+export RAPIDS_VERSION_NUMBER="24.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 3cfd9cb2..3c327ff0 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b7b99751..3931f3bf 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 652a8f0c..760ae971 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.5
-- cudf==24.10.*,>=0.0.0a0
-- dask-cudf==24.10.*,>=0.0.0a0
-- distributed-ucxx==0.40.*,>=0.0.0a0
-- kvikio==24.10.*,>=0.0.0a0
+- cudf==24.12.*,>=0.0.0a0
+- dask-cudf==24.12.*,>=0.0.0a0
+- distributed-ucxx==0.41.*,>=0.0.0a0
+- kvikio==24.12.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<3.0a0
@@ -26,13 +26,13 @@ dependencies:
 - pytest-cov
 - python>=3.10,<3.13
 - rapids-build-backend>=0.3.0,<0.4.0dev0
-- rapids-dask-dependency==24.10.*,>=0.0.0a0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.40.*,>=0.0.0a0
-- ucxx==0.40.*,>=0.0.0a0
+- ucx-py==0.41.*,>=0.0.0a0
+- ucxx==0.41.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 9e6b3a10..59ac8c01 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -158,7 +158,7 @@ dependencies:
           - numpy>=1.23,<3.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.10.*,>=0.0.0a0
+          - rapids-dask-dependency==24.12.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
@@ -168,13 +168,13 @@ dependencies:
           - pytest-cov
       - output_types: [conda]
         packages:
-          - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0
-          - &dask_cudf_unsuffixed dask-cudf==24.10.*,>=0.0.0a0
-          - distributed-ucxx==0.40.*,>=0.0.0a0
-          - &kvikio_unsuffixed kvikio==24.10.*,>=0.0.0a0
-          - &ucx_py_unsuffixed ucx-py==0.40.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.12.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.12.*,>=0.0.0a0
+          - distributed-ucxx==0.41.*,>=0.0.0a0
+          - &kvikio_unsuffixed kvikio==24.12.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.40.*,>=0.0.0a0
+          - ucxx==0.41.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -194,16 +194,16 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu12==24.10.*,>=0.0.0a0
-              - dask-cudf-cu12==24.10.*,>=0.0.0a0
-              - ucx-py-cu12==0.40.*,>=0.0.0a0
+              - cudf-cu12==24.12.*,>=0.0.0a0
+              - dask-cudf-cu12==24.12.*,>=0.0.0a0
+              - ucx-py-cu12==0.41.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - cudf-cu11==24.10.*,>=0.0.0a0
-              - dask-cudf-cu11==24.10.*,>=0.0.0a0
-              - ucx-py-cu11==0.40.*,>=0.0.0a0
+              - cudf-cu11==24.12.*,>=0.0.0a0
+              - dask-cudf-cu11==24.12.*,>=0.0.0a0
+              - ucx-py-cu11==0.41.*,>=0.0.0a0
           - matrix:
             packages:
               - *cudf_unsuffixed
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index af317056..db621977 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.10/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.12/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/pyproject.toml b/pyproject.toml
index 730225ad..fcf57276 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<3.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -50,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.10.*,>=0.0.0a0",
-    "dask-cudf==24.10.*,>=0.0.0a0",
-    "kvikio==24.10.*,>=0.0.0a0",
+    "cudf==24.12.*,>=0.0.0a0",
+    "dask-cudf==24.12.*,>=0.0.0a0",
+    "kvikio==24.12.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.40.*,>=0.0.0a0",
+    "ucx-py==0.41.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 93a1ee23a43563f33fba8a5a8761c03ccef25a1c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 9 Oct 2024 18:12:23 +0200
Subject: [PATCH 02/13] Limit output of pytest durations (#1393)

Durations output were previously increased to show all tests to allow us debugging of timeouts. However, now they have not been as important so limiting to only the 50 longer running tests is best to decrease log lengths, we may soon remove it entirely if they are not currently important.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1393
---
 ci/test_python.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 78330a40..32c0d940 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -52,7 +52,7 @@ UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
 timeout 60m pytest \
   -vv \
-  --durations=0 \
+  --durations=50 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \
@@ -73,7 +73,7 @@ UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
 timeout 30m pytest \
   -vv \
-  --durations=0 \
+  --durations=50 \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda-legacy.xml" \

From f775d883c1149b00a462a041cf6589f9081aa4fb Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 10 Oct 2024 12:59:31 -0500
Subject: [PATCH 03/13] make conda installs in CI stricter (#1395)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1395
---
 ci/build_docs.sh             | 7 ++++---
 ci/release/update-version.sh | 1 -
 ci/test_python.sh            | 4 +++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 7850211e..58da36c7 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-dependency-file-generator \
     --output conda \
     --file-key docs \
@@ -21,9 +23,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
-    dask-cuda
+    "dask-cuda=${RAPIDS_VERSION}"
 
-export RAPIDS_VERSION_NUMBER="24.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
@@ -33,4 +34,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="$(rapids-version-major-minor)" rapids-upload-docs
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2dbe504c..b229d280 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -68,7 +68,6 @@ done
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 # Docs referencing source code
 find docs/source/ -type f -name *.rst -print0 | while IFS= read -r -d '' filename; do
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 32c0d940..33914172 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -5,6 +5,8 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -29,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
-  dask-cuda
+  "dask-cuda=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi

From 8d88006a6a064165e8408dcb9c288059c6f98a7f Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Sat, 12 Oct 2024 13:51:38 -0600
Subject: [PATCH 04/13] Enable Pytorch to share same memory pool as RMM via cli
 (#1392)

This PR closes: https://github.com/rapidsai/dask-cuda/issues/1281

Usage example:
```
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster(rmm_allocator_external_lib_list=["torch", "cupy"])
client = Client(cluster)
```

Verify working
```
def get_torch_allocator():
    import torch
    return torch.cuda.get_allocator_backend()

client.run(get_torch_allocator)
```

```
client.run(get_torch_allocator)
```

```
{'tcp://127.0.0.1:37167': 'pluggable',
 'tcp://127.0.0.1:38749': 'pluggable',
 'tcp://127.0.0.1:43109': 'pluggable',
 'tcp://127.0.0.1:44259': 'pluggable',
 'tcp://127.0.0.1:44953': 'pluggable',
 'tcp://127.0.0.1:45087': 'pluggable',
 'tcp://127.0.0.1:45623': 'pluggable',
 'tcp://127.0.0.1:45847': 'pluggable'}
```

Without it its `native`.


Context: This helps NeMo-Curator to have a  more stable use of Pytorch+dask-cuda

CC: @pentschev .

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1392
---
 dask_cuda/cli.py                | 14 ++++++-
 dask_cuda/cuda_worker.py        |  2 +
 dask_cuda/local_cuda_cluster.py | 22 +++++++++++
 dask_cuda/plugins.py            | 67 +++++++++++++++++++++++++++++++++
 dask_cuda/utils.py              | 11 ++++++
 5 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index a8c6d972..8101f020 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -13,7 +13,7 @@
 from distributed.utils import import_term
 
 from .cuda_worker import CUDAWorker
-from .utils import print_cluster_config
+from .utils import CommaSeparatedChoice, print_cluster_config
 
 logger = logging.getLogger(__name__)
 
@@ -164,6 +164,16 @@ def cuda():
         incompatible with RMM pools and managed memory, trying to enable both will
         result in failure.""",
 )
+@click.option(
+    "--set-rmm-allocator-for-libs",
+    "rmm_allocator_external_lib_list",
+    type=CommaSeparatedChoice(["cupy", "torch"]),
+    default=None,
+    show_default=True,
+    help="""
+    Set RMM as the allocator for external libraries. Provide a comma-separated
+    list of libraries to set, e.g., "torch,cupy".""",
+)
 @click.option(
     "--rmm-release-threshold",
     default=None,
@@ -351,6 +361,7 @@ def worker(
     rmm_maximum_pool_size,
     rmm_managed_memory,
     rmm_async,
+    rmm_allocator_external_lib_list,
     rmm_release_threshold,
     rmm_log_directory,
     rmm_track_allocations,
@@ -425,6 +436,7 @@ def worker(
             rmm_maximum_pool_size,
             rmm_managed_memory,
             rmm_async,
+            rmm_allocator_external_lib_list,
             rmm_release_threshold,
             rmm_log_directory,
             rmm_track_allocations,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 3e03ed29..30c14450 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -47,6 +47,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_allocator_external_lib_list=None,
         rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
@@ -231,6 +232,7 @@ def del_pid_file():
                         release_threshold=rmm_release_threshold,
                         log_directory=rmm_log_directory,
                         track_allocations=rmm_track_allocations,
+                        external_lib_list=rmm_allocator_external_lib_list,
                     ),
                     PreImport(pre_import),
                     CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index c037223b..7a24df43 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -143,6 +143,11 @@ class LocalCUDACluster(LocalCluster):
             The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
             incompatible with RMM pools and managed memory. Trying to enable both will
             result in an exception.
+    rmm_allocator_external_lib_list: str, list or None, default None
+        List of external libraries for which to set RMM as the allocator.
+        Supported options are: ``["torch", "cupy"]``. Can be a comma-separated string
+        (like ``"torch,cupy"``) or a list of strings (like ``["torch", "cupy"]``).
+        If ``None``, no external libraries will use RMM as their allocator.
     rmm_release_threshold: int, str or None, default None
         When ``rmm.async is True`` and the pool size grows beyond this value, unused
         memory held by the pool will be released at the next synchronization point.
@@ -231,6 +236,7 @@ def __init__(
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
         rmm_async=False,
+        rmm_allocator_external_lib_list=None,
         rmm_release_threshold=None,
         rmm_log_directory=None,
         rmm_track_allocations=False,
@@ -265,6 +271,19 @@ def __init__(
             n_workers = len(CUDA_VISIBLE_DEVICES)
         if n_workers < 1:
             raise ValueError("Number of workers cannot be less than 1.")
+
+        if rmm_allocator_external_lib_list is not None:
+            if isinstance(rmm_allocator_external_lib_list, str):
+                rmm_allocator_external_lib_list = [
+                    v.strip() for v in rmm_allocator_external_lib_list.split(",")
+                ]
+            elif not isinstance(rmm_allocator_external_lib_list, list):
+                raise ValueError(
+                    "rmm_allocator_external_lib_list must be either a comma-separated "
+                    "string or a list of strings. Examples: 'torch,cupy' "
+                    "or ['torch', 'cupy']"
+                )
+
         # Set nthreads=1 when parsing mem_limit since it only depends on n_workers
         logger = logging.getLogger(__name__)
         self.memory_limit = parse_memory_limit(
@@ -284,6 +303,8 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         self.rmm_async = rmm_async
         self.rmm_release_threshold = rmm_release_threshold
+        self.rmm_allocator_external_lib_list = rmm_allocator_external_lib_list
+
         if rmm_pool_size is not None or rmm_managed_memory or rmm_async:
             try:
                 import rmm  # noqa F401
@@ -437,6 +458,7 @@ def new_worker_spec(self):
                         release_threshold=self.rmm_release_threshold,
                         log_directory=self.rmm_log_directory,
                         track_allocations=self.rmm_track_allocations,
+                        external_lib_list=self.rmm_allocator_external_lib_list,
                     ),
                     PreImport(self.pre_import),
                     CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
index 122f93ff..cd1928af 100644
--- a/dask_cuda/plugins.py
+++ b/dask_cuda/plugins.py
@@ -1,5 +1,6 @@
 import importlib
 import os
+from typing import Callable, Dict
 
 from distributed import WorkerPlugin
 
@@ -39,6 +40,7 @@ def __init__(
         release_threshold,
         log_directory,
         track_allocations,
+        external_lib_list,
     ):
         if initial_pool_size is None and maximum_pool_size is not None:
             raise ValueError(
@@ -61,6 +63,7 @@ def __init__(
         self.logging = log_directory is not None
         self.log_directory = log_directory
         self.rmm_track_allocations = track_allocations
+        self.external_lib_list = external_lib_list
 
     def setup(self, worker=None):
         if self.initial_pool_size is not None:
@@ -123,6 +126,70 @@ def setup(self, worker=None):
             mr = rmm.mr.get_current_device_resource()
             rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
 
+        if self.external_lib_list is not None:
+            for lib in self.external_lib_list:
+                enable_rmm_memory_for_library(lib)
+
+
+def enable_rmm_memory_for_library(lib_name: str) -> None:
+    """Enable RMM memory pool support for a specified third-party library.
+
+    This function allows the given library to utilize RMM's memory pool if it supports
+    integration with RMM. The library name is passed as a string argument, and if the
+    library is compatible, its memory allocator will be configured to use RMM.
+
+    Parameters
+    ----------
+    lib_name : str
+        The name of the third-party library to enable RMM memory pool support for.
+        Supported libraries are "cupy" and "torch".
+
+    Raises
+    ------
+    ValueError
+        If the library name is not supported or does not have RMM integration.
+    ImportError
+        If the required library is not installed.
+    """
+
+    # Mapping of supported libraries to their respective setup functions
+    setup_functions: Dict[str, Callable[[], None]] = {
+        "torch": _setup_rmm_for_torch,
+        "cupy": _setup_rmm_for_cupy,
+    }
+
+    if lib_name not in setup_functions:
+        supported_libs = ", ".join(setup_functions.keys())
+        raise ValueError(
+            f"The library '{lib_name}' is not supported for RMM integration. "
+            f"Supported libraries are: {supported_libs}."
+        )
+
+    # Call the setup function for the specified library
+    setup_functions[lib_name]()
+
+
+def _setup_rmm_for_torch() -> None:
+    try:
+        import torch
+    except ImportError as e:
+        raise ImportError("PyTorch is not installed.") from e
+
+    from rmm.allocators.torch import rmm_torch_allocator
+
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+
+def _setup_rmm_for_cupy() -> None:
+    try:
+        import cupy
+    except ImportError as e:
+        raise ImportError("CuPy is not installed.") from e
+
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
 
 class PreImport(WorkerPlugin):
     def __init__(self, libraries):
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index ff4dbbae..74596fe2 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -9,6 +9,7 @@
 from multiprocessing import cpu_count
 from typing import Optional
 
+import click
 import numpy as np
 import pynvml
 import toolz
@@ -764,3 +765,13 @@ def get_rmm_memory_resource_stack(mr) -> list:
         if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
             return mr.allocation_counts["current_bytes"]
     return None
+
+
+class CommaSeparatedChoice(click.Choice):
+    def convert(self, value, param, ctx):
+        values = [v.strip() for v in value.split(",")]
+        for v in values:
+            if v not in self.choices:
+                choices_str = ", ".join(f"'{c}'" for c in self.choices)
+                self.fail(f"invalid choice(s): {v}. (choices are: {choices_str})")
+        return values

From dfcd399171cdaca93155fe7a1f47812db63c780c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 14 Oct 2024 19:06:16 +0200
Subject: [PATCH 05/13] Reenable UCXX in CI (#1396)

UCXX CI tests had been previously disabled due to instabilities, see https://github.com/rapidsai/dask-cuda/pull/1270#issuecomment-1806295358, it should now be much more resilient so we should reenable them in preparation for the permanent migration to UCXX.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1396
---
 ci/test_python.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 33914172..18dd88cf 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -52,7 +52,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 60m pytest \
+timeout 90m pytest \
   -vv \
   --durations=50 \
   --capture=no \
@@ -62,7 +62,7 @@ timeout 60m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
   --cov-report=term \
-  tests -k "not ucxx"
+  tests
 popd
 
 rapids-logger "pytest explicit-comms (legacy dd)"
@@ -73,7 +73,7 @@ DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 30m pytest \
+timeout 60m pytest \
   -vv \
   --durations=50 \
   --capture=no \
@@ -83,7 +83,7 @@ timeout 30m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage-legacy.xml" \
   --cov-report=term \
-  tests/test_explicit_comms.py -k "not ucxx"
+  tests/test_explicit_comms.py
 popd
 
 rapids-logger "Run local benchmark (dask-expr)"

From 0f78f5d23029313ecb3647faca6c28933b52d130 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 22 Oct 2024 23:39:51 +0200
Subject: [PATCH 06/13] Ignore legacy Dask dataframe warnings (#1397)

Ignore legacy Dask dataframe warnings that the implementation is going to be soon removed, introduced in
https://github.com/dask/dask/pull/11437 .

The warning is only raised for `DASK_DATAFRAME__QUERY_PLANNING=False` cases.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1397
---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index fcf57276..2266fb5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -128,6 +128,9 @@ filterwarnings = [
     # is enabled in both dask-cudf and dask-cuda.
     # See: https://github.com/rapidsai/dask-cuda/issues/1311
     "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 
 [tool.rapids-build-backend]

From 4639a968bcbf9837085be5c8df40ef27d00bf009 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 23 Oct 2024 14:12:46 -0500
Subject: [PATCH 07/13] remove unnecessary cmake and sccache configuration
 (#1400)

Contributes to https://github.com/rapidsai/build-planning/issues/108

This is a pure Python project, so it doesn't need configuration about CMake or `sccache`.

This proposes removing them to simplify build scripts a bit.

It also proposes updating the `rapids-dependency-file-generator` pre-commit hook to it's latest version, something I'm trying to roll out across RAPIDS as part of https://github.com/rapidsai/build-planning/issues/108.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1400
---
 .pre-commit-config.yaml | 2 +-
 ci/build_python.sh      | 4 ----
 ci/build_wheel.sh       | 3 +--
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4707492a..a2202df3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,7 +37,7 @@ repos:
         hooks:
             - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.13.11
+        rev: v1.16.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 48cece32..c12a0dde 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -5,12 +5,8 @@ set -euo pipefail
 
 rapids-configure-conda-channels
 
-source rapids-configure-sccache
-
 source rapids-date-string
 
-export CMAKE_GENERATOR=Ninja
-
 rapids-print-env
 
 rapids-generate-version > ./VERSION
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 828972dc..91c57231 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,11 +3,10 @@
 
 set -euo pipefail
 
-source rapids-configure-sccache
 source rapids-date-string
 
 rapids-generate-version > ./VERSION
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
 
 RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist

From fc80d43bf22db405fe123be8324aaee7978d4956 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 1 Nov 2024 13:02:58 -0500
Subject: [PATCH 08/13] Switch pytest `traceback` to `native` (#1389)

In cudf & cuml we have observed a ~10% to ~20% respectively speed up of pytest suite execution by switching pytest traceback to `--native`:

```
currently:

102474 passed, 2117 skipped, 902 xfailed in 892.16s (0:14:52)

--tb=short:

102474 passed, 2117 skipped, 902 xfailed in 898.99s (0:14:58)

--tb=no:

102474 passed, 2117 skipped, 902 xfailed in 815.98s (0:13:35)

--tb=native:

102474 passed, 2117 skipped, 902 xfailed in 820.92s (0:13:40)
```

This PR makes similar change to `dask-cuda` repo.

xref: https://github.com/rapidsai/cudf/pull/16851

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1389
---
 dask_cuda/tests/pytest.ini | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 dask_cuda/tests/pytest.ini

diff --git a/dask_cuda/tests/pytest.ini b/dask_cuda/tests/pytest.ini
new file mode 100644
index 00000000..7b0a9f29
--- /dev/null
+++ b/dask_cuda/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native

From 233376d8f111e2571f745e6f31729db9bc2183ac Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 5 Nov 2024 16:37:40 +0100
Subject: [PATCH 09/13] Add warmup runs and profile all iterations to
 benchmarks (#1402)

Add support for initial warmup runs in benchmarks and allows profiling all iterations or just the last one.

This is technically a breaking change since `--profile` now profiles all iterations, and the new `--profile-last` option profiles only the last one as `--profile` used to behave.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1402
---
 dask_cuda/benchmarks/common.py                | 21 ++++++++++++-----
 dask_cuda/benchmarks/local_cudf_groupby.py    | 13 +++--------
 dask_cuda/benchmarks/local_cudf_merge.py      |  8 +------
 dask_cuda/benchmarks/local_cudf_shuffle.py    | 13 +++--------
 dask_cuda/benchmarks/local_cupy.py            | 15 ++++--------
 .../benchmarks/local_cupy_map_overlap.py      | 15 ++++--------
 dask_cuda/benchmarks/utils.py                 | 23 ++++++++++++++++++-
 7 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 7f48d4fa..49676fee 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -1,3 +1,4 @@
+import contextlib
 from argparse import Namespace
 from functools import partial
 from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
@@ -7,7 +8,7 @@
 import pandas as pd
 
 import dask
-from distributed import Client
+from distributed import Client, performance_report
 
 from dask_cuda.benchmarks.utils import (
     address_to_index,
@@ -87,12 +88,20 @@ def run_benchmark(client: Client, args: Namespace, config: Config):
 
     If ``args.profile`` is set, the final run is profiled.
     """
+
     results = []
-    for _ in range(max(1, args.runs) - 1):
-        res = config.bench_once(client, args, write_profile=None)
-        results.append(res)
-    results.append(config.bench_once(client, args, write_profile=args.profile))
-    return results
+    for _ in range(max(0, args.warmup_runs)):
+        config.bench_once(client, args, write_profile=None)
+
+    ctx = contextlib.nullcontext()
+    if args.profile is not None:
+        ctx = performance_report(filename=args.profile)
+    with ctx:
+        for _ in range(max(1, args.runs) - 1):
+            res = config.bench_once(client, args, write_profile=None)
+            results.append(res)
+        results.append(config.bench_once(client, args, write_profile=args.profile_last))
+        return results
 
 
 def gather_bench_results(client: Client, args: Namespace, config: Config):
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index f094ff18..a9e7d833 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -98,10 +98,9 @@ def bench_once(client, args, write_profile=None):
         "False": False,
     }.get(args.shuffle, args.shuffle)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         t1 = clock()
@@ -260,12 +259,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
     ]
 
     return parse_benchmark_args(
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index e2b03520..6ebe005a 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -190,7 +190,7 @@ def bench_once(client, args, write_profile=None):
     if args.backend == "explicit-comms":
         ctx1 = dask.config.set(explicit_comms=True)
     if write_profile is not None:
-        ctx2 = performance_report(filename=args.profile)
+        ctx2 = performance_report(filename=write_profile)
 
     with ctx1:
         with ctx2:
@@ -346,12 +346,6 @@ def parse_args():
             "action": "store_true",
             "help": "Don't shuffle the keys of the left (base) dataframe.",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": [
                 "-s",
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 25f42e59..3a0955c4 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -121,10 +121,9 @@ def create_data(
 def bench_once(client, args, write_profile=None):
     data_processed, df = create_data(client, args)
 
-    if write_profile is None:
-        ctx = contextlib.nullcontext()
-    else:
-        ctx = performance_report(filename=args.profile)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
     with ctx:
         if args.backend in {"dask", "dask-noop"}:
@@ -228,12 +227,6 @@ def parse_args():
             "type": str,
             "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": "--ignore-index",
             "action": "store_true",
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index c9c8fe1c..ba88db30 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -141,12 +141,11 @@ def bench_once(client, args, write_profile=None):
     chunksize = x.chunksize
     data_processed = sum(arg.nbytes for arg in func_args)
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         rng = start_range(message=args.operation, color="purple")
         result = func(*func_args)
@@ -297,12 +296,6 @@ def parse_args():
             "type": int,
             "help": "Chunk size (default 2500).",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs (default 3).",
-        },
         {
             "name": [
                 "-b",
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index 8b975a24..ecefa52a 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -42,12 +42,11 @@ def bench_once(client, args, write_profile=None):
 
     data_processed = x.nbytes
 
-    # Execute the operations to benchmark
-    if args.profile is not None and write_profile is not None:
-        ctx = performance_report(filename=args.profile)
-    else:
-        ctx = contextlib.nullcontext()
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
 
+    # Execute the operations to benchmark
     with ctx:
         result = x.map_overlap(mean_filter, args.kernel_size, shape=ks)
         if args.backend == "dask-noop":
@@ -168,12 +167,6 @@ def parse_args():
             "type": int,
             "help": "Kernel size, 2*k+1, in each dimension (default 1)",
         },
-        {
-            "name": "--runs",
-            "default": 3,
-            "type": int,
-            "help": "Number of runs",
-        },
         {
             "name": [
                 "-b",
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index de7e2ae1..4f87a025 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -323,7 +323,16 @@ def parse_benchmark_args(
         metavar="PATH",
         default=None,
         type=str,
-        help="Write dask profile report (E.g. dask-report.html)",
+        help="Write dask profile report (E.g. dask-report.html) on all "
+        "iterations (excluding warmup).",
+    )
+    parser.add_argument(
+        "--profile-last",
+        metavar="PATH",
+        default=None,
+        type=str,
+        help="Write dask profile report (E.g. dask-report.html) on last "
+        "iteration only.",
     )
     # See save_benchmark_data for more information
     parser.add_argument(
@@ -344,6 +353,18 @@ def parse_benchmark_args(
         type=parse_bytes,
         help="Bandwidth statistics: ignore messages smaller than this (default '1 MB')",
     )
+    parser.add_argument(
+        "--runs",
+        default=3,
+        type=int,
+        help="Number of runs",
+    )
+    parser.add_argument(
+        "--warmup-runs",
+        default=1,
+        type=int,
+        help="Number of warmup runs",
+    )
 
     for args in args_list:
         name = args.pop("name")

From 9e7a926bc305f32aa0463a44eb96d4494fe55fc0 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 13 Nov 2024 09:13:11 -0600
Subject: [PATCH 10/13] enforce wheel size limits, README formatting in CI
 (#1404)

Contributes to https://github.com/rapidsai/build-planning/issues/110

Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI.

* checks on wheel size (compressed),
  - *to be sure they're under PyPI limits*
  - *and to prompt discussion on PRs that significantly increase wheel sizes*
* checks on README formatting
  - *to ensure they'll render properly as the PyPI project homepages*
  - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1404
---
 ci/build_wheel.sh    |  1 +
 ci/validate_wheel.sh | 18 ++++++++++++++++++
 pyproject.toml       |  8 ++++++++
 3 files changed, 27 insertions(+)
 create mode 100755 ci/validate_wheel.sh

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 91c57231..760e46e3 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -8,5 +8,6 @@ source rapids-date-string
 rapids-generate-version > ./VERSION
 
 python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+./ci/validate_wheel.sh dist
 
 RAPIDS_PY_WHEEL_NAME="dask-cuda" rapids-upload-wheels-to-s3 dist
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 00000000..60a80fce
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+wheel_dir_relative_path=$1
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/pyproject.toml b/pyproject.toml
index 2266fb5b..7025ca4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -152,3 +152,11 @@ exclude = [
     "docs.*",
     "tests.*",
 ]
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'

From af1a2f7f20000796965653a8cb3d799cefd0c58d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 20 Nov 2024 18:42:17 +0100
Subject: [PATCH 11/13] Disable UCXX tests in CI (#1406)

Temporarily disable UCXX tests in CI due to some non-deterministic failures during code freeze phase. They will be reenabled after 24.12 release.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1406
---
 ci/test_python.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 18dd88cf..319efef2 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -62,7 +62,7 @@ timeout 90m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
   --cov-report=term \
-  tests
+  tests -k "not ucxx"
 popd
 
 rapids-logger "pytest explicit-comms (legacy dd)"
@@ -83,7 +83,7 @@ timeout 60m pytest \
   --cov=dask_cuda \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage-legacy.xml" \
   --cov-report=term \
-  tests/test_explicit_comms.py
+  tests/test_explicit_comms.py -k "not ucxx"
 popd
 
 rapids-logger "Run local benchmark (dask-expr)"

From 075f8beb4098359ac72f88b88ae89621e41f5774 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 20 Nov 2024 19:49:30 +0000
Subject: [PATCH 12/13] Update PyNVML and set upper pin (#1130)

Handling the str vs. bytes discrepancy should have been covered by the changes in #1118.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1130
---
 conda/environments/all_cuda-114_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 2 +-
 pyproject.toml                                   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 3c327ff0..c7b20c69 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<11.5
+- pynvml>=11.0.0,<12.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 3931f3bf..9fd24d4e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<11.5
+- pynvml>=11.0.0,<12.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 760ae971..cd7c1679 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<11.5
+- pynvml>=11.0.0,<12.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
diff --git a/dependencies.yaml b/dependencies.yaml
index 59ac8c01..fa6a56e0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -157,7 +157,7 @@ dependencies:
           - numba>=0.57
           - numpy>=1.23,<3.0a0
           - pandas>=1.3
-          - pynvml>=11.0.0,<11.5
+          - pynvml>=11.0.0,<12.0.0a0
           - rapids-dask-dependency==24.12.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
diff --git a/pyproject.toml b/pyproject.toml
index 7025ca4e..f6332875 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "numba>=0.57",
     "numpy>=1.23,<3.0a0",
     "pandas>=1.3",
-    "pynvml>=11.0.0,<11.5",
+    "pynvml>=11.0.0,<12.0.0a0",
     "rapids-dask-dependency==24.12.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.

From e68afed3983bc652f05e91661887f071ddf72ff1 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Dec 2024 13:11:43 -0500
Subject: [PATCH 13/13] Update Changelog [skip ci]

---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8c992fb..3b0d08d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+# dask-cuda 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- Add warmup runs and profile all iterations to benchmarks ([#1402](https://github.com/rapidsai/dask-cuda/pull/1402)) [@pentschev](https://github.com/pentschev)
+
+## 🐛 Bug Fixes
+
+- Disable UCXX tests in CI ([#1406](https://github.com/rapidsai/dask-cuda/pull/1406)) [@pentschev](https://github.com/pentschev)
+- Ignore legacy Dask dataframe warnings ([#1397](https://github.com/rapidsai/dask-cuda/pull/1397)) [@pentschev](https://github.com/pentschev)
+- Reenable UCXX in CI ([#1396](https://github.com/rapidsai/dask-cuda/pull/1396)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Enable Pytorch to share same memory pool as RMM via cli ([#1392](https://github.com/rapidsai/dask-cuda/pull/1392)) [@VibhuJawa](https://github.com/VibhuJawa)
+
+## 🛠️ Improvements
+
+- enforce wheel size limits, README formatting in CI ([#1404](https://github.com/rapidsai/dask-cuda/pull/1404)) [@jameslamb](https://github.com/jameslamb)
+- Add warmup runs and profile all iterations to benchmarks ([#1402](https://github.com/rapidsai/dask-cuda/pull/1402)) [@pentschev](https://github.com/pentschev)
+- remove unnecessary cmake and sccache configuration ([#1400](https://github.com/rapidsai/dask-cuda/pull/1400)) [@jameslamb](https://github.com/jameslamb)
+- make conda installs in CI stricter ([#1395](https://github.com/rapidsai/dask-cuda/pull/1395)) [@jameslamb](https://github.com/jameslamb)
+- Limit output of pytest durations ([#1393](https://github.com/rapidsai/dask-cuda/pull/1393)) [@pentschev](https://github.com/pentschev)
+- Switch pytest `traceback` to `native` ([#1389](https://github.com/rapidsai/dask-cuda/pull/1389)) [@galipremsagar](https://github.com/galipremsagar)
+- Update PyNVML and set upper pin ([#1130](https://github.com/rapidsai/dask-cuda/pull/1130)) [@wence-](https://github.com/wence-)
+
 # dask-cuda 24.10.00 (9 Oct 2024)
 
 ## 🚨 Breaking Changes