From 3e418dd05d4f84472bca4d80902e1b7476f0e0d4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 2 Dec 2024 13:13:16 -0500
Subject: [PATCH 01/78] Move make_strings_column benchmark to nvbench (#17340)

Moves the `cpp/benchmarks/string/factory.cu` implementation from google-bench to nvbench.
Also renames to `.cpp` by recoding without device code.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17340
---
 cpp/benchmarks/CMakeLists.txt     |  3 +-
 cpp/benchmarks/string/factory.cpp | 60 ++++++++++++++++++++
 cpp/benchmarks/string/factory.cu  | 92 -------------------------------
 3 files changed, 61 insertions(+), 94 deletions(-)
 create mode 100644 cpp/benchmarks/string/factory.cpp
 delete mode 100644 cpp/benchmarks/string/factory.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d3de9b39977..8e5ea900efa 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -360,8 +360,6 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
-ConfigureBench(STRINGS_BENCH string/factory.cu)
-
 ConfigureNVBench(
   STRINGS_NVBENCH
   string/case.cpp
@@ -377,6 +375,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/factory.cpp
   string/filter.cpp
   string/find.cpp
   string/find_multiple.cpp
diff --git a/cpp/benchmarks/string/factory.cpp b/cpp/benchmarks/string/factory.cpp
new file mode 100644
index 00000000000..03870b0ae23
--- /dev/null
+++ b/cpp/benchmarks/string/factory.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <limits>
+
+static void bench_factory(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const sv     = cudf::strings_column_view(column->view());
+
+  auto stream    = cudf::get_default_stream();
+  auto mr        = cudf::get_current_device_resource_ref();
+  auto d_strings = cudf::strings::detail::create_string_vector_from_column(sv, stream, mr);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = sv.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::make_strings_column(d_strings, cudf::string_view{nullptr, 0});
+  });
+}
+
+NVBENCH_BENCH(bench_factory)
+  .set_name("factory")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
deleted file mode 100644
index c4e74c4d97e..00000000000
--- a/cpp/benchmarks/string/factory.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "string_bench_args.hpp"
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/transform.h>
-
-#include <limits>
-
-namespace {
-using string_pair = thrust::pair<char const*, cudf::size_type>;
-struct string_view_to_pair {
-  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
-  {
-    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
-  }
-};
-}  // namespace
-
-class StringsFactory : public cudf::benchmark {};
-
-static void BM_factory(benchmark::State& state)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  auto d_column     = cudf::column_device_view::create(column->view());
-  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::get_default_stream());
-  thrust::transform(thrust::device,
-                    d_column->pair_begin<cudf::string_view, true>(),
-                    d_column->pair_end<cudf::string_view, true>(),
-                    pairs.data(),
-                    string_view_to_pair{});
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::make_strings_column(pairs, cudf::get_default_stream());
-  }
-
-  cudf::strings_column_view input(column->view());
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
-
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define STRINGS_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringsFactory, name)      \
-  (::benchmark::State & st) { BM_factory(st); } \
-  BENCHMARK_REGISTER_F(StringsFactory, name)    \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(factory)

From 5190b4460ba86151521de9f4415c5eb55781371e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 2 Dec 2024 18:42:47 +0000
Subject: [PATCH 02/78] Temporarily skip tests due to dask/distributed#8953
 (#17472)

Temporarily skip tests failing due to an upstream dask change.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17472
---
 .../custreamz/tests/test_dataframes.py        | 56 ++++++++++++++++---
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index 8c0130d2818..6905044039c 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -216,7 +216,13 @@ def test_set_index():
     assert_eq(b[0], df.set_index(df.y + 1))
 
 
-def test_binary_stream_operators(stream):
+def test_binary_stream_operators(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
 
     expected = df.x + df.y
@@ -242,7 +248,13 @@ def test_index(stream):
     assert_eq(L[1], df.index + 5)
 
 
-def test_pair_arithmetic(stream):
+def test_pair_arithmetic(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     a = DataFrame(example=df.iloc[:0], stream=stream)
@@ -255,7 +267,13 @@ def test_pair_arithmetic(stream):
     assert_eq(cudf.concat(L), (df.x + df.y) * 2)
 
 
-def test_getitem(stream):
+def test_getitem(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     a = DataFrame(example=df.iloc[:0], stream=stream)
@@ -332,7 +350,13 @@ def test_repr_html(stream):
         assert "1" in html
 
 
-def test_setitem(stream):
+def test_setitem(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     sdf = DataFrame(example=df.iloc[:0], stream=stream)
@@ -356,7 +380,13 @@ def test_setitem(stream):
     assert_eq(L[-1], df.mean())
 
 
-def test_setitem_overwrites(stream):
+def test_setitem_overwrites(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     df = cudf.DataFrame({"x": list(range(10))})
     sdf = DataFrame(example=df.iloc[:0], stream=stream)
     stream = sdf.stream
@@ -413,8 +443,14 @@ def test_setitem_overwrites(stream):
     ],
 )
 def test_rolling_count_aggregations(
-    op, window, m, pre_get, post_get, kwargs, stream
+    request, op, window, m, pre_get, post_get, kwargs, stream
 ):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream) and len(kwargs) == 0,
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     index = pd.DatetimeIndex(
         pd.date_range("2000-01-01", "2000-01-03", freq="1h")
     )
@@ -808,7 +844,13 @@ def test_reductions_with_start_state(stream):
     assert output2[0] == 360
 
 
-def test_rolling_aggs_with_start_state(stream):
+def test_rolling_aggs_with_start_state(request, stream):
+    request.applymarker(
+        pytest.mark.xfail(
+            isinstance(stream, DaskStream),
+            reason="https://github.com/dask/distributed/issues/8953",
+        )
+    )
     example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64")
     sdf = DataFrame(stream, example=example)
     output0 = (

From 68848673e879436139484461508fab8c1b4d021a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 2 Dec 2024 11:49:38 -0800
Subject: [PATCH 03/78] Remove cudf._lib.replace in favor of inlining pylibcudf
 (#17428)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17428
---
 python/cudf/cudf/_lib/CMakeLists.txt        |   1 -
 python/cudf/cudf/_lib/__init__.py           |   1 -
 python/cudf/cudf/_lib/replace.pyx           | 193 --------------------
 python/cudf/cudf/core/column/categorical.py |  10 +-
 python/cudf/cudf/core/column/column.py      |  53 +++++-
 python/cudf/cudf/core/column/numerical.py   |   8 +-
 python/cudf/cudf/core/column/string.py      |   2 +-
 7 files changed, 54 insertions(+), 214 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/replace.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8ed5d5b896c..de483b3070d 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -25,7 +25,6 @@ set(cython_sources
     orc.pyx
     parquet.pyx
     reduce.pyx
-    replace.pyx
     round.pyx
     scalar.pyx
     sort.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index b71c5ea73d6..ee1bd13f2c4 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -13,7 +13,6 @@
     orc,
     parquet,
     reduce,
-    replace,
     round,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
deleted file mode 100644
index b50c6dd25e3..00000000000
--- a/python/cudf/cudf/_lib/replace.pyx
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.api.types import is_scalar
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def replace(Column input_col, Column values_to_replace,
-            Column replacement_values):
-    """
-    Replaces values from values_to_replace with corresponding value from
-    replacement_values in input_col
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    values_to_replace : Column with values which needs to be replaced
-    replacement_values : Column with values which will replace
-    """
-
-    return Column.from_pylibcudf(
-        pylibcudf.replace.find_and_replace_all(
-            input_col.to_pylibcudf(mode="read"),
-            values_to_replace.to_pylibcudf(mode="read"),
-            replacement_values.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_column(Column input_col, Column replacement_values):
-    """
-    Replaces null values in input_col with corresponding values from
-    replacement_values
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    replacement_values : Column with values which will replace nulls
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            replacement_values.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value):
-    """
-    Replaces null values in input_col with replacement_value
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    replacement_value : DeviceScalar with value which will replace nulls
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            replacement_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_fill(Column input_col, object method):
-    """
-    Replaces null values in input_col with replacement_value
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    method : 'ffill' or 'bfill'
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            pylibcudf.replace.ReplacePolicy.PRECEDING
-            if method == 'ffill'
-            else pylibcudf.replace.ReplacePolicy.FOLLOWING,
-        )
-    )
-
-
-def replace_nulls(
-    Column input_col,
-    object replacement=None,
-    object method=None,
-    object dtype=None
-):
-    """
-    Calls one of the version of replace_nulls depending on type
-    of replacement
-    """
-
-    if replacement is None and method is None:
-        raise ValueError("Must specify a fill 'value' or 'method'.")
-
-    if replacement and method:
-        raise ValueError("Cannot specify both 'value' and 'method'.")
-
-    if method:
-        return replace_nulls_fill(input_col, method)
-    elif is_scalar(replacement):
-        return replace_nulls_scalar(
-            input_col,
-            as_device_scalar(replacement, dtype=dtype)
-        )
-    else:
-        return replace_nulls_column(input_col, replacement)
-
-
-@acquire_spill_lock()
-def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi):
-    """
-    Clip the input_col such that values < lo will be replaced by lo
-    and > hi will be replaced by hi
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    lo : DeviceScalar value for clipping lower values
-    hi : DeviceScalar value for clipping upper values
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.clamp(
-            input_col.to_pylibcudf(mode="read"),
-            lo.c_value,
-            hi.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def clip(Column input_col, object lo, object hi):
-    """
-    Clip the input_col such that values < lo will be replaced by lo
-    and > hi will be replaced by hi
-    """
-
-    lo_scalar = as_device_scalar(lo, dtype=input_col.dtype)
-    hi_scalar = as_device_scalar(hi, dtype=input_col.dtype)
-
-    return clamp(input_col, lo_scalar, hi_scalar)
-
-
-@acquire_spill_lock()
-def normalize_nans_and_zeros_inplace(Column input_col):
-    """
-    Inplace normalizing
-    """
-    pylibcudf.replace.normalize_nans_and_zeros(
-        input_col.to_pylibcudf(mode="write"), inplace=True
-    )
-
-
-@acquire_spill_lock()
-def normalize_nans_and_zeros_column(Column input_col):
-    """
-    Returns a new  normalized Column
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.normalize_nans_and_zeros(
-            input_col.to_pylibcudf(mode="read")
-        )
-    )
-
-
-def normalize_nans_and_zeros(Column input_col, in_place=False):
-    """
-    Normalize the NaN and zeros in input_col
-    Convert  -NaN  -> NaN
-    Convert  -0.0  -> 0.0
-
-    Parameters
-    ----------
-    input_col : Column that needs to be normalized
-    in_place : boolean whether to normalize in place or return new column
-    """
-
-    if in_place is True:
-        normalize_nans_and_zeros_inplace(input_col)
-    else:
-        return normalize_nans_and_zeros_column(input_col)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 7354b917f90..7551703c53e 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -843,9 +843,9 @@ def values(self):
         """
         raise NotImplementedError("cudf.Categorical is not yet implemented")
 
-    def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
         return (
-            self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)
+            self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)  # type: ignore[return-value]
         )
 
     def data_array_view(
@@ -989,10 +989,8 @@ def find_and_replace(
         replacement_col = catmap._data["index"].astype(replaced.codes.dtype)
 
         replaced_codes = column.as_column(replaced.codes)
-        output = libcudf.replace.replace(
-            replaced_codes, to_replace_col, replacement_col
-        )
-        codes = as_unsigned_codes(len(new_cats["cats"]), output)
+        output = replaced_codes.replace(to_replace_col, replacement_col)
+        codes = as_unsigned_codes(len(new_cats["cats"]), output)  # type: ignore[arg-type]
 
         result = type(self)(
             data=self.data,  # type: ignore[arg-type]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8ddfd4a54ae..d1938f47d66 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -236,8 +236,14 @@ def find_and_replace(
     ) -> Self:
         raise NotImplementedError
 
-    def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
-        return libcudf.replace.clip(self, lo, hi)
+    @acquire_spill_lock()
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
+        plc_column = plc.replace.clamp(
+            self.to_pylibcudf(mode="read"),
+            cudf.Scalar(lo, self.dtype).device_value.c_value,
+            cudf.Scalar(hi, self.dtype).device_value.c_value,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if self is other:
@@ -685,6 +691,18 @@ def _validate_fillna_value(
             return cudf.Scalar(fill_value, dtype=self.dtype)
         return as_column(fill_value)
 
+    @acquire_spill_lock()
+    def replace(
+        self, values_to_replace: Self, replacement_values: Self
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.replace.find_and_replace_all(
+                self.to_pylibcudf(mode="read"),
+                values_to_replace.to_pylibcudf(mode="read"),
+                replacement_values.to_pylibcudf(mode="read"),
+            )
+        )
+
     def fillna(
         self,
         fill_value: ScalarLike | ColumnLike,
@@ -703,11 +721,32 @@ def fillna(
                 return self.copy()
             else:
                 fill_value = self._validate_fillna_value(fill_value)
-        return libcudf.replace.replace_nulls(
-            input_col=self.nans_to_nulls(),
-            replacement=fill_value,
-            method=method,
-        )._with_type_metadata(self.dtype)
+
+        if fill_value is None and method is None:
+            raise ValueError("Must specify a fill 'value' or 'method'.")
+
+        if fill_value and method:
+            raise ValueError("Cannot specify both 'value' and 'method'.")
+
+        input_col = self.nans_to_nulls()
+
+        with acquire_spill_lock():
+            if method:
+                plc_replace = (
+                    plc.replace.ReplacePolicy.PRECEDING
+                    if method == "ffill"
+                    else plc.replace.ReplacePolicy.FOLLOWING
+                )
+            elif is_scalar(fill_value):
+                plc_replace = cudf.Scalar(fill_value).device_value.c_value
+            else:
+                plc_replace = fill_value.to_pylibcudf(mode="read")
+            plc_column = plc.replace.replace_nulls(
+                input_col.to_pylibcudf(mode="read"),
+                plc_replace,
+            )
+            result = type(self).from_pylibcudf(plc_column)
+        return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a7538c1c947..c8f859596b2 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -482,7 +482,7 @@ def find_and_replace(
         to_replace: ColumnLike,
         replacement: ColumnLike,
         all_nan: bool = False,
-    ) -> NumericalColumn:
+    ) -> Self:
         """
         Return col with *to_replace* replaced with *value*.
         """
@@ -547,7 +547,7 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        replaced = self.astype(common_type)
+        replaced = cast(Self, self.astype(common_type))
         df = cudf.DataFrame._from_data(
             {
                 "old": to_replace_col.astype(common_type),
@@ -563,9 +563,7 @@ def find_and_replace(
             )
             df = df.dropna(subset=["old"])
 
-        return libcudf.replace.replace(
-            replaced, df._data["old"], df._data["new"]
-        )
+        return replaced.replace(df._data["old"], df._data["new"])
 
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d45c76d3ddb..fa5f0dd99fa 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -6185,7 +6185,7 @@ def find_and_replace(
             df = df.dropna(subset=["old"])
         else:
             res = self
-        return libcudf.replace.replace(res, df._data["old"], df._data["new"])
+        return res.replace(df._data["old"], df._data["new"])
 
     def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
         if (

From d1bad33caef34b8fa95543c7494780f2084ee603 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 2 Dec 2024 21:48:26 +0000
Subject: [PATCH 04/78] Update the hook versions in pre-commit (#17462)

The major change here is to move to ruff 0.8 which, among other things, introduces automatic sorting for `__all__` and `__slots__` (so I've turned those on and fixed things).

Notable actual bug fix: https://github.com/rapidsai/cudf/commit/b2cfb9c88db13228a94628970c4c8c01a5527d56

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)
  - Jake Awe (https://github.com/AyodeAwe)
  - Nghia Truong (https://github.com/ttnghia)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/17462
---
 .pre-commit-config.yaml                       | 14 ++---
 cpp/src/lists/set_operations.cu               |  2 +-
 pyproject.toml                                | 25 +++++++--
 python/cudf/benchmarks/common/config.py       |  4 +-
 python/cudf/benchmarks/conftest.py            | 16 +++---
 python/cudf/cudf/__init__.py                  |  2 +-
 python/cudf/cudf/_fuzz_testing/fuzzer.py      |  2 +-
 python/cudf/cudf/core/_base_index.py          |  8 +--
 python/cudf/cudf/core/buffer/spill_manager.py |  4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |  2 +-
 python/cudf/cudf/core/column/__init__.py      | 48 ++++++++++++-----
 python/cudf/cudf/core/column/categorical.py   |  2 +-
 python/cudf/cudf/core/column/datetime.py      |  4 +-
 python/cudf/cudf/core/column/decimal.py       | 11 ++--
 python/cudf/cudf/core/column/interval.py      |  3 +-
 python/cudf/cudf/core/column/lists.py         |  3 +-
 python/cudf/cudf/core/column/numerical.py     |  9 ++--
 .../cudf/cudf/core/column/numerical_base.py   |  2 +-
 python/cudf/cudf/core/column/string.py        | 13 ++---
 python/cudf/cudf/core/column/struct.py        |  2 +-
 python/cudf/cudf/core/column/timedelta.py     |  6 ++-
 python/cudf/cudf/core/column_accessor.py      |  6 +--
 python/cudf/cudf/core/cut.py                  |  2 +-
 python/cudf/cudf/core/dataframe.py            | 50 ++++++++++++------
 python/cudf/cudf/core/dtypes.py               | 18 +++----
 python/cudf/cudf/core/frame.py                |  2 +-
 python/cudf/cudf/core/groupby/groupby.py      |  4 +-
 python/cudf/cudf/core/index.py                | 22 ++++----
 python/cudf/cudf/core/indexed_frame.py        | 42 +++++++--------
 python/cudf/cudf/core/mixins/scans.py         |  4 +-
 python/cudf/cudf/core/multiindex.py           |  2 +-
 python/cudf/cudf/core/reshape.py              |  2 +-
 python/cudf/cudf/core/scalar.py               |  2 +-
 python/cudf/cudf/core/series.py               | 14 ++---
 python/cudf/cudf/core/single_column_frame.py  |  6 +--
 python/cudf/cudf/core/udf/masked_typing.py    |  4 +-
 python/cudf/cudf/datasets.py                  |  2 +-
 python/cudf/cudf/io/parquet.py                |  7 +--
 python/cudf/cudf/options.py                   |  2 +-
 python/cudf/cudf/pandas/__init__.py           |  4 +-
 python/cudf/cudf/pandas/__main__.py           |  2 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   |  6 +--
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  6 +--
 .../pandas/scripts/analyze-test-failures.py   |  2 +-
 python/cudf/cudf/testing/dataset_generator.py |  4 +-
 python/cudf/cudf/testing/testing.py           |  4 +-
 .../cudf/tests/series/test_datetimelike.py    |  4 +-
 python/cudf/cudf/tests/test_binops.py         | 12 ++---
 python/cudf/cudf/tests/test_categorical.py    |  6 +--
 python/cudf/cudf/tests/test_concat.py         |  6 +--
 python/cudf/cudf/tests/test_csv.py            |  4 +-
 .../cudf/tests/test_cuda_array_interface.py   |  2 +-
 python/cudf/cudf/tests/test_dataframe.py      | 28 +++++-----
 python/cudf/cudf/tests/test_feather.py        |  2 +-
 python/cudf/cudf/tests/test_groupby.py        |  7 ++-
 python/cudf/cudf/tests/test_hdf.py            |  2 +-
 python/cudf/cudf/tests/test_index.py          | 14 ++---
 python/cudf/cudf/tests/test_joining.py        |  6 +--
 python/cudf/cudf/tests/test_json.py           | 14 ++---
 python/cudf/cudf/tests/test_orc.py            |  4 +-
 python/cudf/cudf/tests/test_parquet.py        |  2 +-
 python/cudf/cudf/tests/test_quantiles.py      |  2 +-
 python/cudf/cudf/tests/test_replace.py        |  4 +-
 python/cudf/cudf/tests/test_reshape.py        |  6 +--
 python/cudf/cudf/tests/test_scalar.py         |  4 +-
 python/cudf/cudf/tests/test_series.py         |  6 +--
 python/cudf/cudf/tests/test_setitem.py        |  2 +-
 python/cudf/cudf/tests/test_spilling.py       |  2 +-
 python/cudf/cudf/tests/test_string.py         |  4 +-
 python/cudf/cudf/tests/test_testing.py        |  2 +-
 .../cudf/cudf/tests/text/test_text_methods.py | 40 +++++++-------
 python/cudf/cudf/utils/ioutils.py             |  2 +-
 python/cudf/cudf/utils/queryutils.py          |  4 +-
 python/cudf/cudf/utils/utils.py               |  2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  4 +-
 .../tests/test_matplotlib.py                  |  2 +-
 .../tests/test_plotly.py                      |  2 +-
 .../tests/test_seaborn.py                     |  2 +-
 python/cudf_polars/cudf_polars/__init__.py    |  2 +-
 .../cudf_polars/containers/__init__.py        |  2 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    | 32 ++++++------
 .../dsl/expressions/aggregation.py            |  2 +-
 .../cudf_polars/dsl/expressions/base.py       |  2 +-
 .../cudf_polars/dsl/expressions/boolean.py    |  2 +-
 .../cudf_polars/dsl/expressions/rolling.py    |  2 +-
 .../cudf_polars/dsl/expressions/selection.py  |  2 +-
 .../cudf_polars/dsl/expressions/string.py     |  2 +-
 .../cudf_polars/dsl/expressions/unary.py      |  2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 52 +++++++++----------
 .../cudf_polars/cudf_polars/dsl/traversal.py  |  6 +--
 .../cudf_polars/typing/__init__.py            | 10 ++--
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 16 +++---
 python/cudf_polars/pyproject.toml             |  1 +
 python/dask_cudf/dask_cudf/__init__.py        | 20 +++----
 .../dask_cudf/dask_cudf/_expr/collection.py   |  9 ++--
 python/dask_cudf/dask_cudf/core.py            |  4 +-
 python/dask_cudf/dask_cudf/io/__init__.py     |  5 +-
 python/dask_cudf/dask_cudf/io/parquet.py      |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |  4 +-
 .../dask_cudf/tests/test_dispatch.py          |  2 +-
 .../dask_cudf/dask_cudf/tests/test_groupby.py |  2 +-
 python/libcudf/libcudf/__init__.py            |  2 +
 python/pylibcudf/pylibcudf/__init__.py        |  6 +--
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  2 +-
 .../pylibcudf/pylibcudf/tests/io/test_csv.py  |  2 +-
 105 files changed, 431 insertions(+), 368 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 37b26949804..39869b67547 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -17,11 +17,11 @@ repos:
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.2
+    rev: v0.16.6
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.10.0'
+    rev: 'v1.13.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -33,7 +33,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.8.5
+    rev: 1.9.1
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
             ^cpp/include/cudf_test/cxxopts.hpp
           )
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.6
+    rev: 0.6.7
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -133,7 +133,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -144,7 +144,7 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.8.0
     hooks:
       - id: ruff
         args: ["--fix"]
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index c0bc10dd266..6f2acbb0712 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -72,7 +72,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   // - Generate labels for lhs and rhs child elements.
   // - Check existence for rows of the table {rhs_labels, rhs_child} in the table
   //   {lhs_labels, lhs_child}.
-  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults
+  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence results
   //   computed in the previous step.
 
   auto const lhs_child = lhs.get_sliced_child(stream);
diff --git a/pyproject.toml b/pyproject.toml
index 6933484f4e7..0c95ea60408 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,12 +18,13 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn,thirdparty"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
 line-length = 79
+target-version = "py310"
 
 [tool.ruff.lint]
 typing-modules = ["cudf._typing"]
@@ -94,17 +95,35 @@ select = [
     "UP035",
     # usage of legacy `np.random` function calls
     "NPY002",
+    # Ruff-specific rules
+    "RUF",
 ]
 ignore = [
     # whitespace before :
     "E203",
     # line-too-long (due to Copyright header)
     "E501",
+    # type-comparison, disabled because we compare types to numpy dtypes
+    "E721",
+    # String contains ambiguous character
+    "RUF001",
+    # Parenthesize `a and b` expressions when chaining `and` and `or`
+    # together, to make the precedence clear
+    "RUF021",
+    # Mutable class attributes should be annotated with
+    # `typing.ClassVar`
+    "RUF012",
 ]
 fixable = ["ALL"]
 exclude = [
-    # TODO: Remove this in a follow-up where we fix __all__.
-    "__init__.py",
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
+]
+
+[tool.ruff.format]
+exclude = [
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
index c1e9d4d6116..872ba424d20 100644
--- a/python/cudf/benchmarks/common/config.py
+++ b/python/cudf/benchmarks/common/config.py
@@ -42,9 +42,9 @@ def pytest_collection_modifyitems(session, config, items):
         items[:] = list(filter(is_pandas_compatible, items))
 
 else:
-    import cupy  # noqa: W0611, F401
+    import cupy  # noqa: F401
 
-    import cudf  # noqa: W0611, F401
+    import cudf  # noqa: F401
 
     def pytest_collection_modifyitems(session, config, items):
         pass
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 0e4afadccf5..24ff211387c 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,18 +56,16 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-# Turn off isort until we upgrade to 5.8.0
-# https://github.com/pycqa/isort/issues/1594
-from config import (  # noqa: W0611, E402, F401
+from config import (
     NUM_COLS,
     NUM_ROWS,
-    collect_ignore,
-    cudf,  # noqa: W0611, E402, F401
-    pytest_collection_modifyitems,
-    pytest_sessionfinish,
-    pytest_sessionstart,
+    collect_ignore,  # noqa: F401
+    cudf,
+    pytest_collection_modifyitems,  # noqa: F401
+    pytest_sessionfinish,  # noqa: F401
+    pytest_sessionstart,  # noqa: F401
 )
-from utils import (  # noqa: E402
+from utils import (
     OrderedSet,
     collapse_fixtures,
     column_generators,
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 99b759e2166..843f2670b4d 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -99,6 +99,7 @@
 
 
 __all__ = [
+    "NA",
     "BaseIndex",
     "CategoricalDtype",
     "CategoricalIndex",
@@ -114,7 +115,6 @@
     "IntervalIndex",
     "ListDtype",
     "MultiIndex",
-    "NA",
     "NaT",
     "RangeIndex",
     "Scalar",
diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py
index ee1b2c1f1c4..4b080937a17 100644
--- a/python/cudf/cudf/_fuzz_testing/fuzzer.py
+++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py
@@ -95,7 +95,7 @@ def start(self):
                 else:
                     self._data_handler.set_rand_params(self.params)
                     kwargs = self._data_handler._current_params["test_kwargs"]
-                    logging.info(f"Parameters passed: {str(kwargs)}")
+                    logging.info(f"Parameters passed: {kwargs!s}")
                     self._target(file_name, **kwargs)
             except KeyboardInterrupt:
                 logging.info(
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index a6abd63d042..2df154ee112 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -133,7 +133,7 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    def tolist(self):  # noqa: D102
+    def tolist(self):
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
@@ -148,7 +148,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self) -> int:  # noqa: D401
+    def ndim(self) -> int:
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -265,7 +265,7 @@ def get_loc(self, key):
         slice(1, 3, None)
         >>> multi_index.get_loc(('b', 'e'))
         1
-        """  # noqa: E501
+        """
 
     def max(self):
         """The maximum value of the index."""
@@ -1473,7 +1473,7 @@ def _intersection(self, other, sort=None):
             ._data
         )
 
-        if sort is {None, True} and len(other):
+        if sort in {None, True} and len(other):
             return intersection_result.sort_values()
         return intersection_result
 
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index ed351a6b107..07d0d698cb8 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -54,7 +54,7 @@ def get_rmm_memory_resource_stack(
     """
 
     if hasattr(mr, "upstream_mr"):
-        return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr)
+        return [mr, *get_rmm_memory_resource_stack(mr.upstream_mr)]
     return [mr]
 
 
@@ -275,7 +275,7 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         print(
             f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes "
             "failed, spill-on-demand couldn't find any device memory to "
-            f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n"
+            f"spill:\n{self!r}\ntraceback:\n{get_traceback()}\n"
             f"{self.statistics}"
         )
         return False  # Since we didn't find anything to spill, we give up
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b40c56c9a6b..7305ff651c6 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -366,7 +366,7 @@ def __str__(self) -> str:
             f"<{self.__class__.__name__} size={format_bytes(self._size)} "
             f"spillable={self.spillable} exposed={self.exposed} "
             f"num-spill-locks={len(self._spill_locks)} "
-            f"ptr={ptr_info} owner={repr(self._owner)}>"
+            f"ptr={ptr_info} owner={self._owner!r}>"
         )
 
 
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index a1e87d04bc9..0a9d339a6a8 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -1,9 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-"""
-isort: skip_file
-"""
-
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
@@ -15,17 +11,43 @@
     deserialize_columns,
     serialize_columns,
 )
-from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
-from cudf.core.column.datetime import DatetimeTZColumn  # noqa: F401
-from cudf.core.column.lists import ListColumn  # noqa: F401
-from cudf.core.column.numerical import NumericalColumn  # noqa: F401
-from cudf.core.column.string import StringColumn  # noqa: F401
-from cudf.core.column.struct import StructColumn  # noqa: F401
-from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
-from cudf.core.column.decimal import (  # noqa: F401
+from cudf.core.column.datetime import (
+    DatetimeColumn,
+    DatetimeTZColumn,
+)
+from cudf.core.column.decimal import (
     Decimal32Column,
     Decimal64Column,
     Decimal128Column,
     DecimalBaseColumn,
 )
+from cudf.core.column.interval import IntervalColumn
+from cudf.core.column.lists import ListColumn
+from cudf.core.column.numerical import NumericalColumn
+from cudf.core.column.string import StringColumn
+from cudf.core.column.struct import StructColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
+
+__all__ = [
+    "CategoricalColumn",
+    "ColumnBase",
+    "DatetimeColumn",
+    "DatetimeTZColumn",
+    "Decimal32Column",
+    "Decimal64Column",
+    "Decimal128Column",
+    "DecimalBaseColumn",
+    "IntervalColumn",
+    "ListColumn",
+    "NumericalColumn",
+    "StringColumn",
+    "StructColumn",
+    "TimeDeltaColumn",
+    "as_column",
+    "build_column",
+    "column_empty",
+    "column_empty_like",
+    "concat_columns",
+    "deserialize_columns",
+    "serialize_columns",
+]
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 7551703c53e..cbbe01f7289 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -888,7 +888,7 @@ def find_and_replace(
         if len(replacement_col) == replacement_col.null_count:
             replacement_col = replacement_col.astype(self.categories.dtype)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 24b55fe1bc2..c9be3f239f9 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,6 +18,8 @@
 import pylibcudf as plc
 
 import cudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import unary
@@ -28,7 +30,7 @@
     get_tz_data,
 )
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column, column, string
+from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import (
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index ce7aa91f775..ac9a2caad50 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -18,7 +18,8 @@
 from cudf.api.types import is_scalar
 from cudf.core._internals import unary
 from cudf.core.buffer import as_buffer
-from cudf.core.column import ColumnBase
+from cudf.core.column.column import ColumnBase
+from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
     Decimal32Dtype,
     Decimal64Dtype,
@@ -28,8 +29,6 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
-from .numerical_base import NumericalBaseColumn
-
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
@@ -435,7 +434,7 @@ def _get_decimal_type(
     `op` for the given dtypes.
 
     For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
+    """
 
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
@@ -506,8 +505,8 @@ def _get_decimal_type(
     # if we've reached this point, we cannot create a decimal type without
     # overflow; raise an informative error
     raise ValueError(
-        f"Performing {op} between columns of type {repr(lhs_dtype)} and "
-        f"{repr(rhs_dtype)} would result in overflow"
+        f"Performing {op} between columns of type {lhs_dtype!r} and "
+        f"{rhs_dtype!r} would result in overflow"
     )
 
 
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 9147270c289..34975fc94f4 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -7,7 +7,8 @@
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import StructColumn, as_column
+from cudf.core.column.column import as_column
+from cudf.core.column.struct import StructColumn
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 42df5123014..789c4a7f3cb 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -13,11 +13,12 @@
 import pylibcudf as plc
 
 import cudf
+import cudf.core.column.column as column
 from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column, column
+from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c8f859596b2..8ca42debb72 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,10 +12,13 @@
 import pylibcudf
 
 import cudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import unary
-from cudf.core.column import ColumnBase, as_column, column, string
+from cudf.core.column.column import ColumnBase, as_column
+from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
@@ -26,8 +29,6 @@
     np_dtypes_to_pandas_dtypes,
 )
 
-from .numerical_base import NumericalBaseColumn
-
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 
@@ -226,7 +227,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # If `other` is a Python integer and it is out-of-bounds
             # promotion could fail but we can trivially define the result
             # in terms of `notnull` or `NULL_NOT_EQUALS`.
-            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+            if type(other) is int and self.dtype.kind in "iu":
                 truthiness = None
                 iinfo = np.iinfo(self.dtype)
                 if iinfo.min > other:
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 6d639337401..ea242e34edb 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase
+from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fa5f0dd99fa..76d67585609 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -17,13 +17,14 @@
 
 import cudf
 import cudf.api.types
+import cudf.core.column.column as column
+import cudf.core.column.datetime as datetime
 from cudf import _lib as libcudf
 from cudf._lib import string_casting as str_cast, strings as libstrings
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import column, datetime
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
@@ -548,7 +549,7 @@ def join(
         2    <NA>
         3     c-d
         dtype: object
-        """  # noqa E501
+        """
         if sep is None:
             sep = ""
 
@@ -694,7 +695,7 @@ def extract(
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if not _is_supported_regex_flags(flags):
             raise NotImplementedError(
                 "unsupported value for `flags` parameter"
@@ -830,7 +831,7 @@ def contains(
             value is set.
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
         if regex and isinstance(pat, re.Pattern):
@@ -3675,7 +3676,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
             -   Some characters need to be escaped when passing
                 in pat. e.g. ``'$'`` has a special meaning in regex
                 and must be escaped when finding this literal character.
-        """  # noqa W605
+        """
         if isinstance(pat, re.Pattern):
             flags = pat.flags & ~re.U
             pat = pat.pattern
@@ -6160,7 +6161,7 @@ def find_and_replace(
         to_replace_col = column.as_column(to_replace)
         replacement_col = column.as_column(replacement)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 2adc6b54bab..db6ad72ab56 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import ColumnBase
+from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import StructDtype
 from cudf.core.missing import NA
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 620fe31c30f..ccc9ef2b3f6 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -11,11 +11,13 @@
 import pyarrow as pa
 
 import cudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.api.types import is_scalar
 from cudf.core._internals import unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase, column, string
+from cudf.core.column.column import ColumnBase
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import (
     _all_bools_with_nulls,
@@ -468,7 +470,7 @@ def components(self) -> dict[str, ColumnBase]:
         2  13000     10       12       48           712             0            0
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
-        """  # noqa: E501
+        """
 
         date_meta = {
             "seconds": ["m", "s"],
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 496e86ed709..e4fd82e819b 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -49,7 +49,7 @@ def from_zip(cls, data: abc.Iterator):
     def __getitem__(self, key):
         """Recursively apply dict.__getitem__ for nested elements."""
         # As described in the pandas docs
-        # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index  # noqa: E501
+        # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index
         # accessing nested elements of a multiindex must be done using a tuple.
         # Lists and other sequences are treated as accessing multiple elements
         # at the top level of the index.
@@ -62,10 +62,10 @@ def _to_flat_dict_inner(d: dict, parents: tuple = ()):
     for k, v in d.items():
         if not isinstance(v, d.__class__):
             if parents:
-                k = parents + (k,)
+                k = (*parents, k)
             yield (k, v)
         else:
-            yield from _to_flat_dict_inner(d=v, parents=parents + (k,))
+            yield from _to_flat_dict_inner(d=v, parents=(*parents, k))
 
 
 class ColumnAccessor(abc.MutableMapping):
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index a4d12cfc7f0..5bfea45a946 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -149,7 +149,7 @@ def cut(
         if len(set(bins)) is not len(bins):
             if duplicates == "raise":
                 raise ValueError(
-                    f"Bin edges must be unique: {repr(bins)}.\n"
+                    f"Bin edges must be unique: {bins!r}.\n"
                     f"You can drop duplicate edges by setting the 'duplicates'"
                     "kwarg"
                 )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b58ab13be93..fa8d517a9ef 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,7 +13,13 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator, MutableMapping
+from collections.abc import (
+    Callable,
+    Hashable,
+    Iterator,
+    MutableMapping,
+    Sequence,
+)
 from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
@@ -1131,7 +1137,7 @@ def _from_data(
         data: MutableMapping,
         index: BaseIndex | None = None,
         columns: Any = None,
-    ) -> DataFrame:
+    ) -> Self:
         out = super()._from_data(data=data, index=index)
         if columns is not None:
             out.columns = columns
@@ -2242,7 +2248,7 @@ def from_dict(
         n1 n2
         a  b   1  3
            c   2  4
-        """  # noqa: E501
+        """
 
         orient = orient.lower()
         if orient == "index":
@@ -2399,7 +2405,7 @@ def to_dict(
         >>> df.to_dict('records', into=dd)
         [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
          defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
-        """  # noqa: E501
+        """
         orient = orient.lower()
 
         if orient == "series":
@@ -3027,7 +3033,7 @@ def set_index(
         if len(keys) == 0:
             raise ValueError("No valid columns to be added to index.")
         if append:
-            keys = [self.index] + keys
+            keys = [self.index, *keys]
 
         # Preliminary type check
         labels_not_found = []
@@ -3093,7 +3099,7 @@ def set_index(
     @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):  # noqa: D102
+    ):
         if isinstance(value, (pd.Series, pd.DataFrame)):
             value = cudf.from_pandas(value)
         if isinstance(value, cudf.Series):
@@ -3574,7 +3580,7 @@ def drop_duplicates(
         1  Yum Yum   cup     4.0
         2  Indomie   cup     3.5
         4  Indomie  pack     5.0
-        """  # noqa: E501
+        """
         outdf = super().drop_duplicates(
             subset=subset,
             keep=keep,
@@ -4854,7 +4860,7 @@ def map(
 
         if na_action not in {"ignore", None}:
             raise ValueError(
-                f"na_action must be 'ignore' or None. Got {repr(na_action)}"
+                f"na_action must be 'ignore' or None. Got {na_action!r}"
             )
 
         if na_action == "ignore":
@@ -5727,7 +5733,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
         """
 
         data = self
-        index_descr = []
+        index_descr: Sequence[dict[str, Any]] | Sequence[str] = []
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
         index = self.index
@@ -5934,7 +5940,7 @@ def _from_arrays(
         index=None,
         columns=None,
         nan_as_null=False,
-    ):
+    ) -> Self:
         """
         Convert an object implementing an array interface to DataFrame.
 
@@ -5987,6 +5993,12 @@ def _from_arrays(
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
+        # Mapping/MutableMapping are invariant in the key type, so
+        # dict[int, ColumnBase] (the inferred type of ca_data) is not
+        # a valid type to pass to a function accepting
+        # Mapping[Hashable, ColumnBase] even though int is Hashable.
+        # See: https://github.com/python/typing/issues/445
+        ca_data: dict[Hashable, ColumnBase]
         if array_data.ndim == 2:
             ca_data = {
                 k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
@@ -6133,7 +6145,7 @@ def quantile(
             non-numeric types and result is expected to be a Series in case of
             Pandas. cuDF will return a DataFrame as it doesn't support mixed
             types under Series.
-        """  # noqa: E501
+        """
         if axis not in (0, None):
             raise NotImplementedError("axis is not implemented yet")
 
@@ -6832,7 +6844,7 @@ def select_dtypes(self, include=None, exclude=None):
         3  False  2.0
         4   True  1.0
         5  False  2.0
-        """  # noqa: E501
+        """
 
         # code modified from:
         # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196
@@ -7035,7 +7047,9 @@ def to_orc(
         )
 
     @_performance_tracking
-    def stack(self, level=-1, dropna=no_default, future_stack=False):
+    def stack(
+        self, level=-1, dropna=no_default, future_stack=False
+    ) -> DataFrame | Series:
         """Stack the prescribed level(s) from columns to index
 
         Return a reshaped DataFrame or Series having a multi-level
@@ -7282,11 +7296,13 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         )
 
         if has_unnamed_levels:
-            unnamed_level_values = list(
-                map(column_name_idx.get_level_values, unnamed_levels_indices)
-            )
             unnamed_level_values = pd.MultiIndex.from_arrays(
-                unnamed_level_values
+                list(
+                    map(
+                        column_name_idx.get_level_values,
+                        unnamed_levels_indices,
+                    )
+                )
             )
 
         def unnamed_group_generator():
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 2110e610c37..801020664da 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -237,7 +237,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype)
         >>> cudf_dtype
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
-        """  # noqa: E501
+        """
         return CategoricalDtype(
             categories=dtype.categories, ordered=dtype.ordered
         )
@@ -254,7 +254,7 @@ def to_pandas(self) -> pd.CategoricalDtype:
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
         >>> dtype.to_pandas()
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
-        """  # noqa: E501
+        """
         if self._categories is None:
             categories = None
         elif self._categories.dtype.kind == "f":
@@ -399,7 +399,7 @@ def element_type(self) -> Dtype:
         ListDtype(float32)
         >>> deep_nested_type.element_type.element_type.element_type
         'float32'
-        """  # noqa: E501
+        """
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
         elif isinstance(self._typ.value_type, pa.StructType):
@@ -420,7 +420,7 @@ def leaf_type(self):
         ListDtype(ListDtype(ListDtype(float32)))
         >>> deep_nested_type.leaf_type
         'float32'
-        """  # noqa: E501
+        """
         if isinstance(self.element_type, ListDtype):
             return self.element_type.leaf_type
         else:
@@ -486,7 +486,7 @@ def __eq__(self, other):
 
     def __repr__(self):
         if isinstance(self.element_type, (ListDtype, StructDtype)):
-            return f"{type(self).__name__}({repr(self.element_type)})"
+            return f"{type(self).__name__}({self.element_type!r})"
         else:
             return f"{type(self).__name__}({self.element_type})"
 
@@ -556,7 +556,7 @@ class StructDtype(_BaseDtype):
     >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"})
     >>> nested_struct_dtype
     StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')})
-    """  # noqa: E501
+    """
 
     name = "struct"
 
@@ -730,7 +730,7 @@ def itemsize(self):
         >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2)
         >>> decimal{size}_dtype
         Decimal{size}Dtype(precision=9, scale=2)
-    """  # noqa: E501
+    """
 )
 
 
@@ -743,7 +743,7 @@ def __init__(self, precision, scale=0):
 
     @property
     def str(self):
-        return f"{str(self.name)}({self.precision}, {self.scale})"
+        return f"{self.name!s}({self.precision}, {self.scale})"
 
     @property
     def precision(self):
@@ -950,7 +950,7 @@ def __eq__(self, other):
             # This means equality isn't transitive but mimics pandas
             return other in (self.name, str(self))
         return (
-            type(self) == type(other)
+            type(self) is type(other)
             and self.subtype == other.subtype
             and self.closed == other.closed
         )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0c0f271fe6f..70789160cb6 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1392,7 +1392,7 @@ def argsort(
         >>> idx = cudf.Index([3, 1, 2])
         >>> idx.argsort()
         array([1, 2, 0], dtype=int32)
-        """  # noqa: E501
+        """
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
         if kind != "quicksort":
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e977f037b79..29ab3b60d9d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1467,9 +1467,7 @@ def _iterative_groupby_apply(
                 RuntimeWarning,
             )
 
-        chunks = [
-            grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
-        ]
+        chunks = [grouped_values[s:e] for s, e in itertools.pairwise(offsets)]
         chunk_results = [function(chk, *args) for chk in chunks]
         return self._post_process_chunk_results(
             chunk_results, group_names, group_keys, grouped_values
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ff9cd310aef..eac04cf36ec 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1619,7 +1619,7 @@ def argsort(
         Returns
         -------
         cupy.ndarray: The indices sorted based on input.
-        """  # noqa: E501
+        """
         return super().argsort(
             axis=axis,
             kind=kind,
@@ -2218,7 +2218,7 @@ def year(self) -> Index:
         DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]')
         >>> datetime_index.year
         Index([2000, 2001, 2002], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.year, name=self.name)
 
     @property  # type: ignore
@@ -2237,7 +2237,7 @@ def month(self) -> Index:
         DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]')
         >>> datetime_index.month
         Index([1, 2, 3], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.month, name=self.name)
 
     @property  # type: ignore
@@ -2256,7 +2256,7 @@ def day(self) -> Index:
         DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]')
         >>> datetime_index.day
         Index([1, 2, 3], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.day, name=self.name)
 
     @property  # type: ignore
@@ -2340,7 +2340,7 @@ def microsecond(self) -> Index:
               dtype='datetime64[ns]')
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
-        """  # noqa: E501
+        """
         return Index._from_column(
             (
                 # Need to manually promote column to int32 because
@@ -2615,7 +2615,7 @@ def ceil(self, freq: str) -> Self:
         ... ])
         >>> gIndex.ceil("T")
         DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
@@ -2646,7 +2646,7 @@ def floor(self, freq: str) -> Self:
         ... ])
         >>> gIndex.floor("T")
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(
             self._column.floor(freq), name=self.name
         )
@@ -2686,7 +2686,7 @@ def round(self, freq: str) -> Self:
         DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]')
         >>> dt_idx.round('T')
         DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(
             self._column.round(freq), name=self.name
         )
@@ -2737,7 +2737,7 @@ def tz_localize(
         ``ambiguous`` and ``nonexistent`` arguments. Any
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
-        """  # noqa: E501
+        """
         result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_column(
             result_col, name=self.name, freq=self._freq
@@ -2774,7 +2774,7 @@ def tz_convert(self, tz: str | None) -> Self:
                        '2018-03-02 14:00:00+00:00',
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
-        """  # noqa: E501
+        """
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_column(result_col, name=self.name)
 
@@ -3118,7 +3118,7 @@ class CategoricalIndex(Index):
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a")
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
-    """  # noqa: E501
+    """
 
     @_performance_tracking
     def __init__(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f8c2587937..21ac009e7ff 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -607,7 +607,7 @@ def copy(self, deep: bool = True) -> Self:
         )
 
     @_performance_tracking
-    def equals(self, other) -> bool:  # noqa: D102
+    def equals(self, other) -> bool:
         return super().equals(other) and self.index.equals(other.index)
 
     @property
@@ -5474,7 +5474,7 @@ def groupby(
             ),
         )
     )
-    def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def add(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5515,7 +5515,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def radd(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5556,7 +5556,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def subtract(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5599,7 +5599,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rsub(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5640,7 +5640,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def multiply(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5683,7 +5683,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rmul(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5724,7 +5724,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def mod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5765,7 +5765,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rmod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5806,7 +5806,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def pow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5847,7 +5847,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rpow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5888,7 +5888,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def floordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5929,7 +5929,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rfloordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5970,7 +5970,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def truediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -6015,7 +6015,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rtruediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -6059,7 +6059,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def eq(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
@@ -6099,7 +6099,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def ne(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
@@ -6139,7 +6139,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def lt(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
@@ -6179,7 +6179,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def le(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
@@ -6219,7 +6219,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def gt(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
@@ -6259,7 +6259,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def ge(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def ge(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py
index b0f606e32e6..289fcb84d91 100644
--- a/python/cudf/cudf/core/mixins/scans.py
+++ b/python/cudf/cudf/core/mixins/scans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from .mixin_factory import _create_delegating_mixin
 
@@ -12,5 +12,5 @@
         "cumprod",
         "cummin",
         "cummax",
-    },  # noqa: E231
+    },
 )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 19a53af018d..173d4e1c584 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -567,7 +567,7 @@ def levels(self) -> list[cudf.Index]:
                 names=['a', 'b'])
         >>> midx.levels
         [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
-        """  # noqa: E501
+        """
         return [
             idx.rename(name) for idx, name in zip(self._levels, self.names)
         ]
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 016bd1225cd..f37b44b1100 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1013,7 +1013,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._column_names,
+        level_names=(None, *columns._column_names),
         verify=False,
     )
     return cudf.DataFrame._from_data(ca, index=index_labels)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f6331aa1f49..80dd0921f9c 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -304,7 +304,7 @@ def __repr__(self):
         # https://github.com/numpy/numpy/issues/17552
         return (
             f"{self.__class__.__name__}"
-            f"({str(self.value)}, dtype={self.dtype})"
+            f"({self.value!s}, dtype={self.dtype})"
         )
 
     def _binop_result_dtype_or_error(self, other, op):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 95ea22b5ad5..928f3c3d666 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None):
         3    a
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
-        """  # noqa: E501
+        """
         col = as_column(categorical)
         if codes is not None:
             codes = as_column(codes)
@@ -942,7 +942,7 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
-    def tolist(self):  # noqa: D102
+    def tolist(self):
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
@@ -1087,7 +1087,7 @@ def reindex(
             DataFrame, followed by the original Series values. When `drop` is
             True, a `Series` is returned. In either case, if ``inplace=True``,
             no value is returned.
-""",  # noqa: E501
+""",
             example="""
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
         >>> series
@@ -1196,7 +1196,7 @@ def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame:
         12      c
         13   <NA>
         15      d
-        """  # noqa: E501
+        """
         return self._to_frame(name=name, index=self.index)
 
     @_performance_tracking
@@ -2122,7 +2122,7 @@ def data(self):
         >>> np.array(series.data.memoryview())
         array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
                0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
-        """  # noqa: E501
+        """
         return self._column.data
 
     @property  # type: ignore
@@ -4590,7 +4590,7 @@ def is_month_end(self) -> Series:
         7    False
         8    False
         dtype: bool
-        """  # noqa: E501
+        """
         return self._return_result_like_self(self.series._column.is_month_end)
 
     @property  # type: ignore
@@ -5169,7 +5169,7 @@ def components(self) -> cudf.DataFrame:
         2  13000     10       12       48           712             0            0
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
-        """  # noqa: E501
+        """
         ca = ColumnAccessor(self.series._column.components(), verify=False)
         return self.series._constructor_expanddim._from_data(
             ca, index=self.series.index
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 0e66f383ca0..f6d0664758f 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -83,7 +83,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_performance_tracking
-    def ndim(self) -> int:  # noqa: D401
+    def ndim(self) -> int:
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -105,12 +105,12 @@ def _column(self) -> ColumnBase:
 
     @property  # type: ignore
     @_performance_tracking
-    def values(self) -> cupy.ndarray:  # noqa: D102
+    def values(self) -> cupy.ndarray:
         return self._column.values
 
     @property  # type: ignore
     @_performance_tracking
-    def values_host(self) -> numpy.ndarray:  # noqa: D102
+    def values_host(self) -> numpy.ndarray:
         return self._column.values_host
 
     @classmethod
diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py
index 4c90c5bbba0..3a1e01caf28 100644
--- a/python/cudf/cudf/core/udf/masked_typing.py
+++ b/python/cudf/cudf/core/udf/masked_typing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import operator
 
@@ -50,7 +50,7 @@
 SUPPORTED_NUMPY_TYPES = (
     NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES
 )
-supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"]))
+supported_type_str = "\n".join(sorted([*list(SUPPORTED_NUMPY_TYPES), "bool"]))
 
 _units = ["ns", "ms", "us", "s"]
 _datetime_cases = {types.NPDatetime(u) for u in _units}
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index dbabaacf6b5..e8d634598f4 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -6,7 +6,7 @@
 import cudf
 from cudf._lib.transform import bools_to_mask
 
-__all__ = ["timeseries", "randomdata"]
+__all__ = ["randomdata", "timeseries"]
 
 
 # TODO:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 750c6cec180..2382e9f12ed 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1062,10 +1062,7 @@ def to_parquet(
             )
 
         partition_info = (
-            [
-                (i, j - i)
-                for i, j in zip(partition_offsets, partition_offsets[1:])
-            ]
+            [(i, j - i) for i, j in itertools.pairwise(partition_offsets)]
             if partition_offsets is not None
             else None
         )
@@ -1485,7 +1482,7 @@ def write_table(self, df):
         )
         existing_cw_batch = defaultdict(dict)
         new_cw_paths = []
-        partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])]
+        partition_info = [(i, j - i) for i, j in itertools.pairwise(offsets)]
 
         for path, part_info, meta_path in zip(
             paths,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index e206c8bca08..79a3a794af3 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -380,7 +380,7 @@ class option_context(ContextDecorator):
     >>> from cudf import option_context
     >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32):
     ...     pass
-    """  # noqa: E501
+    """
 
     def __init__(self, *args) -> None:
         if len(args) % 2 != 0:
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index bacf1f7e77b..fec181e85d7 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -12,7 +12,7 @@
 from .magics import load_ipython_extension
 from .profiler import Profiler
 
-__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"]
+__all__ = ["Profiler", "install", "is_proxy_object", "load_ipython_extension"]
 
 
 LOADED = False
@@ -57,7 +57,7 @@ def install():
     current_mr = rmm.mr.get_current_device_resource()
     if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
         warnings.warn(
-            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={rmm_mode!s}",
             UserWarning,
         )
         return
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index e0d3d9101a9..619ee822a54 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -96,7 +96,7 @@ def main():
             (module,) = args.module
             # run the module passing the remaining arguments
             # as if it were run with python -m <module> <args>
-            sys.argv[:] = [module] + args.args  # not thread safe?
+            sys.argv[:] = [module, *args.args]  # not thread safe?
             runpy.run_module(module, run_name="__main__")
         elif len(args.args) >= 1:
             # Remove ourself from argv and continue
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 05e7d159c63..e763875adb8 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -77,8 +77,8 @@ def _pandas_util_dir():
     # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py
     res = list(
         set(
-            list(importlib.import_module("pandas.util").__dict__.keys())
-            + [
+            [
+                *list(importlib.import_module("pandas.util").__dict__.keys()),
                 "Appender",
                 "Substitution",
                 "_exceptions",
@@ -219,7 +219,7 @@ def Timestamp_Timedelta__new__(cls, *args, **kwargs):
 def _DataFrame__dir__(self):
     # Column names that are string identifiers are added to the dir of the
     # DataFrame
-    # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878  # noqa: E501
+    # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878
     _pd_df_dir = dir(pd.DataFrame)
     return _pd_df_dir + [
         colname
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 40893ee2614..d32d388b975 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.   # noqa: E501
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -247,7 +247,7 @@ def _fsproxy_state(self) -> _State:
     if metaclasses:
         metaclass = types.new_class(  # type: ignore
             f"{name}_Meta",
-            metaclasses + (_FastSlowProxyMeta,),
+            (*metaclasses, _FastSlowProxyMeta),
             {},
         )
     cls = types.new_class(
@@ -1301,7 +1301,7 @@ def _replace_closurevars(
     return functools.update_wrapper(
         g,
         f,
-        assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
+        assigned=(*functools.WRAPPER_ASSIGNMENTS, "__kwdefaults__"),
     )
 
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index bb2fc00d9fc..e4ee0ce1ca4 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -41,7 +41,7 @@ def count_failures(log_file_name, pattern):
                     PANDAS_TEST_PREFIX
                 )
                 if fnmatch(line_module_name, pattern):
-                    if "longrepr" in line and line["longrepr"]:
+                    if line.get("longrepr"):
                         if isinstance(line["longrepr"], (tuple, list)):
                             message = line["longrepr"][2].splitlines()[0]
                         elif isinstance(line["longrepr"], str):
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 99b686406fb..01a75a2efb0 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -237,9 +237,9 @@ def generate(
 def get_dataframe(parameters, use_threads):
     # Initialize seeds
     if parameters.seed is not None:
-        rng = np.random.default_rng(seed=parameters.seed)  # noqa: F841
+        rng = np.random.default_rng(seed=parameters.seed)
     else:
-        rng = np.random.default_rng(seed=0)  # noqa: F841
+        rng = np.random.default_rng(seed=0)
 
     # For each column, invoke the data generator
     for column_params in parameters.column_parameters:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 8d342f8e6c6..0b09cf7dc34 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -77,7 +77,7 @@ def _check_types(
         ):
             return
 
-    if type(left) != type(right):
+    if type(left) is not type(right):
         raise_assert_detail(
             obj, "Class types are different", f"{type(left)}", f"{type(right)}"
         )
@@ -149,7 +149,7 @@ def assert_column_equal(
         ):
             pass
         else:
-            if type(left) != type(right) or left.dtype != right.dtype:
+            if type(left) is not type(right) or left.dtype != right.dtype:
                 msg1 = f"{left.dtype}"
                 msg2 = f"{right.dtype}"
                 raise_assert_detail(obj, "Dtypes are different", msg1, msg2)
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 691da224f44..81ba61b31dc 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -57,7 +57,7 @@ def test_localize_ambiguous(request, unit, zone_name):
     request.applymarker(
         pytest.mark.xfail(
             condition=(zone_name == "America/Metlakatla"),
-            reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html",  # noqa: E501
+            reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html",
         )
     )
     s = cudf.Series(
@@ -83,7 +83,7 @@ def test_localize_nonexistent(request, unit, zone_name):
     request.applymarker(
         pytest.mark.xfail(
             condition=(zone_name == "America/Grand_Turk"),
-            reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html",  # noqa: E501
+            reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html",
         )
     )
     s = cudf.Series(
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 71b6bbd688d..0712a0de635 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -167,11 +167,11 @@
     _operators_arithmetic = _operators_arithmetic[:1]
     _operators_comparison = _operators_comparison[:1]
     _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1]
-    DATETIME_TYPES = {"datetime64[ms]"}  # noqa: F811
-    NUMERIC_TYPES = {"float32"}  # noqa: F811
-    FLOAT_TYPES = {"float64"}  # noqa: F811
-    INTEGER_TYPES = {"int16"}  # noqa: F811
-    TIMEDELTA_TYPES = {"timedelta64[s]"}  # noqa: F811
+    DATETIME_TYPES = {"datetime64[ms]"}
+    NUMERIC_TYPES = {"float32"}
+    FLOAT_TYPES = {"float64"}
+    INTEGER_TYPES = {"int16"}
+    TIMEDELTA_TYPES = {"timedelta64[s]"}
     # To save time, we skip tests marked "pytest.mark.xfail"
     pytest_xfail = pytest.mark.skipif
 
@@ -444,7 +444,7 @@ def test_str_series_compare_num_reflected(
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("nelem", [1, 2, 100])
 @pytest.mark.parametrize("cmpop", _cmpops)
-@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"])
+@pytest.mark.parametrize("dtype", [*utils.NUMERIC_TYPES, "datetime64[ms]"])
 @pytest.mark.parametrize("use_cudf_scalar", [True, False])
 def test_series_compare_scalar(
     nelem, cmpop, obj_class, dtype, use_cudf_scalar
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index db41f689255..db24fdd2a29 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -98,7 +98,7 @@ def test_categorical_compare_unordered():
     # test equal
     out = sr == sr
     assert out.dtype == np.bool_
-    assert type(out[0]) == np.bool_
+    assert type(out[0]) is np.bool_
     assert np.all(out.to_numpy())
     assert np.all(pdsr == pdsr)
 
@@ -134,7 +134,7 @@ def test_categorical_compare_ordered():
     # test equal
     out = sr1 == sr1
     assert out.dtype == np.bool_
-    assert type(out[0]) == np.bool_
+    assert type(out[0]) is np.bool_
     assert np.all(out.to_numpy())
     assert np.all(pdsr1 == pdsr1)
 
@@ -768,7 +768,7 @@ def test_categorical_setitem_with_nan():
     assert_eq(gs, expected_series)
 
 
-@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"])
+@pytest.mark.parametrize("dtype", [*list(NUMERIC_TYPES), "object"])
 @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
 def test_series_construction_with_nulls(input_obj, dtype):
     dtype = cudf.dtype(dtype)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index ab0f1767cd6..f57f256d55c 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -625,7 +625,7 @@ def test_concat_series_dataframe_input_str(objs):
 )
 @pytest.mark.parametrize("ignore_index", [True, False])
 def test_concat_empty_dataframes(df, other, ignore_index):
-    other_pd = [df] + other
+    other_pd = [df, *other]
 
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
@@ -1224,7 +1224,7 @@ def test_concat_join_empty_dataframes(
     request, df, other, ignore_index, join, sort
 ):
     axis = 0
-    other_pd = [df] + other
+    other_pd = [df, *other]
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
 
@@ -1312,7 +1312,7 @@ def test_concat_join_empty_dataframes_axis_1(
     df, other, ignore_index, axis, join, sort
 ):
     # no duplicate columns
-    other_pd = [df] + other
+    other_pd = [df, *other]
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index ac772c47e3a..e18112d03ea 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -916,10 +916,10 @@ def test_csv_reader_nrows(tmpdir):
         str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows
     )
     assert df.shape == (read_rows, 2)
-    assert str(skip_rows) in list(df)[0]
+    assert str(skip_rows) in next(iter(df))
     assert str(2 * skip_rows) in list(df)[1]
     for row in range(0, read_rows // sample_skip, sample_skip):
-        assert df[list(df)[0]][row] == row + skip_rows + 1
+        assert df[next(iter(df))][row] == row + skip_rows + 1
         assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1)
     assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows)
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 29f2f46e3c7..381ca45de31 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -170,7 +170,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     # CuPy array
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
     a = cudf.core.column.as_column(a)
-    b = cupy.asarray([1, 1, 1])  # noqa: F841
+    b = cupy.asarray([1, 1, 1])
     assert_eq(pd.Index([1, 2, 3]), a.to_pandas())
 
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 509ee0d65a5..d04fd97dcbd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -56,9 +56,9 @@
 # If spilling is enabled globally, we skip many test permutations
 # to reduce running time.
 if get_global_manager() is not None:
-    ALL_TYPES = ["float32"]  # noqa: F811
-    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
-    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    ALL_TYPES = ["float32"]
+    DATETIME_TYPES = ["datetime64[ms]"]
+    NUMERIC_TYPES = ["float32"]
     # To save time, we skip tests marked "xfail"
     pytest_xfail = pytest.mark.skipif
 
@@ -452,8 +452,8 @@ def test_dataframe_basic():
     df = cudf.concat([df, df2])
     assert len(df) == 11
 
-    hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
-    hvals = np.asarray(rnd_vals.tolist() + [321])
+    hkeys = np.asarray([*np.arange(10, dtype=np.float64).tolist(), 123])
+    hvals = np.asarray([*rnd_vals.tolist(), 321])
 
     np.testing.assert_equal(df["keys"].to_numpy(), hkeys)
     np.testing.assert_equal(df["vals"].to_numpy(), hvals)
@@ -1118,7 +1118,7 @@ def test_dataframe_to_string_wide(monkeypatch):
         1   1   1   1   1   1   1   1   1  ...    1    1    1    1    1    1    1    1
         2   2   2   2   2   2   2   2   2  ...    2    2    2    2    2    2    2    2
 
-        [3 rows x 100 columns]"""  # noqa: E501
+        [3 rows x 100 columns]"""
     )
     assert got == expect
 
@@ -2197,7 +2197,7 @@ def test_dataframe_shape_empty():
 
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 20])
-@pytest.mark.parametrize("dtype", dtypes + ["object"])
+@pytest.mark.parametrize("dtype", [*dtypes, "object"])
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     # In case of `bool` dtype: pandas <= 1.2.5 type-casts
@@ -2842,7 +2842,7 @@ def test_arrow_round_trip(preserve_index, index):
     assert_eq(gdf_out, pdf_out)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"])
 def test_cuda_array_interface(dtype):
     np_data = np.arange(10).astype(dtype)
     cupy_data = cupy.array(np_data)
@@ -3707,7 +3707,7 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names():
     assert result._data.names == tuple(result._data.keys())
 
 
-@pytest.mark.parametrize("dtype", dtypes + ["category"])
+@pytest.mark.parametrize("dtype", [*dtypes, "category"])
 def test_dataframe_0_row_dtype(dtype):
     if dtype == "category":
         data = pd.Series(["a", "b", "c", "d", "e"], dtype="category")
@@ -7910,10 +7910,10 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index):
 
     with _hide_concat_empty_dtype_warning():
         expected = pd.concat(
-            [pdf] + other_pd, sort=sort, ignore_index=ignore_index
+            [pdf, *other_pd], sort=sort, ignore_index=ignore_index
         )
         actual = cudf.concat(
-            [gdf] + other_gd, sort=sort, ignore_index=ignore_index
+            [gdf, *other_gd], sort=sort, ignore_index=ignore_index
         )
 
     # In some cases, Pandas creates an empty Index([], dtype="object") for
@@ -8026,10 +8026,10 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index):
 
     with _hide_concat_empty_dtype_warning():
         expected = pd.concat(
-            [pdf] + other_pd, sort=sort, ignore_index=ignore_index
+            [pdf, *other_pd], sort=sort, ignore_index=ignore_index
         )
         actual = cudf.concat(
-            [gdf] + other_gd, sort=sort, ignore_index=ignore_index
+            [gdf, *other_gd], sort=sort, ignore_index=ignore_index
         )
 
     if expected.shape != df.shape:
@@ -10892,7 +10892,7 @@ def test_dataframe_from_ndarray_dup_columns():
 @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]])
 def test_dataframe_contains(name, contains, other_names):
-    column_names = [name] + other_names
+    column_names = [name, *other_names]
     gdf = cudf.DataFrame({c: [0] for c in column_names})
     pdf = pd.DataFrame({c: [0] for c in column_names})
 
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index f93bd2c5d32..6a9dd4c4a66 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -16,7 +16,7 @@
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     rng = np.random.default_rng(seed=0)
-    types = NUMERIC_TYPES + ["bool"]
+    types = [*NUMERIC_TYPES, "bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e4422e204bc..eae0fd23ef8 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -917,7 +917,6 @@ def test_groupby_apply_return_col_from_df():
     # tests a UDF that consists of purely colwise
     # ops, such as `lambda group: group.x + group.y`
     # which returns a column
-    func = lambda group: group.x + group.y  # noqa:E731
     df = cudf.DataFrame(
         {
             "id": range(10),
@@ -1222,7 +1221,7 @@ def test_groupby_column_numeral():
         pd.Series([0, 2, 0]),
         pd.Series([0, 2, 0], index=[0, 2, 1]),
     ],
-)  # noqa: E501
+)
 def test_groupby_external_series(series):
     pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]})
     gdf = DataFrame.from_pandas(pdf)
@@ -2016,8 +2015,8 @@ def test_multi_agg():
 @pytest.mark.parametrize(
     "agg",
     (
-        list(itertools.combinations(["count", "max", "min", "nunique"], 2))
-        + [
+        [
+            *itertools.combinations(["count", "max", "min", "nunique"], 2),
             {"b": "min", "c": "mean"},
             {"b": "max", "c": "mean"},
             {"b": "count", "c": "mean"},
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index 430ed973f19..4921b7b51fc 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -16,7 +16,7 @@
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
-    types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set(
+    types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set(
         UNSIGNED_TYPES
     )
     typer = {"col_" + val: val for val in types}
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 24d42d9eb4c..11f6d687931 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1274,7 +1274,7 @@ def test_index_append_list(data, other):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 @pytest.mark.parametrize("name", [1, "a", None])
 def test_index_basic(data, dtype, name):
@@ -1399,7 +1399,7 @@ def test_multiindex_append(data, other):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_empty(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1410,7 +1410,7 @@ def test_index_empty(data, dtype):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_size(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1421,7 +1421,7 @@ def test_index_size(data, dtype):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_drop_duplicates(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1437,7 +1437,7 @@ def test_dropna_bad_how():
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_tolist(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
@@ -1455,7 +1455,7 @@ def test_index_tolist(data, dtype):
 
 @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_iter_error(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
@@ -1473,7 +1473,7 @@ def test_index_iter_error(data, dtype):
 
 @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_values_host(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index f6941ce7fae..f8e61651f37 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer():
         result = left.merge(right, how="outer", on="key")
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_inner_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype.categories.dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_left_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_outer_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index b48be6b2c2f..aaa8d7d07ee 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -58,12 +58,14 @@ def gdf(pdf):
 @pytest.fixture(params=[0, 1, 10, 100])
 def gdf_writer_types(request):
     # datetime64[us], datetime64[ns] are unsupported due to a bug in parser
-    types = (
-        NUMERIC_TYPES
-        + ["datetime64[s]", "datetime64[ms]"]
-        + TIMEDELTA_TYPES
-        + ["bool", "str"]
-    )
+    types = [
+        *NUMERIC_TYPES,
+        "datetime64[s]",
+        "datetime64[ms]",
+        *TIMEDELTA_TYPES,
+        "bool",
+        "str",
+    ]
     typer = {"col_" + val: val for val in types}
     ncols = len(types)
     nrows = request.param
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 41c1c3ccb20..c4b4ef60184 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -606,7 +606,7 @@ def normalized_equals(value1, value2):
 def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    supported_stat_types = supported_numpy_dtypes + ["str"]
+    supported_stat_types = [*supported_numpy_dtypes, "str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
     if nrows == 100000:
@@ -681,7 +681,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    supported_stat_types = supported_numpy_dtypes + ["str"]
+    supported_stat_types = [*supported_numpy_dtypes, "str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
     if nrows == 200000:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 659d2ebd89a..de3636f7526 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2313,7 +2313,7 @@ def test_parquet_writer_criteo(tmpdir):
 
     cont_names = ["I" + str(x) for x in range(1, 14)]
     cat_names = ["C" + str(x) for x in range(1, 27)]
-    cols = ["label"] + cont_names + cat_names
+    cols = ["label", *cont_names, *cat_names]
 
     df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000))
     df = df.drop(columns=cont_names)
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 7d8303df0c3..9a2816f5444 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -90,4 +90,4 @@ def test_quantile_type_int_float(interpolation):
     actual = gsr.quantile(0.5, interpolation=interpolation)
 
     assert expected == actual
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d9f4ceaf3f7..8ea0d205e8b 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -31,7 +31,7 @@
     [
         cudf.Series([5, 1, 2, 3, None, 243, None, 4]),
         cudf.Series(["one", "two", "three", None, "one"], dtype="category"),
-        cudf.Series(list(range(400)) + [None]),
+        cudf.Series([*list(range(400)), None]),
     ],
 )
 @pytest.mark.parametrize(
@@ -128,7 +128,7 @@ def test_series_replace():
     assert_eq(a8, sr8.to_numpy())
 
     # large input containing null
-    sr9 = cudf.Series(list(range(400)) + [None])
+    sr9 = cudf.Series([*list(range(400)), None])
     sr10 = sr9.replace([22, 323, 27, 0], None)
     assert sr10.null_count == 5
     assert len(sr10.dropna().to_numpy()) == (401 - 5)
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 53fe5f7f30d..5cebdf37c9f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -28,9 +28,9 @@
 # If spilling is enabled globally, we skip many test permutations
 # to reduce running time.
 if get_global_manager() is not None:
-    ALL_TYPES = ["float32"]  # noqa: F811
-    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
-    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    ALL_TYPES = ["float32"]
+    DATETIME_TYPES = ["datetime64[ms]"]
+    NUMERIC_TYPES = ["float32"]
     # To save time, we skip tests marked "pytest.mark.xfail"
     pytest_xfail = pytest.mark.skipif
 
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index f2faf4343b6..fcd98831686 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -351,7 +351,7 @@ def test_scalar_implicit_float_conversion(value):
     got = float(cudf.Scalar(value))
 
     assert expect == got
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
 
 
 @pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1", True, False])
@@ -360,7 +360,7 @@ def test_scalar_implicit_int_conversion(value):
     got = int(cudf.Scalar(value))
 
     assert expect == got
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
 
 
 @pytest.mark.parametrize("cls", [int, float, bool])
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a040d1dc57f..99bd9adb034 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -282,8 +282,8 @@ def test_series_concat_list_series_with_index(data, others, ignore_index):
     other_ps = others
     other_gs = [cudf.from_pandas(obj) for obj in others]
 
-    expected = pd.concat([psr] + other_ps, ignore_index=ignore_index)
-    actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index)
+    expected = pd.concat([psr, *other_ps], ignore_index=ignore_index)
+    actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index)
 
     assert_eq(expected, actual)
 
@@ -1942,7 +1942,7 @@ def test_diff_many_dtypes(data):
 @pytest.mark.parametrize("num_rows", [1, 100])
 @pytest.mark.parametrize("num_bins", [1, 10])
 @pytest.mark.parametrize("right", [True, False])
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"])
 @pytest.mark.parametrize("series_bins", [True, False])
 def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
     rng = np.random.default_rng(seed=0)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 5406836ba61..6119fda0752 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -472,7 +472,7 @@ def test_loc_setitem_series_index_alignment_13031(other_index):
         ),
     ],
 )
-@pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1])
+@pytest.mark.parametrize("arg", [*list(range(-20, 20)), 5.6, 3.1])
 def test_series_set_item_range_index(ps, arg):
     gsr = cudf.from_pandas(ps)
     psr = ps.copy(deep=True)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 7af83a99d60..13d98e43ddc 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -669,7 +669,7 @@ def test_statistics_expose(manager: SpillManager):
     # Expose the first buffer
     buffers[0].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 1
-    stat = list(manager.statistics.exposes.values())[0]
+    stat = next(iter(manager.statistics.exposes.values()))
     assert stat.count == 1
     assert stat.total_nbytes == buffers[0].nbytes
     assert stat.spilled_nbytes == 0
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 9700f548a16..bdc9e695844 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -536,8 +536,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index):
 
     assert_eq(expect, got)
 
-    expect = ps.str.cat(others=[ps.index] + [ps.index], sep=sep, na_rep=na_rep)
-    got = gs.str.cat(others=[gs.index] + [gs.index], sep=sep, na_rep=na_rep)
+    expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep)
+    got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index c3620db3880..87734ebed58 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -69,7 +69,7 @@ def test_basic_assert_index_equal(
         msg = str(e)
 
     if kind is not None:
-        if (kind == TypeError) and (
+        if (kind is TypeError) and (
             msg
             == (
                 "Categoricals can only be compared "
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 47e541fdcef..3637ef075f2 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -44,7 +44,7 @@ def test_tokenize():
 
     actual = strings.str.tokenize()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -71,7 +71,7 @@ def test_tokenize_delimiter():
 
     actual = strings.str.tokenize(delimiter="o")
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -106,7 +106,7 @@ def test_detokenize():
             "the siamésé cat jumped under the sofa",
         ]
     )
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     indices = cudf.Series(
@@ -122,7 +122,7 @@ def test_detokenize():
             "the+the+the+the",
         ]
     )
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -150,7 +150,7 @@ def test_token_count(delimiter, expected_token_counts):
 
     actual = strings.str.token_count(delimiter)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual, check_dtype=False)
 
 
@@ -208,7 +208,7 @@ def test_tokenize_with_vocabulary(delimiter, input, default_id, results):
     )
 
     actual = tokenizer.tokenize(strings, delimiter, default_id)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -232,7 +232,7 @@ def test_normalize_spaces():
 
     actual = strings.str.normalize_spaces()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -252,7 +252,7 @@ def test_normalize_characters():
     )
 
     actual = strings.str.normalize_characters()
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
@@ -266,7 +266,7 @@ def test_normalize_characters():
         ]
     )
     actual = strings.str.normalize_characters(do_lower=False)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -309,7 +309,7 @@ def test_ngrams(n, separator, expected_values):
 
     actual = strings.str.ngrams(n=n, separator=separator)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -364,7 +364,7 @@ def test_character_ngrams(n, expected_values, expected_index, as_list):
 
     actual = strings.str.character_ngrams(n=n, as_list=as_list)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -379,12 +379,12 @@ def test_hash_character_ngrams():
         ]
     )
     actual = strings.str.hash_character_ngrams(5, True)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     actual = strings.str.hash_character_ngrams(5)
     expected = expected.explode()
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -417,7 +417,7 @@ def test_ngrams_tokenize(n, separator, expected_values):
 
     actual = strings.str.ngrams_tokenize(n=n, separator=separator)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -844,7 +844,7 @@ def test_porter_stemmer_measure():
 
     actual = strings.str.porter_stemmer_measure()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -856,14 +856,14 @@ def test_is_vowel_consonant():
         [False, False, True, False, False, False, True, False, None, False]
     )
     actual = strings.str.is_vowel(2)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
         [True, False, True, False, False, False, True, True, None, False]
     )
     actual = strings.str.is_consonant(1)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0])
@@ -871,14 +871,14 @@ def test_is_vowel_consonant():
         [False, True, False, False, True, False, True, True, None, False]
     )
     actual = strings.str.is_vowel(indices)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
         [False, False, True, True, False, True, False, False, None, False]
     )
     actual = strings.str.is_consonant(indices)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -1097,5 +1097,5 @@ def test_byte_pair_encoding(separator, input, results):
     expected = cudf.Series([results, None, "", results])
 
     actual = encoder(strings, separator)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 252bb19063a..5681601d2be 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -829,7 +829,7 @@
 >>> cudf.read_json(json_str, engine='cudf', lines=True, dtype={'k1':float, 'k2':cudf.ListDtype(int)})
     k1   k2
 0  1.0  [1]
-"""  # noqa: E501
+"""
 doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json)
 
 _docstring_to_json = """
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 78aeac425f7..8966789fee8 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -64,7 +64,7 @@ def query_parser(text):
     Returns
     -------
     info: a `dict` of the parsed info
-    """  # noqa
+    """
     # convert any '@' to
     text = text.replace("@", ENVREF_PREFIX)
     tree = ast.parse(text)
@@ -249,7 +249,7 @@ def query_execute(df, expr, callenv):
     nrows = len(df)
     out = column_empty(nrows, dtype=np.bool_)
     # run kernel
-    args = [out] + colarrays + envargs
+    args = [out, *colarrays, *envargs]
     with _CUDFNumbaConfig():
         kernel.forall(nrows)(*args)
     out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index e6d252b8807..c83c1cbe895 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -210,7 +210,7 @@ class GetAttrGetItemMixin:
 
     # Tracking of protected keys by each subclass is necessary to make the
     # `__getattr__`->`__getitem__` call safe. See
-    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
+    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html
     # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this
     # class to avoid calling `__getitem__` inside `__getattr__` when
     # `__getitem__` will internally again call `__getattr__`, resulting in an
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 4473a0e6f12..d494e157a18 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1589,8 +1589,8 @@ def test_numpy_cupy_flatiter(series):
     _, s = series
     arr = s.values
 
-    assert type(arr.flat._fsproxy_fast) == cp.flatiter
-    assert type(arr.flat._fsproxy_slow) == np.flatiter
+    assert type(arr.flat._fsproxy_fast) is cp.flatiter
+    assert type(arr.flat._fsproxy_slow) is np.flatiter
 
 
 @pytest.mark.xfail(
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
index 665b9d6fb08..1909392b9f7 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -15,7 +15,7 @@ def assert_plots_equal(expect, got):
         for expect_ch, got_ch in zip(
             expect.get_children(), got.get_children()
         ):
-            assert type(expect_ch) == type(got_ch)
+            assert type(expect_ch) is type(got_ch)
             if isinstance(expect_ch, Line2D):
                 assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
                 assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
index 27d9df83476..2a0f6697f3a 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
@@ -8,7 +8,7 @@
 
 
 def assert_plotly_equal(expect, got):
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
     if isinstance(expect, dict):
         assert expect.keys() == got.keys()
         for k in expect.keys():
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
index 4b272900acd..021c5bac9b7 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -14,7 +14,7 @@ def assert_plots_equal(expect, got):
         for expect_ch, got_ch in zip(
             expect.get_children(), got.get_children()
         ):
-            assert type(expect_ch) == type(got_ch)
+            assert type(expect_ch) is type(got_ch)
             if isinstance(expect_ch, Line2D):
                 assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
                 assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index ba4858c5619..72e09b872d5 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -21,8 +21,8 @@
 del _ensure_polars_version
 
 __all__: list[str] = [
-    "execute_with_cudf",
     "Translator",
     "__git_commit__",
     "__version__",
+    "execute_with_cudf",
 ]
diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index 3b1eff4a0d0..9dff8822376 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column"]
+__all__: list[str] = ["Column", "DataFrame"]
 
 from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 326d6b65cbe..98d49e36fb1 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -36,27 +36,27 @@
 from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 
 __all__ = [
-    "Expr",
+    "Agg",
+    "AggInfo",
+    "BinOp",
+    "BooleanFunction",
+    "Cast",
+    "Col",
+    "ColRef",
     "ErrorExpr",
-    "NamedExpr",
+    "Expr",
+    "Filter",
+    "Gather",
+    "GroupedRollingWindow",
+    "Len",
     "Literal",
     "LiteralColumn",
-    "Len",
-    "Col",
-    "ColRef",
-    "BooleanFunction",
-    "StringFunction",
-    "TemporalFunction",
+    "NamedExpr",
+    "RollingWindow",
     "Sort",
     "SortBy",
-    "Gather",
-    "Filter",
-    "RollingWindow",
-    "GroupedRollingWindow",
-    "Cast",
-    "Agg",
-    "AggInfo",
+    "StringFunction",
+    "TemporalFunction",
     "Ternary",
-    "BinOp",
     "UnaryFunction",
 ]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index 2af9fdaacc5..624a9bd87ea 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -31,7 +31,7 @@
 
 
 class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request")
+    __slots__ = ("name", "op", "options", "request")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 23851f91938..4c7ae007070 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -20,7 +20,7 @@
 
     from cudf_polars.containers import Column, DataFrame
 
-__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"]
+__all__ = ["AggInfo", "Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"]
 
 
 class AggInfo(NamedTuple):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 1682e7a8a9c..5aa35ead127 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -195,7 +195,7 @@ def do_evaluate(
                 # If the input null count was non-zero, we must
                 # post-process the result to insert the correct value.
                 h_result = plc.interop.to_arrow(result).as_py()
-                if is_any and not h_result or not is_any and h_result:
+                if (is_any and not h_result) or (not is_any and h_result):
                     # Any                     All
                     # False || Null => Null   True && Null => Null
                     return Column(plc.Column.all_null_like(column.obj, 1))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index fa68bcb9426..48c37d101f4 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -13,7 +13,7 @@
 if TYPE_CHECKING:
     import pylibcudf as plc
 
-__all__ = ["RollingWindow", "GroupedRollingWindow"]
+__all__ = ["GroupedRollingWindow", "RollingWindow"]
 
 
 class RollingWindow(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index 77d7d4c0d22..12326740f74 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -20,7 +20,7 @@
 
     from cudf_polars.containers import DataFrame
 
-__all__ = ["Gather", "Filter"]
+__all__ = ["Filter", "Gather"]
 
 
 class Gather(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 92c3c658c21..124a6e8d71c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -92,7 +92,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
                 raise ValueError("StringFunction required")
             return getattr(cls, name)
 
-    __slots__ = ("name", "options", "_regex_program")
+    __slots__ = ("_regex_program", "name", "options")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 7999ec86068..10caaff6811 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -21,7 +21,7 @@
 
     from cudf_polars.containers import DataFrame
 
-__all__ = ["Cast", "UnaryFunction", "Len"]
+__all__ = ["Cast", "Len", "UnaryFunction"]
 
 
 class Cast(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e8d9691f2a0..a28b4cf25b2 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -42,24 +42,24 @@
 
 __all__ = [
     "IR",
-    "ErrorNode",
-    "PythonScan",
-    "Scan",
     "Cache",
-    "DataFrameScan",
-    "Select",
-    "GroupBy",
-    "Join",
     "ConditionalJoin",
-    "HStack",
+    "DataFrameScan",
     "Distinct",
-    "Sort",
-    "Slice",
+    "ErrorNode",
     "Filter",
-    "Projection",
+    "GroupBy",
+    "HConcat",
+    "HStack",
+    "Join",
     "MapFunction",
+    "Projection",
+    "PythonScan",
+    "Scan",
+    "Select",
+    "Slice",
+    "Sort",
     "Union",
-    "HConcat",
 ]
 
 
@@ -130,7 +130,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
 class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
-    __slots__ = ("schema", "_non_child_args")
+    __slots__ = ("_non_child_args", "schema")
     # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     # Concrete classes should set this up with the arguments that will
@@ -253,16 +253,16 @@ class Scan(IR):
     """Input from files."""
 
     __slots__ = (
-        "typ",
-        "reader_options",
         "cloud_options",
         "config_options",
-        "paths",
-        "with_columns",
-        "skip_rows",
         "n_rows",
-        "row_index",
+        "paths",
         "predicate",
+        "reader_options",
+        "row_index",
+        "skip_rows",
+        "typ",
+        "with_columns",
     )
     _non_child = (
         "schema",
@@ -688,7 +688,7 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
-    __slots__ = ("df", "projection", "predicate")
+    __slots__ = ("df", "predicate", "projection")
     _non_child = ("schema", "df", "projection", "predicate")
     df: Any
     """Polars LazyFrame object."""
@@ -819,11 +819,11 @@ class GroupBy(IR):
     """Perform a groupby."""
 
     __slots__ = (
+        "agg_infos",
         "agg_requests",
         "keys",
         "maintain_order",
         "options",
-        "agg_infos",
     )
     _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
     keys: tuple[expr.NamedExpr, ...]
@@ -993,7 +993,7 @@ def do_evaluate(
 class ConditionalJoin(IR):
     """A conditional inner join of two dataframes on a predicate."""
 
-    __slots__ = ("predicate", "options", "ast_predicate")
+    __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
     options: tuple
@@ -1053,7 +1053,7 @@ def do_evaluate(
 class Join(IR):
     """A join of two dataframes."""
 
-    __slots__ = ("left_on", "right_on", "options")
+    __slots__ = ("left_on", "options", "right_on")
     _non_child = ("schema", "left_on", "right_on", "options")
     left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
@@ -1337,7 +1337,7 @@ def do_evaluate(
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    __slots__ = ("keep", "subset", "zlice", "stable")
+    __slots__ = ("keep", "stable", "subset", "zlice")
     _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
     """Which distinct value to keep."""
@@ -1424,7 +1424,7 @@ def do_evaluate(
 class Sort(IR):
     """Sort a dataframe."""
 
-    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    __slots__ = ("by", "null_order", "order", "stable", "zlice")
     _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
     by: tuple[expr.NamedExpr, ...]
     """Sort keys."""
@@ -1505,7 +1505,7 @@ def do_evaluate(
 class Slice(IR):
     """Slice a dataframe."""
 
-    __slots__ = ("offset", "length")
+    __slots__ = ("length", "offset")
     _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
index be8338cb9a9..b3248dae93c 100644
--- a/python/cudf_polars/cudf_polars/dsl/traversal.py
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -16,10 +16,10 @@
 
 
 __all__: list[str] = [
-    "traversal",
-    "reuse_if_unchanged",
-    "make_recursive",
     "CachingVisitor",
+    "make_recursive",
+    "reuse_if_unchanged",
+    "traversal",
 ]
 
 
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 57c5fdaa7cf..52be130ab90 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -21,13 +21,13 @@
     from cudf_polars.dsl import expr, ir, nodebase
 
 __all__: list[str] = [
-    "PolarsIR",
-    "PolarsExpr",
-    "NodeTraverser",
-    "OptimizationArgs",
-    "GenericTransformer",
     "ExprTransformer",
+    "GenericTransformer",
     "IRTransformer",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "PolarsExpr",
+    "PolarsIR",
 ]
 
 PolarsIR: TypeAlias = Union[
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index e7ac72df609..6bb5d78c488 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -19,9 +19,9 @@
 )
 
 __all__ = [
-    "from_polars",
-    "downcast_arrow_lists",
     "can_cast",
+    "downcast_arrow_lists",
+    "from_polars",
     "is_order_preserving_cast",
 ]
 import pylibcudf as plc
@@ -75,11 +75,13 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
     return (
         (
             from_ == to
-            or not has_empty
-            and (
-                plc.traits.is_fixed_width(to)
-                and plc.traits.is_fixed_width(from_)
-                and plc.unary.is_supported_cast(from_, to)
+            or (
+                not has_empty
+                and (
+                    plc.traits.is_fixed_width(to)
+                    and plc.traits.is_fixed_width(from_)
+                    and plc.unary.is_supported_cast(from_, to)
+                )
             )
         )
         or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to))
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f050a7c568a..b781b13ec10 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -158,6 +158,7 @@ ignore = [
   "ISC002", # multi-line-implicit-string-concatenation
 ]
 fixable = ["ALL"]
+typing-modules = ["cudf_polars.typing"]
 
 [tool.ruff.lint.per-file-ignores]
 "**/tests/**/*.py" = ["D"]
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index cc17e71039a..20eb2404b77 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -3,15 +3,15 @@
 import warnings
 from importlib import import_module
 
-from dask import config
 import dask.dataframe as dd
-from dask.dataframe import from_delayed  # noqa: E402
+from dask import config
+from dask.dataframe import from_delayed
 
-import cudf  # noqa: E402
+import cudf
 
-from . import backends  # noqa: E402, F401
-from ._version import __git_commit__, __version__  # noqa: E402, F401
-from .core import concat, from_cudf, DataFrame, Index, Series  # noqa: F401
+from . import backends  # noqa: F401
+from ._version import __git_commit__, __version__  # noqa: F401
+from .core import DataFrame, Index, Series, concat, from_cudf
 
 QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED
 
@@ -56,17 +56,17 @@ def inner_func(*args, **kwargs):
 
 
 if QUERY_PLANNING_ON:
+    from . import io
     from ._expr.expr import _patch_dask_expr
-    from . import io  # noqa: F401
 
     groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
     read_text = DataFrame.read_text
     _patch_dask_expr()
 
 else:
+    from . import io  # noqa: F401
     from ._legacy.groupby import groupby_agg  # noqa: F401
     from ._legacy.io import read_text  # noqa: F401
-    from . import io  # noqa: F401
 
 
 to_orc = _deprecated_api(
@@ -78,10 +78,10 @@ def inner_func(*args, **kwargs):
 
 __all__ = [
     "DataFrame",
-    "Series",
     "Index",
-    "from_cudf",
+    "Series",
     "concat",
+    "from_cudf",
     "from_delayed",
 ]
 
diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 89c0d108743..2dc4031b876 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -213,8 +213,9 @@ def _create_array_collection_with_meta(expr):
     name = result._name
     meta = result._meta
     divisions = result.divisions
-    chunks = ((np.nan,) * (len(divisions) - 1),) + tuple(
-        (d,) for d in meta.shape[1:]
+    chunks = (
+        (np.nan,) * (len(divisions) - 1),
+        *tuple((d,) for d in meta.shape[1:]),
     )
     if len(chunks) > 1:
         if isinstance(dsk, HighLevelGraph):
@@ -224,11 +225,11 @@ def _create_array_collection_with_meta(expr):
             layer = dsk
         if isinstance(layer, Blockwise):
             layer.new_axes["j"] = chunks[1][0]
-            layer.output_indices = layer.output_indices + ("j",)
+            layer.output_indices = (*layer.output_indices, "j")
         else:
             suffix = (0,) * (len(chunks) - 1)
             for i in range(len(chunks[0])):
-                layer[(name, i) + suffix] = layer.pop((name, i))
+                layer[(name, i, *suffix)] = layer.pop((name, i))
 
     return da.Array(dsk, name=name, chunks=chunks, meta=meta)
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 7d6d5c05cbe..5fd217209ec 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -10,7 +10,7 @@
 
 # This module provides backward compatibility for legacy import patterns.
 if dd.DASK_EXPR_ENABLED:
-    from dask_cudf._expr.collection import (  # noqa: E402
+    from dask_cudf._expr.collection import (
         DataFrame,
         Index,
         Series,
@@ -19,7 +19,7 @@
     from dask_cudf._legacy.core import DataFrame, Index, Series  # noqa: F401
 
 
-concat = dd.concat  # noqa: F401
+concat = dd.concat
 
 
 @_dask_cudf_performance_tracking
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 212951336c9..9bca33e414a 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api, QUERY_PLANNING_ON
-
-from . import csv, orc, json, parquet, text  # noqa: F401
+from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
 
+from . import csv, json, orc, parquet, text  # noqa: F401
 
 read_csv = _deprecated_api(
     "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv"
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ce9935c8b3c..ba6209c4820 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -40,7 +40,7 @@ def TaskList(*x):
 from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
 
 # Dask-expr imports CudfEngine from this module
-from dask_cudf._legacy.io.parquet import CudfEngine  # noqa: F401
+from dask_cudf._legacy.io.parquet import CudfEngine
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 5130b804179..cda7e2d134d 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -489,7 +489,7 @@ def test_repartition_hash_staged(npartitions):
     )
 
     # Make sure we are getting a dask_cudf dataframe
-    assert type(ddf_new) == type(ddf)
+    assert type(ddf_new) is type(ddf)
 
     # Check that the length was preserved
     assert len(ddf_new) == len(ddf)
@@ -956,7 +956,7 @@ def func(x):
 
     # NOTE: The calculation here doesn't need to make sense.
     # We just need to make sure we get the right type back.
-    assert type(result) == type(expect)
+    assert type(result) is type(expect)
 
 
 @pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index fe57d4a4f00..d91b9defc1c 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -44,7 +44,7 @@ def test_pyarrow_conversion_dispatch(preserve_index, index):
     if not preserve_index and index is not None:
         df1.index.name = None
 
-    assert type(df1) == type(df2)
+    assert type(df1) is type(df2)
     assert_eq(df1, df2)
 
     # Check that preserve_index does not produce a RangeIndex
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 918290aa6fa..9bd3b506db0 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -58,7 +58,7 @@ def pdf(request):
 # deprecation check for "collect".
 @pytest.mark.parametrize(
     "aggregation",
-    sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)),
+    sorted((*tuple(set(OPTIMIZED_AGGS) - {list}), "collect")),
 )
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py
index 10c476cbe89..4077fa8fbf9 100644
--- a/python/libcudf/libcudf/__init__.py
+++ b/python/libcudf/libcudf/__init__.py
@@ -14,3 +14,5 @@
 
 from libcudf._version import __git_commit__, __version__
 from libcudf.load import load_library
+
+__all__ = ["__git_commit__", "__version__", "load_library"]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 62a2170f83e..8ea176a6b07 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -65,8 +65,8 @@
     "aggregation",
     "binaryop",
     "column_factories",
-    "contiguous_split",
     "concatenate",
+    "contiguous_split",
     "copying",
     "datetime",
     "experimental",
@@ -83,6 +83,7 @@
     "lists",
     "merge",
     "null_mask",
+    "nvtext",
     "partitioning",
     "quantiles",
     "reduce",
@@ -91,13 +92,12 @@
     "rolling",
     "round",
     "search",
+    "sorting",
     "stream_compaction",
     "strings",
-    "sorting",
     "traits",
     "transform",
     "transpose",
     "types",
     "unary",
-    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 4f125d3a733..d88a7d4b825 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -15,11 +15,11 @@
 )
 
 __all__ = [
+    "byte_pair_encode",
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
-    "byte_pair_encode",
     "ngrams_tokenize",
     "normalize",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index 1cbaac57315..555ca2fb02c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -281,7 +281,7 @@ def test_read_csv_header(csv_table_data, source_or_sink, header):
         new_tbl_dict = {}
         for i, (name, vals) in enumerate(tbl_dict.items()):
             str_vals = [str(val) for val in vals]
-            new_tbl_dict[str(i)] = [name] + str_vals
+            new_tbl_dict[str(i)] = [name, *str_vals]
         pa_table = pa.table(new_tbl_dict)
 
     assert_table_and_meta_eq(

From 852338e71dae9833a53507bd4b1470798f0a5c4b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 2 Dec 2024 16:41:35 -0600
Subject: [PATCH 05/78] Update PyTorch to >=2.4.0 to get fix for CUDA array
 interface bug, and drop CUDA 11 PyTorch tests. (#17475)

This PR updates our PyTorch lower bound to 2.4.0 to get the bugfix from https://github.com/pytorch/pytorch/pull/121458.

Also, this PR drops CUDA 11 tests because conda-forge no longer produces CUDA 11 builds of PyTorch. This was causing a failure on Hopper GPUs because the last available CUDA 11 builds from conda-forge do not include sm90 support.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17475
---
 conda/environments/all_cuda-118_arch-x86_64.yaml  |  3 ---
 conda/environments/all_cuda-125_arch-x86_64.yaml  |  2 +-
 dependencies.yaml                                 |  7 ++-----
 .../cudf/cudf/tests/test_cuda_array_interface.py  | 15 +++++----------
 .../dependencies.yaml                             |  2 +-
 5 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 97c72ec8042..2be64b7cd70 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -80,7 +80,6 @@ dependencies:
 - python-confluent-kafka>=2.5.0,<2.6.0a0
 - python-xxhash
 - python>=3.10,<3.13
-- pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
 - rich
@@ -97,8 +96,6 @@ dependencies:
 - sphinxcontrib-websupport
 - streamz
 - sysroot_linux-64==2.17
-- tokenizers==0.15.2
-- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 84b58b6d7a4..6b5ca04c015 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -78,7 +78,7 @@ dependencies:
 - python-confluent-kafka>=2.5.0,<2.6.0a0
 - python-xxhash
 - python>=3.10,<3.13
-- pytorch>=2.1.0
+- pytorch>=2.4.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
 - rich
diff --git a/dependencies.yaml b/dependencies.yaml
index 3976696a41c..259d41b59fe 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -885,12 +885,9 @@ dependencies:
       - output_types: conda
         matrices:
           - matrix:
-              arch: x86_64
+              cuda: "12.*"
             packages:
-              # Currently, CUDA + aarch64 builds of pytorch do not exist on conda-forge.
-              - pytorch>=2.1.0
-              # We only install these on x86_64 to avoid pulling pytorch as a
-              # dependency of transformers.
+              - pytorch>=2.4.0
               - *tokenizers
               - *transformers
           - matrix:
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 381ca45de31..dcde0dab83d 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -187,7 +187,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     ),
 )
 def test_cuda_array_interface_pytorch():
-    torch = pytest.importorskip("torch", minversion="1.6.0")
+    torch = pytest.importorskip("torch", minversion="2.4.0")
     if not torch.cuda.is_available():
         pytest.skip("need gpu version of pytorch to be installed")
 
@@ -202,15 +202,10 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    # TODO: This test fails with PyTorch 2. It appears that PyTorch
-    # checks that the pointer is device-accessible even when the
-    # size is zero. See
-    # https://github.com/pytorch/pytorch/issues/98133
-    #
-    # index = cudf.Index([], dtype="float64")
-    # tensor = torch.tensor(index)
-    # got = cudf.Index(tensor)
-    # assert_eq(got, index)
+    index = cudf.Index([], dtype="float64")
+    tensor = torch.tensor(index)
+    got = cudf.Index(tensor)
+    assert_eq(got, index)
 
     index = cudf.core.index.RangeIndex(start=0, stop=100)
     tensor = torch.tensor(index)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 6b317cc13fb..e726b7fdca1 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -213,7 +213,7 @@ dependencies:
       - output_types: conda
         packages:
           - numpy
-          - pytorch>=2.1.0
+          - pytorch>=2.4.0
   test_seaborn:
     common:
       - output_types: conda

From da72cf609f61fa4dd154be377a8b591ea1773e04 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:08:37 -0800
Subject: [PATCH 06/78] Remove cudf._lib.filling in favor of inlining pylibcudf
 (#17459)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17459
---
 python/cudf/cudf/_lib/CMakeLists.txt        |  1 -
 python/cudf/cudf/_lib/__init__.py           |  1 -
 python/cudf/cudf/_lib/filling.pyx           | 57 ---------------------
 python/cudf/cudf/core/column/categorical.py |  7 +--
 python/cudf/cudf/core/column/column.py      | 47 +++++++++++------
 python/cudf/cudf/core/frame.py              | 11 +++-
 python/cudf/cudf/core/index.py              | 14 ++---
 7 files changed, 51 insertions(+), 87 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/filling.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index de483b3070d..e69a2672163 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -18,7 +18,6 @@ set(cython_sources
     column.pyx
     copying.pyx
     csv.pyx
-    filling.pyx
     groupby.pyx
     interop.pyx
     merge.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index ee1bd13f2c4..ec32386b2ce 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -5,7 +5,6 @@
     binaryop,
     copying,
     csv,
-    filling,
     groupby,
     interop,
     merge,
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
deleted file mode 100644
index b2f4c620144..00000000000
--- a/python/cudf/cudf/_lib/filling.pyx
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
-    pylibcudf.filling.fill_in_place(
-        destination.to_pylibcudf(mode='write'),
-        begin,
-        end,
-        (<DeviceScalar> as_device_scalar(value, dtype=destination.dtype)).c_value
-    )
-
-
-@acquire_spill_lock()
-def fill(Column destination, int begin, int end, DeviceScalar value):
-    return Column.from_pylibcudf(
-        pylibcudf.filling.fill(
-            destination.to_pylibcudf(mode='read'),
-            begin,
-            end,
-            (<DeviceScalar> as_device_scalar(value)).c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def repeat(list inp, object count):
-    ctbl = pylibcudf.Table([col.to_pylibcudf(mode="read") for col in inp])
-    if isinstance(count, Column):
-        count = count.to_pylibcudf(mode="read")
-    return columns_from_pylibcudf_table(
-        pylibcudf.filling.repeat(
-            ctbl,
-            count
-        )
-    )
-
-
-@acquire_spill_lock()
-def sequence(int size, DeviceScalar init, DeviceScalar step):
-    return Column.from_pylibcudf(
-        pylibcudf.filling.sequence(
-            size,
-            (<DeviceScalar> as_device_scalar(init)).c_value,
-            (<DeviceScalar> as_device_scalar(step)).c_value
-        )
-    )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index cbbe01f7289..c849a9d3d2b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -668,13 +668,8 @@ def _fill(
             return self if inplace else self.copy()
 
         fill_code = self._encode(fill_value)
-        fill_scalar = cudf._lib.scalar.as_device_scalar(
-            fill_code, self.codes.dtype
-        )
-
         result = self if inplace else self.copy()
-
-        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
+        result.codes._fill(fill_code, begin, end, inplace=True)
         return result
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d1938f47d66..cdc3a03f445 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -401,14 +401,19 @@ def _fill(
         # the scalar is None when calling `is_valid`.
         slr = cudf.Scalar(fill_value, dtype=self.dtype)
 
-        if not inplace:
-            return libcudf.filling.fill(self, begin, end, slr.device_value)
-
-        if is_string_dtype(self.dtype):
-            return self._mimic_inplace(
-                libcudf.filling.fill(self, begin, end, slr.device_value),
-                inplace=True,
-            )
+        if not inplace or is_string_dtype(self.dtype):
+            with acquire_spill_lock():
+                result = type(self).from_pylibcudf(
+                    plc.filling.fill(
+                        self.to_pylibcudf(mode="read"),
+                        begin,
+                        end,
+                        slr.device_value.c_value,
+                    )
+                )
+            if is_string_dtype(self.dtype):
+                return self._mimic_inplace(result, inplace=True)
+            return result  # type: ignore[return-value]
 
         if not slr.is_valid() and not self.nullable:
             mask = as_buffer(
@@ -418,8 +423,13 @@ def _fill(
             )
             self.set_base_mask(mask)
 
-        libcudf.filling.fill_in_place(self, begin, end, slr.device_value)
-
+        with acquire_spill_lock():
+            plc.filling.fill_in_place(
+                self.to_pylibcudf(mode="write"),
+                begin,
+                end,
+                slr.device_value.c_value,
+            )
         return self
 
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
@@ -1813,11 +1823,18 @@ def as_column(
     * range objects
     """
     if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)):
-        column = libcudf.filling.sequence(
-            len(arbitrary),
-            as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")),
-            as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")),
-        )
+        with acquire_spill_lock():
+            column = Column.from_pylibcudf(
+                plc.filling.sequence(
+                    len(arbitrary),
+                    as_device_scalar(
+                        arbitrary.start, dtype=np.dtype(np.int64)
+                    ).c_value,
+                    as_device_scalar(
+                        arbitrary.step, dtype=np.dtype(np.int64)
+                    ).c_value,
+                )
+            )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
             dtype = cudf.dtype(
                 f'i{cudf.get_option("default_integer_bitwidth")//8}'
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 70789160cb6..0a7e6fefe6e 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1901,7 +1901,16 @@ def _repeat(
         if not is_scalar(repeats):
             repeats = as_column(repeats)
 
-        return libcudf.filling.repeat(columns, repeats)
+        with acquire_spill_lock():
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns]
+            )
+            if isinstance(repeats, ColumnBase):
+                repeats = repeats.to_pylibcudf(mode="read")
+            return [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc.filling.repeat(plc_table, repeats).columns()
+            ]
 
     @_performance_tracking
     @_warn_no_dask_cudf
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index eac04cf36ec..cc3d8448151 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -20,7 +20,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.filling import sequence
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -3402,11 +3401,14 @@ def interval_range(
     start = start.astype(common_dtype)
     freq = freq.astype(common_dtype)
 
-    bin_edges = sequence(
-        size=periods + 1,
-        init=start.device_value,
-        step=freq.device_value,
-    )
+    with acquire_spill_lock():
+        bin_edges = libcudf.column.Column.from_pylibcudf(
+            plc.filling.sequence(
+                size=periods + 1,
+                init=start.device_value.c_value,
+                step=freq.device_value.c_value,
+            )
+        )
     return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
 
 

From b67c0a97d16ec3c9d0abf825ad9755013b24ebab Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 2 Dec 2024 16:04:59 -0800
Subject: [PATCH 07/78] Update MurmurHash3_x64_128 to use the cuco equivalent
 implementation (#17457)

This PR modifies MurmurHash3_x64_128 to utilize the cuco equivalent implementation, eliminating duplication.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/17457
---
 .../hashing/detail/murmurhash3_x64_128.cuh    | 163 +++---------------
 .../hashing/detail/murmurhash3_x86_32.cuh     |   2 +-
 cpp/include/cudf/hashing/detail/xxhash_64.cuh |   2 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |  17 +-
 4 files changed, 35 insertions(+), 149 deletions(-)

diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
index 5e88b905023..31390aa3edf 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
@@ -15,177 +15,63 @@
  */
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 
-#include <thrust/pair.h>
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/array>
+#include <cuda/std/cstddef>
 
 namespace cudf::hashing::detail {
 
-// MurmurHash3_x64_128 implementation from
-// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
 template <typename Key>
 struct MurmurHash3_x64_128 {
-  using result_type = thrust::pair<uint64_t, uint64_t>;
+  using result_type = cuda::std::array<uint64_t, 2>;
 
-  constexpr MurmurHash3_x64_128() = default;
-  constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}
-
-  __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED)
+    : _impl{seed}
   {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
-  {
-    uint64_t result = getblock32(data, offset + 4);
-    result          = result << 32;
-    return result | getblock32(data, offset);
-  }
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
-  __device__ inline uint64_t fmix64(uint64_t k) const
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
   {
-    k ^= k >> 33;
-    k *= 0xff51afd7ed558ccdUL;
-    k ^= k >> 33;
-    k *= 0xc4ceb9fe1a85ec53UL;
-    k ^= k >> 33;
-    return k;
+    return this->_impl.compute_hash(bytes, size);
   }
 
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
-
+ private:
   template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a 8-byte chunk.
-    uint64_t k1     = 0;
-    uint64_t k2     = 0;
-    auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
-    switch (len & (BLOCK_SIZE - 1)) {
-      case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
-      case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
-      case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
-      case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
-      case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
-      case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
-      case 9:
-        k2 ^= static_cast<uint64_t>(tail[8]) << 0;
-        k2 *= c2;
-        k2 = rotate_bits_left(k2, 33);
-        k2 *= c1;
-        h.second ^= k2;
-
-      case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
-      case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
-      case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
-      case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
-      case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
-      case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
-      case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
-      case 1:
-        k1 ^= static_cast<uint64_t>(tail[0]) << 0;
-        k1 *= c1;
-        k1 = rotate_bits_left(k1, 31);
-        k1 *= c2;
-        h.first ^= k1;
-    };
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  __device__ constexpr result_type compute(T const& key) const
   {
-    auto const nblocks = len / BLOCK_SIZE;
-    uint64_t h1        = m_seed;
-    uint64_t h2        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint64_t k1 = getblock64(data, (i * BLOCK_SIZE));                     // 1st 8 bytes
-      uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2));  // 2nd 8 bytes
-
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, 31);
-      k1 *= c2;
-
-      h1 ^= k1;
-      h1 = rotate_bits_left(h1, 27);
-      h1 += h2;
-      h1 = h1 * 5 + 0x52dce729;
-
-      k2 *= c2;
-      k2 = rotate_bits_left(k2, 33);
-      k2 *= c1;
-
-      h2 ^= k2;
-      h2 = rotate_bits_left(h2, 31);
-      h2 += h1;
-      h2 = h2 * 5 + 0x38495ab5;
-    }
-
-    thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});
-
-    // Finalize hash.
-    h1 ^= len;
-    h2 ^= len;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return {h1, h2};
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
   }
 
- private:
-  uint64_t m_seed{};
-  static constexpr uint32_t BLOCK_SIZE = 16;  // 2 x 64-bit = 16 bytes
-
-  static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
-  static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
+  cuco::murmurhash3_x64_128<Key> _impl;
 };
 
 template <>
 MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
   bool const& key) const
 {
-  return compute<uint8_t>(key);
+  return this->compute<uint8_t>(key);
 }
 
 template <>
 MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
   float const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
 MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
   double const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
@@ -193,9 +79,8 @@ MurmurHash3_x64_128<cudf::string_view>::result_type
   __device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
     cudf::string_view const& key) const
 {
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
 }
 
 template <>
@@ -203,7 +88,7 @@ MurmurHash3_x64_128<numeric::decimal32>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
     numeric::decimal32 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -211,7 +96,7 @@ MurmurHash3_x64_128<numeric::decimal64>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
     numeric::decimal64 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -219,7 +104,7 @@ MurmurHash3_x64_128<numeric::decimal128>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
     numeric::decimal128 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 }  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
index 38a7d927b9c..e0c7ce840d7 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -33,7 +33,7 @@ template <typename Key>
 struct MurmurHash3_x86_32 {
   using result_type = hash_value_type;
 
-  __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
     : _impl{seed}
   {
   }
diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
index 7d72349e340..d77d040b365 100644
--- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh
+++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
@@ -31,7 +31,7 @@ template <typename Key>
 struct XXHash_64 {
   using result_type = std::uint64_t;
 
-  __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+  CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
 
   __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 090bd92af8c..43df7f325ac 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/array>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -31,7 +32,7 @@ namespace hashing {
 namespace detail {
 namespace {
 
-using hash_value_type = thrust::pair<uint64_t, uint64_t>;
+using hash_value_type = cuda::std::array<uint64_t, 2>;
 
 /**
  * @brief Computes the hash value of a row in the given table.
@@ -58,7 +59,7 @@ class murmur_device_row_hasher {
    */
   __device__ void operator()(size_type row_index) const noexcept
   {
-    auto h = cudf::detail::accumulate(
+    auto const h = cudf::detail::accumulate(
       _input.begin(),
       _input.end(),
       hash_value_type{_seed, 0},
@@ -66,8 +67,8 @@ class murmur_device_row_hasher {
         return cudf::type_dispatcher(
           column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
       });
-    _output1[row_index] = h.first;
-    _output2[row_index] = h.second;
+    _output1[row_index] = h[0];
+    _output2[row_index] = h[1];
   }
 
   /**
@@ -78,13 +79,13 @@ class murmur_device_row_hasher {
     template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
     __device__ hash_value_type operator()(column_device_view const& col,
                                           size_type row_index,
-                                          Nullate const _check_nulls,
-                                          hash_value_type const _seed) const noexcept
+                                          Nullate const check_nulls,
+                                          hash_value_type const seed) const noexcept
     {
-      if (_check_nulls && col.is_null(row_index)) {
+      if (check_nulls && col.is_null(row_index)) {
         return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
       }
-      auto const hasher = MurmurHash3_x64_128<T>{_seed.first};
+      auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
     }
 

From 12c77f32eee3b1aa0ba5592d9f25b4664104bd04 Mon Sep 17 00:00:00 2001
From: tequilayu <48981002+tequilayu@users.noreply.github.com>
Date: Tue, 3 Dec 2024 08:57:51 +0800
Subject: [PATCH 08/78] add comment to Series.tolist method (#17350)

closes #15767

This PR adds comment to `Series.tolist` method. It mentions that the method will raise a `TypeError` when it's called and suggest alternatives.

Authors:
  - https://github.com/tequilayu
  - Michael Wang (https://github.com/isVoid)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17350
---
 python/cudf/cudf/core/series.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 928f3c3d666..58cefc6554e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -943,6 +943,19 @@ def drop(
         )
 
     def tolist(self):
+        """Conversion to host memory lists is currently unsupported
+
+        Raises
+        ------
+        TypeError
+            If this method is called
+
+        Notes
+        -----
+        cuDF currently does not support implicity conversion from GPU stored series to
+        host stored lists. A `TypeError` is raised when this method is called.
+        Consider calling `.to_arrow().to_pylist()` to construct a Python list.
+        """
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "

From 3785a48eb81be23b44b895624f21acbfc1a828c5 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 3 Dec 2024 11:17:04 -0600
Subject: [PATCH 09/78] Add multi-partition `DataFrameScan` support to
 cuDF-Polars (#17441)

Follow-up to https://github.com/rapidsai/cudf/pull/17262

Adds support for parallel `DataFrameScan` operations.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17441
---
 python/cudf_polars/cudf_polars/callback.py    |  14 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  17 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |   1 +
 .../cudf_polars/experimental/base.py          |  43 +++
 .../cudf_polars/experimental/dispatch.py      |  84 ++++++
 .../cudf_polars/experimental/io.py            |  49 ++++
 .../cudf_polars/experimental/parallel.py      | 245 +++++++++---------
 .../cudf_polars/tests/dsl/test_traversal.py   |  12 +-
 .../tests/experimental/test_dataframescan.py  |  53 ++++
 python/cudf_polars/tests/test_executors.py    |  16 ++
 10 files changed, 411 insertions(+), 123 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/experimental/base.py
 create mode 100644 python/cudf_polars/cudf_polars/experimental/dispatch.py
 create mode 100644 python/cudf_polars/cudf_polars/experimental/io.py
 create mode 100644 python/cudf_polars/tests/experimental/test_dataframescan.py

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 95527028aa9..29d3dc4ae79 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -217,7 +217,8 @@ def validate_config_options(config: dict) -> None:
         If the configuration contains unsupported options.
     """
     if unsupported := (
-        config.keys() - {"raise_on_fail", "parquet_options", "executor"}
+        config.keys()
+        - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
     ):
         raise ValueError(
             f"Engine configuration contains unsupported settings: {unsupported}"
@@ -226,6 +227,17 @@ def validate_config_options(config: dict) -> None:
         config.get("parquet_options", {})
     )
 
+    # Validate executor_options
+    executor = config.get("executor", "pylibcudf")
+    if executor == "dask-experimental":
+        unsupported = config.get("executor_options", {}).keys() - {
+            "max_rows_per_partition"
+        }
+    else:
+        unsupported = config.get("executor_options", {}).keys()
+    if unsupported:
+        raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
+
 
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a28b4cf25b2..1faa778ccf6 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -688,14 +688,16 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
-    __slots__ = ("df", "predicate", "projection")
-    _non_child = ("schema", "df", "projection", "predicate")
+    __slots__ = ("config_options", "df", "predicate", "projection")
+    _non_child = ("schema", "df", "projection", "predicate", "config_options")
     df: Any
     """Polars LazyFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
+    config_options: dict[str, Any]
+    """GPU-specific configuration options"""
 
     def __init__(
         self,
@@ -703,11 +705,13 @@ def __init__(
         df: Any,
         projection: Sequence[str] | None,
         predicate: expr.NamedExpr | None,
+        config_options: dict[str, Any],
     ):
         self.schema = schema
         self.df = df
         self.projection = tuple(projection) if projection is not None else None
         self.predicate = predicate
+        self.config_options = config_options
         self._non_child_args = (schema, df, self.projection, predicate)
         self.children = ()
 
@@ -719,7 +723,14 @@ def get_hashable(self) -> Hashable:
         not stable across runs, or repeat instances of the same equal dataframes.
         """
         schema_hash = tuple(self.schema.items())
-        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+        return (
+            type(self),
+            schema_hash,
+            id(self.df),
+            self.projection,
+            self.predicate,
+            json.dumps(self.config_options),
+        )
 
     @classmethod
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b1e2de63ba6..37cf36dc4dd 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -263,6 +263,7 @@ def _(
         translate_named_expr(translator, n=node.selection)
         if node.selection is not None
         else None,
+        translator.config.config.copy(),
     )
 
 
diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
new file mode 100644
index 00000000000..8f660632df2
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition base classes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import Union
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator, Sequence
+
+    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.nodebase import Node
+
+
+class PartitionInfo:
+    """
+    Partitioning information.
+
+    This class only tracks the partition count (for now).
+    """
+
+    __slots__ = ("count",)
+
+    def __init__(self, count: int):
+        self.count = count
+
+    def keys(self, node: Node) -> Iterator[tuple[str, int]]:
+        """Return the partitioned keys for a given node."""
+        name = get_key_name(node)
+        yield from ((name, i) for i in range(self.count))
+
+
+def get_key_name(node: Node) -> str:
+    """Generate the key name for a Node."""
+    return f"{type(node).__name__.lower()}-{hash(node)}"
+
+
+def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
+    # Concatenate a sequence of DataFrames vertically
+    return Union.do_evaluate(None, *dfs)
diff --git a/python/cudf_polars/cudf_polars/experimental/dispatch.py b/python/cudf_polars/cudf_polars/experimental/dispatch.py
new file mode 100644
index 00000000000..79a52ff3cde
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/dispatch.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition dispatch functions."""
+
+from __future__ import annotations
+
+from functools import singledispatch
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import TypeAlias
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.typing import GenericTransformer
+
+
+LowerIRTransformer: TypeAlias = (
+    "GenericTransformer[IR, tuple[IR, MutableMapping[IR, PartitionInfo]]]"
+)
+"""Protocol for Lowering IR nodes."""
+
+
+@singledispatch
+def lower_ir_node(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Rewrite an IR node and extract partitioning information.
+
+    Parameters
+    ----------
+    ir
+        IR node to rewrite.
+    rec
+        Recursive LowerIRTransformer callable.
+
+    Returns
+    -------
+    new_ir, partition_info
+        The rewritten node, and a mapping from unique nodes in
+        the full IR graph to associated partitioning information.
+
+    Notes
+    -----
+    This function is used by `lower_ir_graph`.
+
+    See Also
+    --------
+    lower_ir_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+
+
+@singledispatch
+def generate_ir_tasks(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    """
+    Generate a task graph for evaluation of an IR node.
+
+    Parameters
+    ----------
+    ir
+        IR node to generate tasks for.
+    partition_info
+        Partitioning information, obtained from :func:`lower_ir_graph`.
+
+    Returns
+    -------
+    mapping
+        A (partial) dask task graph for the evaluation of an ir node.
+
+    Notes
+    -----
+    Task generation should only produce the tasks for the current node,
+    referring to child tasks by name.
+
+    See Also
+    --------
+    task_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
new file mode 100644
index 00000000000..3a1fec36079
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition IO Logic."""
+
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import DataFrameScan, Union
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+
+
+@lower_ir_node.register(DataFrameScan)
+def _(
+    ir: DataFrameScan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    rows_per_partition = ir.config_options.get("executor_options", {}).get(
+        "max_rows_per_partition", 1_000_000
+    )
+
+    nrows = max(ir.df.shape()[0], 1)
+    count = math.ceil(nrows / rows_per_partition)
+
+    if count > 1:
+        length = math.ceil(nrows / count)
+        slices = [
+            DataFrameScan(
+                ir.schema,
+                ir.df.slice(offset, length),
+                ir.projection,
+                ir.predicate,
+                ir.config_options,
+            )
+            for offset in range(0, nrows, length)
+        ]
+        new_node = Union(ir.schema, None, *slices)
+        return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
+            new_node: PartitionInfo(count=count)
+        }
+
+    return ir, {ir: PartitionInfo(count=1)}
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 6518dd60c7d..e5884f1c574 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,93 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
-"""Partitioned LogicalPlan nodes."""
+"""Multi-partition Dask execution."""
 
 from __future__ import annotations
 
+import itertools
 import operator
-from functools import reduce, singledispatch
+from functools import reduce
 from typing import TYPE_CHECKING, Any
 
-from cudf_polars.dsl.ir import IR
-from cudf_polars.dsl.traversal import traversal
+import cudf_polars.experimental.io  # noqa: F401
+from cudf_polars.dsl.ir import IR, Cache, Projection, Union
+from cudf_polars.dsl.traversal import CachingVisitor, traversal
+from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
+from cudf_polars.experimental.dispatch import (
+    generate_ir_tasks,
+    lower_ir_node,
+)
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
-    from typing import TypeAlias
 
     from cudf_polars.containers import DataFrame
-    from cudf_polars.dsl.nodebase import Node
-    from cudf_polars.typing import GenericTransformer
-
-
-class PartitionInfo:
-    """
-    Partitioning information.
-
-    This class only tracks the partition count (for now).
-    """
-
-    __slots__ = ("count",)
-
-    def __init__(self, count: int):
-        self.count = count
-
-
-LowerIRTransformer: TypeAlias = (
-    "GenericTransformer[IR, MutableMapping[IR, PartitionInfo]]"
-)
-"""Protocol for Lowering IR nodes."""
-
-
-def get_key_name(node: Node) -> str:
-    """Generate the key name for a Node."""
-    return f"{type(node).__name__.lower()}-{hash(node)}"
-
-
-@singledispatch
-def lower_ir_node(
-    ir: IR, rec: LowerIRTransformer
-) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    """
-    Rewrite an IR node and extract partitioning information.
-
-    Parameters
-    ----------
-    ir
-        IR node to rewrite.
-    rec
-        Recursive LowerIRTransformer callable.
-
-    Returns
-    -------
-    new_ir, partition_info
-        The rewritten node, and a mapping from unique nodes in
-        the full IR graph to associated partitioning information.
-
-    Notes
-    -----
-    This function is used by `lower_ir_graph`.
-
-    See Also
-    --------
-    lower_ir_graph
-    """
-    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
 
 
 @lower_ir_node.register(IR)
 def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Default logic - Requires single partition
+
     if len(ir.children) == 0:
         # Default leaf node has single partition
-        return ir, {ir: PartitionInfo(count=1)}
+        return ir, {
+            ir: PartitionInfo(count=1)
+        }  # pragma: no cover; Missed by pylibcudf executor
 
     # Lower children
-    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False)
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
     partition_info = reduce(operator.or_, _partition_info)
 
     # Check that child partitioning is supported
-    count = max(partition_info[c].count for c in children)
-    if count > 1:
+    if any(partition_info[c].count > 1 for c in children):
         raise NotImplementedError(
             f"Class {type(ir)} does not support multiple partitions."
         )  # pragma: no cover
@@ -123,41 +76,62 @@ def lower_ir_graph(ir: IR) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     --------
     lower_ir_node
     """
-    from cudf_polars.dsl.traversal import CachingVisitor
-
     mapper = CachingVisitor(lower_ir_node)
     return mapper(ir)
 
 
-@singledispatch
-def generate_ir_tasks(
+def task_graph(
     ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
-) -> MutableMapping[Any, Any]:
+) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
     """
-    Generate a task graph for evaluation of an IR node.
+    Construct a task graph for evaluation of an IR graph.
 
     Parameters
     ----------
     ir
-        IR node to generate tasks for.
+        Root of the graph to rewrite.
     partition_info
-        Partitioning information, obtained from :func:`lower_ir_graph`.
+        A mapping from all unique IR nodes to the
+        associated partitioning information.
 
     Returns
     -------
-    mapping
-        A (partial) dask task graph for the evaluation of an ir node.
+    graph
+        A Dask-compatible task graph for the entire
+        IR graph with root `ir`.
 
     Notes
     -----
-    Task generation should only produce the tasks for the current node,
-    referring to child tasks by name.
+    This function traverses the unique nodes of the
+    graph with root `ir`, and extracts the tasks for
+    each node with :func:`generate_ir_tasks`.
 
     See Also
     --------
-    task_graph
+    generate_ir_tasks
     """
-    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+    graph = reduce(
+        operator.or_,
+        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
+    )
+
+    key_name = get_key_name(ir)
+    partition_count = partition_info[ir].count
+    if partition_count > 1:
+        graph[key_name] = (_concat, list(partition_info[ir].keys(ir)))
+        return graph, key_name
+    else:
+        return graph, (key_name, 0)
+
+
+def evaluate_dask(ir: IR) -> DataFrame:
+    """Evaluate an IR graph with Dask."""
+    from dask import get
+
+    ir, partition_info = lower_ir_graph(ir)
+
+    graph, key = task_graph(ir, partition_info)
+    return get(graph, key)
 
 
 @generate_ir_tasks.register(IR)
@@ -189,48 +163,85 @@ def _(
     }
 
 
-def task_graph(
-    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
-) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
-    """
-    Construct a task graph for evaluation of an IR graph.
+@lower_ir_node.register(Union)
+def _(
+    ir: Union, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Lower children
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
+    partition_info = reduce(operator.or_, _partition_info)
 
-    Parameters
-    ----------
-    ir
-        Root of the graph to rewrite.
-    partition_info
-        A mapping from all unique IR nodes to the
-        associated partitioning information.
+    # Check zlice
+    if ir.zlice is not None:  # pragma: no cover
+        if any(p[c].count > 1 for p, c in zip(children, _partition_info, strict=False)):
+            raise NotImplementedError("zlice is not supported for multiple partitions.")
+        new_node = ir.reconstruct(children)
+        partition_info[new_node] = PartitionInfo(count=1)
+        return new_node, partition_info
 
-    Returns
-    -------
-    graph
-        A Dask-compatible task graph for the entire
-        IR graph with root `ir`.
+    # Partition count is the sum of all child partitions
+    count = sum(partition_info[c].count for c in children)
 
-    Notes
-    -----
-    This function traverses the unique nodes of the
-    graph with root `ir`, and extracts the tasks for
-    each node with :func:`generate_ir_tasks`.
+    # Return reconstructed node and partition-info dict
+    new_node = ir.reconstruct(children)
+    partition_info[new_node] = PartitionInfo(count=count)
+    return new_node, partition_info
 
-    See Also
-    --------
-    generate_ir_tasks
-    """
-    graph = reduce(
-        operator.or_,
-        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
-    )
-    return graph, (get_key_name(ir), 0)
 
+@generate_ir_tasks.register(Union)
+def _(
+    ir: Union, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    key_name = get_key_name(ir)
+    partition = itertools.count()
+    return {
+        (key_name, next(partition)): child_key
+        for child in ir.children
+        for child_key in partition_info[child].keys(child)
+    }
 
-def evaluate_dask(ir: IR) -> DataFrame:
-    """Evaluate an IR graph with Dask."""
-    from dask import get
 
-    ir, partition_info = lower_ir_graph(ir)
+def _lower_ir_pwise(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Lower a partition-wise (i.e. embarrassingly-parallel) IR node
 
-    graph, key = task_graph(ir, partition_info)
-    return get(graph, key)
+    # Lower children
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
+    partition_info = reduce(operator.or_, _partition_info)
+    counts = {partition_info[c].count for c in children}
+
+    # Check that child partitioning is supported
+    if len(counts) > 1:
+        raise NotImplementedError(
+            f"Class {type(ir)} does not support unbalanced partitions."
+        )  # pragma: no cover
+
+    # Return reconstructed node and partition-info dict
+    partition = PartitionInfo(count=max(counts))
+    new_node = ir.reconstruct(children)
+    partition_info[new_node] = partition
+    return new_node, partition_info
+
+
+lower_ir_node.register(Projection, _lower_ir_pwise)
+lower_ir_node.register(Cache, _lower_ir_pwise)
+
+
+def _generate_ir_tasks_pwise(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Generate partition-wise (i.e. embarrassingly-parallel) tasks
+    child_names = [get_key_name(c) for c in ir.children]
+    return {
+        key: (
+            ir.do_evaluate,
+            *ir._non_child_args,
+            *[(child_name, i) for child_name in child_names],
+        )
+        for i, key in enumerate(partition_info[ir].keys(ir))
+    }
+
+
+generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise)
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 2f4df9289f8..9755994c419 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -116,7 +116,11 @@ def test_rewrite_ir_node():
     def replace_df(node, rec):
         if isinstance(node, ir.DataFrameScan):
             return ir.DataFrameScan(
-                node.schema, new_df._df, node.projection, node.predicate
+                node.schema,
+                new_df._df,
+                node.projection,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 
@@ -144,7 +148,11 @@ def test_rewrite_scan_node(tmp_path):
     def replace_scan(node, rec):
         if isinstance(node, ir.Scan):
             return ir.DataFrameScan(
-                node.schema, right._df, node.with_columns, node.predicate
+                node.schema,
+                right._df,
+                node.with_columns,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 
diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py
new file mode 100644
index 00000000000..77c7bf0c503
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_dataframescan.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import Translator
+from cudf_polars.experimental.parallel import lower_ir_graph
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": range(30_000),
+            "y": ["cat", "dog", "fish"] * 10_000,
+            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 6_000,
+        }
+    )
+
+
+@pytest.mark.parametrize("max_rows_per_partition", [1_000, 1_000_000])
+def test_parallel_dataframescan(df, max_rows_per_partition):
+    total_row_count = len(df.collect())
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": max_rows_per_partition},
+    )
+    assert_gpu_result_equal(df, engine=engine)
+
+    # Check partitioning
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+    ir, info = lower_ir_graph(qir)
+    count = info[ir].count
+    if max_rows_per_partition < total_row_count:
+        assert count > 1
+    else:
+        assert count == 1
+
+
+def test_dataframescan_concat(df):
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 1_000},
+    )
+    df2 = pl.concat([df, df])
+    assert_gpu_result_equal(df2, engine=engine)
diff --git a/python/cudf_polars/tests/test_executors.py b/python/cudf_polars/tests/test_executors.py
index 3eaea2ec9ea..b8c0bb926ab 100644
--- a/python/cudf_polars/tests/test_executors.py
+++ b/python/cudf_polars/tests/test_executors.py
@@ -66,3 +66,19 @@ def test_unknown_executor():
         match="ValueError: Unknown executor 'unknown-executor'",
     ):
         assert_gpu_result_equal(df, executor="unknown-executor")
+
+
+@pytest.mark.parametrize("executor", [None, "pylibcudf", "dask-experimental"])
+def test_unknown_executor_options(executor):
+    df = pl.LazyFrame({})
+
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="Unsupported executor_options",
+    ):
+        df.collect(
+            engine=pl.GPUEngine(
+                executor=executor,
+                executor_options={"foo": None},
+            )
+        )

From 4696bbf91ca37ab6960b606d1f7763487ee03ef6 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:58:35 -0500
Subject: [PATCH 10/78] Revert "Temporarily skip tests due to
 dask/distributed#8953" (#17492)

Reverts rapidsai/cudf#17472

The new dask nightly has resolved
https://github.com/dask/distributed/issues/8953
---
 .../custreamz/tests/test_dataframes.py        | 56 +++----------------
 1 file changed, 7 insertions(+), 49 deletions(-)

diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index 6905044039c..8c0130d2818 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -216,13 +216,7 @@ def test_set_index():
     assert_eq(b[0], df.set_index(df.y + 1))
 
 
-def test_binary_stream_operators(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_binary_stream_operators(stream):
     df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
 
     expected = df.x + df.y
@@ -248,13 +242,7 @@ def test_index(stream):
     assert_eq(L[1], df.index + 5)
 
 
-def test_pair_arithmetic(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_pair_arithmetic(stream):
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     a = DataFrame(example=df.iloc[:0], stream=stream)
@@ -267,13 +255,7 @@ def test_pair_arithmetic(request, stream):
     assert_eq(cudf.concat(L), (df.x + df.y) * 2)
 
 
-def test_getitem(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_getitem(stream):
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     a = DataFrame(example=df.iloc[:0], stream=stream)
@@ -350,13 +332,7 @@ def test_repr_html(stream):
         assert "1" in html
 
 
-def test_setitem(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_setitem(stream):
     df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10})
 
     sdf = DataFrame(example=df.iloc[:0], stream=stream)
@@ -380,13 +356,7 @@ def test_setitem(request, stream):
     assert_eq(L[-1], df.mean())
 
 
-def test_setitem_overwrites(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_setitem_overwrites(stream):
     df = cudf.DataFrame({"x": list(range(10))})
     sdf = DataFrame(example=df.iloc[:0], stream=stream)
     stream = sdf.stream
@@ -443,14 +413,8 @@ def test_setitem_overwrites(request, stream):
     ],
 )
 def test_rolling_count_aggregations(
-    request, op, window, m, pre_get, post_get, kwargs, stream
+    op, window, m, pre_get, post_get, kwargs, stream
 ):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream) and len(kwargs) == 0,
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
     index = pd.DatetimeIndex(
         pd.date_range("2000-01-01", "2000-01-03", freq="1h")
     )
@@ -844,13 +808,7 @@ def test_reductions_with_start_state(stream):
     assert output2[0] == 360
 
 
-def test_rolling_aggs_with_start_state(request, stream):
-    request.applymarker(
-        pytest.mark.xfail(
-            isinstance(stream, DaskStream),
-            reason="https://github.com/dask/distributed/issues/8953",
-        )
-    )
+def test_rolling_aggs_with_start_state(stream):
     example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64")
     sdf = DataFrame(stream, example=example)
     output0 = (

From d3e94d458ddeaced5ba34a825ab0af5275b73dbe Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 3 Dec 2024 10:03:29 -0800
Subject: [PATCH 11/78] Apply clang-tidy autofixes from new rules (#17431)

This PR contains all of clang-tidy's autofixes for the rules outlined in https://github.com/rapidsai/cudf/issues/17410. In the process I simplified the process of performing autofixes locally.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17431
---
 ci/cpp_linters.sh                             |   2 +-
 cpp/CMakeLists.txt                            |  14 +-
 cpp/src/bitmask/is_element_valid.cpp          |   4 +-
 cpp/src/column/column_view.cpp                |  97 +++---
 cpp/src/copying/copy.cpp                      |  12 +-
 cpp/src/copying/pack.cpp                      |  81 +++--
 cpp/src/datetime/timezone.cpp                 |   2 +-
 cpp/src/groupby/sort/aggregate.cpp            |  96 +++---
 cpp/src/interop/dlpack.cpp                    |   4 +-
 cpp/src/interop/to_arrow_schema.cpp           |   4 +-
 cpp/src/io/avro/avro.cpp                      |  12 +-
 cpp/src/io/comp/comp.cpp                      |   8 +-
 cpp/src/io/comp/nvcomp_adapter.cpp            | 280 +++++++++---------
 cpp/src/io/comp/uncomp.cpp                    |  40 +--
 cpp/src/io/functions.cpp                      |  63 ++--
 cpp/src/io/json/parser_features.cpp           | 139 ++++-----
 cpp/src/io/parquet/arrow_schema_writer.cpp    |   2 +-
 .../io/parquet/compact_protocol_reader.cpp    | 131 ++++----
 .../io/parquet/compact_protocol_writer.cpp    |   2 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |   5 +-
 cpp/src/io/parquet/reader_impl.cpp            |   6 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  18 +-
 cpp/src/io/text/bgzip_utils.cpp               |   2 +-
 cpp/src/io/utilities/base64_utilities.cpp     |   6 +-
 cpp/src/io/utilities/data_sink.cpp            |   4 +-
 cpp/src/io/utilities/datasource.cpp           |   8 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |  41 +--
 cpp/src/jit/cache.cpp                         |  12 +-
 cpp/src/jit/parser.cpp                        |  56 ++--
 .../quantiles/tdigest/tdigest_column_view.cpp |   8 +-
 cpp/src/reductions/scan/scan.cpp              |   3 +-
 cpp/src/reductions/segmented/reductions.cpp   |   3 +
 .../detail/optimized_unbounded_window.cpp     |  54 ++--
 cpp/src/strings/regex/regcomp.cpp             |  14 +-
 cpp/src/strings/regex/regexec.cpp             |   6 +-
 cpp/src/structs/utilities.cpp                 |   2 +-
 cpp/src/table/table_view.cpp                  |  33 ++-
 cpp/src/transform/transform.cpp               |   7 +-
 cpp/src/utilities/prefetch.cpp                |   4 +-
 cpp/src/utilities/stream_pool.cpp             | 112 +++----
 40 files changed, 722 insertions(+), 675 deletions(-)

diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
index 4d5b62ba280..9702b055512 100755
--- a/ci/cpp_linters.sh
+++ b/ci/cpp_linters.sh
@@ -27,7 +27,7 @@ source rapids-configure-sccache
 # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.
 
 iwyu_flag=""
-if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
+if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then
   iwyu_flag="-DCUDF_IWYU=ON"
 fi
 cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f25b46a52cd..12e6826f301 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -93,6 +93,7 @@ option(
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
 option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
 option(CUDF_IWYU "Enable IWYU during compilation" OFF)
+option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF)
 
 option(
   CUDF_KVIKIO_REMOTE_IO
@@ -205,9 +206,16 @@ function(enable_static_checkers target)
   if(_LINT_CLANG_TIDY)
     # clang will complain about unused link libraries on the compile line unless we specify
     # -Qunused-arguments.
-    set_target_properties(
-      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
-    )
+    if(CUDF_CLANG_TIDY_AUTOFIX)
+      set_target_properties(
+        ${target} PROPERTIES CXX_CLANG_TIDY
+                             "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix"
+      )
+    else()
+      set_target_properties(
+        ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+      )
+    endif()
   endif()
   if(_LINT_IWYU)
     # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 4806c7a94e8..7eb80c4249e 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -30,9 +30,9 @@ bool is_element_valid_sync(column_view const& col_view,
   CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
   if (!col_view.nullable()) { return true; }
 
-  bitmask_type word;
+  bitmask_type word = 0;
   // null_mask() returns device ptr to bitmask without offset
-  size_type index = element_index + col_view.offset();
+  size_type const index = element_index + col_view.offset();
   CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
                                 col_view.null_mask() + word_index(index),
                                 sizeof(bitmask_type),
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index e831aa9645d..ea940676f6a 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -41,7 +41,7 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
       cudf::experimental::prefetch::detail::prefetch_noexcept(
         key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
     } else if (col.type().id() == type_id::STRING) {
-      strings_column_view scv{col};
+      strings_column_view const scv{col};
       if (data_ptr == nullptr) {
         // Do not call chars_size if the data_ptr is nullptr.
         return;
@@ -58,51 +58,6 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
   }
 }
 
-}  // namespace
-
-column_view_base::column_view_base(data_type type,
-                                   size_type size,
-                                   void const* data,
-                                   bitmask_type const* null_mask,
-                                   size_type null_count,
-                                   size_type offset)
-  : _type{type},
-    _size{size},
-    _data{data},
-    _null_mask{null_mask},
-    _null_count{null_count},
-    _offset{offset}
-{
-  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
-
-  if (type.id() == type_id::EMPTY) {
-    _null_count = size;
-    CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
-    CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
-  } else if (is_compound(type)) {
-    if (type.id() != type_id::STRING) {
-      CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
-    }
-  } else if (size > 0) {
-    CUDF_EXPECTS(nullptr != data, "Null data pointer.");
-  }
-
-  CUDF_EXPECTS(offset >= 0, "Invalid offset.");
-
-  if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
-    CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
-  }
-}
-
-size_type column_view_base::null_count(size_type begin, size_type end) const
-{
-  CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
-  return (null_count() == 0)
-           ? 0
-           : cudf::detail::null_count(
-               null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
-}
-
 // Struct to use custom hash combine and fold expression
 struct HashValue {
   std::size_t hash;
@@ -133,8 +88,6 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false
                          });
 }
 
-std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
-
 bool shallow_equivalent_impl(column_view const& lhs,
                              column_view const& rhs,
                              bool is_parent_empty = false)
@@ -151,11 +104,59 @@ bool shallow_equivalent_impl(column_view const& lhs,
                       return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
                     });
 }
+
+}  // namespace
+
+column_view_base::column_view_base(data_type type,
+                                   size_type size,
+                                   void const* data,
+                                   bitmask_type const* null_mask,
+                                   size_type null_count,
+                                   size_type offset)
+  : _type{type},
+    _size{size},
+    _data{data},
+    _null_mask{null_mask},
+    _null_count{null_count},
+    _offset{offset}
+{
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
+
+  if (type.id() == type_id::EMPTY) {
+    _null_count = size;
+    CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
+    CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
+  } else if (is_compound(type)) {
+    if (type.id() != type_id::STRING) {
+      CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
+    }
+  } else if (size > 0) {
+    CUDF_EXPECTS(nullptr != data, "Null data pointer.");
+  }
+
+  CUDF_EXPECTS(offset >= 0, "Invalid offset.");
+
+  if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
+    CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
+  }
+}
+
+size_type column_view_base::null_count(size_type begin, size_type end) const
+{
+  CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
+  return (null_count() == 0)
+           ? 0
+           : cudf::detail::null_count(
+               null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
+}
+
 bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
 {
   return shallow_equivalent_impl(lhs, rhs);
 }
 
+std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
+
 }  // namespace detail
 
 // Immutable view constructor
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 5e2065ba844..89d8cc3f4aa 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -62,11 +62,12 @@ struct scalar_empty_like_functor_impl<cudf::list_view> {
     auto ls = static_cast<list_scalar const*>(&input);
 
     // TODO:  add a manual constructor for lists_column_view.
-    column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
+    column_view const offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
     std::vector<column_view> children;
     children.push_back(offsets);
     children.push_back(ls->view());
-    column_view lcv{cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
+    column_view const lcv{
+      cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
 
     return empty_like(lcv);
   }
@@ -81,8 +82,9 @@ struct scalar_empty_like_functor_impl<cudf::struct_view> {
     // TODO: add a manual constructor for structs_column_view
     // TODO: add cudf::get_element() support for structs
     cudf::table_view tbl = ss->view();
-    std::vector<column_view> children(tbl.begin(), tbl.end());
-    column_view scv{cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
+    std::vector<column_view> const children(tbl.begin(), tbl.end());
+    column_view const scv{
+      cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
 
     return empty_like(scv);
   }
@@ -120,7 +122,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
     is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
-  mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
+  mask_state const allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
 
   return std::make_unique<column>(input.type(),
                                   size,
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index a001807c82b..42ea28f5961 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -48,20 +48,20 @@ struct serialized_column {
       null_count(_null_count),
       data_offset(_data_offset),
       null_mask_offset(_null_mask_offset),
-      num_children(_num_children),
-      pad(0)
+      num_children(_num_children)
+
   {
   }
 
   data_type type;
-  size_type size;
-  size_type null_count;
-  int64_t data_offset;       // offset into contiguous data buffer, or -1 if column data is null
-  int64_t null_mask_offset;  // offset into contiguous data buffer, or -1 if column data is null
-  size_type num_children;
+  size_type size{};
+  size_type null_count{};
+  int64_t data_offset{};       // offset into contiguous data buffer, or -1 if column data is null
+  int64_t null_mask_offset{};  // offset into contiguous data buffer, or -1 if column data is null
+  size_type num_children{};
   // Explicitly pad to avoid uninitialized padding bits, allowing `serialized_column` to be bit-wise
   // comparable
-  int pad;
+  int pad{};
 };
 
 /**
@@ -137,6 +137,34 @@ void build_column_metadata(metadata_builder& mb,
     });
 }
 
+table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
+{
+  // gpu data can be null if everything is empty but the metadata must always be valid
+  CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
+  auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
+  uint8_t const* base_ptr = gpu_data;
+  // first entry is a stub where size == the total # of top level columns (see pack_metadata above)
+  auto const num_columns = serialized_columns[0].size;
+  size_t current_index   = 1;
+
+  std::function<std::vector<column_view>(size_type)> get_columns;
+  get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
+    std::vector<column_view> cols;
+    for (size_t i = 0; i < num_columns; i++) {
+      auto serial_column = serialized_columns[current_index];
+      current_index++;
+
+      std::vector<column_view> const children = get_columns(serial_column.num_children);
+
+      cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
+    }
+
+    return cols;
+  };
+
+  return table_view{get_columns(num_columns)};
+}
+
 }  // anonymous namespace
 
 /**
@@ -198,37 +226,6 @@ class metadata_builder_impl {
   std::vector<detail::serialized_column> metadata;
 };
 
-/**
- * @copydoc cudf::detail::unpack
- */
-table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
-{
-  // gpu data can be null if everything is empty but the metadata must always be valid
-  CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
-  auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
-  uint8_t const* base_ptr = gpu_data;
-  // first entry is a stub where size == the total # of top level columns (see pack_metadata above)
-  auto const num_columns = serialized_columns[0].size;
-  size_t current_index   = 1;
-
-  std::function<std::vector<column_view>(size_type)> get_columns;
-  get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
-    std::vector<column_view> cols;
-    for (size_t i = 0; i < num_columns; i++) {
-      auto serial_column = serialized_columns[current_index];
-      current_index++;
-
-      std::vector<column_view> children = get_columns(serial_column.num_children);
-
-      cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
-    }
-
-    return cols;
-  };
-
-  return table_view{get_columns(num_columns)};
-}
-
 metadata_builder::metadata_builder(size_type const num_root_columns)
   : impl(std::make_unique<metadata_builder_impl>(num_root_columns +
                                                  1 /*one more extra metadata entry as below*/))
@@ -280,9 +277,6 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
   return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder);
 }
 
-/**
- * @copydoc cudf::unpack
- */
 table_view unpack(packed_columns const& input)
 {
   CUDF_FUNC_RANGE();
@@ -292,9 +286,6 @@ table_view unpack(packed_columns const& input)
                             reinterpret_cast<uint8_t const*>(input.gpu_data->data()));
 }
 
-/**
- * @copydoc cudf::unpack(uint8_t const*, uint8_t const* )
- */
 table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index f786624680c..78e4198f60c 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -62,7 +62,7 @@ struct dst_transition_s {
 #pragma pack(pop)
 
 struct timezone_file {
-  timezone_file_header header;
+  timezone_file_header header{};
   bool is_header_from_64bit = false;
 
   std::vector<int64_t> transition_times;
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 3041e261945..7a8a1883ed4 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -45,6 +45,42 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
+namespace {
+
+/**
+ * @brief Creates column views with only valid elements in both input column views
+ *
+ * @param column_0 The first column
+ * @param column_1 The second column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return tuple with new null mask (if null masks of input differ) and new column views
+ */
+auto column_view_with_common_nulls(column_view const& column_0,
+                                   column_view const& column_1,
+                                   rmm::cuda_stream_view stream)
+{
+  auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream);
+  if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); }
+  auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) {
+    return column_view(col.type(),
+                       col.size(),
+                       col.head(),
+                       static_cast<cudf::bitmask_type const*>(nullmask),
+                       null_count,
+                       col.offset(),
+                       std::vector(col.child_begin(), col.child_end()));
+  };
+  auto new_column_0 = null_count == column_0.null_count()
+                        ? column_0
+                        : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count);
+  auto new_column_1 = null_count == column_1.null_count()
+                        ? column_1
+                        : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count);
+  return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1);
+}
+
+}  // namespace
+
 /**
  * @brief Functor to dispatch aggregation with
  *
@@ -170,13 +206,13 @@ void aggregate_result_functor::operator()<aggregation::MIN>(aggregation const& a
     } else {
       auto argmin_agg = make_argmin_aggregation();
       operator()<aggregation::ARGMIN>(*argmin_agg);
-      column_view argmin_result = cache.get_result(values, *argmin_agg);
+      column_view const argmin_result = cache.get_result(values, *argmin_agg);
 
       // We make a view of ARGMIN result without a null mask and gather using
       // this mask. The values in data buffer of ARGMIN result corresponding
       // to null values was initialized to ARGMIN_SENTINEL which is an out of
       // bounds index value and causes the gathered value to be null.
-      column_view null_removed_map(
+      column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmin_result.size(),
         static_cast<void const*>(argmin_result.template data<size_type>()),
@@ -212,13 +248,13 @@ void aggregate_result_functor::operator()<aggregation::MAX>(aggregation const& a
     } else {
       auto argmax_agg = make_argmax_aggregation();
       operator()<aggregation::ARGMAX>(*argmax_agg);
-      column_view argmax_result = cache.get_result(values, *argmax_agg);
+      column_view const argmax_result = cache.get_result(values, *argmax_agg);
 
       // We make a view of ARGMAX result without a null mask and gather using
       // this mask. The values in data buffer of ARGMAX result corresponding
       // to null values was initialized to ARGMAX_SENTINEL which is an out of
       // bounds index value and causes the gathered value to be null.
-      column_view null_removed_map(
+      column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmax_result.size(),
         static_cast<void const*>(argmax_result.template data<size_type>()),
@@ -248,8 +284,8 @@ void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const&
   auto count_agg = make_count_aggregation();
   operator()<aggregation::SUM>(*sum_agg);
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view sum_result   = cache.get_result(values, *sum_agg);
-  column_view count_result = cache.get_result(values, *count_agg);
+  column_view const sum_result   = cache.get_result(values, *sum_agg);
+  column_view const count_result = cache.get_result(values, *count_agg);
 
   // TODO (dm): Special case for timestamp. Add target_type_impl for it.
   //            Blocked until we support operator+ on timestamps
@@ -291,8 +327,8 @@ void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation con
   auto count_agg = make_count_aggregation();
   operator()<aggregation::MEAN>(*mean_agg);
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view mean_result = cache.get_result(values, *mean_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const mean_result = cache.get_result(values, *mean_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   auto result = detail::group_var(get_grouped_values(),
                                   mean_result,
@@ -312,7 +348,7 @@ void aggregate_result_functor::operator()<aggregation::STD>(aggregation const& a
   auto& std_agg = dynamic_cast<cudf::detail::std_aggregation const&>(agg);
   auto var_agg  = make_variance_aggregation(std_agg._ddof);
   operator()<aggregation::VARIANCE>(*var_agg);
-  column_view var_result = cache.get_result(values, *var_agg);
+  column_view const var_result = cache.get_result(values, *var_agg);
 
   auto result = cudf::detail::unary_operation(var_result, unary_operator::SQRT, stream, mr);
   cache.add_result(values, agg, std::move(result));
@@ -325,8 +361,8 @@ void aggregate_result_functor::operator()<aggregation::QUANTILE>(aggregation con
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
-  auto& quantile_agg      = dynamic_cast<cudf::detail::quantile_aggregation const&>(agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
+  auto& quantile_agg            = dynamic_cast<cudf::detail::quantile_aggregation const&>(agg);
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
@@ -346,7 +382,7 @@ void aggregate_result_functor::operator()<aggregation::MEDIAN>(aggregation const
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
@@ -391,7 +427,7 @@ void aggregate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation
   } else {
     CUDF_FAIL("Wrong count aggregation kind");
   }
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   cache.add_result(values,
                    agg,
@@ -564,38 +600,6 @@ void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregat
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
-/**
- * @brief Creates column views with only valid elements in both input column views
- *
- * @param column_0 The first column
- * @param column_1 The second column
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return tuple with new null mask (if null masks of input differ) and new column views
- */
-auto column_view_with_common_nulls(column_view const& column_0,
-                                   column_view const& column_1,
-                                   rmm::cuda_stream_view stream)
-{
-  auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream);
-  if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); }
-  auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) {
-    return column_view(col.type(),
-                       col.size(),
-                       col.head(),
-                       static_cast<cudf::bitmask_type const*>(nullmask),
-                       null_count,
-                       col.offset(),
-                       std::vector(col.child_begin(), col.child_end()));
-  };
-  auto new_column_0 = null_count == column_0.null_count()
-                        ? column_0
-                        : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count);
-  auto new_column_1 = null_count == column_1.null_count()
-                        ? column_1
-                        : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count);
-  return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1);
-}
-
 /**
  * @brief Perform covariance between two child columns of non-nullable struct column.
  *
@@ -734,7 +738,7 @@ void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation cons
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view valid_counts = cache.get_result(values, *count_agg);
+  column_view const valid_counts = cache.get_result(values, *count_agg);
 
   cache.add_result(values,
                    agg,
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index b5cc4cbba0d..fee767255c2 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -115,8 +115,8 @@ DLDataType data_type_to_DLDataType(data_type type)
 
 // Context object to own memory allocated for DLManagedTensor
 struct dltensor_context {
-  int64_t shape[2];    // NOLINT
-  int64_t strides[2];  // NOLINT
+  int64_t shape[2]{};    // NOLINT
+  int64_t strides[2]{};  // NOLINT
   rmm::device_buffer buffer;
 
   static void deleter(DLManagedTensor* arg)
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 5afed772656..5dd8d77c261 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -44,7 +44,7 @@ struct dispatch_to_arrow_type {
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
   {
-    cudf::type_id id = input_view.type().id();
+    cudf::type_id const id = input_view.type().id();
     switch (id) {
       case cudf::type_id::TIMESTAMP_SECONDS:
         return ArrowSchemaSetTypeDateTime(
@@ -186,7 +186,7 @@ int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
                                                            column_metadata const& metadata,
                                                            ArrowSchema* out)
 {
-  cudf::dictionary_column_view dview{input};
+  cudf::dictionary_column_view const dview{input};
 
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
   NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index b3fcca62314..c3a7f0f3053 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -200,7 +200,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     // encountered.  If they don't, we have to assume the data is corrupted,
     // and thus, we terminate processing immediately.
     std::array const sync_marker = {get_raw<uint64_t>(), get_raw<uint64_t>()};
-    bool valid_sync_markers =
+    bool const valid_sync_markers =
       ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1]));
     if (!valid_sync_markers) { return false; }
   }
@@ -218,10 +218,10 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   md->selected_data_size = m_cur - m_start;
   // Extract columns
   for (size_t i = 0; i < md->schema.size(); i++) {
-    type_kind_e kind                = md->schema[i].kind;
-    logicaltype_kind_e logical_kind = md->schema[i].logical_kind;
+    type_kind_e const kind                = md->schema[i].kind;
+    logicaltype_kind_e const logical_kind = md->schema[i].logical_kind;
 
-    bool is_supported_kind = ((kind > type_null) && (kind < type_record));
+    bool const is_supported_kind = ((kind > type_null) && (kind < type_record));
     if (is_supported_logical_type(logical_kind) || is_supported_kind) {
       column_desc col;
       int parent_idx       = md->schema[i].parent_idx;
@@ -302,7 +302,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   // Empty schema
   if (json_str == "[]") return true;
 
-  std::array<char, MAX_SCHEMA_DEPTH> depthbuf;
+  std::array<char, MAX_SCHEMA_DEPTH> depthbuf{};
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
@@ -341,7 +341,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   m_cur               = m_base;
   m_end               = m_base + json_str.length();
   while (more_data()) {
-    int c = *m_cur++;
+    int const c = *m_cur++;
     switch (c) {
       case '"':
         str = get_str();
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index b26a6292806..2dda2287e09 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -48,13 +48,13 @@ std::vector<std::uint8_t> compress_gzip(host_span<uint8_t const> src)
   zs.avail_out = 0;
   zs.next_out  = nullptr;
 
-  int windowbits    = 15;
-  int gzip_encoding = 16;
-  int ret           = deflateInit2(
+  constexpr int windowbits    = 15;
+  constexpr int gzip_encoding = 16;
+  int ret                     = deflateInit2(
     &zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowbits | gzip_encoding, 8, Z_DEFAULT_STRATEGY);
   CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression initialization failed.");
 
-  uint32_t estcomplen = deflateBound(&zs, src.size());
+  uint32_t const estcomplen = deflateBound(&zs, src.size());
   dst.resize(estcomplen);
   zs.avail_out = estcomplen;
   zs.next_out  = dst.data();
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index c3187f73a95..b8bf8be6d2d 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -31,6 +31,7 @@
 #include <mutex>
 
 namespace cudf::io::nvcomp {
+namespace {
 
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
@@ -50,19 +51,6 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
-size_t batched_decompress_temp_size(compression_type compression,
-                                    size_t num_chunks,
-                                    size_t max_uncomp_chunk_size,
-                                    size_t max_total_uncomp_size)
-{
-  size_t temp_size             = 0;
-  nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex(
-    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
-  return temp_size;
-}
 
 // Dispatcher for nvcompBatched<format>DecompressAsync
 template <typename... Args>
@@ -94,40 +82,6 @@ std::string compression_type_name(compression_type compression)
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
 
-void batched_decompress(compression_type compression,
-                        device_span<device_span<uint8_t const> const> inputs,
-                        device_span<device_span<uint8_t> const> outputs,
-                        device_span<compression_result> results,
-                        size_t max_uncomp_chunk_size,
-                        size_t max_total_uncomp_size,
-                        rmm::cuda_stream_view stream)
-{
-  auto const num_chunks = inputs.size();
-
-  // cuDF inflate inputs converted to nvcomp inputs
-  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
-  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
-  // Temporary space required for decompression
-  auto const temp_size = batched_decompress_temp_size(
-    compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
-  rmm::device_buffer scratch(temp_size, stream);
-  auto const nvcomp_status = batched_decompress_async(compression,
-                                                      nvcomp_args.input_data_ptrs.data(),
-                                                      nvcomp_args.input_data_sizes.data(),
-                                                      nvcomp_args.output_data_sizes.data(),
-                                                      actual_uncompressed_data_sizes.data(),
-                                                      num_chunks,
-                                                      scratch.data(),
-                                                      scratch.size(),
-                                                      nvcomp_args.output_data_ptrs.data(),
-                                                      nvcomp_statuses.data(),
-                                                      stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
-
-  update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
-}
-
 size_t batched_compress_temp_size(compression_type compression,
                                   size_t batch_size,
                                   size_t max_uncompressed_chunk_bytes,
@@ -172,52 +126,17 @@ size_t batched_compress_temp_size(compression_type compression,
   return temp_size;
 }
 
-// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
-size_t compress_max_output_chunk_size(compression_type compression,
-                                      uint32_t max_uncompressed_chunk_bytes)
-{
-  auto const capped_uncomp_bytes = std::min<size_t>(
-    compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes),
-    max_uncompressed_chunk_bytes);
-
-  size_t max_comp_chunk_size = 0;
-  nvcompStatus_t status      = nvcompStatus_t::nvcompSuccess;
-  switch (compression) {
-    case compression_type::SNAPPY:
-      status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::DEFLATE:
-      status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::ZSTD:
-      status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::LZ4:
-      status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
-      break;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-
-  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
-               "failed to get max uncompressed chunk size");
-  return max_comp_chunk_size;
-}
-
 // Dispatcher for nvcompBatched<format>CompressAsync
-static void batched_compress_async(compression_type compression,
-                                   void const* const* device_uncompressed_ptrs,
-                                   size_t const* device_uncompressed_bytes,
-                                   size_t max_uncompressed_chunk_bytes,
-                                   size_t batch_size,
-                                   void* device_temp_ptr,
-                                   size_t temp_bytes,
-                                   void* const* device_compressed_ptrs,
-                                   size_t* device_compressed_bytes,
-                                   rmm::cuda_stream_view stream)
+void batched_compress_async(compression_type compression,
+                            void const* const* device_uncompressed_ptrs,
+                            size_t const* device_uncompressed_bytes,
+                            size_t max_uncompressed_chunk_bytes,
+                            size_t batch_size,
+                            void* device_temp_ptr,
+                            size_t temp_bytes,
+                            void* const* device_compressed_ptrs,
+                            size_t* device_compressed_bytes,
+                            rmm::cuda_stream_view stream)
 {
   nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
   switch (compression) {
@@ -279,6 +198,137 @@ bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept
   return (reinterpret_cast<std::uintptr_t>(ptr) % alignment) == 0;
 }
 
+std::optional<std::string> is_compression_disabled_impl(compression_type compression,
+                                                        feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: {
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD:
+      if (not params.are_stable_integrations_enabled) {
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    default: return "Unsupported compression type";
+  }
+}
+
+std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
+                                                          feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE:
+    case compression_type::GZIP: {
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE decompression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD: {
+      if (not params.are_stable_integrations_enabled) {
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+  }
+  return "Unsupported compression type";
+}
+
+}  // namespace
+
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size)
+{
+  size_t temp_size                   = 0;
+  nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex(
+    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
+
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+  return temp_size;
+}
+
+void batched_decompress(compression_type compression,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<compression_result> results,
+                        size_t max_uncomp_chunk_size,
+                        size_t max_total_uncomp_size,
+                        rmm::cuda_stream_view stream)
+{
+  auto const num_chunks = inputs.size();
+
+  // cuDF inflate inputs converted to nvcomp inputs
+  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
+  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
+  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
+  // Temporary space required for decompression
+  auto const temp_size = batched_decompress_temp_size(
+    compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
+  rmm::device_buffer scratch(temp_size, stream);
+  auto const nvcomp_status = batched_decompress_async(compression,
+                                                      nvcomp_args.input_data_ptrs.data(),
+                                                      nvcomp_args.input_data_sizes.data(),
+                                                      nvcomp_args.output_data_sizes.data(),
+                                                      actual_uncompressed_data_sizes.data(),
+                                                      num_chunks,
+                                                      scratch.data(),
+                                                      scratch.size(),
+                                                      nvcomp_args.output_data_ptrs.data(),
+                                                      nvcomp_statuses.data(),
+                                                      stream.value());
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+
+  update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
+}
+
+// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
+size_t compress_max_output_chunk_size(compression_type compression,
+                                      uint32_t max_uncompressed_chunk_bytes)
+{
+  auto const capped_uncomp_bytes = std::min<size_t>(
+    compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes),
+    max_uncompressed_chunk_bytes);
+
+  size_t max_comp_chunk_size = 0;
+  nvcompStatus_t status      = nvcompStatus_t::nvcompSuccess;
+  switch (compression) {
+    case compression_type::SNAPPY:
+      status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::DEFLATE:
+      status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::ZSTD:
+      status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::LZ4:
+      status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
+      break;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+
+  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
+               "failed to get max uncompressed chunk size");
+  return max_comp_chunk_size;
+}
+
 void batched_compress(compression_type compression,
                       device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
@@ -347,28 +397,6 @@ struct hash_feature_status_inputs {
 using feature_status_memo_map =
   std::unordered_map<feature_status_inputs, std::optional<std::string>, hash_feature_status_inputs>;
 
-std::optional<std::string> is_compression_disabled_impl(compression_type compression,
-                                                        feature_status_parameters params)
-{
-  switch (compression) {
-    case compression_type::DEFLATE: {
-      if (not params.are_all_integrations_enabled) {
-        return "DEFLATE compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::LZ4:
-    case compression_type::SNAPPY:
-    case compression_type::ZSTD:
-      if (not params.are_stable_integrations_enabled) {
-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    default: return "Unsupported compression type";
-  }
-}
-
 std::optional<std::string> is_compression_disabled(compression_type compression,
                                                    feature_status_parameters params)
 {
@@ -398,30 +426,6 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   return reason;
 }
 
-std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
-                                                          feature_status_parameters params)
-{
-  switch (compression) {
-    case compression_type::DEFLATE:
-    case compression_type::GZIP: {
-      if (not params.are_all_integrations_enabled) {
-        return "DEFLATE decompression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::LZ4:
-    case compression_type::SNAPPY:
-    case compression_type::ZSTD: {
-      if (not params.are_stable_integrations_enabled) {
-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-  }
-  return "Unsupported compression type";
-}
-
 std::optional<std::string> is_decompression_disabled(compression_type compression,
                                                      feature_status_parameters params)
 {
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index b3d43fa786a..4ab5174387e 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -127,7 +127,7 @@ struct zip_archive_s {
 
 bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
 {
-  gz_file_header_s const* fhdr;
+  gz_file_header_s const* fhdr = nullptr;
 
   if (!dst) return false;
   memset(dst, 0, sizeof(gz_archive_s));
@@ -138,7 +138,7 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
   raw += sizeof(gz_file_header_s);
   len -= sizeof(gz_file_header_s);
   if (fhdr->flags & GZIPHeaderFlag::fextra) {
-    uint32_t xlen;
+    uint32_t xlen = 0;
 
     if (len < 2) return false;
     xlen = raw[0] | (raw[1] << 8);
@@ -151,8 +151,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
     len -= xlen;
   }
   if (fhdr->flags & GZIPHeaderFlag::fname) {
-    size_t l = 0;
-    uint8_t c;
+    size_t l  = 0;
+    uint8_t c = 0;
     do {
       if (l >= len) return false;
       c = raw[l];
@@ -163,8 +163,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
     len -= l;
   }
   if (fhdr->flags & GZIPHeaderFlag::fcomment) {
-    size_t l = 0;
-    uint8_t c;
+    size_t l  = 0;
+    uint8_t c = 0;
     do {
       if (l >= len) return false;
       c = raw[l];
@@ -219,7 +219,7 @@ bool OpenZipArchive(zip_archive_s* dst, uint8_t const* raw, size_t len)
 
 int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, uint8_t const* comp_data, size_t comp_len)
 {
-  int zerr;
+  int zerr = 0;
   z_stream strm;
 
   memset(&strm, 0, sizeof(strm));
@@ -291,7 +291,7 @@ size_t decompress_zlib(host_span<uint8_t const> src, host_span<uint8_t> dst)
  */
 size_t decompress_gzip(host_span<uint8_t const> src, host_span<uint8_t> dst)
 {
-  gz_archive_s gz;
+  gz_archive_s gz{};
   auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size());
   CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header");
   return decompress_zlib({gz.comp_data, gz.comp_len}, dst);
@@ -303,12 +303,12 @@ size_t decompress_gzip(host_span<uint8_t const> src, host_span<uint8_t> dst)
 size_t decompress_snappy(host_span<uint8_t const> src, host_span<uint8_t> dst)
 {
   CUDF_EXPECTS(not dst.empty() and src.size() >= 1, "invalid Snappy decompress inputs");
-  uint32_t uncompressed_size, bytes_left, dst_pos;
+  uint32_t uncompressed_size = 0, bytes_left = 0, dst_pos = 0;
   auto cur       = src.begin();
   auto const end = src.end();
   // Read uncompressed length (varint)
   {
-    uint32_t l        = 0, c;
+    uint32_t l = 0, c = 0;
     uncompressed_size = 0;
     do {
       c              = *cur++;
@@ -328,7 +328,7 @@ size_t decompress_snappy(host_span<uint8_t const> src, host_span<uint8_t> dst)
 
     if (blen & 3) {
       // Copy
-      uint32_t offset;
+      uint32_t offset = 0;
       if (blen & 2) {
         // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset
         if (cur + 2 > end) break;
@@ -441,7 +441,7 @@ source_properties get_source_properties(compression_type compression, host_span<
   switch (compression) {
     case compression_type::AUTO:
     case compression_type::GZIP: {
-      gz_archive_s gz;
+      gz_archive_s gz{};
       auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size());
       CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header while fetching source properties");
       compression = compression_type::GZIP;
@@ -452,26 +452,28 @@ source_properties get_source_properties(compression_type compression, host_span<
       [[fallthrough]];
     }
     case compression_type::ZIP: {
-      zip_archive_s za;
+      zip_archive_s za{};
       if (OpenZipArchive(&za, raw, src.size())) {
         size_t cdfh_ofs = 0;
         for (int i = 0; i < za.eocd->num_entries; i++) {
           auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
             reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
-          int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
+          int const cdfh_len =
+            sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
           if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
             // Bad cdir
             break;
           }
           // For now, only accept with non-zero file sizes and DEFLATE
           if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
-            size_t lfh_ofs  = cdfh->hdr_ofs;
-            auto const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
+            size_t const lfh_ofs = cdfh->hdr_ofs;
+            auto const* lfh      = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
             if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
                 lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
               if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
-                size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
-                size_t file_end   = file_start + lfh->comp_size;
+                size_t const file_start =
+                  lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
+                size_t const file_end = file_start + lfh->comp_size;
                 if (file_end <= src.size()) {
                   // Pick the first valid file of non-zero size (only 1 file expected in archive)
                   compression = compression_type::ZIP;
@@ -510,7 +512,7 @@ source_properties get_source_properties(compression_type compression, host_span<
       auto const end = src.end();
       // Read uncompressed length (varint)
       {
-        uint32_t l = 0, c;
+        uint32_t l = 0, c = 0;
         do {
           c              = *cur++;
           auto const lo7 = c & 0x7f;
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ceaeb5d8f85..88423122e16 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -39,6 +39,38 @@
 #include <utility>
 
 namespace cudf::io {
+namespace {
+
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths()[0];
+
+  // Attempt to infer from the file extension
+  auto const pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](auto const& begin, auto const& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
+}
+
+}  // namespace
 
 // Returns builder for csv_reader_options
 csv_reader_options_builder csv_reader_options::builder(source_info src)
@@ -170,35 +202,6 @@ table_with_metadata read_avro(avro_reader_options const& options, rmm::device_as
   return avro::read_avro(std::move(datasources[0]), options, cudf::get_default_stream(), mr);
 }
 
-compression_type infer_compression_type(compression_type compression, source_info const& info)
-{
-  if (compression != compression_type::AUTO) { return compression; }
-
-  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
-
-  auto filepath = info.filepaths()[0];
-
-  // Attempt to infer from the file extension
-  auto const pos = filepath.find_last_of('.');
-
-  if (pos == std::string::npos) { return {}; }
-
-  auto str_tolower = [](auto const& begin, auto const& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
-
-  if (ext == "gz") { return compression_type::GZIP; }
-  if (ext == "zip") { return compression_type::ZIP; }
-  if (ext == "bz2") { return compression_type::BZIP2; }
-  if (ext == "xz") { return compression_type::XZ; }
-
-  return compression_type::NONE;
-}
-
 table_with_metadata read_json(json_reader_options options,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
@@ -287,7 +290,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
     CUDF_FAIL("Unsupported source type");
   }
 
-  orc::metadata metadata(source.get(), stream);
+  orc::metadata const metadata(source.get(), stream);
 
   // Initialize statistics to return
   raw_orc_statistics result;
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index e795e8e09d8..ced7acb9cde 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -68,6 +68,77 @@ void json_reader_options::set_dtypes(schema_element types)
 }  // namespace cudf::io
 
 namespace cudf::io::json::detail {
+namespace {
+
+// example schema and its path.
+// "a": int             {"a", int}
+// "a": [ int ]         {"a", list}, {"element", int}
+// "a": { "b": int}     {"a", struct}, {"b", int}
+// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
+// "a": [ null]         {"a", list}, {"element", str}
+// back() is root.
+// front() is leaf.
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the json column
+ * @param root root of input schema element
+ * @return data type of the column if present, otherwise std::nullopt
+ */
+std::optional<data_type> get_path_data_type(
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
+{
+  if (path.empty() || path.size() == 1) {
+    return root.type;
+  } else {
+    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
+      auto const child_name      = path.first(path.size() - 1).back().first;
+      auto const child_schema_it = root.child_types.find(child_name);
+      return (child_schema_it != std::end(root.child_types))
+               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
+               : std::optional<data_type>{};
+    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
+      auto const child_schema_it = root.child_types.find(list_child_name);
+      return (child_schema_it != std::end(root.child_types))
+               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
+               : std::optional<data_type>{};
+    }
+    return std::optional<data_type>{};
+  }
+}
+
+std::optional<schema_element> child_schema_element(std::string const& col_name,
+                                                   cudf::io::json_reader_options const& options)
+{
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
+        auto column_index = atol(col_name.data());
+        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
+                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
+                 : std::optional<schema_element>{};
+      },
+      [col_name](
+        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
+                 : std::optional<schema_element>{};
+      },
+      [col_name](
+        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                 ? user_dtypes.find(col_name)->second
+                 : std::optional<schema_element>{};
+      },
+      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
+        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
+                 ? user_dtypes.child_types.find(col_name)->second
+                 : std::optional<schema_element>{};
+      }},
+    options.get_dtypes());
+}
+
+}  // namespace
 
 /// Created an empty column of the specified schema
 struct empty_column_functor {
@@ -211,74 +282,6 @@ column_name_info make_column_name_info(schema_element const& schema, std::string
   return info;
 }
 
-std::optional<schema_element> child_schema_element(std::string const& col_name,
-                                                   cudf::io::json_reader_options const& options)
-{
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-        auto column_index = atol(col_name.data());
-        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? user_dtypes.find(col_name)->second
-                 : std::optional<schema_element>{};
-      },
-      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
-                 ? user_dtypes.child_types.find(col_name)->second
-                 : std::optional<schema_element>{};
-      }},
-    options.get_dtypes());
-}
-
-// example schema and its path.
-// "a": int             {"a", int}
-// "a": [ int ]         {"a", list}, {"element", int}
-// "a": { "b": int}     {"a", struct}, {"b", int}
-// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
-// "a": [ null]         {"a", list}, {"element", str}
-// back() is root.
-// front() is leaf.
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the json column
- * @param root root of input schema element
- * @return data type of the column if present, otherwise std::nullopt
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
-{
-  if (path.empty() || path.size() == 1) {
-    return root.type;
-  } else {
-    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
-      auto const child_name      = path.first(path.size() - 1).back().first;
-      auto const child_schema_it = root.child_types.find(child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
-      auto const child_schema_it = root.child_types.find(list_child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    }
-    return std::optional<data_type>{};
-  }
-}
-
 std::optional<data_type> get_path_data_type(
   host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options)
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index d15435b2553..a4536ac6a3b 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -336,7 +336,7 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
 {
   // Lambda function to convert int32 to a string of uint8 bytes
   auto const convert_int32_to_byte_string = [&](int32_t const value) {
-    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::array<uint8_t, sizeof(int32_t)> buffer{};
     std::memcpy(buffer.data(), &value, sizeof(int32_t));
     return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
   };
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index d276e946a51..f1ecf66c29f 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -27,23 +27,7 @@
 #include <tuple>
 
 namespace cudf::io::parquet::detail {
-
-/**
- * @brief Base class for parquet field functors.
- *
- * Holds the field value used by all of the specialized functors.
- */
-class parquet_field {
- private:
-  int _field_val;
-
- protected:
-  parquet_field(int f) : _field_val(f) {}
-
- public:
-  virtual ~parquet_field() = default;
-  [[nodiscard]] int field() const { return _field_val; }
-};
+namespace {
 
 std::string field_type_string(FieldType type)
 {
@@ -79,6 +63,72 @@ void assert_bool_field_type(int type)
                "expected bool field, got " + field_type_string(field_type) + " field instead");
 }
 
+template <int index>
+struct FunctionSwitchImpl {
+  template <typename... Operator>
+  static inline void run(CompactProtocolReader* cpr,
+                         int field_type,
+                         int const& field,
+                         std::tuple<Operator...>& ops)
+  {
+    if (field == std::get<index>(ops).field()) {
+      std::get<index>(ops)(cpr, field_type);
+    } else {
+      FunctionSwitchImpl<index - 1>::run(cpr, field_type, field, ops);
+    }
+  }
+};
+
+template <>
+struct FunctionSwitchImpl<0> {
+  template <typename... Operator>
+  static inline void run(CompactProtocolReader* cpr,
+                         int field_type,
+                         int const& field,
+                         std::tuple<Operator...>& ops)
+  {
+    if (field == std::get<0>(ops).field()) {
+      std::get<0>(ops)(cpr, field_type);
+    } else {
+      cpr->skip_struct_field(field_type);
+    }
+  }
+};
+
+template <typename... Operator>
+inline void function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>& op)
+{
+  constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
+  int field           = 0;
+  while (true) {
+    int const current_byte = cpr->getb();
+    if (!current_byte) { break; }
+    int const field_delta = current_byte >> 4;
+    int const field_type  = current_byte & 0xf;
+    field                 = field_delta ? field + field_delta : cpr->get_i16();
+    FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
+  }
+}
+
+}  // namespace
+
+/**
+ * @brief Base class for parquet field functors.
+ *
+ * Holds the field value used by all of the specialized functors.
+ */
+class parquet_field {
+ private:
+  int _field_val;
+
+ protected:
+  parquet_field(int f) : _field_val(f) {}
+
+ public:
+  virtual ~parquet_field() = default;
+  [[nodiscard]] int field() const { return _field_val; }
+};
+
 /**
  * @brief Abstract base class for list functors.
  */
@@ -494,53 +544,6 @@ void CompactProtocolReader::skip_struct_field(int t, int depth)
   }
 }
 
-template <int index>
-struct FunctionSwitchImpl {
-  template <typename... Operator>
-  static inline void run(CompactProtocolReader* cpr,
-                         int field_type,
-                         int const& field,
-                         std::tuple<Operator...>& ops)
-  {
-    if (field == std::get<index>(ops).field()) {
-      std::get<index>(ops)(cpr, field_type);
-    } else {
-      FunctionSwitchImpl<index - 1>::run(cpr, field_type, field, ops);
-    }
-  }
-};
-
-template <>
-struct FunctionSwitchImpl<0> {
-  template <typename... Operator>
-  static inline void run(CompactProtocolReader* cpr,
-                         int field_type,
-                         int const& field,
-                         std::tuple<Operator...>& ops)
-  {
-    if (field == std::get<0>(ops).field()) {
-      std::get<0>(ops)(cpr, field_type);
-    } else {
-      cpr->skip_struct_field(field_type);
-    }
-  }
-};
-
-template <typename... Operator>
-inline void function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>& op)
-{
-  constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
-  int field           = 0;
-  while (true) {
-    int const current_byte = cpr->getb();
-    if (!current_byte) { break; }
-    int const field_delta = current_byte >> 4;
-    int const field_type  = current_byte & 0xf;
-    field                 = field_delta ? field + field_delta : cpr->get_i16();
-    FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
-  }
-}
-
 void CompactProtocolReader::read(FileMetaData* f)
 {
   using optional_list_column_order =
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 14c99f728de..bf2db013118 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -291,7 +291,7 @@ uint32_t CompactProtocolFieldWriter::put_uint(uint64_t v)
 
 uint32_t CompactProtocolFieldWriter::put_int(int64_t v)
 {
-  int64_t s = (v < 0);
+  int64_t const s = (v < 0);
   return put_uint(((v ^ -s) << 1) + s);
 }
 
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index cd3dcd2bce4..b0cbabf1c12 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -426,7 +426,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
   // For each column, it contains #sources * #column_chunks_per_src rows.
   std::vector<std::unique_ptr<column>> columns;
-  stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
+  stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
     auto const schema_idx = output_column_schemas[col_idx];
     auto const& dtype     = output_dtypes[col_idx];
@@ -447,7 +447,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   auto stats_table = cudf::table(std::move(columns));
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter.get(), static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter const stats_expr{filter.get(),
+                                              static_cast<size_type>(output_dtypes.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index d74ae83b635..c48ff896e33 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -148,7 +148,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
 
-    size_t max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
+    size_t const max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
     chunk_offsets.push_back(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
@@ -203,7 +203,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
       auto& out_buf = (*cols)[input_col.nesting[idx]];
       cols          = &out_buf.children;
 
-      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      int const owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
@@ -435,7 +435,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     ColumnChunkDesc* col               = &pass.chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    int const index             = pi->nesting_decode - page_nesting_decode.device_ptr();
     PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index a6562d33de2..bfd0cc992cf 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -833,7 +833,7 @@ std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update
   // buffer pointer and size
   auto read_int32_from_ipc_message = [&]() {
-    int32_t bytes;
+    int32_t bytes = 0;
     std::memcpy(&bytes, message_buf, sizeof(int32_t));
     // Offset the message buf and reduce remaining size
     message_buf += sizeof(int32_t);
@@ -991,7 +991,7 @@ std::string aggregate_reader_metadata::get_pandas_index() const
     // One-liner regex:
     // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
     // Documented below.
-    std::regex index_columns_expr{
+    std::regex const index_columns_expr{
       R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
       R"(()"                            // Open first capturing group
       R"((?:")"                         // Open non-capturing group match opening quote
@@ -1013,12 +1013,12 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   std::vector<std::string> names;
   auto str = get_pandas_index();
   if (str.length() != 0) {
-    std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
+    std::regex const index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
     std::smatch sm;
     while (std::regex_search(str, sm, index_name_expr)) {
       if (sm.size() == 2) {  // 2 = whole match, first item
         if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
-          std::regex esc_quote{R"(\\")"};
+          std::regex const esc_quote{R"(\\")"};
           names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
         }
       }
@@ -1362,8 +1362,8 @@ aggregate_reader_metadata::select_columns(
     std::vector<path_info> all_paths;
     std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
                                                          int schema_idx) {
-      auto const& schema_elem = get_schema(schema_idx);
-      std::string curr_path   = path_till_now + schema_elem.name;
+      auto const& schema_elem     = get_schema(schema_idx);
+      std::string const curr_path = path_till_now + schema_elem.name;
       all_paths.push_back({curr_path, schema_idx});
       for (auto const& child_idx : schema_elem.children_idx) {
         add_path(curr_path + ".", child_idx);
@@ -1376,7 +1376,7 @@ aggregate_reader_metadata::select_columns(
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
     // vector reference pushback (*use_names). If filter names passed.
-    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> const column_names{
       *use_names, *filter_columns_names};
     for (auto const& used_column_names : column_names) {
       for (auto const& selected_path : used_column_names.get()) {
@@ -1408,7 +1408,7 @@ aggregate_reader_metadata::select_columns(
 
     std::vector<column_name_info> selected_columns;
     if (include_index) {
-      std::vector<std::string> index_names = get_pandas_index_names();
+      std::vector<std::string> const index_names = get_pandas_index_names();
       std::transform(index_names.cbegin(),
                      index_names.cend(),
                      std::back_inserter(selected_columns),
@@ -1457,7 +1457,7 @@ aggregate_reader_metadata::select_columns(
     }
     for (auto& col : selected_columns) {
       auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
-      bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
+      bool const valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
       if (valid_column) {
         output_column_schemas.push_back(top_level_col_schema_idx);
 
diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp
index cb412828e2d..77da2a44c7c 100644
--- a/cpp/src/io/text/bgzip_utils.cpp
+++ b/cpp/src/io/text/bgzip_utils.cpp
@@ -40,7 +40,7 @@ IntType read_int(char* data)
 template <typename T>
 void write_int(std::ostream& output_stream, T val)
 {
-  std::array<char, sizeof(T)> bytes;
+  std::array<char, sizeof(T)> bytes{};
   // we assume little-endian
   std::memcpy(&bytes[0], &val, sizeof(T));
   output_stream.write(bytes.data(), bytes.size());
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
index 856c29599a7..2a2a07afc8d 100644
--- a/cpp/src/io/utilities/base64_utilities.cpp
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -86,7 +86,7 @@ std::string base64_encode(std::string_view string_to_encode)
   num_iterations += (input_length % 3) ? 1 : 0;
 
   std::string encoded;
-  size_t encoded_length = (input_length + 2) / 3 * 4;
+  size_t const encoded_length = (input_length + 2) / 3 * 4;
   encoded.reserve(encoded_length);
 
   // altered: modify base64 encoder loop using STL and Thrust.
@@ -135,7 +135,7 @@ std::string base64_decode(std::string_view encoded_string)
     return std::string{};
   }
 
-  size_t input_length = encoded_string.length();
+  size_t const input_length = encoded_string.length();
   std::string decoded;
 
   // altered: compute number of decoding iterations = floor (multiple of 4)
@@ -147,7 +147,7 @@ std::string base64_decode(std::string_view encoded_string)
   // two bytes smaller, depending on the amount of trailing equal signs
   // in the encoded string. This approximation is needed to reserve
   // enough space in the string to be returned.
-  size_t approx_decoded_length = input_length / 4 * 3;
+  size_t const approx_decoded_length = input_length / 4 * 3;
   decoded.reserve(approx_decoded_length);
 
   //
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index b37a5ac900a..bed03869b34 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -86,7 +86,7 @@ class file_sink : public data_sink {
   {
     if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
 
-    size_t offset = _bytes_written;
+    size_t const offset = _bytes_written;
     _bytes_written += size;
 
     if (!_kvikio_file.closed()) {
@@ -170,7 +170,7 @@ class void_sink : public data_sink {
   size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t _bytes_written;
+  size_t _bytes_written{};
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 10814eea458..62ef7c7a794 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -128,7 +128,8 @@ class file_source : public datasource {
                                                   rmm::cuda_stream_view stream) override
   {
     rmm::device_buffer out_data(size, stream);
-    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    size_t const read =
+      device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
     out_data.resize(read, stream);
     return datasource::buffer::create(std::move(out_data));
   }
@@ -444,7 +445,8 @@ class remote_file_source : public datasource {
                                                   rmm::cuda_stream_view stream) override
   {
     rmm::device_buffer out_data(size, stream);
-    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    size_t const read =
+      device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
     out_data.resize(read, stream);
     return datasource::buffer::create(std::move(out_data));
   }
@@ -471,7 +473,7 @@ class remote_file_source : public datasource {
   static bool is_supported_remote_url(std::string const& url)
   {
     // Regular expression to match "s3://"
-    static std::regex pattern{R"(^s3://)", std::regex_constants::icase};
+    static std::regex const pattern{R"(^s3://)", std::regex_constants::icase};
     return std::regex_search(url, pattern);
   }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index f9750e4a505..9b17e7f6d55 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -33,6 +33,24 @@
 namespace cudf {
 namespace io {
 namespace detail {
+namespace {
+
+[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode)
+{
+  auto const fd = open(filepath.c_str(), flags, mode);
+  if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); }
+
+  return fd;
+}
+
+[[nodiscard]] size_t get_file_size(int file_descriptor)
+{
+  struct stat st {};
+  CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size");
+  return static_cast<size_t>(st.st_size);
+}
+
+}  // namespace
 
 void force_init_cuda_context()
 {
@@ -55,26 +73,11 @@ void force_init_cuda_context()
     CUDF_EXPECTS(std::filesystem::exists(path), "Cannot open file; it does not exist");
   }
 
-  std::array<char, 1024> error_msg_buffer;
+  std::array<char, 1024> error_msg_buffer{};
   auto const error_msg = strerror_r(err, error_msg_buffer.data(), 1024);
   CUDF_FAIL("Cannot open file; failed with errno: " + std::string{error_msg});
 }
 
-[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode)
-{
-  auto const fd = open(filepath.c_str(), flags, mode);
-  if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); }
-
-  return fd;
-}
-
-[[nodiscard]] size_t get_file_size(int file_descriptor)
-{
-  struct stat st;
-  CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size");
-  return static_cast<size_t>(st.st_size);
-}
-
 file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode)
   : fd(open_file_checked(filepath.c_str(), flags, mode)), _size{get_file_size(fd)}
 {
@@ -125,7 +128,7 @@ class cufile_shim {
 void cufile_shim::modify_cufile_json() const
 {
   std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-  static temp_directory tmp_config_dir{"cudf_cufile_config"};
+  static temp_directory const tmp_config_dir{"cudf_cufile_config"};
 
   // Modify the config file based on the policy
   auto const config_file_path = getenv_or<std::string>(json_path_env_var, "/etc/cufile.json");
@@ -253,7 +256,7 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
                                                   uint8_t* dst,
                                                   rmm::cuda_stream_view stream)
 {
-  int device;
+  int device = 0;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto read_slice = [device, gds_read = shim->read, file_handle = cf_file.handle()](
@@ -285,7 +288,7 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath)
 
 std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
 {
-  int device;
+  int device = 0;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto write_slice = [device, gds_write = shim->write, file_handle = cf_file.handle()](
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 34a0bdce124..49f92756e43 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -22,6 +22,7 @@
 
 namespace cudf {
 namespace jit {
+namespace {
 
 // Get the directory in home to use for storing the cache
 std::filesystem::path get_user_home_cache_dir()
@@ -72,13 +73,13 @@ std::filesystem::path get_cache_dir()
 
     // Make per device cache based on compute capability. This is to avoid multiple devices of
     // different compute capability to access the same kernel cache.
-    int device;
-    int cc_major;
-    int cc_minor;
+    int device   = 0;
+    int cc_major = 0;
+    int cc_minor = 0;
     CUDF_CUDA_TRY(cudaGetDevice(&device));
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
-    int cc = cc_major * 10 + cc_minor;
+    int const cc = cc_major * 10 + cc_minor;
 
     kernel_cache_path /= std::to_string(cc);
 
@@ -107,13 +108,14 @@ std::size_t try_parse_numeric_env_var(char const* const env_name, std::size_t de
   auto const value = std::getenv(env_name);
   return value != nullptr ? std::stoull(value) : default_val;
 }
+}  // namespace
 
 jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog)
 {
   static std::mutex caches_mutex{};
   static std::unordered_map<std::string, std::unique_ptr<jitify2::ProgramCache<>>> caches{};
 
-  std::lock_guard<std::mutex> caches_lock(caches_mutex);
+  std::lock_guard<std::mutex> const caches_lock(caches_mutex);
 
   auto existing_cache = caches.find(preprog.name());
 
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 519ac2d1a2e..c79ba4347bf 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -26,10 +26,37 @@
 
 namespace cudf {
 namespace jit {
-constexpr char percent_escape[] = "_";  // NOLINT
+namespace {
 
 inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 
+std::string remove_comments(std::string const& src)
+{
+  std::string output;
+  auto f = src.cbegin();
+  while (f < src.cend()) {
+    auto l = std::find(f, src.cend(), '/');
+    output.append(f, l);  // push chunk instead of 1 char at a time
+    f = std::next(l);     // skip over '/'
+    if (l < src.cend()) {
+      char const n = f < src.cend() ? *f : '?';
+      if (n == '/') {                        // found "//"
+        f = std::find(f, src.cend(), '\n');  // skip to end of line
+      } else if (n == '*') {                 // found "/*"
+        auto term = std::string("*/");       // skip to end of next "*/"
+        f         = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size();
+      } else {
+        output.push_back('/');  // lone '/' should be pushed into output
+      }
+    }
+  }
+  return output;
+}
+
+}  // namespace
+
+constexpr char percent_escape[] = "_";  // NOLINT
+
 std::string ptx_parser::escape_percent(std::string const& src)
 {
   // b/c we're transforming into inline ptx we aren't allowed to have register names starting with %
@@ -106,7 +133,7 @@ std::string ptx_parser::parse_instruction(std::string const& src)
   std::string output;
   std::string suffix;
 
-  std::string original_code = "\n   /**   " + src + "  */\n";
+  std::string const original_code = "\n   /**   " + src + "  */\n";
 
   int piece_count = 0;
 
@@ -316,33 +343,10 @@ std::string ptx_parser::parse_function_header(std::string const& src)
   return "\n__device__ __inline__ void " + function_name + "(" + input_arg + "){" + "\n";
 }
 
-std::string remove_comments(std::string const& src)
-{
-  std::string output;
-  auto f = src.cbegin();
-  while (f < src.cend()) {
-    auto l = std::find(f, src.cend(), '/');
-    output.append(f, l);  // push chunk instead of 1 char at a time
-    f = std::next(l);     // skip over '/'
-    if (l < src.cend()) {
-      char n = f < src.cend() ? *f : '?';
-      if (n == '/') {                        // found "//"
-        f = std::find(f, src.cend(), '\n');  // skip to end of line
-      } else if (n == '*') {                 // found "/*"
-        auto term = std::string("*/");       // skip to end of next "*/"
-        f         = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size();
-      } else {
-        output.push_back('/');  // lone '/' should be pushed into output
-      }
-    }
-  }
-  return output;
-}
-
 // The interface
 std::string ptx_parser::parse()
 {
-  std::string no_comments = remove_comments(ptx);
+  std::string const no_comments = remove_comments(ptx);
 
   input_arg_list.clear();
   auto const _func = std::string(".func");  // Go directly to the .func mark
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index 17844b6bb0a..933ef1bfcbd 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -29,14 +29,14 @@ tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(c
   CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
   CUDF_EXPECTS(not col.nullable(), "Encountered nullable tdigest column");
 
-  structs_column_view scv(col);
+  structs_column_view const scv(col);
   CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
   CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
                "Encountered invalid tdigest column");
   CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
                "Encountered invalid tdigest column");
 
-  lists_column_view lcv(scv.child(centroid_column_index));
+  lists_column_view const lcv(scv.child(centroid_column_index));
   auto data = lcv.child();
   CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
   CUDF_EXPECTS(data.num_children() == 2,
@@ -52,14 +52,14 @@ lists_column_view tdigest_column_view::centroids() const { return child(centroid
 column_view tdigest_column_view::means() const
 {
   auto c = centroids();
-  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  structs_column_view const inner(c.parent().child(lists_column_view::child_column_index));
   return inner.child(mean_column_index);
 }
 
 column_view tdigest_column_view::weights() const
 {
   auto c = centroids();
-  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  structs_column_view const inner(c.parent().child(lists_column_view::child_column_index));
   return inner.child(weight_column_index);
 }
 
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index b91ae19b51a..7afd3ba3c00 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -20,8 +20,8 @@
 #include <cudf/reduction.hpp>
 
 namespace cudf {
-
 namespace detail {
+namespace {
 std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
@@ -50,6 +50,7 @@ std::unique_ptr<column> scan(column_view const& input,
            : detail::scan_inclusive(input, agg, null_handling, stream, mr);
 }
 
+}  // namespace
 }  // namespace detail
 
 std::unique_ptr<column> scan(column_view const& input,
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index c4f6c135dde..dedfc4b0734 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -26,6 +26,8 @@
 namespace cudf {
 namespace reduction {
 namespace detail {
+namespace {
+
 struct segmented_reduce_dispatch_functor {
   column_view const& col;
   device_span<size_type const> offsets;
@@ -126,6 +128,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
       segmented_values, offsets, output_dtype, null_handling, init, stream, mr},
     agg);
 }
+}  // namespace
 }  // namespace detail
 }  // namespace reduction
 
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 7cad31c0658..9c22c27144d 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -25,32 +25,7 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
-
-bool can_optimize_unbounded_window(bool unbounded_preceding,
-                                   bool unbounded_following,
-                                   size_type min_periods,
-                                   rolling_aggregation const& agg)
-{
-  auto is_supported = [](auto const& agg) {
-    switch (agg.kind) {
-      case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]];
-      case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]];
-      case cudf::aggregation::Kind::SUM: [[fallthrough]];
-      case cudf::aggregation::Kind::MIN: [[fallthrough]];
-      case cudf::aggregation::Kind::MAX: return true;
-      default:
-        // COLLECT_LIST and COLLECT_SET can be added at a later date.
-        // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED]
-        // category. For instance:
-        // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK)
-        //    use [UNBOUNDED PRECEDING, CURRENT ROW].
-        // 2. LEAD/LAG are defined on finite row boundaries.
-        return false;
-    }
-  };
-
-  return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg);
-}
+namespace {
 
 /// Converts rolling_aggregation to corresponding reduce/groupby_aggregation.
 template <typename Base>
@@ -145,6 +120,33 @@ std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
   // Blow up results into separate column.
   return cudf::make_column_from_scalar(*reduce_results, input.size(), stream, mr);
 }
+}  // namespace
+
+bool can_optimize_unbounded_window(bool unbounded_preceding,
+                                   bool unbounded_following,
+                                   size_type min_periods,
+                                   rolling_aggregation const& agg)
+{
+  auto is_supported = [](auto const& agg) {
+    switch (agg.kind) {
+      case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]];
+      case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]];
+      case cudf::aggregation::Kind::SUM: [[fallthrough]];
+      case cudf::aggregation::Kind::MIN: [[fallthrough]];
+      case cudf::aggregation::Kind::MAX: return true;
+      default:
+        // COLLECT_LIST and COLLECT_SET can be added at a later date.
+        // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED]
+        // category. For instance:
+        // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK)
+        //    use [UNBOUNDED PRECEDING, CURRENT ROW].
+        // 2. LEAD/LAG are defined on finite row boundaries.
+        return false;
+    }
+  };
+
+  return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg);
+}
 
 std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index b923a301f84..b7b1338dd89 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -80,8 +80,8 @@ std::array<char, 33> const escapable_chars{
  */
 std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
 {
-  auto size       = static_cast<size_type>(pattern.size());
-  size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
+  auto size             = static_cast<size_type>(pattern.size());
+  size_type const count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
     return is_begin_utf8_char(static_cast<uint8_t>(ch));
   });
   std::vector<char32_t> result(count + 1);
@@ -89,7 +89,7 @@ std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
   char const* input_ptr = pattern.data();
   for (size_type idx = 0; idx < size; ++idx) {
     char_utf8 output_character = 0;
-    size_type ch_width         = to_char_utf8(input_ptr, output_character);
+    size_type const ch_width   = to_char_utf8(input_ptr, output_character);
     input_ptr += ch_width;
     idx += ch_width - 1;
     *output_ptr++ = output_character;
@@ -102,7 +102,7 @@ std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
 
 int32_t reprog::add_inst(int32_t t)
 {
-  reinst inst;
+  reinst inst{};
   inst.type        = t;
   inst.u2.left_id  = 0;
   inst.u1.right_id = 0;
@@ -968,7 +968,7 @@ class regex_compiler {
     }
     if (token != RBRA) { push_operator(token, subid); }
 
-    static std::vector<int> tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA};
+    static std::vector<int> const tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA};
     _last_was_and =
       std::any_of(tokens.cbegin(), tokens.cend(), [token](auto t) { return t == token; });
   }
@@ -1046,7 +1046,7 @@ reprog reprog::create_from(std::string_view pattern,
 {
   reprog rtn;
   auto pattern32 = string_to_char32_vector(pattern);
-  regex_compiler compiler(pattern32.data(), flags, capture, rtn);
+  regex_compiler const compiler(pattern32.data(), flags, capture, rtn);
   // for debugging, it can be helpful to call rtn.print(flags) here to dump
   // out the instructions that have been created from the given pattern
   return rtn;
@@ -1114,7 +1114,7 @@ void reprog::build_start_ids()
   std::stack<int> ids;
   ids.push(_startinst_id);
   while (!ids.empty()) {
-    int id = ids.top();
+    int const id = ids.top();
     ids.pop();
     reinst const& inst = _insts[id];
     if (inst.type == OR) {
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index 60ad714dfec..3d11b641b3f 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -99,9 +99,9 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   // place each class and append the variable length data
   for (int32_t idx = 0; idx < classes_count; ++idx) {
     auto const& h_class = h_prog.class_at(idx);
-    reclass_device d_class{h_class.builtins,
-                           static_cast<int32_t>(h_class.literals.size()),
-                           reinterpret_cast<reclass_range*>(d_end)};
+    reclass_device const d_class{h_class.builtins,
+                                 static_cast<int32_t>(h_class.literals.size()),
+                                 reinterpret_cast<reclass_range*>(d_end)};
     *classes++ = d_class;
     memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_range));
     h_end += h_class.literals.size() * sizeof(reclass_range);
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 4012ee3d21c..22328726c0e 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -47,7 +47,7 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
     std::vector<column_view> children;
     children.reserve(num_cols);
     for (size_type col_index = 0; col_index < num_cols; col_index++) {
-      structs_column_view scv(struct_cols[col_index]);
+      structs_column_view const scv(struct_cols[col_index]);
 
       // all inputs must have the same # of children and they must all be of the
       // same type.
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 659beb749af..ee7136d8f5e 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -25,6 +25,21 @@
 
 namespace cudf {
 namespace detail {
+namespace {
+
+template <typename ViewType>
+auto concatenate_column_views(std::vector<ViewType> const& views)
+{
+  using ColumnView = typename ViewType::ColumnView;
+  std::vector<ColumnView> concat_cols;
+  for (auto& view : views) {
+    concat_cols.insert(concat_cols.end(), view.begin(), view.end());
+  }
+  return concat_cols;
+}
+
+}  // namespace
+
 template <typename ColumnView>
 table_view_base<ColumnView>::table_view_base(std::vector<ColumnView> const& cols) : _columns{cols}
 {
@@ -38,17 +53,6 @@ table_view_base<ColumnView>::table_view_base(std::vector<ColumnView> const& cols
   }
 }
 
-template <typename ViewType>
-auto concatenate_column_views(std::vector<ViewType> const& views)
-{
-  using ColumnView = typename ViewType::ColumnView;
-  std::vector<ColumnView> concat_cols;
-  for (auto& view : views) {
-    concat_cols.insert(concat_cols.end(), view.begin(), view.end());
-  }
-  return concat_cols;
-}
-
 // Explicit instantiation for a table of `column_view`s
 template class table_view_base<column_view>;
 
@@ -65,17 +69,16 @@ table_view table_view::select(std::vector<size_type> const& column_indices) cons
 // Convert mutable view to immutable view
 mutable_table_view::operator table_view()
 {
-  std::vector<column_view> cols{begin(), end()};
-  return table_view{cols};
+  return table_view{std::vector<column_view>{begin(), end()}};
 }
 
 table_view::table_view(std::vector<table_view> const& views)
-  : table_view{concatenate_column_views(views)}
+  : table_view{detail::concatenate_column_views(views)}
 {
 }
 
 mutable_table_view::mutable_table_view(std::vector<mutable_table_view> const& views)
-  : mutable_table_view{concatenate_column_views(views)}
+  : mutable_table_view{detail::concatenate_column_views(views)}
 {
 }
 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b919ac16956..4a383bfba47 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -33,7 +33,7 @@
 namespace cudf {
 namespace transformation {
 namespace jit {
-
+namespace {
 void unary_operation(mutable_column_view output,
                      column_view input,
                      std::string const& udf,
@@ -41,7 +41,7 @@ void unary_operation(mutable_column_view output,
                      bool is_ptx,
                      rmm::cuda_stream_view stream)
 {
-  std::string kernel_name =
+  std::string const kernel_name =
     jitify2::reflection::Template("cudf::transformation::jit::kernel")  //
       .instantiate(cudf::type_to_name(output.type()),  // list of template arguments
                    cudf::type_to_name(input.type()));
@@ -62,6 +62,7 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(output),
              cudf::jit::get_data_ptr(input));
 }
+}  // namespace
 
 }  // namespace jit
 }  // namespace transformation
@@ -81,7 +82,7 @@ std::unique_ptr<column> transform(column_view const& input,
 
   if (input.is_empty()) { return output; }
 
-  mutable_column_view output_view = *output;
+  mutable_column_view const output_view = *output;
 
   // transform
   transformation::jit::unary_operation(output_view, input, unary_udf, output_type, is_ptx, stream);
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 000526723c4..6c9f677afb3 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -33,14 +33,14 @@ prefetch_config& prefetch_config::instance()
 
 bool prefetch_config::get(std::string_view key)
 {
-  std::shared_lock<std::shared_mutex> lock(config_mtx);
+  std::shared_lock<std::shared_mutex> const lock(config_mtx);
   auto const it = config_values.find(key.data());
   return it == config_values.end() ? false : it->second;  // default to not prefetching
 }
 
 void prefetch_config::set(std::string_view key, bool value)
 {
-  std::lock_guard<std::shared_mutex> lock(config_mtx);
+  std::lock_guard<std::shared_mutex> const lock(config_mtx);
   config_values[key.data()] = value;
 }
 
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 7069b59be26..9d1bebd1937 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -55,6 +55,63 @@ std::size_t constexpr STREAM_POOL_SIZE = 32;
   } while (0)
 #endif
 
+/**
+ * @brief RAII struct to wrap a cuda event and ensure its proper destruction.
+ */
+struct cuda_event {
+  cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
+  virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
+
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
+  operator cudaEvent_t() { return e_; }
+
+ private:
+  cudaEvent_t e_{};
+};
+
+namespace {
+
+// FIXME: these will be available in rmm soon
+inline int get_num_cuda_devices()
+{
+  rmm::cuda_device_id::value_type num_dev{};
+  CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+rmm::cuda_device_id get_current_cuda_device()
+{
+  int device_id = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  return rmm::cuda_device_id{device_id};
+}
+
+/**
+ * @brief Returns a cudaEvent_t for the current thread.
+ *
+ * The returned event is valid for the current device.
+ *
+ * @return A cudaEvent_t unique to the current thread and valid on the current device.
+ */
+cudaEvent_t event_for_thread()
+{
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local static std::vector<cuda_event*> thread_events(get_num_cuda_devices());
+  auto const device_id = get_current_cuda_device();
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
+  return *thread_events[device_id.value()];
+}
+
+}  // namespace
+
 /**
  * @brief Implementation of `cuda_stream_pool` that wraps an `rmm::cuda_stram_pool`.
  */
@@ -109,59 +166,6 @@ cuda_stream_pool* create_global_cuda_stream_pool()
   return new rmm_cuda_stream_pool();
 }
 
-// FIXME: these will be available in rmm soon
-inline int get_num_cuda_devices()
-{
-  rmm::cuda_device_id::value_type num_dev{};
-  CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev));
-  return num_dev;
-}
-
-rmm::cuda_device_id get_current_cuda_device()
-{
-  int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
-  return rmm::cuda_device_id{device_id};
-}
-
-/**
- * @brief RAII struct to wrap a cuda event and ensure its proper destruction.
- */
-struct cuda_event {
-  cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
-  virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
-
-  // Moveable but not copyable.
-  cuda_event(const cuda_event&)            = delete;
-  cuda_event& operator=(const cuda_event&) = delete;
-
-  cuda_event(cuda_event&&)            = default;
-  cuda_event& operator=(cuda_event&&) = default;
-
-  operator cudaEvent_t() { return e_; }
-
- private:
-  cudaEvent_t e_;
-};
-
-/**
- * @brief Returns a cudaEvent_t for the current thread.
- *
- * The returned event is valid for the current device.
- *
- * @return A cudaEvent_t unique to the current thread and valid on the current device.
- */
-cudaEvent_t event_for_thread()
-{
-  // The program may crash if this function is called from the main thread and user application
-  // subsequently calls cudaDeviceReset().
-  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
-  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
-  auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
-  return *thread_events[device_id.value()];
-}
-
 /**
  * @brief Returns a reference to the global stream pool for the current device.
  * @return `cuda_stream_pool` valid on the current device.
@@ -174,7 +178,7 @@ cuda_stream_pool& global_cuda_stream_pool()
   static std::mutex mutex;
   auto const device_id = get_current_cuda_device();
 
-  std::lock_guard<std::mutex> lock(mutex);
+  std::lock_guard<std::mutex> const lock(mutex);
   if (pools[device_id.value()] == nullptr) {
     pools[device_id.value()] = create_global_cuda_stream_pool();
   }

From beb42960a7fbf2b0c1da17c943bb66050539b39c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 3 Dec 2024 10:05:24 -0800
Subject: [PATCH 12/78] Workaround for a misaligned access in `read_csv` on
 some CUDA versions (#17477)

Use a global array instead of a shared memory array in the `gather_row_offsets_gpu` kernel.

Impact on the kernel performance is less than 5%, and this kernel takes very little portion of the total read_csv execution time - impact on the performance is negligible.

Also modified functions that take this array to take a `device_span` instead on a plain pointer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17477
---
 cpp/src/io/csv/csv_gpu.cu | 40 +++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 273e82edf8b..e2bc75d4bab 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -495,7 +495,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
  * @param t thread id (leaf node id)
  */
 template <uint32_t lanemask, uint32_t tmask, uint32_t base, uint32_t level_scale>
-inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t)
+inline __device__ void ctx_merge(device_span<uint64_t> ctxtree, packed_rowctx_t* ctxb, uint32_t t)
 {
   uint64_t tmp = shuffle_xor(*ctxb, lanemask);
   if (!(t & tmask)) {
@@ -518,7 +518,7 @@ inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint3
  */
 template <uint32_t rmask>
 inline __device__ void ctx_unmerge(
-  uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t)
+  uint32_t base, device_span<uint64_t const> ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t)
 {
   rowctx32_t ctxb_left, ctxb_right, ctxb_sum;
   ctxb_sum   = get_row_context(ctxtree[base], *ctx);
@@ -550,7 +550,7 @@ inline __device__ void ctx_unmerge(
  * @param[in] ctxb packed row context for the current character block
  * @param t thread id (leaf node id)
  */
-static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
+static inline __device__ void rowctx_merge_transform(device_span<uint64_t> ctxtree,
                                                      packed_rowctx_t ctxb,
                                                      uint32_t t)
 {
@@ -584,8 +584,8 @@ static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
  *
  * @return Final row context and count (row_position*4 + context_id format)
  */
-static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxtree[1024],
-                                                                   uint32_t t)
+static inline __device__ rowctx32_t
+rowctx_inverse_merge_transform(device_span<uint64_t const> ctxtree, uint32_t t)
 {
   uint32_t ctx     = ctxtree[0] & 3;  // Starting input context
   rowctx32_t brow4 = 0;               // output row in block *4
@@ -603,6 +603,8 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
   return brow4 + ctx;
 }
 
+constexpr auto bk_ctxtree_size = rowofs_block_dim * 2;
+
 /**
  * @brief Gather row offsets from CSV character data split into 16KB chunks
  *
@@ -634,6 +636,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  */
 CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   gather_row_offsets_gpu(uint64_t* row_ctx,
+                         device_span<uint64_t> ctxtree,
                          device_span<uint64_t> offsets_out,
                          device_span<char const> const data,
                          size_t chunk_size,
@@ -649,12 +652,8 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
                          int escapechar,
                          int commentchar)
 {
-  auto start         = data.begin();
-  using block_reduce = typename cub::BlockReduce<uint32_t, rowofs_block_dim>;
-  __shared__ union {
-    typename block_reduce::TempStorage bk_storage;
-    __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
-  } temp_storage;
+  auto start            = data.begin();
+  auto const bk_ctxtree = ctxtree.subspan(blockIdx.x * bk_ctxtree_size, bk_ctxtree_size);
 
   char const* end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
   uint32_t t      = threadIdx.x;
@@ -723,16 +722,16 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   // Convert the long-form {rowmap,outctx}[inctx] version into packed version
   // {rowcount,ouctx}[inctx], then merge the row contexts of the 32-character blocks into
   // a single 16K-character block context
-  rowctx_merge_transform(temp_storage.ctxtree, pack_rowmaps(ctx_map), t);
+  rowctx_merge_transform(bk_ctxtree, pack_rowmaps(ctx_map), t);
 
   // If this is the second phase, get the block's initial parser state and row counter
   if (offsets_out.data()) {
-    if (t == 0) { temp_storage.ctxtree[0] = row_ctx[blockIdx.x]; }
+    if (t == 0) { bk_ctxtree[0] = row_ctx[blockIdx.x]; }
     __syncthreads();
 
     // Walk back the transform tree with the known initial parser state
-    rowctx32_t ctx             = rowctx_inverse_merge_transform(temp_storage.ctxtree, t);
-    uint64_t row               = (temp_storage.ctxtree[0] >> 2) + (ctx >> 2);
+    rowctx32_t ctx             = rowctx_inverse_merge_transform(bk_ctxtree, t);
+    uint64_t row               = (bk_ctxtree[0] >> 2) + (ctx >> 2);
     uint32_t rows_out_of_range = 0;
     uint32_t rowmap            = select_rowmap(ctx_map, ctx & 3);
     // Output row positions
@@ -749,11 +748,14 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
     }
     __syncthreads();
     // Return the number of rows out of range
-    rows_out_of_range = block_reduce(temp_storage.bk_storage).Sum(rows_out_of_range);
+
+    using block_reduce = typename cub::BlockReduce<uint32_t, rowofs_block_dim>;
+    __shared__ typename block_reduce::TempStorage bk_storage;
+    rows_out_of_range = block_reduce(bk_storage).Sum(rows_out_of_range);
     if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; }
   } else {
     // Just store the row counts and output contexts
-    if (t == 0) { row_ctx[blockIdx.x] = temp_storage.ctxtree[1]; }
+    if (t == 0) { row_ctx[blockIdx.x] = bk_ctxtree[1]; }
   }
 }
 
@@ -829,7 +831,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options,
   // Calculate actual block count to use based on records count
   auto const block_size = csvparse_block_dim;
   auto const num_rows   = row_offsets.size() - 1;
-  auto const grid_size  = (num_rows + block_size - 1) / block_size;
+  auto const grid_size  = cudf::util::div_rounding_up_safe<size_t>(num_rows, block_size);
 
   convert_csv_to_cudf<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts);
@@ -849,9 +851,11 @@ uint32_t __host__ gather_row_offsets(parse_options_view const& options,
                                      rmm::cuda_stream_view stream)
 {
   uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes);
+  auto ctxtree      = rmm::device_uvector<packed_rowctx_t>(dim_grid * bk_ctxtree_size, stream);
 
   gather_row_offsets_gpu<<<dim_grid, rowofs_block_dim, 0, stream.value()>>>(
     row_ctx,
+    ctxtree,
     offsets_out,
     data,
     chunk_size,

From 7cc9a9fe8f8e1d889ac813cbbf7f7eb2d4897400 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:41:18 -0600
Subject: [PATCH 13/78] Use exec_policy_nosync in write_json (#17445)

Part of #12086
Replaced `rmm::exec_policy` with `rmm::exec_policy_nosync` in write_json

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17445
---
 cpp/src/io/json/write_json.cu | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index a4885d59cc5..1a0c59e365a 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -327,7 +327,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                               -> size_type { return idx / tbl.num_columns(); }));
     auto validity_iterator =
       cudf::detail::make_counting_transform_iterator(0, validity_fn{*tbl_device_view});
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+    thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(stream),
                                   row_num,
                                   row_num + total_rows,
                                   validity_iterator,
@@ -335,7 +335,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                   false,
                                   thrust::equal_to<size_type>{},
                                   thrust::logical_or<bool>{});
-    thrust::for_each(rmm::exec_policy(stream),
+    thrust::for_each(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(total_rows),
                      [write_separator = d_str_separator.begin(),
@@ -362,7 +362,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
     0, cuda::proclaim_return_type<size_type>([num_strviews_per_row] __device__(size_type const i) {
       return i * num_strviews_per_row;
     }));
-  thrust::gather(rmm::exec_policy(stream),
+  thrust::gather(rmm::exec_policy_nosync(stream),
                  d_strview_offsets,
                  d_strview_offsets + row_string_offsets.size(),
                  old_offsets.begin<size_type>(),
@@ -427,7 +427,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
         auto const length = offsets[idx + 1] - offsets[idx];
         return length == 0 ? 2 : (2 + length + length - 1);
       }));
-  thrust::exclusive_scan(rmm::exec_policy(stream),
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
                          num_strings_per_list,
                          num_strings_per_list + num_offsets,
                          d_strview_offsets.begin());
@@ -436,7 +436,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   rmm::device_uvector<string_view> d_strviews(total_strings, stream);
   // scatter null_list and list_prefix, list_suffix
   auto col_device_view = cudf::column_device_view::create(lists_strings.parent(), stream);
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_lists),
                    [col = *col_device_view,
@@ -458,7 +458,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   auto labels = cudf::lists::detail::generate_labels(
     lists_strings, num_strings, stream, cudf::get_current_device_resource_ref());
   auto d_strings_children = cudf::column_device_view::create(strings_children, stream);
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_strings),
                    [col                = *col_device_view,
@@ -485,7 +485,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   // gather from offset and create a new string column
   auto old_offsets = strings_column_view(joined_col->view()).offsets();
   rmm::device_uvector<size_type> row_string_offsets(num_offsets, stream, mr);
-  thrust::gather(rmm::exec_policy(stream),
+  thrust::gather(rmm::exec_policy_nosync(stream),
                  d_strview_offsets.begin(),
                  d_strview_offsets.end(),
                  old_offsets.begin<size_type>(),

From 541e7e864c700bedfc667b5199a3415fca1b311d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:58:20 -0800
Subject: [PATCH 14/78] Make `column_empty` mask buffer creation consistent
 with libcudf (#16715)

Based on offline discussions, this PR makes `column_empty` consistent with libcudf where

* A size 0 "empty" column should not have a mask buffer
* A size > 0 "empty" (i.e all null) column should have a mask buffer

Additionally removes `column_empty_like` which can be subsumed by `column_empty` (I didn't find any active usage of this method across RAPIDS https://github.com/search?q=org%3Arapidsai%20column_empty_like&type=code)

`column_empty` will have an unused `masked` argument, but since there is usage of this method across RAPIDS I'll need to adjust them before removing that keyword here (https://github.com/search?q=org%3Arapidsai%20column_empty&type=code)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16715
---
 python/cudf/cudf/core/column/__init__.py   |  1 -
 python/cudf/cudf/core/column/column.py     | 66 +++++++++-------------
 python/cudf/cudf/core/dataframe.py         | 14 ++---
 python/cudf/cudf/core/reshape.py           | 18 +++---
 python/cudf/cudf/core/udf/groupby_utils.py |  5 +-
 python/cudf/cudf/tests/test_list.py        |  2 +-
 python/cudf/cudf/tests/test_parquet.py     | 25 ++++++++
 python/cudf/cudf/tests/test_string_udfs.py |  4 +-
 python/cudf/cudf/utils/queryutils.py       |  3 +-
 9 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 0a9d339a6a8..db8d33f013a 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -6,7 +6,6 @@
     as_column,
     build_column,
     column_empty,
-    column_empty_like,
     concat_columns,
     deserialize_columns,
     serialize_columns,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index cdc3a03f445..c8cd80f45f4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -882,7 +882,7 @@ def take(
         """
         # Handle zero size
         if indices.size == 0:
-            return cast(Self, column_empty_like(self, newsize=0))
+            return cast(Self, column_empty(row_count=0, dtype=self.dtype))
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
@@ -1222,7 +1222,6 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
             "data": (self.data_ptr, False),
             "version": 1,
         }
-
         if self.nullable and self.has_nulls():
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
@@ -1516,37 +1515,6 @@ def _return_sentinel_column():
         return codes.fillna(na_sentinel.value)
 
 
-def column_empty_like(
-    column: ColumnBase,
-    dtype: Dtype | None = None,
-    masked: bool = False,
-    newsize: int | None = None,
-) -> ColumnBase:
-    """Allocate a new column like the given *column*"""
-    if dtype is None:
-        dtype = column.dtype
-    row_count = len(column) if newsize is None else newsize
-
-    if (
-        hasattr(column, "dtype")
-        and isinstance(column.dtype, cudf.CategoricalDtype)
-        and dtype == column.dtype
-    ):
-        catcolumn = cast("cudf.core.column.CategoricalColumn", column)
-        codes = column_empty_like(
-            catcolumn.codes, masked=masked, newsize=newsize
-        )
-        return build_column(
-            data=None,
-            dtype=dtype,
-            mask=codes.base_mask,
-            children=(codes,),
-            size=codes.size,
-        )
-
-    return column_empty(row_count, dtype, masked)
-
-
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     """Check if an object dtype Series or array contains NaN."""
     return any(
@@ -1556,9 +1524,31 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
 
 
 def column_empty(
-    row_count: int, dtype: Dtype = "object", masked: bool = False
+    row_count: int,
+    dtype: Dtype = "object",
+    masked: bool = False,
+    for_numba: bool = False,
 ) -> ColumnBase:
-    """Allocate a new column like the given row_count and dtype."""
+    """
+    Allocate a new column with the given row_count and dtype.
+
+    * Passing row_count == 0 creates a size 0 column without a mask buffer.
+    * Passing row_count > 0 creates an all null column with a mask buffer.
+
+    Parameters
+    ----------
+    row_count : int
+        Number of elements in the column.
+
+    dtype : Dtype
+        Type of the column.
+
+    masked : bool
+        Unused.
+
+    for_numba : bool, default False
+        If True, don't allocate a mask as it's not supported by numba.
+    """
     dtype = cudf.dtype(dtype)
     children: tuple[ColumnBase, ...] = ()
 
@@ -1600,7 +1590,7 @@ def column_empty(
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
-    if masked:
+    if row_count > 0 and not for_numba:
         mask = as_buffer(
             plc.null_mask.create_null_mask(
                 row_count, plc.null_mask.MaskState.ALL_NULL
@@ -2353,9 +2343,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         if not is_dtype_equal(obj.dtype, head.dtype):
             # if all null, cast to appropriate dtype
             if obj.null_count == len(obj):
-                objs[i] = column_empty_like(
-                    head, dtype=head.dtype, masked=True, newsize=len(obj)
-                )
+                objs[i] = column_empty(row_count=len(obj), dtype=head.dtype)
             else:
                 raise ValueError("All columns must be the same type")
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fa8d517a9ef..656274bca38 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1424,8 +1424,8 @@ def __setitem__(self, arg, value):
                         new_columns = (
                             value
                             if key == arg
-                            else column.column_empty_like(
-                                col, masked=True, newsize=length
+                            else column.column_empty(
+                                row_count=length, dtype=col.dtype
                             )
                             for key, col in self._column_labels_and_values
                         )
@@ -3385,10 +3385,8 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
                         (
-                            column.column_empty_like(
-                                col_data, masked=True, newsize=length
-                            )
-                            for col_data in self._columns
+                            column.column_empty(row_count=length, dtype=dtype)
+                            for _, dtype in self._dtypes
                         ),
                         verify=False,
                     )
@@ -6191,8 +6189,8 @@ def quantile(
                         quant_index=False,
                     )._column
                     if len(res) == 0:
-                        res = column.column_empty_like(
-                            qs, dtype=ser.dtype, masked=True, newsize=len(qs)
+                        res = column.column_empty(
+                            row_count=len(qs), dtype=ser.dtype
                         )
                     result[k] = res
             result = DataFrame._from_data(result)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index f37b44b1100..a6815da62c6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -14,7 +14,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.column import ColumnBase, as_column, column_empty_like
+from cudf.core.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils.dtypes import min_unsigned_type
 
@@ -421,8 +421,8 @@ def concat(
                         # if join is inner and it contains an empty df
                         # we return an empty df, hence creating an empty
                         # column with dtype metadata retained.
-                        result_data[name] = cudf.core.column.column_empty_like(
-                            col, newsize=0
+                        result_data[name] = column_empty(
+                            row_count=0, dtype=col.dtype
                         )
                     else:
                         result_data[name] = col
@@ -458,8 +458,8 @@ def concat(
                     else:
                         col_label = (k, name)
                     if empty_inner:
-                        result_data[col_label] = (
-                            cudf.core.column.column_empty_like(col, newsize=0)
+                        result_data[col_label] = column_empty(
+                            row_count=0, dtype=col.dtype
                         )
                     else:
                         result_data[col_label] = col
@@ -995,9 +995,7 @@ def as_tuple(x):
             ]
             new_size = nrows * len(names)
             scatter_map = (columns_idx * np.int32(nrows)) + index_idx
-            target_col = cudf.core.column.column_empty_like(
-                col, masked=True, newsize=new_size
-            )
+            target_col = column_empty(row_count=new_size, dtype=col.dtype)
             target_col[scatter_map] = col
             target = cudf.Index._from_column(target_col)
             result.update(
@@ -1300,7 +1298,9 @@ def _one_hot_encode_column(
     """
     if isinstance(column.dtype, cudf.CategoricalDtype):
         if column.size == column.null_count:
-            column = column_empty_like(categories, newsize=column.size)
+            column = column_empty(
+                row_count=column.size, dtype=categories.dtype
+            )
         else:
             column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 3af662b62ea..814d3e9fc85 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -154,8 +154,9 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
 
-    output = cudf.core.column.column_empty(ngroups, dtype=return_type)
-
+    output = cudf.core.column.column_empty(
+        ngroups, dtype=return_type, for_numba=True
+    )
     launch_args = [
         offsets,
         output,
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 7d87fc73621..260b481b933 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -852,7 +852,7 @@ def test_listcol_setitem_retain_dtype():
         {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]}
     )
     df1 = df.head(0)
-    # Performing a setitem on `b` triggers a `column.column_empty_like` call
+    # Performing a setitem on `b` triggers a `column.column_empty` call
     # which tries to create an empty ListColumn.
     df1["b"] = df1["c"]
     # Performing a copy to trigger a copy dtype which is obtained by accessing
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index de3636f7526..13efa71ebae 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -4158,6 +4158,31 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
 
+def test_parquet_roundtrip_zero_rows_no_column_mask():
+    expected = cudf.DataFrame._from_data(
+        {
+            "int": cudf.core.column.column_empty(0, "int64"),
+            "float": cudf.core.column.column_empty(0, "float64"),
+            "datetime": cudf.core.column.column_empty(0, "datetime64[ns]"),
+            "timedelta": cudf.core.column.column_empty(0, "timedelta64[ns]"),
+            "bool": cudf.core.column.column_empty(0, "bool"),
+            "decimal": cudf.core.column.column_empty(
+                0, cudf.Decimal64Dtype(1)
+            ),
+            "struct": cudf.core.column.column_empty(
+                0, cudf.StructDtype({"a": "int64"})
+            ),
+            "list": cudf.core.column.column_empty(
+                0, cudf.ListDtype("float64")
+            ),
+        }
+    )
+    with BytesIO() as bio:
+        expected.to_parquet(bio)
+        result = cudf.read_parquet(bio)
+    assert_eq(result, expected)
+
+
 def test_parquet_reader_mismatched_nullability():
     # Ensure that we can faithfully read the tables with mismatched nullabilities
     df1 = cudf.DataFrame(
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 69876d97aad..f4841f42e91 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -82,7 +82,9 @@ def run_udf_test(data, func, dtype):
         )
     else:
         dtype = np.dtype(dtype)
-        output = cudf.core.column.column_empty(len(data), dtype=dtype)
+        output = cudf.core.column.column_empty(
+            len(data), dtype=dtype, for_numba=True
+        )
 
     cudf_column = cudf.core.column.as_column(data)
     str_views = column_to_string_view_array(cudf_column)
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 8966789fee8..4e3d32c8ed0 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -210,7 +210,6 @@ def query_execute(df, expr, callenv):
         Contains keys 'local_dict', 'locals' and 'globals' which are all dict.
         They represent the arg, local and global dictionaries of the caller.
     """
-
     # compile
     compiled = query_compile(expr)
     columns = compiled["colnames"]
@@ -247,7 +246,7 @@ def query_execute(df, expr, callenv):
 
     # allocate output buffer
     nrows = len(df)
-    out = column_empty(nrows, dtype=np.bool_)
+    out = column_empty(nrows, dtype=np.bool_, for_numba=True)
     # run kernel
     args = [out, *colarrays, *envargs]
     with _CUDFNumbaConfig():

From 1b01df357a841e4aa29f3a40bc1162f1380269fb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 08:22:36 -0500
Subject: [PATCH 15/78] Use grid_1d utilities in copy_range.cuh (#17409)

Use the `grid_1d` utilities to manage thread and stride calculations in the `copy_range.cuh` kernels.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/17409
---
 cpp/include/cudf/detail/copy_range.cuh | 8 ++++----
 cpp/include/cudf/detail/null_mask.cuh  | 9 ++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index fcb80fe45f7..022c5c40ea0 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -56,15 +56,15 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
   constexpr cudf::size_type leader_lane{0};
   int const lane_id = threadIdx.x % warp_size;
 
-  cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int const warp_id         = tid / warp_size;
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const warp_id = tid / warp_size;
 
   cudf::size_type const offset         = target.offset();
   cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin);
   cudf::size_type const end_mask_idx   = cudf::word_index(offset + target_end);
 
   cudf::size_type mask_idx             = begin_mask_idx + warp_id;
-  cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size;
+  cudf::size_type const masks_per_grid = cudf::detail::grid_1d::grid_stride() / warp_size;
 
   cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin);
   cudf::size_type source_idx    = tid + target_offset;
@@ -92,7 +92,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
       }
     }
 
-    source_idx += blockDim.x * gridDim.x;
+    source_idx += cudf::detail::grid_1d::grid_stride();
     mask_idx += masks_per_grid;
   }
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 025e2ccc3ec..17ecc0f5539 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -67,7 +67,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
                                       size_type source_size_bits,
                                       size_type* count_ptr)
 {
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const last_bit_index  = source_size_bits - 1;
   auto const last_word_index = cudf::word_index(last_bit_index);
@@ -75,7 +75,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
   size_type thread_count = 0;
 
   for (size_type destination_word_index = tid; destination_word_index < destination.size();
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += cudf::detail::grid_1d::grid_stride()) {
     bitmask_type destination_word =
       detail::get_mask_offset_word(source[0],
                                    destination_word_index,
@@ -214,8 +214,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
-  size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type range_id  = tid;
+  auto range_id = cudf::detail::grid_1d::global_thread_id();
 
   while (range_id < num_ranges) {
     size_type const first_bit_index = *(first_bit_indices + range_id);
@@ -243,7 +242,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
     // Update the null count with the computed delta.
     size_type updated_null_count = *(null_counts + range_id) + delta;
     *(null_counts + range_id)    = updated_null_count;
-    range_id += blockDim.x * gridDim.x;
+    range_id += cudf::detail::grid_1d::grid_stride();
   }
 }
 

From 439321edb43082fb75f195b6be2049c925279089 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:14:47 -0500
Subject: [PATCH 16/78] Turn off cudf.pandas 3rd party integrations tests for
 24.12 (#17500)

Removes the third-party integration tests for the 24.12 nightly CI. We
need to do this to unblock CI. These tests have not been running
properly, and we just noticed that. There are more than a few failures
so we will have to resolve this in the next release. Future work is
tracked in #17490.
---
 .github/workflows/test.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3be07480b15..d261c370fd0 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -135,18 +135,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
-  third-party-integration-tests-cudf-pandas:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
-    with:
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      node_type: "gpu-v100-latest-1"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: |
-        ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
   wheel-tests-cudf-polars:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12

From 86d833bdd46f0742621e6f1ec39e4e42fe1a695d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:17:11 -0500
Subject: [PATCH 17/78] Change indices for dictionary column to signed integer
 type (#17390)

Change the indices type for dictionary column from unsigned to signed integer type.

Closes #17327

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17390
---
 .../cudf/column/column_device_view.cuh        |  6 ++--
 cpp/include/cudf/dictionary/encode.hpp        |  4 +--
 cpp/include/cudf/utilities/traits.hpp         | 23 ++++++++++++++
 cpp/include/cudf_test/column_wrapper.hpp      |  8 ++---
 cpp/src/column/column_factories.cpp           |  2 +-
 cpp/src/dictionary/add_keys.cu                |  6 ++--
 cpp/src/dictionary/detail/concatenate.cu      |  2 +-
 cpp/src/dictionary/dictionary_factories.cu    |  5 +--
 cpp/src/dictionary/encode.cu                  | 13 +++-----
 cpp/src/dictionary/remove_keys.cu             |  4 +--
 cpp/src/dictionary/search.cu                  | 10 +++---
 cpp/src/interop/from_arrow_device.cu          | 17 +++-------
 cpp/src/interop/from_arrow_host.cu            | 17 +++-------
 cpp/src/utilities/traits.cpp                  | 16 ++++++++++
 cpp/tests/copying/get_value_tests.cpp         |  6 ++--
 cpp/tests/dictionary/add_keys_test.cpp        |  4 +--
 cpp/tests/dictionary/encode_test.cpp          |  8 ++---
 cpp/tests/dictionary/factories_test.cpp       | 31 +++++++++----------
 cpp/tests/dictionary/search_test.cpp          | 16 +++++-----
 cpp/tests/interop/from_arrow_host_test.cpp    | 20 ++++++------
 cpp/tests/interop/nanoarrow_utils.hpp         |  8 +++--
 cpp/tests/interop/to_arrow_device_test.cpp    |  5 ++-
 cpp/tests/interop/to_arrow_test.cpp           |  7 ++---
 cpp/tests/rolling/lead_lag_test.cpp           |  4 +--
 cpp/tests/streams/dictionary_test.cpp         | 16 +++++-----
 25 files changed, 139 insertions(+), 119 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35a39ef9758..db6d5255616 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -460,7 +460,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    */
   struct index_element_fn {
     template <typename IndexType,
-              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>)>
+              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_signed_v<IndexType>)>
     __device__ size_type operator()(column_device_view const& indices, size_type index)
     {
       return static_cast<size_type>(indices.element<IndexType>(index));
@@ -468,10 +468,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
 
     template <typename IndexType,
               typename... Args,
-              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>))>
+              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_signed_v<IndexType>))>
     __device__ size_type operator()(Args&&... args)
     {
-      CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type");
+      CUDF_UNREACHABLE("dictionary indices must be a signed integral type");
     }
   };
 
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index dc81fd74992..ced6bd2afa4 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -41,7 +41,7 @@ namespace dictionary {
  *
  * The null mask and null count are copied from the input column to the output column.
  *
- * @throw cudf::logic_error if indices type is not an unsigned integer type
+ * @throw cudf::logic_error if indices type is not a signed integer type
  * @throw cudf::logic_error if the column to encode is already a DICTIONARY type
  *
  * @code{.pseudo}
@@ -58,7 +58,7 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type            = data_type{type_id::UINT32},
+  data_type indices_type            = data_type{type_id::INT32},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 22a67ca049a..dae1cd38832 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -217,6 +217,29 @@ constexpr inline bool is_index_type()
  */
 bool is_index_type(data_type type);
 
+/**
+ * @brief Indicates whether the type `T` is a signed numeric type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is signed numeric
+ */
+template <typename T>
+constexpr inline bool is_signed()
+{
+  return std::is_signed_v<T>;
+}
+
+/**
+ * @brief Indicates whether `type` is a signed numeric `data_type`.
+ *
+ * "Signed Numeric" types include fundamental integral types such as `INT*`
+ * but can also be `FLOAT*` types.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is signed numeric
+ */
+bool is_signed(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a unsigned numeric type.
  *
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 6206c1311d2..6300bb87572 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped =
       cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
-                               cudf::data_type{type_id::UINT32},
+                               cudf::data_type{type_id::INT32},
                                cudf::test::get_default_stream());
   }
 
@@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped = cudf::dictionary::encode(
       fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
-      cudf::data_type{type_id::UINT32},
+      cudf::data_type{type_id::INT32},
       cudf::test::get_default_stream());
   }
 
@@ -1173,7 +1173,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 
@@ -1210,7 +1210,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 972f97e8668..050c23b0a3d 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -178,7 +178,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
   CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
   return make_dictionary_column(
     make_column_from_scalar(s, 1, stream, mr),
-    make_column_from_scalar(numeric_scalar<uint32_t>(0, true, stream), size, stream, mr),
+    make_column_from_scalar(numeric_scalar<int32_t>(0, true, stream), size, stream, mr),
     rmm::device_buffer{0, stream, mr},
     0);
 }
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 565055009ba..a851fc6069d 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -106,10 +106,10 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   auto indices_column     = [&] {
     column_view gather_result = table_indices.front()->view();
     auto const indices_size   = gather_result.size();
-    // we can just use the lower-bound/gather data directly for UINT32 case
-    if (indices_type.id() == type_id::UINT32) {
+    // we can just use the lower-bound/gather data directly for INT32 case
+    if (indices_type.id() == type_id::INT32) {
       auto contents = table_indices.front()->release();
-      return std::make_unique<column>(data_type{type_id::UINT32},
+      return std::make_unique<column>(data_type{type_id::INT32},
                                       indices_size,
                                       std::move(*(contents.data.release())),
                                       rmm::device_buffer{0, stream, mr},
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index b3a8bb4cd20..0f17858094b 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -252,7 +252,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     if (dict_view.is_empty()) {
-      return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0};
+      return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0};
     }
     return dict_view.get_indices_annotated();  // nicely includes validity mask and view offset
   });
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 3e0c98d36ea..9f81c852a30 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -33,7 +33,7 @@ struct dispatch_create_indices {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
+    CUDF_EXPECTS(cudf::is_signed<IndexType>(), "indices must be a signed type");
     column_view indices_view{
       indices.type(), indices.size(), indices.data<IndexType>(), nullptr, 0, indices.offset()};
     return std::make_unique<column>(indices_view, stream, mr);
@@ -83,7 +83,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 {
   CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
   CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
-  CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer");
+  CUDF_EXPECTS(is_signed(indices_column->type()) && is_index_type(indices_column->type()),
+               "indices must be type unsigned integer");
 
   auto count = indices_column->size();
   std::vector<std::unique_ptr<column>> children;
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index c8ccb511e8f..5935b4f13e8 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -44,7 +44,8 @@ std::unique_ptr<column> encode(column_view const& input_column,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
+  CUDF_EXPECTS(is_signed(indices_type) && is_index_type(indices_type),
+               "indices must be type signed integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
                "cannot encode a dictionary from a dictionary");
 
@@ -63,10 +64,6 @@ std::unique_ptr<column> encode(column_view const& input_column,
     keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // remove the null-mask
   }
 
-  // the encode() returns INT32 for indices
-  if (indices_column->type().id() != indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);
-
   // create column with keys_column and indices_column
   return make_dictionary_column(std::move(keys_column),
                                 std::move(indices_column),
@@ -79,9 +76,9 @@ std::unique_ptr<column> encode(column_view const& input_column,
  */
 data_type get_indices_type_for_size(size_type keys_size)
 {
-  if (keys_size <= std::numeric_limits<uint8_t>::max()) return data_type{type_id::UINT8};
-  if (keys_size <= std::numeric_limits<uint16_t>::max()) return data_type{type_id::UINT16};
-  return data_type{type_id::UINT32};
+  if (keys_size <= std::numeric_limits<int8_t>::max()) return data_type{type_id::INT8};
+  if (keys_size <= std::numeric_limits<int16_t>::max()) return data_type{type_id::INT16};
+  return data_type{type_id::INT32};
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 119f43a4ae9..59c8453cf33 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -180,11 +180,11 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
   // search the indices values with key indices to look for any holes
   auto const matches = [&] {
     // build keys index to verify against indices values
-    rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
+    rmm::device_uvector<int32_t> keys_positions(keys_size, stream);
     thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
     column_view keys_positions_view(
-      data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0);
+      data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0);
     return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
   }();
   auto d_matches = matches->view().data<bool>();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 04e2c17635d..286b1a87df2 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -158,8 +158,9 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
-  if (dictionary.is_empty())
-    return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
+  if (dictionary.is_empty()) {
+    return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
+  }
   return type_dispatcher<dispatch_storage_type>(
     dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr);
 }
@@ -169,8 +170,9 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  if (dictionary.is_empty())
-    return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
+  if (dictionary.is_empty()) {
+    return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
+  }
   return type_dispatcher<dispatch_storage_type>(
     dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr);
 }
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 057e563c86e..cb3c4c55a61 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -194,19 +194,12 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
     get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
 
   auto const dict_indices_type = [&schema]() -> data_type {
-    // cudf dictionary requires an unsigned type for the indices,
-    // since it is invalid for an arrow dictionary to contain negative
-    // indices, we can safely use the unsigned equivalent without having
-    // to modify the buffers.
+    // cudf dictionary requires a signed type for the indices
     switch (schema->storage_type) {
-      case NANOARROW_TYPE_INT8:
-      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
-      case NANOARROW_TYPE_INT16:
-      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
-      case NANOARROW_TYPE_INT32:
-      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
-      case NANOARROW_TYPE_INT64:
-      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+      case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+      case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+      case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
       default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
     }
   }();
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 2e9504a6726..b5d2427e288 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -267,19 +267,12 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::dictiona
     get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
 
   auto const dict_indices_type = [&schema]() -> data_type {
-    // cudf dictionary requires an unsigned type for the indices,
-    // since it is invalid for an arrow dictionary to contain negative
-    // indices, we can safely use the unsigned equivalent without having
-    // to modify the buffers.
+    // cudf dictionary requires a signed type for the indices
     switch (schema->storage_type) {
-      case NANOARROW_TYPE_INT8:
-      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
-      case NANOARROW_TYPE_INT16:
-      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
-      case NANOARROW_TYPE_INT32:
-      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
-      case NANOARROW_TYPE_INT64:
-      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+      case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+      case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+      case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
       default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
     }
   }();
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index 41ee4e960b6..86b4db02f54 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -127,6 +127,22 @@ struct is_index_type_impl {
  */
 bool is_index_type(data_type type) { return cudf::type_dispatcher(type, is_index_type_impl{}); }
 
+struct is_signed_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_signed<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` is a signed numeric `data_type`.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is signed numeric
+ */
+bool is_signed(data_type type) { return cudf::type_dispatcher(type, is_signed_impl{}); }
+
 struct is_unsigned_impl {
   template <typename T>
   constexpr bool operator()()
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index b2d64dac7c8..9e8525cd96b 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed
 TYPED_TEST(DictionaryGetValueTest, BasicGet)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
   auto col = cudf::make_dictionary_column(keys, indices);
 
   auto s = cudf::get_element(*col, 2);
@@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
 TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
@@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 TYPED_TEST(DictionaryGetValueTest, GetNull)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index ebc8c11e86c..da8231fb8be 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn)
   cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
 }
 
@@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn)
   cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint8_t> expected{2, 4, 1, 0, 4, 1};
+  cudf::test::fixed_width_column_wrapper<int8_t> expected{2, 4, 1, 0, 4, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index dfa3ede5d46..46319bb376d 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn)
   cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
+  cudf::test::fixed_width_column_wrapper<int32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
 }
 
@@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat)
   cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 7.125};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> expected{2, 3, 1, 0, 3, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{2, 3, 1, 0, 3, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
@@ -64,7 +64,7 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull)
   cudf::test::fixed_width_column_wrapper<int64_t> keys_expected{0, 111, 222, 333, 444};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
@@ -72,6 +72,6 @@ TEST_F(DictionaryEncodeTest, InvalidEncode)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> input{0, 1, 2, 3, -1, -2, -3};
 
-  EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}),
+  EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}),
                cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp
index 051ea45aed6..30e3984d66d 100644
--- a/cpp/tests/dictionary/factories_test.cpp
+++ b/cpp/tests/dictionary/factories_test.cpp
@@ -29,7 +29,7 @@ struct DictionaryFactoriesTest : public cudf::test::BaseFixture {};
 TEST_F(DictionaryFactoriesTest, CreateFromColumnViews)
 {
   cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
-  cudf::test::fixed_width_column_wrapper<uint32_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
 
   auto dictionary = cudf::make_dictionary_column(keys, values);
   cudf::dictionary_column_view view(dictionary->view());
@@ -41,8 +41,8 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumnViews)
 TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<float> keys{-11.75, 4.25, 7.125, 0.5, 12.0};
-  std::vector<uint32_t> h_values{1, 3, 2, 0, 1, 4, 1};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  std::vector<int32_t> h_values{1, 3, 2, 0, 1, 4, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     h_values.begin(), h_values.end(), thrust::make_transform_iterator(h_values.begin(), [](auto v) {
       return v > 0;
     }));
@@ -50,8 +50,7 @@ TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls)
   cudf::dictionary_column_view view(dictionary->view());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys);
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
 
@@ -59,16 +58,15 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumns)
 {
   std::vector<std::string> h_keys{"pear", "apple", "fruit", "macintosh"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint32_t> h_values{1, 2, 3, 1, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint32_t> values(h_values.begin(), h_values.end());
+  std::vector<int32_t> h_values{1, 2, 3, 1, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values(h_values.begin(), h_values.end());
 
   auto dictionary =
     cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{}, 0);
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -77,8 +75,8 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls)
 {
   std::vector<int64_t> h_keys{-1234567890, -987654321, 0, 19283714};
   cudf::test::fixed_width_column_wrapper<int64_t> keys(h_keys.begin(), h_keys.end());
-  std::vector<uint32_t> h_values{1, 2, 3, 1, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint32_t> values(h_values.begin(), h_values.end());
+  std::vector<int32_t> h_values{1, 2, 3, 1, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values(h_values.begin(), h_values.end());
   auto size                    = static_cast<cudf::size_type>(h_values.size());
   rmm::device_buffer null_mask = create_null_mask(size, cudf::mask_state::ALL_NULL);
   auto dictionary =
@@ -88,8 +86,7 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls)
   EXPECT_EQ(size, view.null_count());
 
   cudf::test::fixed_width_column_wrapper<int64_t> keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -98,15 +95,15 @@ TEST_F(DictionaryFactoriesTest, KeysWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{{0, 1, 2, 3, 4},
                                                        {true, true, true, false, true}};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{5, 4, 3, 2, 1, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
 }
 
 TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{{5, 4, 3, 2, 1, 0},
-                                                           {true, true, true, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{{5, 4, 3, 2, 1, 0},
+                                                          {true, true, true, false, true, false}};
   EXPECT_THROW(
     cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
     cudf::logic_error);
@@ -115,7 +112,7 @@ TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
 TEST_F(DictionaryFactoriesTest, InvalidIndices)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
-  cudf::test::fixed_width_column_wrapper<int16_t> indices{5, 4, 3, 2, 1, 0};
+  cudf::test::fixed_width_column_wrapper<uint16_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
   EXPECT_THROW(
     cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 2774173b80a..d5877f12184 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -31,8 +31,8 @@ TEST_F(DictionarySearchTest, StringsColumn)
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("ccc"));
   EXPECT_TRUE(result->is_valid());
-  auto n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{3}, n_result->value());
+  auto n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{3}, n_result->value());
 
   result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("eee"));
   EXPECT_FALSE(result->is_valid());
@@ -40,8 +40,8 @@ TEST_F(DictionarySearchTest, StringsColumn)
                                                       cudf::string_scalar("eee"),
                                                       cudf::get_default_stream(),
                                                       cudf::get_current_device_resource_ref());
-  n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{5}, n_result->value());
+  n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{5}, n_result->value());
 }
 
 TEST_F(DictionarySearchTest, WithNulls)
@@ -51,8 +51,8 @@ TEST_F(DictionarySearchTest, WithNulls)
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(4));
   EXPECT_TRUE(result->is_valid());
-  auto n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{0}, n_result->value());
+  auto n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{0}, n_result->value());
 
   result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(5));
   EXPECT_FALSE(result->is_valid());
@@ -60,8 +60,8 @@ TEST_F(DictionarySearchTest, WithNulls)
                                                       cudf::numeric_scalar<int64_t>(5),
                                                       cudf::get_default_stream(),
                                                       cudf::get_current_device_resource_ref());
-  n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{1}, n_result->value());
+  n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{1}, n_result->value());
 }
 
 TEST_F(DictionarySearchTest, EmptyColumn)
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index d93ef28aab8..1ab11b374b6 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -460,19 +460,17 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType)
   // test dictionary arrays with different index types
   // cudf asserts that the index type must be unsigned
   auto array1 =
-    get_nanoarrow_dict_array<int64_t, uint8_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int8_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
   auto array2 =
-    get_nanoarrow_dict_array<int64_t, uint16_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int16_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
   auto array3 =
-    get_nanoarrow_dict_array<int64_t, uint64_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int64_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
 
   // create equivalent cudf dictionary columns
   auto keys_col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 7});
-  auto ind1_col = cudf::test::fixed_width_column_wrapper<uint8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
-  auto ind2_col =
-    cudf::test::fixed_width_column_wrapper<uint16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
-  auto ind3_col =
-    cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind1_col = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind2_col = cudf::test::fixed_width_column_wrapper<int16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind3_col = cudf::test::fixed_width_column_wrapper<int64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
 
   vector_of_columns columns;
   columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col));
@@ -485,19 +483,19 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType)
   ArrowSchemaInit(input_schema.get());
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
   NANOARROW_THROW_NOT_OK(
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 8be7e087b6d..b7b8202a3c2 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -200,17 +200,19 @@ struct nanoarrow_storage_type {};
     static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \
   }
 
-DEFINE_NANOARROW_STORAGE(bool, BOOL);
+DEFINE_NANOARROW_STORAGE(int8_t, INT8);
+DEFINE_NANOARROW_STORAGE(int16_t, INT16);
+DEFINE_NANOARROW_STORAGE(int32_t, INT32);
 DEFINE_NANOARROW_STORAGE(int64_t, INT64);
+DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
 DEFINE_NANOARROW_STORAGE(uint16_t, UINT16);
 DEFINE_NANOARROW_STORAGE(uint64_t, UINT64);
+DEFINE_NANOARROW_STORAGE(bool, BOOL);
 DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32);
 DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
-DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
-DEFINE_NANOARROW_STORAGE(int32_t, INT32);
 DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128);
 
 #undef DEFINE_NANOARROW_STORAGE
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 29aa928c277..112b3e1d8e2 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -48,7 +48,6 @@ get_nanoarrow_cudf_table(cudf::size_type length)
                          .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin());
-  auto dict_col = cudf::dictionary::encode(col4);
   columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(test_data.bool_data.begin(),
                                                                     test_data.bool_data.end(),
@@ -103,7 +102,7 @@ get_nanoarrow_cudf_table(cudf::size_type length)
     schema->children[1]->flags = 0;
   }
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_INT32));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
@@ -181,7 +180,7 @@ get_nanoarrow_tables(cudf::size_type length)
 
   populate_from_col<int64_t>(arrow->children[0], table->get_column(0).view());
   populate_from_col<cudf::string_view>(arrow->children[1], table->get_column(1).view());
-  populate_dict_from_col<int64_t, uint32_t>(
+  populate_dict_from_col<int64_t, int32_t>(
     arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view()));
 
   populate_from_col<bool>(arrow->children[3], table->get_column(3).view());
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index faa07ba3311..28a80502f08 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -63,7 +63,6 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto validity_generator = []() { return rand() % 7 != 0; };
   std::generate(
     list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
-  // cudf::size_type n = 0;
   std::generate(
     list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
       return (n++) * length_of_individual_list;
@@ -87,7 +86,6 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
       .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     int64_data.begin(), int64_data.end(), validity.begin());
-  auto dict_col = cudf::dictionary::encode(col4);
   columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          bool_data.begin(), bool_data.end(), bool_validity.begin())
@@ -120,11 +118,12 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto int64array = get_arrow_array<int64_t>(int64_data, validity);
 
   auto string_array = get_arrow_array<cudf::string_view>(string_data, validity);
+  auto dict_col     = cudf::dictionary::encode(col4);
   cudf::dictionary_column_view view(dict_col->view());
   auto keys       = cudf::test::to_host<int64_t>(view.keys()).first;
-  auto indices    = cudf::test::to_host<uint32_t>(view.indices()).first;
+  auto indices    = cudf::test::to_host<int32_t>(view.indices()).first;
   auto dict_array = get_arrow_dict_array(std::vector<int64_t>(keys.begin(), keys.end()),
-                                         std::vector<uint32_t>(indices.begin(), indices.end()),
+                                         std::vector<int32_t>(indices.begin(), indices.end()),
                                          validity);
   auto boolarray  = get_arrow_array<bool>(bool_data, bool_validity);
   auto list_array = get_arrow_list_array<int64_t>(
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index 6519b0ed4ee..d82f512329f 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -1098,7 +1098,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary)
 
     auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release();
     auto expected_values =
-      cudf::test::fixed_width_column_wrapper<uint32_t>{
+      cudf::test::fixed_width_column_wrapper<int32_t>{
         {2, 3, 4, 5, 0, 0, 7, 8, 9, 10, 0, 0},
         cudf::test::iterators::nulls_at(std::vector{4, 5, 10, 11})}
         .release();
@@ -1118,7 +1118,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary)
 
     auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release();
     auto expected_values =
-      cudf::test::fixed_width_column_wrapper<uint32_t>{
+      cudf::test::fixed_width_column_wrapper<int32_t>{
         {0, 0, 1, 2, 3, 4, 0, 6, 0, 7, 8, 9}, cudf::test::iterators::nulls_at(std::vector{0, 6})}
         .release();
     auto expected_output =
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
index 03e4cf47470..498504ef212 100644
--- a/cpp/tests/streams/dictionary_test.cpp
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -29,7 +29,7 @@ class DictionaryTest : public cudf::test::BaseFixture {};
 TEST_F(DictionaryTest, FactoryColumnViews)
 {
   cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
-  cudf::test::fixed_width_column_wrapper<uint8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
 
   auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
@@ -42,15 +42,15 @@ TEST_F(DictionaryTest, FactoryColumns)
 {
   std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+  std::vector<int8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values(h_values.begin(), h_values.end());
 
   auto dictionary = cudf::make_dictionary_column(
     keys.release(), values.release(), cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -59,15 +59,15 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
 {
   std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+  std::vector<int8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values(h_values.begin(), h_values.end());
 
   auto dictionary = cudf::make_dictionary_column(
     keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -75,7 +75,7 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
 TEST_F(DictionaryTest, Encode)
 {
   cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
-  cudf::data_type int32_type(cudf::type_id::UINT32);
+  cudf::data_type int32_type(cudf::type_id::INT32);
   cudf::column_view col_view = col;
   cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream());
 }

From 6440207ccea4bed0a0654186276de1e589acb0d9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 Dec 2024 10:46:04 -0600
Subject: [PATCH 18/78] Remove upper bounds on cuda-python to allow 12.6.2 and
 11.8.5 (#17326)

Now that some upstream bugs have been fixed, we can allow cuda-python 12.6.2 and 11.8.5.

See https://github.com/NVIDIA/cuda-python/issues/226#issuecomment-2472355738 for more information.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17326
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 conda/recipes/pylibcudf/meta.yaml                | 4 ++--
 dependencies.yaml                                | 8 ++++----
 python/cudf/pyproject.toml                       | 2 +-
 python/pylibcudf/pyproject.toml                  | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2be64b7cd70..87c40421be0 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 6b5ca04c015..0935de96d19 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.5
 - cupy>=12.0.0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 04904e95630..e52b8c5f2a0 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -91,7 +91,7 @@ requirements:
     - cudatoolkit
     - ptxcompiler >=0.7.0
     - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
     - cuda-cudart
     - libcufile  # [linux64]
@@ -100,7 +100,7 @@ requirements:
     # TODO: Add nvjitlink here
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index ec3fcd59c62..3d965f30986 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -83,9 +83,9 @@ requirements:
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     {% endif %}
     - nvtx >=0.2.1
     - packaging
diff --git a/dependencies.yaml b/dependencies.yaml
index 259d41b59fe..044c7d187b3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -679,10 +679,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0,<=12.6.0
+              - cuda-python>=12.0,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_pylibcudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - cuda-python>=11.7.1,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
   run_cudf:
     common:
@@ -705,10 +705,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0,<=12.6.0
+              - cuda-python>=12.0,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_cudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - cuda-python>=11.7.1,<12.0a0
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index df3e6b87991..80de9056a0a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -20,7 +20,7 @@ requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
-    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
+    "cuda-python>=11.7.1,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index dc82eb363d0..a5e5704b8ed 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
+    "cuda-python>=11.7.1,<12.0a0",
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",

From 38820ff0e8cd7cd54793fd5c49fb1566a24686b1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 Dec 2024 11:43:37 -0600
Subject: [PATCH 19/78] Update to CCCL 2.7.0-rc2. (#17233)

This PR updates to CCCL 2.7.0-rc2. Do not merge until all of RAPIDS is
ready to update.

Depends on https://github.com/rapidsai/rapids-cmake/pull/710 and should
be admin-merged immediately after that PR.

Part of https://github.com/rapidsai/build-planning/issues/115.

---------

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 .../thirdparty/patches/cccl_override.json     |  5 --
 .../patches/cccl_symbol_visibility.diff       | 27 --------
 .../thrust_disable_64bit_dispatching.diff     | 66 ++++++++++++++-----
 .../thrust_faster_sort_compile_times.diff     | 12 ++--
 4 files changed, 56 insertions(+), 54 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index dcf9c1139f9..2f29578f7ae 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/cccl_symbol_visibility.diff",
-          "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]",
-          "fixed_in" : "2.6"
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
deleted file mode 100644
index f745d5fa314..00000000000
--- a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
+++ /dev/null
@@ -1,27 +0,0 @@
-diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-index e7c62c031b..5db861853a 100644
---- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t;
- #      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
- #    endif
- 
--#    define _LIBCUDACXX_TYPE_VIS      _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_FUNC_VIS      _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_HIDDEN
-@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
- #    define _LIBCUDACXX_WEAK __attribute__((__weak__))
- #  endif
- 
--// Redefine some macros for internal use
--#  if defined(__cuda_std__)
--#    undef _LIBCUDACXX_FUNC_VIS
--#    define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
--#    undef _LIBCUDACXX_TYPE_VIS
--#    define _LIBCUDACXX_TYPE_VIS
--#  endif // __cuda_std__
--
- // Thread API
- #  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
- #    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
index 6ae1e1c917b..291eabe25fd 100644
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -1,25 +1,59 @@
 diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 2a3cc4e33..8fb337b26 100644
+index 971b93d62..0d6b25b07 100644
 --- a/thrust/thrust/system/cuda/detail/dispatch.h
 +++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -44,8 +44,7 @@
-   }                                                                                   \
-   else                                                                                \
-   {                                                                                   \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--    status                             = call arguments;                              \
+@@ -36,16 +36,15 @@
+  * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
+  * interfaces, that always deduce the size type from the arguments.
+  */
+-#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)         \
+-  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)    \
+-  {                                                                        \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count); \
+-    status                             = call arguments;                   \
+-  }                                                                        \
+-  else                                                                     \
+-  {                                                                        \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count); \
+-    status                             = call arguments;                   \
++#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)                                 \
++  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)                            \
++  {                                                                                                \
++    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count);                         \
++    status                             = call arguments;                                           \
++  }                                                                                                \
++  else                                                                                             \
++  {                                                                                                \
 +    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
    }
- 
+
  /**
-@@ -66,9 +65,7 @@
-   }                                                                                          \
-   else                                                                                       \
-   {                                                                                          \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
--    status                              = call arguments;                                    \
+@@ -55,18 +54,16 @@
+  *
+  * This version of the macro supports providing two count variables, which is necessary for set algorithms.
+  */
+-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+-  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)  \
+-  {                                                                                \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);       \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);       \
+-    status                              = call arguments;                          \
+-  }                                                                                \
+-  else                                                                             \
+-  {                                                                                \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);       \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);       \
+-    status                              = call arguments;                          \
++#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments)                        \
++  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)                         \
++  {                                                                                                       \
++    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);                              \
++    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);                              \
++    status                              = call arguments;                                                 \
++  }                                                                                                       \
++  else                                                                                                    \
++  {                                                                                                       \
 +    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
    }
+
  /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
index cb0cc55f4d2..5f1981e9806 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -1,20 +1,20 @@
 diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index eb76ebb0b..c6c529a50 100644
+index 29510db5e..cf57e5786 100644
 --- a/cub/cub/block/block_merge_sort.cuh
 +++ b/cub/cub/block/block_merge_sort.cuh
 @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
    KeyT key1 = keys_shared[keys1_beg];
    KeyT key2 = keys_shared[keys2_beg];
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
    {
-     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-@@ -376,7 +376,7 @@ public:
+     const bool p  = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -374,7 +374,7 @@ public:
        //
        KeyT max_key = oob_default;
- 
+
 -#pragma unroll
 +#pragma unroll 1
        for (int item = 1; item < ITEMS_PER_THREAD; ++item)
@@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644
 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
  {
    constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int i = 0; i < ITEMS_PER_THREAD; ++i)

From 43fac3b64ee69427073adf76b4d6b11a3873fc10 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 4 Dec 2024 13:48:48 -0500
Subject: [PATCH 20/78] Expose stream-ordering in nvtext API (#17446)

Adds stream parameter to
```
cudf::nvtext::byte_pair_encoding
```
Added stream gtests to verify correct stream forwarding.

Reference: https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17446
---
 cpp/include/nvtext/byte_pair_encoding.hpp |  2 +
 cpp/src/text/bpe/byte_pair_encoding.cu    |  3 +-
 cpp/src/text/bpe/load_merge_pairs.cu      |  3 +-
 cpp/tests/CMakeLists.txt                  |  1 +
 cpp/tests/streams/text/bpe_test.cpp       | 59 +++++++++++++++++++++++
 5 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 cpp/tests/streams/text/bpe_test.cpp

diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index ab862df044d..71b68565e77 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -122,6 +122,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
  * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return An encoded column of strings.
  */
@@ -129,6 +130,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index f46f49ddc0e..0aacfd16f67 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -459,10 +459,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
                                                  bpe_merge_pairs const& merges_table,
                                                  cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
+  return detail::byte_pair_encoding(input, merges_table, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index cd68566bdec..a13a435a271 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -103,7 +103,8 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
+  auto pairs =
+    cudf::strings::split_record(input, cudf::string_scalar(" ", true, stream, mr), 1, stream, mr);
   auto content = pairs->release();
   return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8928d27a871..adf512811cc 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -742,6 +742,7 @@ ConfigureTest(
 )
 ConfigureTest(
   STREAM_TEXT_TEST
+  streams/text/bpe_test.cpp
   streams/text/edit_distance_test.cpp
   streams/text/ngrams_test.cpp
   streams/text/replace_test.cpp
diff --git a/cpp/tests/streams/text/bpe_test.cpp b/cpp/tests/streams/text/bpe_test.cpp
new file mode 100644
index 00000000000..0510edc122a
--- /dev/null
+++ b/cpp/tests/streams/text/bpe_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/byte_pair_encoding.hpp>
+
+struct TextBytePairEncoding : public cudf::test::BaseFixture {};
+
+TEST_F(TextBytePairEncoding, BytePairEncoding)
+{
+  auto stream = cudf::test::get_default_stream();
+  // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
+  auto mpt = cudf::test::strings_column_wrapper({
+    "e n",    // 14
+    "i t",    // 16
+    "i s",    // 17
+    "e s",    // 20
+    "en t",   // 44
+    "c e",    // 90
+    "es t",   // 141
+    "en ce",  // 340
+    "t h",    // 146
+    "h i",    // 5049
+    "th is",  // 5407
+    "t est",  // 9034
+    "s i",    // 13142
+    "s ent"   // 33832
+  });
+
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt), stream);
+
+  auto validity = cudf::test::iterators::null_at(4);
+  cudf::test::strings_column_wrapper input(
+    {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""},
+    validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto results =
+    nvtext::byte_pair_encoding(sv, *merge_pairs, cudf::string_scalar(" ", true, stream), stream);
+}

From 4505c5399a7aea119e07dded7b54084be713e985 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 13:52:38 -0500
Subject: [PATCH 21/78] Return empty result for segmented_reduce if input and
 offsets are both empty (#17437)

Changes the behavior of `cudf::segmented_reduce` to return an empty column if both the input and the offsets parameter are empty.
Closes #17433

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)
  - Basit Ayantunde (https://github.com/lamarrr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17437
---
 cpp/src/reductions/segmented/reductions.cpp   |  6 ++++++
 .../reductions/segmented_reduction_tests.cpp  | 20 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index dedfc4b0734..1c3a2b0c0f3 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reduction.hpp>
@@ -120,6 +121,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
     CUDF_FAIL(
       "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
   }
+
+  if (segmented_values.is_empty() && offsets.empty()) {
+    return cudf::make_empty_column(output_dtype);
+  }
+
   CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element.");
 
   return cudf::detail::aggregation_dispatcher(
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index bc0321bd40a..2281a517aa6 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -1122,6 +1122,26 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_bool);
 }
 
+TEST_F(SegmentedReductionTestUntyped, EmptyInputEmptyOffsets)
+{
+  auto const str_empty = cudf::test::strings_column_wrapper{};
+  auto const int_empty = cudf::test::fixed_width_column_wrapper<cudf::size_type>{};
+  auto result =
+    cudf::segmented_reduce(str_empty,
+                           cudf::column_view{int_empty},
+                           *cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type{cudf::type_id::STRING},
+                           cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, str_empty);
+
+  result = cudf::segmented_reduce(int_empty,
+                                  cudf::column_view{int_empty},
+                                  *cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>(),
+                                  cudf::data_type{cudf::type_id::INT32},
+                                  cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, int_empty);
+}
+
 template <typename T>
 struct SegmentedReductionFixedPointTest : public cudf::test::BaseFixture {};
 

From 351ece53a3f1b5269c0b15f7254e67cd06535740 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:11:37 -0800
Subject: [PATCH 22/78] Remove cudf._lib.binops in favor of inlining pylibcudf
 (#17468)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17468
---
 python/cudf/cudf/_lib/CMakeLists.txt         |  1 -
 python/cudf/cudf/_lib/__init__.py            |  1 -
 python/cudf/cudf/_lib/binaryop.pyx           | 61 --------------------
 python/cudf/cudf/core/_internals/binaryop.py | 60 +++++++++++++++++++
 python/cudf/cudf/core/column/column.py       |  2 +-
 python/cudf/cudf/core/column/datetime.py     | 16 ++---
 python/cudf/cudf/core/column/decimal.py      | 13 +++--
 python/cudf/cudf/core/column/lists.py        |  4 +-
 python/cudf/cudf/core/column/numerical.py    | 10 ++--
 python/cudf/cudf/core/column/string.py       | 10 ++--
 python/cudf/cudf/core/column/timedelta.py    | 13 ++---
 python/cudf/cudf/utils/applyutils.py         |  4 +-
 12 files changed, 94 insertions(+), 101 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/binaryop.pyx
 create mode 100644 python/cudf/cudf/core/_internals/binaryop.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index e69a2672163..dd27aae7133 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -14,7 +14,6 @@
 
 set(cython_sources
     aggregation.pyx
-    binaryop.pyx
     column.pyx
     copying.pyx
     csv.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index ec32386b2ce..cdf7cbe13c4 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -2,7 +2,6 @@
 import numpy as np
 
 from . import (
-    binaryop,
     copying,
     csv,
     groupby,
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
deleted file mode 100644
index e2547476849..00000000000
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-from cudf.core.buffer import acquire_spill_lock
-
-# Map pandas operation names to pylibcudf operation names.
-_op_map = {
-    "TRUEDIV": "TRUE_DIV",
-    "FLOORDIV": "FLOOR_DIV",
-    "MOD": "PYMOD",
-    "EQ": "EQUAL",
-    "NE": "NOT_EQUAL",
-    "LT": "LESS",
-    "GT": "GREATER",
-    "LE": "LESS_EQUAL",
-    "GE": "GREATER_EQUAL",
-    "AND": "BITWISE_AND",
-    "OR": "BITWISE_OR",
-    "XOR": "BITWISE_XOR",
-    "L_AND": "LOGICAL_AND",
-    "L_OR": "LOGICAL_OR",
-}
-
-
-@acquire_spill_lock()
-def binaryop(lhs, rhs, op, dtype):
-    """
-    Dispatches a binary op call to the appropriate libcudf function:
-    """
-    # TODO: Shouldn't have to keep special-casing. We need to define a separate
-    # pipeline for libcudf binops that don't map to Python binops.
-    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
-        op = op[2:-2]
-    op = op.upper()
-    op = _op_map.get(op, op)
-
-    return Column.from_pylibcudf(
-        # Check if the dtype args are desirable here.
-        pylibcudf.binaryop.binary_operation(
-            lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column)
-            else (
-                <DeviceScalar> as_device_scalar(
-                    lhs, dtype=rhs.dtype if lhs is None else None
-                )
-            ).c_value,
-            rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column)
-            else (
-                <DeviceScalar> as_device_scalar(
-                    rhs, dtype=lhs.dtype if rhs is None else None
-                )
-            ).c_value,
-            pylibcudf.binaryop.BinaryOperator[op],
-            dtype_to_pylibcudf_type(dtype),
-        )
-    )
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
new file mode 100644
index 00000000000..212150f505e
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+    from cudf.core.column import ColumnBase
+    from cudf.core.scalar import Scalar
+
+
+@acquire_spill_lock()
+def binaryop(
+    lhs: ColumnBase | Scalar, rhs: ColumnBase | Scalar, op: str, dtype: Dtype
+) -> ColumnBase:
+    """
+    Dispatches a binary op call to the appropriate libcudf function:
+    """
+    # TODO: Shouldn't have to keep special-casing. We need to define a separate
+    # pipeline for libcudf binops that don't map to Python binops.
+    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
+        op = op[2:-2]
+    # Map pandas operation names to pylibcudf operation names.
+    _op_map = {
+        "TRUEDIV": "TRUE_DIV",
+        "FLOORDIV": "FLOOR_DIV",
+        "MOD": "PYMOD",
+        "EQ": "EQUAL",
+        "NE": "NOT_EQUAL",
+        "LT": "LESS",
+        "GT": "GREATER",
+        "LE": "LESS_EQUAL",
+        "GE": "GREATER_EQUAL",
+        "AND": "BITWISE_AND",
+        "OR": "BITWISE_OR",
+        "XOR": "BITWISE_XOR",
+        "L_AND": "LOGICAL_AND",
+        "L_OR": "LOGICAL_OR",
+    }
+    op = op.upper()
+    op = _op_map.get(op, op)
+
+    return Column.from_pylibcudf(
+        plc.binaryop.binary_operation(
+            lhs.to_pylibcudf(mode="read")
+            if isinstance(lhs, Column)
+            else lhs.device_value.c_value,
+            rhs.to_pylibcudf(mode="read")
+            if isinstance(rhs, Column)
+            else rhs.device_value.c_value,
+            plc.binaryop.BinaryOperator[op],
+            dtype_to_pylibcudf_type(dtype),
+        )
+    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c8cd80f45f4..1ddc79e8970 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1366,7 +1366,7 @@ def nans_to_nulls(self: Self) -> Self:
 
     def normalize_binop_value(
         self, other: ScalarLike
-    ) -> ColumnBase | ScalarLike:
+    ) -> ColumnBase | cudf.Scalar:
         raise NotImplementedError
 
     def _reduce(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c9be3f239f9..b526a6efa51 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -22,7 +22,7 @@
 import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core._internals.search import search_sorted
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -509,7 +509,9 @@ def isocalendar(self) -> dict[str, ColumnBase]:
             )
         }
 
-    def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
+    def normalize_binop_value(  # type: ignore[override]
+        self, other: DatetimeLikeScalar
+    ) -> cudf.Scalar | cudf.DateOffset | ColumnBase:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
@@ -789,12 +791,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         if out_dtype is None:
             return NotImplemented
 
-        result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-        if out_dtype != cudf.dtype(np.bool_) and op == "__add__":
+        result_col = binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if out_dtype.kind != "b" and op == "__add__":
             return result_col
-        elif cudf.get_option(
-            "mode.pandas_compatible"
-        ) and out_dtype == cudf.dtype(np.bool_):
+        elif (
+            cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b"
+        ):
             return result_col.fillna(op == "__ne__")
         else:
             return result_col
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index ac9a2caad50..2c22724d3d7 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -11,12 +11,11 @@
 import pyarrow as pa
 
 import cudf
-from cudf import _lib as libcudf
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
 from cudf.api.types import is_scalar
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
@@ -30,6 +29,8 @@
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
@@ -141,7 +142,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
             rhs = rhs.astype(
                 type(output_type)(rhs.dtype.precision, rhs.dtype.scale)
             )
-            result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type)
+            result = binaryop.binaryop(lhs, rhs, op, output_type)
             # libcudf doesn't support precision, so result.dtype doesn't
             # maintain output_type.precision
             result.dtype.precision = output_type.precision
@@ -153,7 +154,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
             "__le__",
             "__ge__",
         }:
-            result = libcudf.binaryop.binaryop(lhs, rhs, op, bool)
+            result = binaryop.binaryop(lhs, rhs, op, bool)
         else:
             raise TypeError(
                 f"{op} not supported for the following dtypes: "
@@ -177,7 +178,7 @@ def _validate_fillna_value(
             "integer values"
         )
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
                 if other.dtype.kind not in "iu":
@@ -209,7 +210,7 @@ def normalize_binop_value(self, other):
             other = Decimal(other)
             metadata = other.as_tuple()
             precision = max(len(metadata.digits), metadata.exponent)
-            scale = -metadata.exponent
+            scale = -cast(int, metadata.exponent)
             return cudf.Scalar(
                 other, dtype=self.dtype.__class__(precision, scale)
             )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 789c4a7f3cb..ea384888388 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -188,8 +188,8 @@ def __cuda_array_interface__(self):
             "Lists are not yet supported via `__cuda_array_interface__`"
         )
 
-    def normalize_binop_value(self, other):
-        if not isinstance(other, ListColumn):
+    def normalize_binop_value(self, other) -> Self:
+        if not isinstance(other, type(self)):
             return NotImplemented
         return other
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 8ca42debb72..9514aaeab50 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -16,7 +16,7 @@
 import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
@@ -292,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         lhs, rhs = (other, self) if reflect else (self, other)
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        return binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
@@ -301,11 +301,9 @@ def nans_to_nulls(self: Self) -> Self:
         newmask = libcudf.transform.nans_to_nulls(self)
         return self.set_mask(newmask)
 
-    def normalize_binop_value(
-        self, other: ScalarLike
-    ) -> ColumnBase | cudf.Scalar:
+    def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
-            if not isinstance(other, NumericalColumn):
+            if not isinstance(other, type(self)):
                 return NotImplemented
             return other
         if isinstance(other, cudf.Scalar):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 76d67585609..6b45828568c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -19,11 +19,11 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf import _lib as libcudf
 from cudf._lib import string_casting as str_cast, strings as libstrings
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
@@ -6200,7 +6200,7 @@ def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
 
     def _binaryop(
         self, other: ColumnBinaryOperand, op: str
-    ) -> "column.ColumnBase":
+    ) -> column.ColumnBase:
         reflect, op = self._check_reflected_op(op)
         # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to
         # support binary operations between empty or all null string columns
@@ -6229,7 +6229,7 @@ def _binaryop(
         if other is NotImplemented:
             return NotImplemented
 
-        if isinstance(other, (StringColumn, str, cudf.Scalar)):
+        if isinstance(other, (StringColumn, cudf.Scalar)):
             if isinstance(other, cudf.Scalar) and other.dtype != "O":
                 if op in {
                     "__eq__",
@@ -6279,9 +6279,7 @@ def _binaryop(
                 "NULL_NOT_EQUALS",
             }:
                 lhs, rhs = (other, self) if reflect else (self, other)
-                return libcudf.binaryop.binaryop(
-                    lhs=lhs, rhs=rhs, op=op, dtype="bool"
-                )
+                return binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool")
         return NotImplemented
 
     @copy_docstring(column.ColumnBase.view)
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ccc9ef2b3f6..f3a7916aa35 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -13,9 +13,8 @@
 import cudf
 import cudf.core.column.column as column
 import cudf.core.column.string as string
-from cudf import _lib as libcudf
 from cudf.api.types import is_scalar
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -188,8 +187,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 this = self.astype(common_dtype).astype(out_dtype)
                 if isinstance(other, cudf.Scalar):
                     if other.is_valid():
-                        other = other.value.astype(common_dtype).astype(
-                            out_dtype
+                        other = cudf.Scalar(
+                            other.value.astype(common_dtype).astype(out_dtype)
                         )
                     else:
                         other = cudf.Scalar(None, out_dtype)
@@ -219,10 +218,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         lhs, rhs = (other, this) if reflect else (this, other)
 
-        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-        if cudf.get_option(
-            "mode.pandas_compatible"
-        ) and out_dtype == cudf.dtype(np.bool_):
+        result = binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b":
             result = result.fillna(op == "__ne__")
         return result
 
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index cd7fe5ee023..4d6f4ea73a8 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -9,7 +9,7 @@
 from numba.core.utils import pysignature
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column
 from cudf.utils import utils
@@ -121,7 +121,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"):
                 nullmask.copy(), dtype=utils.mask_dtype
             )
         else:
-            out_mask = libcudf.binaryop.binaryop(
+            out_mask = binaryop.binaryop(
                 nullmask, out_mask, op, out_mask.dtype
             )
 

From cd3e352be06795b825828156da10ba83e1e8939f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:38:35 -0500
Subject: [PATCH 23/78] Migrate `cudf::io::merge_row_group_metadata` to
 pylibcudf (#17491)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17491
---
 python/cudf/cudf/_lib/parquet.pyx         | 22 ++++----------
 python/pylibcudf/pylibcudf/io/parquet.pxd |  2 ++
 python/pylibcudf/pylibcudf/io/parquet.pyi |  1 +
 python/pylibcudf/pylibcudf/io/parquet.pyx | 36 +++++++++++++++++++++--
 4 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d4bd0cd306c..6c80120ad6e 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -14,8 +14,6 @@ except ImportError:
 
 import numpy as np
 
-from cython.operator cimport dereference
-
 from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
@@ -25,7 +23,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -35,7 +33,6 @@ from pylibcudf.io.parquet cimport ChunkedParquetReader
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
-    merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_writer_options,
     write_parquet as parquet_writer,
@@ -64,6 +61,7 @@ import pylibcudf as plc
 from pylibcudf cimport Table
 
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
+from cython.operator cimport dereference
 
 
 cdef class BufferArrayFromVector:
@@ -808,19 +806,9 @@ cpdef merge_filemetadata(object filemetadata_list):
     --------
     cudf.io.parquet.merge_row_group_metadata
     """
-    cdef vector[unique_ptr[vector[uint8_t]]] list_c
-    cdef vector[uint8_t] blob_c
-    cdef unique_ptr[vector[uint8_t]] output_c
-
-    for blob_py in filemetadata_list:
-        blob_c = blob_py
-        list_c.push_back(move(make_unique[vector[uint8_t]](blob_c)))
-
-    with nogil:
-        output_c = move(parquet_merge_metadata(list_c))
-
-    out_metadata_py = BufferArrayFromVector.from_unique_ptr(move(output_c))
-    return np.asarray(out_metadata_py)
+    return np.asarray(
+        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
+    )
 
 
 cdef statistics_freq _get_stat_freq(str statistics):
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 1a61c20d783..79080fa7243 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -91,3 +91,5 @@ cdef class ParquetWriterOptionsBuilder:
     cpdef ParquetWriterOptions build(self)
 
 cpdef memoryview write_parquet(ParquetWriterOptions options)
+
+cpdef memoryview merge_row_group_metadata(list metdata_list)
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index eb2ca68109b..3eb3d7c3a92 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -78,3 +78,4 @@ class ParquetWriterOptionsBuilder:
     def build(self) -> ParquetWriterOptions: ...
 
 def write_parquet(options: ParquetWriterOptions) -> memoryview: ...
+def merge_row_group_metadata(metdata_list: list) -> memoryview: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index b95b1f39de1..93843c932ad 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -2,7 +2,7 @@
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport unique_ptr, make_unique
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -22,6 +22,7 @@ from pylibcudf.libcudf.io.parquet cimport (
     read_parquet as cpp_read_parquet,
     write_parquet as cpp_write_parquet,
     parquet_writer_options,
+    merge_row_group_metadata as cpp_merge_row_group_metadata,
 )
 from pylibcudf.libcudf.io.types cimport (
     compression_type,
@@ -38,10 +39,10 @@ __all__ = [
     "ParquetWriterOptions",
     "ParquetWriterOptionsBuilder",
     "read_parquet",
-    "write_parquet"
+    "write_parquet",
+    "merge_row_group_metadata",
 ]
 
-
 cdef parquet_reader_options _setup_parquet_reader_options(
     SourceInfo source_info,
     list columns = None,
@@ -577,3 +578,32 @@ cpdef memoryview write_parquet(ParquetWriterOptions options):
         c_result = cpp_write_parquet(c_options)
 
     return memoryview(HostBuffer.from_unique_ptr(move(c_result)))
+
+
+cpdef memoryview merge_row_group_metadata(list metdata_list):
+    """
+    Merges multiple raw metadata blobs that were previously
+    created by write_parquet into a single metadata blob.
+
+    For details, see :cpp:func:`merge_row_group_metadata`.
+
+    Parameters
+    ----------
+    metdata_list : list
+        List of input file metadata
+
+    Returns
+    -------
+    memoryview
+        A parquet-compatible blob that contains the data for all row groups in the list
+    """
+    cdef vector[unique_ptr[vector[uint8_t]]] list_c
+    cdef unique_ptr[vector[uint8_t]] output_c
+
+    for blob in metdata_list:
+        list_c.push_back(move(make_unique[vector[uint8_t]](<vector[uint8_t]> blob)))
+
+    with nogil:
+        output_c = move(cpp_merge_row_group_metadata(list_c))
+
+    return memoryview(HostBuffer.from_unique_ptr(move(output_c)))

From 47e49d04281da3f488bc0d954b366b272c08d316 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 4 Dec 2024 12:55:18 -0800
Subject: [PATCH 24/78] Fix groupby(as_index=False).size not reseting index
 (#17499)

closes #17478

Also fixes a bug where the `Series.name` attribute wasn't preserved with `size`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17499
---
 python/cudf/cudf/core/groupby/groupby.py |  7 +++++--
 python/cudf/cudf/tests/test_groupby.py   | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 29ab3b60d9d..0f12f266a95 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -497,11 +497,14 @@ def size(self):
         col = cudf.core.column.column_empty(
             len(self.obj), "int8", masked=False
         )
-        return (
-            cudf.Series._from_column(col)
+        result = (
+            cudf.Series._from_column(col, name=getattr(self.obj, "name", None))
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
+        if not self._as_index:
+            result = result.rename("size").reset_index()
+        return result
 
     @_performance_tracking
     def cumcount(self, ascending: bool = True):
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index eae0fd23ef8..d8a2528230e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -4074,3 +4074,17 @@ def test_get_group_list_like():
 
     with pytest.raises(KeyError):
         df.groupby(["a"]).get_group([1])
+
+
+def test_size_as_index_false():
+    df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"])
+    expected = df.groupby("a", as_index=False).size()
+    result = cudf.from_pandas(df).groupby("a", as_index=False).size()
+    assert_eq(result, expected)
+
+
+def test_size_series_with_name():
+    ser = pd.Series(range(3), name="foo")
+    expected = ser.groupby(ser).size()
+    result = cudf.from_pandas(ser).groupby(ser).size()
+    assert_eq(result, expected)

From 1b82963df736f3ad71b003443a4de1414f3ce2e5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 16:33:24 -0500
Subject: [PATCH 25/78] Fix libcudf compile error when logging is disabled
 (#17512)

Adds `[[maybe_unused]]` to the `compression_type_name` function to prevent the warning/error.
Error/warning introduced in #17431
Closes #17510

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - MithunR (https://github.com/mythrocks)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17512
---
 cpp/src/io/comp/nvcomp_adapter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index b8bf8be6d2d..9d3cf75a13f 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -70,7 +70,7 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
   }
 }
 
-std::string compression_type_name(compression_type compression)
+[[maybe_unused]] std::string compression_type_name(compression_type compression)
 {
   switch (compression) {
     case compression_type::SNAPPY: return "Snappy";

From fbc32563809f509c0186081e6012f72a0e83ebcd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 5 Dec 2024 12:08:37 -0600
Subject: [PATCH 26/78] Force Thrust to use 32-bit offset type. (#17523)

This fixes the patch we use for Thrust to always get a 32-bit offset type. The net effect of this patch is that we are behaving as if `THRUST_FORCE_32_BIT_OFFSET_TYPE` is set. This replaces a previous patch which I mistakenly did not update between CCCL 2.6.x testing and 2.7.0-rc2 testing.

In the future we hope to configure this with CMake and drop the patches, but that will require us to use features from https://github.com/NVIDIA/cccl/pull/2844 (which is not available in 2.7.0-rc2).

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17523
---
 .../thrust_disable_64bit_dispatching.diff     | 75 +++++--------------
 1 file changed, 19 insertions(+), 56 deletions(-)

diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
index 291eabe25fd..9f68d85e7db 100644
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -1,59 +1,22 @@
 diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 971b93d62..0d6b25b07 100644
+index 3d004aa55..71ce86bea 100644
 --- a/thrust/thrust/system/cuda/detail/dispatch.h
 +++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -36,16 +36,15 @@
-  * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
-  * interfaces, that always deduce the size type from the arguments.
-  */
--#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)         \
--  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)    \
--  {                                                                        \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count); \
--    status                             = call arguments;                   \
--  }                                                                        \
--  else                                                                     \
--  {                                                                        \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count); \
--    status                             = call arguments;                   \
-+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)                                 \
-+  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)                            \
-+  {                                                                                                \
-+    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count);                         \
-+    status                             = call arguments;                                           \
-+  }                                                                                                \
-+  else                                                                                             \
-+  {                                                                                                \
-+    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
-
- /**
-@@ -55,18 +54,16 @@
-  *
-  * This version of the macro supports providing two count variables, which is necessary for set algorithms.
-  */
--#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
--  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)  \
--  {                                                                                \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);       \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);       \
--    status                              = call arguments;                          \
--  }                                                                                \
--  else                                                                             \
--  {                                                                                \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);       \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);       \
--    status                              = call arguments;                          \
-+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments)                        \
-+  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)                         \
-+  {                                                                                                       \
-+    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);                              \
-+    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);                              \
-+    status                              = call arguments;                                                 \
-+  }                                                                                                       \
-+  else                                                                                                    \
-+  {                                                                                                       \
-+    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
-
- /**
+@@ -63,7 +63,7 @@
+   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1)                \
+   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2)
+ 
+-#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
++#if 0
+ //! @brief Always dispatches to 64 bit offset version of an algorithm
+ #  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
+@@ -89,7 +89,7 @@
+     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
+     _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
+ 
+-#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE)
++#elif 1
+ 
+ //! @brief Ensures that the size of the input does not overflow the offset type
+ #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \

From 06e937b7be83c69e94e27e1dc50e98755d341d2c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 5 Dec 2024 15:43:17 -0800
Subject: [PATCH 27/78] Remove cudf._lib.merge in favor of inlining pylibcudf
 (#17370)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17370
---
 python/cudf/cudf/_lib/CMakeLists.txt |  1 -
 python/cudf/cudf/_lib/__init__.py    |  1 -
 python/cudf/cudf/_lib/merge.pyx      | 47 --------------------------
 python/cudf/cudf/core/reshape.py     | 50 ++++++++++++++++++++++------
 4 files changed, 39 insertions(+), 60 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/merge.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index dd27aae7133..e3d9a48e2ba 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -19,7 +19,6 @@ set(cython_sources
     csv.pyx
     groupby.pyx
     interop.pyx
-    merge.pyx
     orc.pyx
     parquet.pyx
     reduce.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cdf7cbe13c4..cb2d0501fea 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -6,7 +6,6 @@
     csv,
     groupby,
     interop,
-    merge,
     nvtext,
     orc,
     parquet,
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
deleted file mode 100644
index 9372acdab44..00000000000
--- a/python/cudf/cudf/_lib/merge.pyx
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-def merge_sorted(
-    list input_columns,
-    list key_columns_indices,
-    bool ascending=True,
-    str na_position="last",
-):
-    """Merge multiple lists of lexicographically sorted columns into one list
-    of sorted columns. `input_columns` is a list of lists of columns to be
-    merged.
-    """
-    c_input_tables = [
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ) for source_columns in input_columns
-    ]
-
-    num_keys = len(key_columns_indices)
-
-    column_order = (
-        pylibcudf.types.Order.ASCENDING if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-
-    if not ascending:
-        na_position = "last" if na_position == "first" else "first"
-    null_precedence = (
-        pylibcudf.types.NullOrder.BEFORE if na_position == "first"
-        else pylibcudf.types.NullOrder.AFTER
-    )
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.merge.merge(
-            c_input_tables,
-            key_columns_indices,
-            [column_order] * num_keys,
-            [null_precedence] * num_keys,
-        )
-    )
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a6815da62c6..84c653c5b3f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -8,7 +8,10 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
+from cudf._lib.column import Column
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
@@ -941,21 +944,46 @@ def _merge_sorted(
                 idx + objs[0].index.nlevels for idx in key_columns_indices
             ]
 
-    columns = [
-        [
-            *(obj.index._columns if not ignore_index else ()),
-            *obj._columns,
-        ]
+    columns = (
+        itertools.chain(obj.index._columns, obj._columns)
+        if not ignore_index
+        else obj._columns
         for obj in objs
+    )
+
+    input_tables = [
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
+        for source_columns in columns
+    ]
+
+    num_keys = len(key_columns_indices)
+
+    column_order = (
+        plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+    )
+
+    if not ascending:
+        na_position = "last" if na_position == "first" else "first"
+
+    null_precedence = (
+        plc.types.NullOrder.BEFORE
+        if na_position == "first"
+        else plc.types.NullOrder.AFTER
+    )
+
+    plc_table = plc.merge.merge(
+        input_tables,
+        key_columns_indices,
+        [column_order] * num_keys,
+        [null_precedence] * num_keys,
+    )
+
+    result_columns = [
+        Column.from_pylibcudf(col) for col in plc_table.columns()
     ]
 
     return objs[0]._from_columns_like_self(
-        cudf._lib.merge.merge_sorted(
-            input_columns=columns,
-            key_columns_indices=key_columns_indices,
-            ascending=ascending,
-            na_position=na_position,
-        ),
+        result_columns,
         column_names=objs[0]._column_names,
         index_names=None if ignore_index else objs[0]._index_names,
     )

From c0a4c6ca47515ac368b62582ecd2a7af241b0238 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 5 Dec 2024 16:26:59 -0800
Subject: [PATCH 28/78] Move cudf._lib.aggregation to cudf.core._internals
 (#17516)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17516
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/aggregation.pyx         | 245 ---------------
 python/cudf/cudf/_lib/groupby.pyx             |   2 +-
 python/cudf/cudf/_lib/reduce.pyx              |   2 +-
 .../cudf/cudf/core/_internals/aggregation.py  | 288 ++++++++++++++++++
 python/cudf/cudf/core/window/rolling.py       |   2 +-
 6 files changed, 291 insertions(+), 249 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/aggregation.pyx
 create mode 100644 python/cudf/cudf/core/_internals/aggregation.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index e3d9a48e2ba..2f05101e8e3 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -13,7 +13,6 @@
 # =============================================================================
 
 set(cython_sources
-    aggregation.pyx
     column.pyx
     copying.pyx
     csv.pyx
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
deleted file mode 100644
index 3c96b90f0a1..00000000000
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pandas as pd
-from numba.np import numpy_support
-
-import pylibcudf
-
-import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
-from cudf.utils import cudautils
-
-_agg_name_map = {
-    "COUNT_VALID": "COUNT",
-    "COUNT_ALL": "SIZE",
-    "VARIANCE": "VAR",
-    "NTH_ELEMENT": "NTH",
-    "COLLECT_LIST": "COLLECT",
-    "COLLECT_SET": "UNIQUE",
-}
-
-
-class Aggregation:
-    def __init__(self, agg):
-        self.c_obj = agg
-
-    @property
-    def kind(self):
-        name = self.c_obj.kind().name
-        return _agg_name_map.get(name, name)
-
-    @classmethod
-    def sum(cls):
-        return cls(pylibcudf.aggregation.sum())
-
-    @classmethod
-    def min(cls):
-        return cls(pylibcudf.aggregation.min())
-
-    @classmethod
-    def max(cls):
-        return cls(pylibcudf.aggregation.max())
-
-    @classmethod
-    def idxmin(cls):
-        return cls(pylibcudf.aggregation.argmin())
-
-    @classmethod
-    def idxmax(cls):
-        return cls(pylibcudf.aggregation.argmax())
-
-    @classmethod
-    def mean(cls):
-        return cls(pylibcudf.aggregation.mean())
-
-    @classmethod
-    def count(cls, dropna=True):
-        return cls(pylibcudf.aggregation.count(
-            pylibcudf.types.NullPolicy.EXCLUDE
-            if dropna else pylibcudf.types.NullPolicy.INCLUDE
-        ))
-
-    @classmethod
-    def ewma(cls, com=1.0, adjust=True):
-        return cls(pylibcudf.aggregation.ewma(
-            com,
-            pylibcudf.aggregation.EWMHistory.INFINITE
-            if adjust else pylibcudf.aggregation.EWMHistory.FINITE
-        ))
-
-    @classmethod
-    def size(cls):
-        return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE))
-
-    @classmethod
-    def collect(cls):
-        return cls(
-            pylibcudf.aggregation.collect_list(pylibcudf.types.NullPolicy.INCLUDE)
-        )
-
-    @classmethod
-    def nunique(cls, dropna=True):
-        return cls(pylibcudf.aggregation.nunique(
-            pylibcudf.types.NullPolicy.EXCLUDE
-            if dropna else pylibcudf.types.NullPolicy.INCLUDE
-        ))
-
-    @classmethod
-    def nth(cls, size):
-        return cls(pylibcudf.aggregation.nth_element(size))
-
-    @classmethod
-    def product(cls):
-        return cls(pylibcudf.aggregation.product())
-    prod = product
-
-    @classmethod
-    def sum_of_squares(cls):
-        return cls(pylibcudf.aggregation.sum_of_squares())
-
-    @classmethod
-    def var(cls, ddof=1):
-        return cls(pylibcudf.aggregation.variance(ddof))
-
-    @classmethod
-    def std(cls, ddof=1):
-        return cls(pylibcudf.aggregation.std(ddof))
-
-    @classmethod
-    def median(cls):
-        return cls(pylibcudf.aggregation.median())
-
-    @classmethod
-    def quantile(cls, q=0.5, interpolation="linear"):
-        if not pd.api.types.is_list_like(q):
-            q = [q]
-
-        return cls(pylibcudf.aggregation.quantile(
-            q, pylibcudf.types.Interpolation[interpolation.upper()]
-        ))
-
-    @classmethod
-    def unique(cls):
-        return cls(pylibcudf.aggregation.collect_set(
-                pylibcudf.types.NullPolicy.INCLUDE,
-                pylibcudf.types.NullEquality.EQUAL,
-                pylibcudf.types.NanEquality.ALL_EQUAL,
-
-        ))
-
-    @classmethod
-    def first(cls):
-        return cls(
-            pylibcudf.aggregation.nth_element(0, pylibcudf.types.NullPolicy.EXCLUDE)
-        )
-
-    @classmethod
-    def last(cls):
-        return cls(
-            pylibcudf.aggregation.nth_element(-1, pylibcudf.types.NullPolicy.EXCLUDE)
-        )
-
-    @classmethod
-    def corr(cls, method, min_periods):
-        return cls(pylibcudf.aggregation.correlation(
-            pylibcudf.aggregation.CorrelationType[method.upper()],
-            min_periods
-
-        ))
-
-    @classmethod
-    def cov(cls, min_periods, ddof=1):
-        return cls(pylibcudf.aggregation.covariance(
-            min_periods,
-            ddof
-        ))
-
-    # scan aggregations
-    @classmethod
-    def cumcount(cls):
-        return cls.count(False)
-
-    cumsum = sum
-    cummin = min
-    cummax = max
-    cumprod = product
-
-    @classmethod
-    def rank(cls, method, ascending, na_option, pct):
-        return cls(pylibcudf.aggregation.rank(
-            pylibcudf.aggregation.RankMethod[method.upper()],
-            (pylibcudf.types.Order.ASCENDING if ascending else
-                pylibcudf.types.Order.DESCENDING),
-            (pylibcudf.types.NullPolicy.EXCLUDE if na_option == "keep" else
-                pylibcudf.types.NullPolicy.INCLUDE),
-            (pylibcudf.types.NullOrder.BEFORE
-                if (na_option == "top") == ascending else
-                pylibcudf.types.NullOrder.AFTER),
-            (pylibcudf.aggregation.RankPercentage.ZERO_NORMALIZED
-                if pct else
-                pylibcudf.aggregation.RankPercentage.NONE)
-
-        ))
-
-    # Reduce aggregations
-    @classmethod
-    def any(cls):
-        return cls(pylibcudf.aggregation.any())
-
-    @classmethod
-    def all(cls):
-        return cls(pylibcudf.aggregation.all())
-
-    # Rolling aggregations
-    @classmethod
-    def from_udf(cls, op, *args, **kwargs):
-        # Handling UDF type
-        nb_type = numpy_support.from_dtype(kwargs['dtype'])
-        type_signature = (nb_type[:],)
-        ptx_code, output_dtype = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = cudf.dtype(output_dtype)
-        if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
-            raise TypeError(f"Result of window function has unsupported dtype {op[1]}")
-
-        return cls(
-            pylibcudf.aggregation.udf(
-                ptx_code,
-                pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]),
-            )
-        )
-
-
-def make_aggregation(op, kwargs=None):
-    r"""
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-    \*\*kwargs : dict, optional
-        Any keyword arguments to be passed to the op.
-
-    Returns
-    -------
-    Aggregation
-    """
-    if kwargs is None:
-        kwargs = {}
-
-    if isinstance(op, str):
-        return getattr(Aggregation, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            return Aggregation.collect()
-        elif "dtype" in kwargs:
-            return Aggregation.from_udf(op, **kwargs)
-        else:
-            return op(Aggregation)
-    raise TypeError(f"Unknown aggregation {op}")
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4e712be6738..80a77ef2267 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -20,7 +20,7 @@ from cudf._lib.scalar import as_device_scalar
 
 import pylibcudf
 
-from cudf._lib.aggregation import make_aggregation
+from cudf.core._internals.aggregation import make_aggregation
 
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. These strings must be elements of the AggregationKind enum.
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 944753d28b8..2850cab93a1 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -10,7 +10,7 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
 
 import pylibcudf
 
-from cudf._lib.aggregation import make_aggregation
+from cudf.core._internals.aggregation import make_aggregation
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
new file mode 100644
index 00000000000..fe8ea5a947a
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from numba.np import numpy_support
+
+import pylibcudf as plc
+
+import cudf
+from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+from cudf.api.types import is_scalar
+from cudf.utils import cudautils
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from typing_extensions import Self
+
+_agg_name_map = {
+    "COUNT_VALID": "COUNT",
+    "COUNT_ALL": "SIZE",
+    "VARIANCE": "VAR",
+    "NTH_ELEMENT": "NTH",
+    "COLLECT_LIST": "COLLECT",
+    "COLLECT_SET": "UNIQUE",
+}
+
+
+class Aggregation:
+    def __init__(self, agg: plc.aggregation.Aggregation) -> None:
+        self.c_obj = agg
+
+    @property
+    def kind(self) -> str:
+        name = self.c_obj.kind().name
+        return _agg_name_map.get(name, name)
+
+    @classmethod
+    def sum(cls) -> Self:
+        return cls(plc.aggregation.sum())
+
+    @classmethod
+    def min(cls) -> Self:
+        return cls(plc.aggregation.min())
+
+    @classmethod
+    def max(cls) -> Self:
+        return cls(plc.aggregation.max())
+
+    @classmethod
+    def idxmin(cls) -> Self:
+        return cls(plc.aggregation.argmin())
+
+    @classmethod
+    def idxmax(cls) -> Self:
+        return cls(plc.aggregation.argmax())
+
+    @classmethod
+    def mean(cls) -> Self:
+        return cls(plc.aggregation.mean())
+
+    @classmethod
+    def count(cls, dropna: bool = True) -> Self:
+        return cls(
+            plc.aggregation.count(
+                plc.types.NullPolicy.EXCLUDE
+                if dropna
+                else plc.types.NullPolicy.INCLUDE
+            )
+        )
+
+    @classmethod
+    def ewma(cls, com: float = 1.0, adjust: bool = True) -> Self:
+        return cls(
+            plc.aggregation.ewma(
+                com,
+                plc.aggregation.EWMHistory.INFINITE
+                if adjust
+                else plc.aggregation.EWMHistory.FINITE,
+            )
+        )
+
+    @classmethod
+    def size(cls) -> Self:
+        return cls(plc.aggregation.count(plc.types.NullPolicy.INCLUDE))
+
+    @classmethod
+    def collect(cls) -> Self:
+        return cls(plc.aggregation.collect_list(plc.types.NullPolicy.INCLUDE))
+
+    @classmethod
+    def nunique(cls, dropna: bool = True) -> Self:
+        return cls(
+            plc.aggregation.nunique(
+                plc.types.NullPolicy.EXCLUDE
+                if dropna
+                else plc.types.NullPolicy.INCLUDE
+            )
+        )
+
+    @classmethod
+    def nth(cls, size: int) -> Self:
+        return cls(plc.aggregation.nth_element(size))
+
+    @classmethod
+    def product(cls) -> Self:
+        return cls(plc.aggregation.product())
+
+    prod = product
+
+    @classmethod
+    def sum_of_squares(cls) -> Self:
+        return cls(plc.aggregation.sum_of_squares())
+
+    @classmethod
+    def var(cls, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.variance(ddof))
+
+    @classmethod
+    def std(cls, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.std(ddof))
+
+    @classmethod
+    def median(cls) -> Self:
+        return cls(plc.aggregation.median())
+
+    @classmethod
+    def quantile(
+        cls,
+        q: float | list[float] = 0.5,
+        interpolation: Literal[
+            "linear", "lower", "higher", "midpoint", "nearest"
+        ] = "linear",
+    ) -> Self:
+        return cls(
+            plc.aggregation.quantile(
+                [q] if is_scalar(q) else q,
+                plc.types.Interpolation[interpolation.upper()],
+            )
+        )
+
+    @classmethod
+    def unique(cls) -> Self:
+        return cls(
+            plc.aggregation.collect_set(
+                plc.types.NullPolicy.INCLUDE,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+        )
+
+    @classmethod
+    def first(cls) -> Self:
+        return cls(
+            plc.aggregation.nth_element(0, plc.types.NullPolicy.EXCLUDE)
+        )
+
+    @classmethod
+    def last(cls) -> Self:
+        return cls(
+            plc.aggregation.nth_element(-1, plc.types.NullPolicy.EXCLUDE)
+        )
+
+    @classmethod
+    def corr(cls, method, min_periods) -> Self:
+        return cls(
+            plc.aggregation.correlation(
+                plc.aggregation.CorrelationType[method.upper()], min_periods
+            )
+        )
+
+    @classmethod
+    def cov(cls, min_periods: int, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.covariance(min_periods, ddof))
+
+    # scan aggregations
+    @classmethod
+    def cumcount(cls) -> Self:
+        return cls.count(False)
+
+    cumsum = sum
+    cummin = min
+    cummax = max
+    cumprod = product
+
+    @classmethod
+    def rank(
+        cls,
+        method: Literal["first", "average", "min", "max", "dense"],
+        ascending: bool,
+        na_option: Literal["keep", "top", "bottom"],
+        pct: bool,
+    ) -> Self:
+        return cls(
+            plc.aggregation.rank(
+                plc.aggregation.RankMethod[method.upper()],
+                (
+                    plc.types.Order.ASCENDING
+                    if ascending
+                    else plc.types.Order.DESCENDING
+                ),
+                (
+                    plc.types.NullPolicy.EXCLUDE
+                    if na_option == "keep"
+                    else plc.types.NullPolicy.INCLUDE
+                ),
+                (
+                    plc.types.NullOrder.BEFORE
+                    if (na_option == "top") == ascending
+                    else plc.types.NullOrder.AFTER
+                ),
+                (
+                    plc.aggregation.RankPercentage.ZERO_NORMALIZED
+                    if pct
+                    else plc.aggregation.RankPercentage.NONE
+                ),
+            )
+        )
+
+    # Reduce aggregations
+    @classmethod
+    def any(cls) -> Self:
+        return cls(plc.aggregation.any())
+
+    @classmethod
+    def all(cls) -> Self:
+        return cls(plc.aggregation.all())
+
+    # Rolling aggregations
+    @classmethod
+    def from_udf(cls, op, *args, **kwargs) -> Self:
+        # Handling UDF type
+        nb_type = numpy_support.from_dtype(kwargs["dtype"])
+        type_signature = (nb_type[:],)
+        ptx_code, output_dtype = cudautils.compile_udf(op, type_signature)
+        output_np_dtype = cudf.dtype(output_dtype)
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
+            raise TypeError(
+                f"Result of window function has unsupported dtype {op[1]}"
+            )
+
+        return cls(
+            plc.aggregation.udf(
+                ptx_code,
+                plc.DataType(
+                    SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]
+                ),
+            )
+        )
+
+
+def make_aggregation(
+    op: str | Callable, kwargs: dict | None = None
+) -> Aggregation:
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    Aggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    if isinstance(op, str):
+        return getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            return Aggregation.collect()
+        elif "dtype" in kwargs:
+            return Aggregation.from_udf(op, **kwargs)
+        else:
+            return op(Aggregation)
+    raise TypeError(f"Unknown aggregation {op}")
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index d2cb5e8c190..a580c35ccbf 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -12,8 +12,8 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.aggregation import make_aggregation
 from cudf.api.types import is_integer, is_number
+from cudf.core._internals.aggregation import make_aggregation
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import as_column
 from cudf.core.mixins import Reducible

From 84690b5fe5f995937214552826d3541041cb37ab Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 5 Dec 2024 19:35:53 -0500
Subject: [PATCH 29/78] Migrate copy_column and Column.from_scalar to pylibcudf
 (#17513)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17513
---
 python/cudf/cudf/_lib/column.pyx  | 14 ++++++--------
 python/cudf/cudf/_lib/copying.pyx | 15 +++------------
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9cbe11d61ac..245a5d03981 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -42,8 +42,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
 from pylibcudf.libcudf.column.column cimport column, column_contents
 from pylibcudf.libcudf.column.column_factories cimport (
-    make_column_from_scalar as cpp_make_column_from_scalar,
-    make_numeric_column,
+    make_numeric_column
 )
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
@@ -840,9 +839,8 @@ cdef class Column:
 
     @staticmethod
     def from_scalar(py_val, size_type size):
-        cdef DeviceScalar val = py_val.device_value
-        cdef const scalar* c_val = val.get_raw_ptr()
-        cdef unique_ptr[column] c_result
-        with nogil:
-            c_result = move(cpp_make_column_from_scalar(c_val[0], size))
-        return Column.from_unique_ptr(move(c_result))
+        return Column.from_pylibcudf(
+            pylibcudf.Column.from_scalar(
+                py_val.device_value.c_value, size
+            )
+        )
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 4dfb12d8ab3..1f3f03f4be1 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,8 +3,6 @@
 import pickle
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 import pylibcudf
 
 import cudf
@@ -18,10 +16,6 @@ from cudf._lib.scalar cimport DeviceScalar
 
 from cudf._lib.reduce import minmax
 
-from libcpp.memory cimport make_unique
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
@@ -59,12 +53,9 @@ def copy_column(Column input_column):
     -------
     Deep copied column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view input_column_view = input_column.view()
-    with nogil:
-        c_result = move(make_unique[column](input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        input_column.to_pylibcudf(mode="read").copy()
+    )
 
 
 @acquire_spill_lock()

From 169a45a751862cccaf9898d6d83eb695c4d7b9bf Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Dec 2024 07:38:31 -0500
Subject: [PATCH 30/78] Plumb pylibcudf.io.parquet options classes through cudf
 python (#17506)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17506
---
 python/cudf/cudf/_lib/parquet.pyx         | 266 ++++++++------------
 python/pylibcudf/pylibcudf/io/parquet.pxd |  47 ++++
 python/pylibcudf/pylibcudf/io/parquet.pyi |  30 +++
 python/pylibcudf/pylibcudf/io/parquet.pyx | 289 +++++++++++++++++++++-
 4 files changed, 464 insertions(+), 168 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 6c80120ad6e..c77c9875342 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -3,7 +3,7 @@
 import io
 
 import pyarrow as pa
-
+import itertools
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
@@ -22,45 +22,31 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
-from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from pylibcudf.expressions cimport Expression
 from pylibcudf.io.parquet cimport ChunkedParquetReader
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.parquet cimport (
-    chunked_parquet_writer_options,
-    parquet_chunked_writer as cpp_parquet_chunked_writer,
-    parquet_writer_options,
-    write_parquet as parquet_writer,
-)
 from pylibcudf.libcudf.io.types cimport (
-    sink_info,
-    column_in_metadata,
-    table_input_metadata,
-    partition_info,
     statistics_freq,
     compression_type,
     dictionary_policy,
 )
-from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
     add_df_col_struct_names,
-    make_sinks_info,
 )
-from cudf._lib.utils cimport table_view_from_table
 
 import pylibcudf as plc
 
 from pylibcudf cimport Table
 
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
+from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
+from pylibcudf.io.parquet cimport ParquetChunkedWriter
 from cython.operator cimport dereference
 
 
@@ -440,44 +426,34 @@ def write_parquet(
     --------
     cudf.io.parquet.write_parquet
     """
-
-    # Create the write options
-    cdef table_input_metadata tbl_meta
-
-    cdef vector[map[string, string]] user_data
-    cdef table_view tv
-    cdef vector[unique_ptr[data_sink]] _data_sinks
-    cdef sink_info sink = make_sinks_info(
-        filepaths_or_buffers, _data_sinks
-    )
-
     if index is True or (
         index is None and not isinstance(table._index, cudf.RangeIndex)
     ):
-        tv = table_view_from_table(table)
-        tbl_meta = table_input_metadata(tv)
+        columns = [*table.index._columns, *table._columns]
+        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
+        tbl_meta = TableInputMetadata(plc_table)
         for level, idx_name in enumerate(table._index.names):
             tbl_meta.column_metadata[level].set_name(
-                str.encode(
-                    _index_level_name(idx_name, level, table._column_names)
-                )
+                _index_level_name(idx_name, level, table._column_names)
             )
         num_index_cols_meta = len(table._index.names)
     else:
-        tv = table_view_from_table(table, ignore_index=True)
-        tbl_meta = table_input_metadata(tv)
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = TableInputMetadata(plc_table)
         num_index_cols_meta = 0
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
         if not isinstance(name, str):
             if cudf.get_option("mode.pandas_compatible"):
-                tbl_meta.column_metadata[i].set_name(str(name).encode())
+                tbl_meta.column_metadata[i].set_name(str(name))
             else:
                 raise ValueError(
                     "Writing a Parquet file requires string column names"
                 )
         else:
-            tbl_meta.column_metadata[i].set_name(name.encode())
+            tbl_meta.column_metadata[i].set_name(name)
 
         _set_col_metadata(
             table[name]._column,
@@ -489,21 +465,16 @@ def write_parquet(
             column_type_length,
             output_as_binary
         )
-
-    cdef map[string, string] tmp_user_data
     if partitions_info is not None:
-        for start_row, num_row in partitions_info:
-            partitioned_df = table.iloc[start_row: start_row + num_row].copy(
-                deep=False
-            )
-            pandas_metadata = generate_pandas_metadata(partitioned_df, index)
-            tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-            user_data.push_back(tmp_user_data)
-            tmp_user_data.clear()
+        user_data = [
+            {"pandas": generate_pandas_metadata(
+                table.iloc[start_row:start_row + num_row].copy(deep=False),
+                index
+            )}
+            for start_row, num_row in partitions_info
+        ]
     else:
-        pandas_metadata = generate_pandas_metadata(table, index)
-        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-        user_data.push_back(tmp_user_data)
+        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
 
     if header_version not in ("1.0", "2.0"):
         raise ValueError(
@@ -519,20 +490,15 @@ def write_parquet(
 
     comp_type = _get_comp_type(compression)
     stat_freq = _get_stat_freq(statistics)
-
-    cdef unique_ptr[vector[uint8_t]] out_metadata_c
-    cdef vector[string] c_column_chunks_file_paths
-    cdef bool _int96_timestamps = int96_timestamps
-    cdef vector[partition_info] partitions
-
-    # Perform write
-    cdef parquet_writer_options args = move(
-        parquet_writer_options.builder(sink, tv)
+    options = (
+        plc.io.parquet.ParquetWriterOptions.builder(
+            plc.io.SinkInfo(filepaths_or_buffers), plc_table
+        )
         .metadata(tbl_meta)
-        .key_value_metadata(move(user_data))
+        .key_value_metadata(user_data)
         .compression(comp_type)
         .stats_level(stat_freq)
-        .int96_timestamps(_int96_timestamps)
+        .int96_timestamps(int96_timestamps)
         .write_v2_headers(header_version == "2.0")
         .dictionary_policy(dict_policy)
         .utc_timestamps(False)
@@ -540,40 +506,27 @@ def write_parquet(
         .build()
     )
     if partitions_info is not None:
-        partitions.reserve(len(partitions_info))
-        for part in partitions_info:
-            partitions.push_back(
-                partition_info(part[0], part[1])
-            )
-        args.set_partitions(move(partitions))
+        options.set_partitions(
+            [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info]
+        )
     if metadata_file_path is not None:
         if is_list_like(metadata_file_path):
-            for path in metadata_file_path:
-                c_column_chunks_file_paths.push_back(str.encode(path))
+            options.set_column_chunks_file_paths(metadata_file_path)
         else:
-            c_column_chunks_file_paths.push_back(
-                str.encode(metadata_file_path)
-            )
-        args.set_column_chunks_file_paths(move(c_column_chunks_file_paths))
+            options.set_column_chunks_file_paths([metadata_file_path])
     if row_group_size_bytes is not None:
-        args.set_row_group_size_bytes(row_group_size_bytes)
+        options.set_row_group_size_bytes(row_group_size_bytes)
     if row_group_size_rows is not None:
-        args.set_row_group_size_rows(row_group_size_rows)
+        options.set_row_group_size_rows(row_group_size_rows)
     if max_page_size_bytes is not None:
-        args.set_max_page_size_bytes(max_page_size_bytes)
+        options.set_max_page_size_bytes(max_page_size_bytes)
     if max_page_size_rows is not None:
-        args.set_max_page_size_rows(max_page_size_rows)
+        options.set_max_page_size_rows(max_page_size_rows)
     if max_dictionary_size is not None:
-        args.set_max_dictionary_size(max_dictionary_size)
-
-    with nogil:
-        out_metadata_c = move(parquet_writer(args))
-
+        options.set_max_dictionary_size(max_dictionary_size)
+    blob = plc.io.parquet.write_parquet(options)
     if metadata_file_path is not None:
-        out_metadata_py = BufferArrayFromVector.from_unique_ptr(
-            move(out_metadata_c)
-        )
-        return np.asarray(out_metadata_py)
+        return np.asarray(blob.obj)
     else:
         return None
 
@@ -624,10 +577,9 @@ cdef class ParquetWriter:
     cudf.io.parquet.write_parquet
     """
     cdef bool initialized
-    cdef unique_ptr[cpp_parquet_chunked_writer] writer
-    cdef table_input_metadata tbl_meta
-    cdef sink_info sink
-    cdef vector[unique_ptr[data_sink]] _data_sink
+    cdef ParquetChunkedWriter writer
+    cdef SinkInfo sink
+    cdef TableInputMetadata tbl_meta
     cdef str statistics
     cdef object compression
     cdef object index
@@ -653,7 +605,7 @@ cdef class ParquetWriter:
             if is_list_like(filepath_or_buffer)
             else [filepath_or_buffer]
         )
-        self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink)
+        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
         self.statistics = statistics
         self.compression = compression
         self.index = index
@@ -673,52 +625,29 @@ cdef class ParquetWriter:
                 table,
                 num_partitions=len(partitions_info) if partitions_info else 1
             )
-
-        cdef table_view tv
         if self.index is not False and (
             table._index.name is not None or
                 isinstance(table._index, cudf.core.multiindex.MultiIndex)):
-            tv = table_view_from_table(table)
+            columns = [*table.index._columns, *table._columns]
+            plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
         else:
-            tv = table_view_from_table(table, ignore_index=True)
-
-        cdef vector[partition_info] partitions
-        if partitions_info is not None:
-            for part in partitions_info:
-                partitions.push_back(
-                    partition_info(part[0], part[1])
-                )
-
-        with nogil:
-            self.writer.get()[0].write(tv, partitions)
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
+            )
+        self.writer.write(plc_table, partitions_info)
 
     def close(self, object metadata_file_path=None):
-        cdef unique_ptr[vector[uint8_t]] out_metadata_c
-        cdef vector[string] column_chunks_file_paths
-
         if not self.initialized:
             return None
-
-        # Update metadata-collection options
+        column_chunks_file_paths=[]
         if metadata_file_path is not None:
             if is_list_like(metadata_file_path):
-                for path in metadata_file_path:
-                    column_chunks_file_paths.push_back(str.encode(path))
+                column_chunks_file_paths = list(metadata_file_path)
             else:
-                column_chunks_file_paths.push_back(
-                    str.encode(metadata_file_path)
-                )
-
-        with nogil:
-            out_metadata_c = move(
-                self.writer.get()[0].close(column_chunks_file_paths)
-            )
-
+                column_chunks_file_paths = [metadata_file_path]
+        blob = self.writer.close(column_chunks_file_paths)
         if metadata_file_path is not None:
-            out_metadata_py = BufferArrayFromVector.from_unique_ptr(
-                move(out_metadata_c)
-            )
-            return np.asarray(out_metadata_py)
+            return np.asarray(blob.obj)
         return None
 
     def __enter__(self):
@@ -730,32 +659,44 @@ cdef class ParquetWriter:
     def _initialize_chunked_state(self, table, num_partitions=1):
         """ Prepares all the values required to build the
         chunked_parquet_writer_options and creates a writer"""
-        cdef table_view tv
 
         # Set the table_metadata
         num_index_cols_meta = 0
-        self.tbl_meta = table_input_metadata(
-            table_view_from_table(table, ignore_index=True))
+        plc_table = plc.Table(
+            [
+                col.to_pylibcudf(mode="read")
+                for col in table._columns
+            ]
+        )
+        self.tbl_meta = TableInputMetadata(plc_table)
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                tv = table_view_from_table(table)
-                self.tbl_meta = table_input_metadata(tv)
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(table.index._columns, table._columns)
+                    ]
+                )
+                self.tbl_meta = TableInputMetadata(plc_table)
                 for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(
-                        (str.encode(idx_name))
-                    )
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
                 num_index_cols_meta = len(table._index.names)
             else:
                 if table._index.name is not None:
-                    tv = table_view_from_table(table)
-                    self.tbl_meta = table_input_metadata(tv)
-                    self.tbl_meta.column_metadata[0].set_name(
-                        str.encode(table._index.name)
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
                     )
+                    self.tbl_meta = TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table._index.name)
                     num_index_cols_meta = 1
 
         for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name.encode())
+            self.tbl_meta.column_metadata[i].set_name(name)
             _set_col_metadata(
                 table[name]._column,
                 self.tbl_meta.column_metadata[i],
@@ -764,13 +705,7 @@ cdef class ParquetWriter:
         index = (
             False if isinstance(table._index, cudf.RangeIndex) else self.index
         )
-        pandas_metadata = generate_pandas_metadata(table, index)
-        cdef map[string, string] tmp_user_data
-        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-        cdef vector[map[string, string]] user_data
-        user_data = vector[map[string, string]](num_partitions, tmp_user_data)
-
-        cdef chunked_parquet_writer_options args
+        user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions
         cdef compression_type comp_type = _get_comp_type(self.compression)
         cdef statistics_freq stat_freq = _get_stat_freq(self.statistics)
         cdef dictionary_policy dict_policy = (
@@ -778,23 +713,22 @@ cdef class ParquetWriter:
             if self.use_dictionary
             else plc.io.types.DictionaryPolicy.NEVER
         )
-        with nogil:
-            args = move(
-                chunked_parquet_writer_options.builder(self.sink)
-                .metadata(self.tbl_meta)
-                .key_value_metadata(move(user_data))
-                .compression(comp_type)
-                .stats_level(stat_freq)
-                .row_group_size_bytes(self.row_group_size_bytes)
-                .row_group_size_rows(self.row_group_size_rows)
-                .max_page_size_bytes(self.max_page_size_bytes)
-                .max_page_size_rows(self.max_page_size_rows)
-                .max_dictionary_size(self.max_dictionary_size)
-                .write_arrow_schema(self.write_arrow_schema)
-                .build()
-            )
-            args.set_dictionary_policy(dict_policy)
-            self.writer.reset(new cpp_parquet_chunked_writer(args))
+        options = (
+            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(comp_type)
+            .stats_level(stat_freq)
+            .row_group_size_bytes(self.row_group_size_bytes)
+            .row_group_size_rows(self.row_group_size_rows)
+            .max_page_size_bytes(self.max_page_size_bytes)
+            .max_page_size_rows(self.max_page_size_rows)
+            .max_dictionary_size(self.max_dictionary_size)
+            .write_arrow_schema(self.write_arrow_schema)
+            .build()
+        )
+        options.set_dictionary_policy(dict_policy)
+        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
         self.initialized = True
 
 
@@ -837,7 +771,7 @@ cdef compression_type _get_comp_type(object compression):
 
 cdef _set_col_metadata(
     Column col,
-    column_in_metadata& col_meta,
+    ColumnInMetadata col_meta,
     bool force_nullable_schema=False,
     str path=None,
     object skip_compression=None,
@@ -847,7 +781,7 @@ cdef _set_col_metadata(
 ):
     need_path = (skip_compression is not None or column_encoding is not None or
                  column_type_length is not None or output_as_binary is not None)
-    name = col_meta.get_name().decode('UTF-8') if need_path else None
+    name = col_meta.get_name() if need_path else None
     full_path = path + "." + name if path is not None else name
 
     if force_nullable_schema:
@@ -880,7 +814,7 @@ cdef _set_col_metadata(
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
-            col_meta.child(i).set_name(name.encode())
+            col_meta.child(i).set_name(name)
             _set_col_metadata(
                 child_col,
                 col_meta.child(i),
@@ -894,7 +828,7 @@ cdef _set_col_metadata(
     elif isinstance(col.dtype, cudf.ListDtype):
         if full_path is not None:
             full_path = full_path + ".list"
-            col_meta.child(1).set_name("element".encode())
+            col_meta.child(1).set_name("element")
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 79080fa7243..7bd6ba91ca9 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -15,9 +15,12 @@ from pylibcudf.io.types cimport (
     TableWithMetadata,
 )
 from pylibcudf.libcudf.io.parquet cimport (
+    parquet_chunked_writer as cpp_parquet_chunked_writer,
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_writer_options,
     parquet_writer_options_builder,
+    chunked_parquet_writer_options,
+    chunked_parquet_writer_options_builder,
 )
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.table cimport Table
@@ -46,6 +49,50 @@ cpdef read_parquet(
     # DataType timestamp_type = *
 )
 
+
+cdef class ParquetChunkedWriter:
+    cdef unique_ptr[cpp_parquet_chunked_writer] c_obj
+    cpdef memoryview close(self, list column_chunks_file_paths)
+    cpdef void write(self, Table table, object partitions_info=*)
+
+
+cdef class ChunkedParquetWriterOptions:
+    cdef chunked_parquet_writer_options c_obj
+    cdef SinkInfo sink
+
+    cpdef void set_dictionary_policy(self, dictionary_policy policy)
+
+
+cdef class ChunkedParquetWriterOptionsBuilder:
+    cdef chunked_parquet_writer_options_builder c_obj
+    cdef SinkInfo sink
+
+    cpdef ChunkedParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata)
+
+    cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata)
+
+    cpdef ChunkedParquetWriterOptionsBuilder compression(
+        self,
+        compression_type compression
+    )
+
+    cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf)
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled)
+
+    cpdef ChunkedParquetWriterOptions build(self)
+
+
 cdef class ParquetWriterOptions:
     cdef parquet_writer_options c_obj
     cdef Table table_ref
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index 3eb3d7c3a92..22bea1abd8e 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -78,4 +78,34 @@ class ParquetWriterOptionsBuilder:
     def build(self) -> ParquetWriterOptions: ...
 
 def write_parquet(options: ParquetWriterOptions) -> memoryview: ...
+
+class ParquetChunkedWriter:
+    def __init__(self): ...
+    def close(self, metadata_file_path: list) -> memoryview: ...
+    def write(self, table: Table) -> None: ...
+    @staticmethod
+    def from_options(options: ChunkedParquetWriterOptions) -> Self: ...
+
+class ChunkedParquetWriterOptions:
+    def __init__(self): ...
+    def set_dictionary_policy(self, policy: DictionaryPolicy) -> None: ...
+    @staticmethod
+    def builder(sink: SinkInfo) -> ChunkedParquetWriterOptionsBuilder: ...
+
+class ChunkedParquetWriterOptionsBuilder:
+    def __init__(self): ...
+    def metadata(self, metadata: TableInputMetadata) -> Self: ...
+    def key_value_metadata(
+        self, metadata: list[Mapping[str, str]]
+    ) -> Self: ...
+    def compression(self, compression: CompressionType) -> Self: ...
+    def stats_level(self, sf: StatisticsFreq) -> Self: ...
+    def row_group_size_bytes(self, val: int) -> Self: ...
+    def row_group_size_rows(self, val: int) -> Self: ...
+    def max_page_size_bytes(self, val: int) -> Self: ...
+    def max_page_size_rows(self, val: int) -> Self: ...
+    def max_dictionary_size(self, val: int) -> Self: ...
+    def write_arrow_schema(self, enabled: bool) -> Self: ...
+    def build(self) -> ChunkedParquetWriterOptions: ...
+
 def merge_row_group_metadata(metdata_list: list) -> memoryview: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index 93843c932ad..9bdf849a30c 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -22,6 +22,8 @@ from pylibcudf.libcudf.io.parquet cimport (
     read_parquet as cpp_read_parquet,
     write_parquet as cpp_write_parquet,
     parquet_writer_options,
+    parquet_chunked_writer as cpp_parquet_chunked_writer,
+    chunked_parquet_writer_options,
     merge_row_group_metadata as cpp_merge_row_group_metadata,
 )
 from pylibcudf.libcudf.io.types cimport (
@@ -40,6 +42,8 @@ __all__ = [
     "ParquetWriterOptionsBuilder",
     "read_parquet",
     "write_parquet",
+    "ChunkedParquetWriterOptions",
+    "ChunkedParquetWriterOptionsBuilder"
     "merge_row_group_metadata",
 ]
 
@@ -247,6 +251,288 @@ cpdef read_parquet(
     return TableWithMetadata.from_libcudf(c_result)
 
 
+cdef class ParquetChunkedWriter:
+    cpdef memoryview close(self, list metadata_file_path):
+        """
+        Closes the chunked Parquet writer.
+
+        Parameters
+        ----------
+        metadata_file_path: list
+            Column chunks file path to be set in the raw output metadata
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] column_chunks_file_paths
+        cdef unique_ptr[vector[uint8_t]] out_metadata_c
+        if metadata_file_path:
+            for path in metadata_file_path:
+                column_chunks_file_paths.push_back(path.encode())
+        with nogil:
+            out_metadata_c = move(self.c_obj.get()[0].close(column_chunks_file_paths))
+        return memoryview(HostBuffer.from_unique_ptr(move(out_metadata_c)))
+
+    cpdef void write(self, Table table, object partitions_info=None):
+        """
+        Writes table to output.
+
+        Parameters
+        ----------
+        table: Table
+            Table that needs to be written
+        partitions_info: object, default None
+            Optional partitions to divide the table into.
+            If specified, must be same size as number of sinks.
+
+        Returns
+        -------
+        None
+        """
+        if partitions_info is None:
+            with nogil:
+                self.c_obj.get()[0].write(table.view())
+            return
+        cdef vector[partition_info] partitions
+        for part in partitions_info:
+            partitions.push_back(
+                partition_info(part[0], part[1])
+            )
+        with nogil:
+            self.c_obj.get()[0].write(table.view(), partitions)
+
+    @staticmethod
+    def from_options(ChunkedParquetWriterOptions options):
+        """
+        Creates a chunked Parquet writer from options
+
+        Parameters
+        ----------
+        options: ChunkedParquetWriterOptions
+            Settings for controlling writing behavior
+
+        Returns
+        -------
+        ParquetChunkedWriter
+        """
+        cdef ParquetChunkedWriter parquet_writer = ParquetChunkedWriter.__new__(
+            ParquetChunkedWriter
+        )
+        parquet_writer.c_obj.reset(new cpp_parquet_chunked_writer(options.c_obj))
+        return parquet_writer
+
+
+cdef class ChunkedParquetWriterOptions:
+    @staticmethod
+    def builder(SinkInfo sink):
+        """
+        Create builder to create ChunkedParquetWriterOptions.
+
+        Parameters
+        ----------
+        sink: SinkInfo
+            The sink used for writer output
+
+        Returns
+        -------
+        ChunkedParquetWriterOptionsBuilder
+        """
+        cdef ChunkedParquetWriterOptionsBuilder parquet_builder = (
+            ChunkedParquetWriterOptionsBuilder.__new__(
+                ChunkedParquetWriterOptionsBuilder
+            )
+        )
+        parquet_builder.c_obj = chunked_parquet_writer_options.builder(sink.c_obj)
+        parquet_builder.sink = sink
+        return parquet_builder
+
+    cpdef void set_dictionary_policy(self, dictionary_policy_t policy):
+        """
+        Sets the policy for dictionary use.
+
+        Parameters
+        ----------
+        policy : DictionaryPolicy
+            Policy for dictionary use
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_dictionary_policy(policy)
+
+
+cdef class ChunkedParquetWriterOptionsBuilder:
+    cpdef ChunkedParquetWriterOptionsBuilder metadata(
+        self,
+        TableInputMetadata metadata
+    ):
+        self.c_obj.metadata(metadata.c_obj)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata):
+        """
+        Sets Key-Value footer metadata.
+
+        Parameters
+        ----------
+        metadata : list[dict[str, str]]
+            Key-Value footer metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.key_value_metadata(
+            [
+                {key.encode(): value.encode() for key, value in mapping.items()}
+                for mapping in metadata
+            ]
+        )
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder compression(
+        self,
+        compression_type compression
+    ):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        compression : CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(compression)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf):
+        """
+        Sets the level of statistics.
+
+        Parameters
+        ----------
+        sf : StatisticsFreq
+            Level of statistics requested in the output file
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.stats_level(sf)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val):
+        """
+        Sets the maximum row group size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Maximum row group size, in bytes to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.row_group_size_bytes(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val):
+        """
+        Sets the maximum row group size, in rows.
+
+        Parameters
+        ----------
+        val : size_type
+            Maximum row group size, in rows to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.row_group_size_rows(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val):
+        """
+        Sets the maximum uncompressed page size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Maximum uncompressed page size, in bytes to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_page_size_bytes(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val):
+        """
+        Sets the maximum page size, in rows.
+
+        Parameters
+        ----------
+        val : size_type
+            Maximum page size, in rows to set.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_page_size_rows(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val):
+        """
+        Sets the maximum dictionary size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Sets the maximum dictionary size, in bytes.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_dictionary_size(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled):
+        """
+        Set to true if arrow schema is to be written.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable writing of arrow schema.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.write_arrow_schema(enabled)
+        return self
+
+    cpdef ChunkedParquetWriterOptions build(self):
+        """Create a ChunkedParquetWriterOptions object"""
+        cdef ChunkedParquetWriterOptions parquet_options = (
+            ChunkedParquetWriterOptions.__new__(ChunkedParquetWriterOptions)
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.sink = self.sink
+        return parquet_options
+
+
 cdef class ParquetWriterOptions:
 
     @staticmethod
@@ -571,11 +857,10 @@ cpdef memoryview write_parquet(ParquetWriterOptions options):
         (parquet FileMetadata thrift message) if requested in
         parquet_writer_options (empty blob otherwise).
     """
-    cdef parquet_writer_options c_options = options.c_obj
     cdef unique_ptr[vector[uint8_t]] c_result
 
     with nogil:
-        c_result = cpp_write_parquet(c_options)
+        c_result = cpp_write_parquet(move(options.c_obj))
 
     return memoryview(HostBuffer.from_unique_ptr(move(c_result)))
 

From 38261f8509245f88bdeab193a1357d9c73d765f0 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 6 Dec 2024 08:32:41 -0500
Subject: [PATCH 31/78] Improve strings contains/find performance for smaller
 strings (#17330)

Replaces usage of `cudf::string_view::find()` with loop and call to `cudf::string_view::compare()` where possible.
This showed significant performance improvement.
This was also slightly faster than a KMP prototype implementation.
Also updates the find/contains benchmarks to remove the 2GB limit and include column versions of the find APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17330
---
 cpp/benchmarks/string/find.cpp           | 59 ++++++++++++++++--------
 cpp/include/cudf/strings/string_view.cuh | 17 ++++---
 cpp/src/strings/search/find.cu           | 24 ++++++----
 3 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 3ea3ff13a2f..2ba793e998e 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -28,21 +28,19 @@
 
 static void bench_find_string(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
   auto const api       = state.get_string("api");
-
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const tgt_type  = state.get_string("target");
 
   auto const stream = cudf::get_default_stream();
-  auto const col    = create_string_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(num_rows, max_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  cudf::string_scalar target("0987 5W43");
+  auto target        = cudf::string_scalar("0987 5W43");
+  auto targets_col   = cudf::make_column_from_scalar(target, num_rows);
+  auto const targets = cudf::strings_column_view(targets_col->view());
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
@@ -55,23 +53,44 @@ static void bench_find_string(nvbench::state& state)
   }
 
   if (api == "find") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::find(input, targets); });
+    }
   } else if (api == "contains") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::contains(input, targets); });
+    }
   } else if (api == "starts_with") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::starts_with(input, targets); });
+    }
   } else if (api == "ends_with") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::ends_with(input, targets); });
+    }
   }
 }
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("hit_rate", {20, 80})  // percentage
   .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
-  .add_int64_axis("hit_rate", {20, 80});  // percentage
+  .add_string_axis("target", {"scalar", "column"});
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 34ed3c5618e..1ae4c3703b2 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -373,24 +373,23 @@ __device__ inline size_type string_view::find_impl(char const* str,
                                                    size_type pos,
                                                    size_type count) const
 {
-  auto const nchars = length();
-  if (!str || pos < 0 || pos > nchars) return npos;
-  if (count < 0) count = nchars;
+  if (!str || pos < 0) { return npos; }
+  if (pos > 0 && pos > length()) { return npos; }
 
   // use iterator to help reduce character/byte counting
-  auto itr        = begin() + pos;
+  auto const itr  = begin() + pos;
   auto const spos = itr.byte_offset();
-  auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes();
+  auto const epos =
+    (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes();
 
   auto const find_length = (epos - spos) - bytes + 1;
+  auto const d_target    = string_view{str, bytes};
 
   auto ptr = data() + (forward ? spos : (epos - bytes));
   for (size_type idx = 0; idx < find_length; ++idx) {
-    bool match = true;
-    for (size_type jdx = 0; match && (jdx < bytes); ++jdx) {
-      match = (ptr[jdx] == str[jdx]);
+    if (d_target.compare(ptr, bytes) == 0) {
+      return forward ? pos : character_offset(epos - bytes - idx);
     }
-    if (match) { return forward ? pos : character_offset(epos - bytes - idx); }
     // use pos to record the current find position
     pos += strings::detail::is_begin_utf8_char(*ptr);
     forward ? ++ptr : --ptr;
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 3cf4970d36e..0f33fcb6fe1 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -70,13 +70,11 @@ struct finder_fn {
     if (d_strings.is_null(idx)) { return -1; }
     auto const d_str = d_strings.element<string_view>(idx);
     if (d_str.empty() && (start > 0)) { return -1; }
+    if (stop >= 0 && start > stop) { return -1; }
     auto const d_target = d_targets[idx];
 
-    auto const length = d_str.length();
-    auto const begin  = (start > length) ? length : start;
-    auto const end    = (stop < 0) || (stop > length) ? length : stop;
-    return forward ? d_str.find(d_target, begin, end - begin)
-                   : d_str.rfind(d_target, begin, end - begin);
+    auto const count = (stop < 0) ? stop : (stop - start);
+    return forward ? d_str.find(d_target, start, count) : d_str.rfind(d_target, start, count);
   }
 };
 
@@ -367,7 +365,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
        i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
     // this is definitely faster for very long strings > 128B
-    for (auto j = 0; j < bytes_per_warp; j++) {
+    for (auto j = 0; !found && (j < bytes_per_warp); j++) {
       if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
           d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
         found = true;
@@ -531,7 +529,6 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   results->set_null_count(strings.null_count());
   return results;
 }
-
 }  // namespace
 
 std::unique_ptr<column> contains(strings_column_view const& input,
@@ -541,13 +538,17 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
-      ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
+      ((input.chars_size(stream) / (input.size() - input.null_count())) >
+       AVG_CHAR_BYTES_THRESHOLD)) {
     return contains_warp_parallel(input, target, stream, mr);
   }
 
   // benchmark measurements showed this to be faster for smaller strings
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
-    return d_string.find(d_target) != string_view::npos;
+    for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) {
+      if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; }
+    }
+    return false;
   };
   return contains_fn(input, target, pfn, stream, mr);
 }
@@ -558,7 +559,10 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
-    return d_string.find(d_target) != string_view::npos;
+    for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) {
+      if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; }
+    }
+    return false;
   };
   return contains_fn(strings, targets, pfn, stream, mr);
 }

From c791f8044d0d11f55042afd7a66698d8ce2e1973 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:20:06 -0800
Subject: [PATCH 32/78] Remove cudf._lib.text in favor of inlining pylibcudf
 (#17408)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17408
---
 python/cudf/cudf/_lib/CMakeLists.txt |  1 -
 python/cudf/cudf/_lib/__init__.py    |  1 -
 python/cudf/cudf/_lib/text.pyx       | 53 ----------------------------
 python/cudf/cudf/io/text.py          | 45 +++++++++++++++++------
 4 files changed, 34 insertions(+), 66 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/text.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2f05101e8e3..4e1bf860872 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -27,7 +27,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    text.pyx
     transform.pyx
     types.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cb2d0501fea..c79d5100622 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -16,7 +16,6 @@
     string_casting,
     strings,
     strings_udf,
-    text,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
deleted file mode 100644
index 7942d067c2b..00000000000
--- a/python/cudf/cudf/_lib/text.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from io import TextIOBase
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def read_text(object filepaths_or_buffers,
-              str delimiter,
-              object byte_range,
-              bool strip_delimiters,
-              object compression,
-              object compression_offsets):
-    """
-    Cython function to call into libcudf API, see `multibyte_split`.
-
-    See Also
-    --------
-    cudf.io.text.read_text
-    """
-    if compression is None:
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
-        else:
-            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
-    elif compression == "bgzip":
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            raise ValueError("bgzip compression requires a file path")
-        if compression_offsets is not None:
-            if len(compression_offsets) != 2:
-                raise ValueError(
-                    "compression offsets need to consist of two elements")
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-                compression_offsets[0],
-                compression_offsets[1]
-            )
-        else:
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-            )
-    else:
-        raise ValueError("Only bgzip compression is supported at the moment")
-
-    options = plc.io.text.ParseOptions(
-        byte_range=byte_range, strip_delimiters=strip_delimiters
-    )
-    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 5ce738cae0e..5e266c5ff55 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "Compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)

From 467cf7a7c0a248bdba34e48fc8932acff5797016 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Dec 2024 14:06:08 -0500
Subject: [PATCH 33/78] Replaces uses of `cudf._lib.Column.from_unique_ptr`
 with `pylibcudf.Column.from_libcudf` (#17517)

Apart of #15162. In a follow-up PR we'll deprecate the cudf python column APIs and others that are used outside cudf.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17517
---
 python/cudf/cudf/_lib/strings_udf.pyx |  8 ++++----
 python/cudf/cudf/_lib/utils.pyx       | 11 ++++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index dd2fafbe07f..83f0cb850a5 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
-
 from pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
@@ -27,6 +26,7 @@ from rmm.librmm.device_buffer cimport device_buffer
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
+from pylibcudf cimport Column as plc_Column
 
 
 def get_cuda_build_version():
@@ -52,9 +52,9 @@ def column_from_udf_string_array(DeviceBuffer d_buffer):
         c_result = move(cpp_column_from_udf_string_array(data, size))
         cpp_free_udf_string_array(data, size)
 
-    result = Column.from_unique_ptr(move(c_result))
-
-    return result
+    return Column.from_pylibcudf(
+        plc_Column.from_libcudf(move(c_result))
+    )
 
 
 def get_character_flags_table_ptr():
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 6b3f10e1806..ff032656f80 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-
+from pylibcudf cimport Column as plc_Column
 try:
     import ujson as json
 except ImportError:
@@ -223,10 +223,11 @@ cdef columns_from_unique_ptr(
 
     cdef size_t i
 
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
-
-    return columns
+    return [
+        Column.from_pylibcudf(
+            plc_Column.from_libcudf(move(dereference(it+i)))
+        ) for i in range(c_columns.size())
+    ]
 
 
 cpdef columns_from_pylibcudf_table(tbl):

From 1a62b46938b76abd00711337d03ff4864845257c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:17:11 -0800
Subject: [PATCH 34/78] Remove cudf._lib.round in favor of inlining pylibcudf
 (#17430)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17430
---
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 -
 python/cudf/cudf/_lib/__init__.py             |  1 -
 python/cudf/cudf/_lib/round.pyx               | 39 -------------------
 .../cudf/cudf/core/column/numerical_base.py   | 19 ++++++---
 4 files changed, 14 insertions(+), 46 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/round.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 4e1bf860872..cff25f5752c 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -21,7 +21,6 @@ set(cython_sources
     orc.pyx
     parquet.pyx
     reduce.pyx
-    round.pyx
     scalar.pyx
     sort.pyx
     stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index c79d5100622..05310d8d232 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -10,7 +10,6 @@
     orc,
     parquet,
     reduce,
-    round,
     sort,
     stream_compaction,
     string_casting,
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
deleted file mode 100644
index f961c09e6f6..00000000000
--- a/python/cudf/cudf/_lib/round.pyx
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-from pylibcudf.round import RoundingMethod
-
-
-@acquire_spill_lock()
-def round(Column input_col, int decimal_places=0, how="half_even"):
-    """
-    Round column values to the given number of decimal places
-
-    Parameters
-    ----------
-    input_col : Column whose values will be rounded
-    decimal_places : The number or decimal places to round to
-
-    Returns
-    -------
-    A Column with values rounded to the given number of decimal places
-    """
-    if how not in {"half_even", "half_up"}:
-        raise ValueError("'how' must be either 'half_even' or 'half_up'")
-
-    how = (
-        RoundingMethod.HALF_EVEN if how == "half_even"
-        else RoundingMethod.HALF_UP
-    )
-
-    return Column.from_pylibcudf(
-        plc.round.round(
-            input_col.to_pylibcudf(mode="read"),
-            decimal_places,
-            how
-        )
-    )
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index ea242e34edb..3f9abdabc2f 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 
@@ -246,12 +246,21 @@ def corr(self, other: NumericalBaseColumn) -> float:
         return cov / lhs_std / rhs_std
 
     def round(
-        self, decimals: int = 0, how: str = "half_even"
+        self,
+        decimals: int = 0,
+        how: Literal["half_even", "half_up"] = "half_even",
     ) -> NumericalBaseColumn:
         if not cudf.api.types.is_integer(decimals):
-            raise TypeError("Values in decimals must be integers")
-        """Round the values in the Column to the given number of decimals."""
-        return libcudf.round.round(self, decimal_places=decimals, how=how)
+            raise TypeError("Argument 'decimals' must an integer")
+        if how not in {"half_even", "half_up"}:
+            raise ValueError(f"{how=} must be either 'half_even' or 'half_up'")
+        plc_how = plc.round.RoundingMethod[how.upper()]
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                plc.round.round(
+                    self.to_pylibcudf(mode="read"), decimals, plc_how
+                )
+            )
 
     def _scan(self, op: str) -> ColumnBase:
         return libcudf.reduce.scan(

From b6f7e6ea33d8f516033508224cd89bbd09a791ee Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 6 Dec 2024 12:55:22 -0800
Subject: [PATCH 35/78] Remove cudf._lib.orc in favor of inlining pylibcudf
 (#17466)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17466
---
 python/cudf/cudf/_lib/CMakeLists.txt    |   1 -
 python/cudf/cudf/_lib/__init__.py       |   1 -
 python/cudf/cudf/_lib/orc.pyx           | 466 ------------------
 python/cudf/cudf/io/orc.py              | 613 +++++++++++++++++-------
 python/cudf/cudf/utils/ioutils.py       | 161 ++++++-
 python/pylibcudf/pylibcudf/io/types.pxd |   1 -
 python/pylibcudf/pylibcudf/io/types.pyi |   2 +
 python/pylibcudf/pylibcudf/io/types.pyx |   6 +-
 8 files changed, 603 insertions(+), 648 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/orc.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index cff25f5752c..e98cf283bbb 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -18,7 +18,6 @@ set(cython_sources
     csv.pyx
     groupby.pyx
     interop.pyx
-    orc.pyx
     parquet.pyx
     reduce.pyx
     scalar.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 05310d8d232..4758a933898 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -7,7 +7,6 @@
     groupby,
     interop,
     nvtext,
-    orc,
     parquet,
     reduce,
     sort,
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
deleted file mode 100644
index c829cac6409..00000000000
--- a/python/cudf/cudf/_lib/orc.pyx
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int64_t
-from libcpp cimport bool, int
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-import itertools
-from collections import OrderedDict
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport update_col_struct_field_names
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-from cudf.core.buffer import acquire_spill_lock
-from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
-from pylibcudf.io.orc cimport OrcChunkedWriter
-
-# TODO: Consider inlining this function since it seems to only be used in one place.
-cpdef read_parsed_orc_statistics(filepath_or_buffer):
-    """
-    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
-
-    See Also
-    --------
-    cudf.io.orc.read_orc_statistics
-    """
-
-    parsed = (
-        plc.io.orc.read_parsed_orc_statistics(
-            plc.io.SourceInfo([filepath_or_buffer])
-        )
-    )
-
-    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
-
-
-cpdef read_orc(object filepaths_or_buffers,
-               object columns=None,
-               object stripes=None,
-               object skip_rows=None,
-               object num_rows=None,
-               bool use_index=True,
-               object timestamp_type=None):
-    """
-    Cython function to call into libcudf API, see `read_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-
-    Notes
-    -----
-    Currently this function only considers the metadata of the first file in the list of
-    filepaths_or_buffers.
-    """
-
-    if columns is not None:
-        columns = [str(col) for col in columns]
-
-    tbl_w_meta = plc.io.orc.read_orc(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        stripes,
-        get_skiprows_arg(skip_rows),
-        get_num_rows_arg(num_rows),
-        use_index,
-        plc.types.DataType(
-            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
-                cudf.dtype(timestamp_type)
-            ]
-        )
-    )
-
-    names = tbl_w_meta.column_names(include_children=False)
-
-    actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
-                                             names,
-                                             skip_rows,
-                                             num_rows)
-
-    if columns is not None and (isinstance(columns, list) and len(columns) == 0):
-        # When `columns=[]`, index needs to be
-        # established, but not the columns.
-        nrows = tbl_w_meta.tbl.num_rows()
-        return {}, cudf.RangeIndex(nrows)
-
-    data, index = data_from_pylibcudf_io(
-        tbl_w_meta,
-        col_names if columns is None else names,
-        actual_index_names
-    )
-
-    if is_range_index:
-        index = range_idx
-    elif reset_index_name:
-        index.names = [None] * len(index.names)
-
-    child_name_values = tbl_w_meta.child_names.values()
-
-    data = {
-        name: update_col_struct_field_names(
-            col, child_names
-        )
-        for (name, col), child_names in zip(data.items(), child_name_values)
-    }
-
-    return data, index
-
-
-def _get_comp_type(object compression):
-    if compression is None or compression is False:
-        return plc.io.types.CompressionType.NONE
-
-    compression = str(compression).upper()
-    if compression == "SNAPPY":
-        return plc.io.types.CompressionType.SNAPPY
-    elif compression == "ZLIB":
-        return plc.io.types.CompressionType.ZLIB
-    elif compression == "ZSTD":
-        return plc.io.types.CompressionType.ZSTD
-    elif compression == "LZ4":
-        return plc.io.types.CompressionType.LZ4
-    else:
-        raise ValueError(f"Unsupported `compression` type {compression}")
-
-
-cdef tuple _get_index_from_metadata(
-        vector[map[string, string]] user_data,
-        object names,
-        object skip_rows,
-        object num_rows):
-
-    meta = None
-    index_col = None
-    is_range_index = False
-    reset_index_name = False
-    range_idx = None
-
-    if user_data.size() > 0:
-        json_str = user_data[0][b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            if 'index_columns' in meta and len(meta['index_columns']) > 0:
-                index_col = meta['index_columns']
-                if isinstance(index_col[0], dict) and \
-                        index_col[0]['kind'] == 'range':
-                    is_range_index = True
-                else:
-                    index_col_names = OrderedDict()
-                    for idx_col in index_col:
-                        for c in meta['columns']:
-                            if c['field_name'] == idx_col:
-                                index_col_names[idx_col] = \
-                                    c['name'] or c['field_name']
-                                if c['name'] is None:
-                                    reset_index_name = True
-
-    actual_index_names = None
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            range_index_meta = index_col[0]
-            range_idx = cudf.RangeIndex(
-                start=range_index_meta['start'],
-                stop=range_index_meta['stop'],
-                step=range_index_meta['step'],
-                name=range_index_meta['name']
-            )
-            if skip_rows is not None:
-                range_idx = range_idx[skip_rows:]
-            if num_rows is not None:
-                range_idx = range_idx[:num_rows]
-        else:
-            actual_index_names = list(index_col_names.values())
-            names = names[len(actual_index_names):]
-
-    return (
-        actual_index_names,
-        names,
-        is_range_index,
-        reset_index_name,
-        range_idx
-    )
-
-
-def _get_orc_stat_freq(str statistics):
-    """
-    Convert ORC statistics terms to CUDF convention:
-      - ORC "STRIPE"   == CUDF "ROWGROUP"
-      - ORC "ROWGROUP" == CUDF "PAGE"
-    """
-    statistics = str(statistics).upper()
-    if statistics == "NONE":
-        return plc.io.types.StatisticsFreq.STATISTICS_NONE
-    elif statistics == "STRIPE":
-        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
-    elif statistics == "ROWGROUP":
-        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
-    else:
-        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
-
-
-@acquire_spill_lock()
-def write_orc(
-    table,
-    object path_or_buf,
-    object compression="snappy",
-    str statistics="ROWGROUP",
-    object stripe_size_bytes=None,
-    object stripe_size_rows=None,
-    object row_index_stride=None,
-    object cols_as_map_type=None,
-    object index=None
-):
-    """
-    Cython function to call into libcudf API, see `cudf::io::write_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-    """
-    user_data = {}
-    user_data["pandas"] = generate_pandas_metadata(table, index)
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        columns = table._columns if table._index is None else [
-            *table.index._columns, *table._columns
-        ]
-        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        tbl_meta = TableInputMetadata(plc_table)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        plc_table = plc.Table(
-            [col.to_pylibcudf(mode="read") for col in table._columns]
-        )
-        tbl_meta = TableInputMetadata(plc_table)
-        num_index_cols_meta = 0
-
-    if cols_as_map_type is not None:
-        cols_as_map_type = set(cols_as_map_type)
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        tbl_meta.column_metadata[i].set_name(name)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            (cols_as_map_type is not None)
-            and (name in cols_as_map_type),
-        )
-
-    options = (
-        plc.io.orc.OrcWriterOptions.builder(
-            plc.io.SinkInfo([path_or_buf]), plc_table
-        )
-        .metadata(tbl_meta)
-        .key_value_metadata(user_data)
-        .compression(_get_comp_type(compression))
-        .enable_statistics(_get_orc_stat_freq(statistics))
-        .build()
-    )
-    if stripe_size_bytes is not None:
-        options.set_stripe_size_bytes(stripe_size_bytes)
-    if stripe_size_rows is not None:
-        options.set_stripe_size_rows(stripe_size_rows)
-    if row_index_stride is not None:
-        options.set_row_index_stride(row_index_stride)
-
-    plc.io.orc.write_orc(options)
-
-
-cdef int64_t get_skiprows_arg(object arg) except*:
-    arg = 0 if arg is None else arg
-    if not isinstance(arg, int) or arg < 0:
-        raise TypeError("skiprows must be an int >= 0")
-    return <int64_t> arg
-
-cdef int64_t get_num_rows_arg(object arg) except*:
-    arg = -1 if arg is None else arg
-    if not isinstance(arg, int) or arg < -1:
-        raise TypeError("num_rows must be an int >= -1")
-    return <int64_t> arg
-
-
-cdef class ORCWriter:
-    """
-    ORCWriter lets you you incrementally write out a ORC file from a series
-    of cudf tables
-
-    See Also
-    --------
-    cudf.io.orc.to_orc
-    """
-    cdef bool initialized
-    cdef OrcChunkedWriter writer
-    cdef SinkInfo sink
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef TableInputMetadata tbl_meta
-    cdef object cols_as_map_type
-    cdef object stripe_size_bytes
-    cdef object stripe_size_rows
-    cdef object row_index_stride
-
-    def __cinit__(self,
-                  object path,
-                  object index=None,
-                  object compression="snappy",
-                  str statistics="ROWGROUP",
-                  object cols_as_map_type=None,
-                  object stripe_size_bytes=None,
-                  object stripe_size_rows=None,
-                  object row_index_stride=None):
-        self.sink = plc.io.SinkInfo([path])
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.cols_as_map_type = cols_as_map_type \
-            if cols_as_map_type is None else set(cols_as_map_type)
-        self.stripe_size_bytes = stripe_size_bytes
-        self.stripe_size_rows = stripe_size_rows
-        self.row_index_stride = row_index_stride
-        self.initialized = False
-
-    def write_table(self, table):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(table)
-
-        keep_index = self.index is not False and (
-            table._index.name is not None or
-            isinstance(table._index, cudf.core.multiindex.MultiIndex)
-        )
-        if keep_index:
-            columns = [
-                col.to_pylibcudf(mode="read")
-                for col in itertools.chain(table.index._columns, table._columns)
-            ]
-        else:
-            columns = [col.to_pylibcudf(mode="read") for col in table._columns]
-
-        self.writer.write(plc.Table(columns))
-
-    def close(self):
-        if not self.initialized:
-            return
-
-        self.writer.close()
-
-    def __dealloc__(self):
-        self.close()
-
-    def _initialize_chunked_state(self, table):
-        """
-        Prepare all the values required to build the
-        chunked_orc_writer_options anb creates a writer"""
-
-        num_index_cols_meta = 0
-        plc_table = plc.Table(
-            [
-                col.to_pylibcudf(mode="read")
-                for col in table._columns
-            ]
-        )
-        self.tbl_meta = TableInputMetadata(plc_table)
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                plc_table = plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read")
-                        for col in itertools.chain(table.index._columns, table._columns)
-                    ]
-                )
-                self.tbl_meta = TableInputMetadata(plc_table)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(
-                        idx_name
-                    )
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    plc_table = plc.Table(
-                        [
-                            col.to_pylibcudf(mode="read")
-                            for col in itertools.chain(
-                                table.index._columns, table._columns
-                            )
-                        ]
-                    )
-                    self.tbl_meta = TableInputMetadata(plc_table)
-                    self.tbl_meta.column_metadata[0].set_name(
-                        table._index.name
-                    )
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name)
-            _set_col_children_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-                (self.cols_as_map_type is not None)
-                and (name in self.cols_as_map_type),
-            )
-
-        user_data = {}
-        pandas_metadata = generate_pandas_metadata(table, self.index)
-        user_data["pandas"] = pandas_metadata
-
-        options = (
-            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
-            .metadata(self.tbl_meta)
-            .key_value_metadata(user_data)
-            .compression(_get_comp_type(self.compression))
-            .enable_statistics(_get_orc_stat_freq(self.statistics))
-            .build()
-        )
-        if self.stripe_size_bytes is not None:
-            options.set_stripe_size_bytes(self.stripe_size_bytes)
-        if self.stripe_size_rows is not None:
-            options.set_stripe_size_rows(self.stripe_size_rows)
-        if self.row_index_stride is not None:
-            options.set_row_index_stride(self.row_index_stride)
-
-        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
-
-        self.initialized = True
-
-cdef _set_col_children_metadata(Column col,
-                                ColumnInMetadata col_meta,
-                                list_column_as_map=False):
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name)
-            _set_col_children_metadata(
-                child_col, col_meta.child(i), list_column_as_map
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if list_column_as_map:
-            col_meta.set_list_column_as_map()
-        _set_col_children_metadata(
-            col.children[cpp_lists_column_view.child_column_index],
-            col_meta.child(cpp_lists_column_view.child_column_index),
-            list_column_as_map
-        )
-    else:
-        return
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 68b60809bb9..5616413b7e4 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,147 +1,28 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
-import datetime
+import itertools
 import warnings
+from typing import TYPE_CHECKING, Literal
 
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import orc as liborc
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf._lib.utils import data_from_pylibcudf_io
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
 
-def _make_empty_df(filepath_or_buffer, columns):
-    from pyarrow import orc
-
-    orc_file = orc.ORCFile(filepath_or_buffer)
-    schema = orc_file.schema
-    col_names = schema.names if columns is None else columns
-    return cudf.DataFrame._from_data(
-        data={
-            col_name: cudf.core.column.column_empty(
-                row_count=0,
-                dtype=schema.field(col_name).type.to_pandas_dtype(),
-            )
-            for col_name in col_names
-        }
-    )
-
-
-def _parse_column_statistics(cs, column_statistics_blob):
-    # Initialize stats to return and parse stats blob
-    column_statistics = {}
-    cs.ParseFromString(column_statistics_blob)
-
-    # Load from parsed stats blob into stats to return
-    if cs.HasField("numberOfValues"):
-        column_statistics["number_of_values"] = cs.numberOfValues
-    if cs.HasField("hasNull"):
-        column_statistics["has_null"] = cs.hasNull
-
-    if cs.HasField("intStatistics"):
-        column_statistics["minimum"] = (
-            cs.intStatistics.minimum
-            if cs.intStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.intStatistics.maximum
-            if cs.intStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
-        )
-
-    elif cs.HasField("doubleStatistics"):
-        column_statistics["minimum"] = (
-            cs.doubleStatistics.minimum
-            if cs.doubleStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.doubleStatistics.maximum
-            if cs.doubleStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.doubleStatistics.sum
-            if cs.doubleStatistics.HasField("sum")
-            else None
-        )
-
-    elif cs.HasField("stringStatistics"):
-        column_statistics["minimum"] = (
-            cs.stringStatistics.minimum
-            if cs.stringStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.stringStatistics.maximum
-            if cs.stringStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.stringStatistics.sum
-
-    elif cs.HasField("bucketStatistics"):
-        column_statistics["true_count"] = cs.bucketStatistics.count[0]
-        column_statistics["false_count"] = (
-            column_statistics["number_of_values"]
-            - column_statistics["true_count"]
-        )
-
-    elif cs.HasField("decimalStatistics"):
-        column_statistics["minimum"] = (
-            cs.decimalStatistics.minimum
-            if cs.decimalStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.decimalStatistics.maximum
-            if cs.decimalStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.decimalStatistics.sum
-
-    elif cs.HasField("dateStatistics"):
-        column_statistics["minimum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("maximum")
-            else None
-        )
-
-    elif cs.HasField("timestampStatistics"):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if cs.timestampStatistics.HasField(
-            "minimumUtc"
-        ) and cs.timestampStatistics.HasField("maximumUtc"):
-            column_statistics["minimum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc
-            )
-            column_statistics["maximum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
-            )
-
-    elif cs.HasField("binaryStatistics"):
-        column_statistics["sum"] = cs.binaryStatistics.sum
-
-    return column_statistics
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
 
 
 @ioutils.doc_read_orc_metadata()
@@ -175,11 +56,12 @@ def read_orc_statistics(
         path_or_buf = ioutils._select_single_source(
             path_or_buf, "read_orc_statistics"
         )
-        (
-            column_names,
-            parsed_file_statistics,
-            parsed_stripes_statistics,
-        ) = liborc.read_parsed_orc_statistics(path_or_buf)
+        parsed = plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([path_or_buf])
+        )
+        column_names = parsed.column_names
+        parsed_file_statistics = parsed.file_stats
+        parsed_stripes_statistics = parsed.stripes_stats
 
         # Parse file statistics
         file_statistics = {
@@ -273,16 +155,14 @@ def read_orc(
     columns=None,
     filters=None,
     stripes=None,
-    skiprows=None,
-    num_rows=None,
-    use_index=True,
+    skiprows: int | None = None,
+    num_rows: int | None = None,
+    use_index: bool = True,
     timestamp_type=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
     """{docstring}"""
-    from cudf import DataFrame
-
     if skiprows is not None:
         # Do not remove until cuIO team approves its removal.
         warnings.warn(
@@ -329,31 +209,132 @@ def read_orc(
 
         # Return empty if everything was filtered
         if len(selected_stripes) == 0:
-            return _make_empty_df(filepaths_or_buffers[0], columns)
+            from pyarrow import orc
+
+            orc_file = orc.ORCFile(filepaths_or_buffers[0])
+            schema = orc_file.schema
+            col_names = schema.names if columns is None else columns
+            return cudf.DataFrame._from_data(
+                data={
+                    col_name: cudf.core.column.column_empty(
+                        row_count=0,
+                        dtype=schema.field(col_name).type.to_pandas_dtype(),
+                    )
+                    for col_name in col_names
+                }
+            )
         else:
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
-            )
+        if columns is not None:
+            columns = [str(col) for col in columns]
+
+        if skiprows is None:
+            skiprows = 0
+        elif not isinstance(skiprows, int) or skiprows < 0:
+            raise TypeError("skiprows must be an int >= 0")
+
+        if num_rows is None:
+            num_rows = -1
+        elif not isinstance(num_rows, int) or num_rows < -1:
+            raise TypeError("num_rows must be an int >= -1")
+
+        tbl_w_meta = plc.io.orc.read_orc(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            columns,
+            stripes,
+            skiprows,
+            num_rows,
+            use_index,
+            dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
         )
+
+        if isinstance(columns, list) and len(columns) == 0:
+            # When `columns=[]`, index needs to be
+            # established, but not the columns.
+            nrows = tbl_w_meta.tbl.num_rows()
+            data = {}
+            index = cudf.RangeIndex(nrows)
+        else:
+            names = tbl_w_meta.column_names(include_children=False)
+            index_col = None
+            is_range_index = False
+            reset_index_name = False
+            range_idx = None
+
+            if len(tbl_w_meta.per_file_user_data) > 0:
+                json_str = (
+                    tbl_w_meta.per_file_user_data[0]
+                    .get(b"pandas", b"")
+                    .decode("utf-8")
+                )
+                if json_str != "":
+                    meta = json.loads(json_str)
+                    if (
+                        "index_columns" in meta
+                        and len(meta["index_columns"]) > 0
+                    ):
+                        index_col = meta["index_columns"]
+                        if (
+                            isinstance(index_col[0], dict)
+                            and index_col[0]["kind"] == "range"
+                        ):
+                            is_range_index = True
+                        else:
+                            index_col_names = {}
+                            for idx_col in index_col:
+                                for c in meta["columns"]:
+                                    if c["field_name"] == idx_col:
+                                        index_col_names[idx_col] = (
+                                            c["name"] or c["field_name"]
+                                        )
+                                        if c["name"] is None:
+                                            reset_index_name = True
+
+            actual_index_names = None
+            col_names = names
+            if index_col is not None and len(index_col) > 0:
+                if is_range_index:
+                    range_index_meta = index_col[0]
+                    range_idx = cudf.RangeIndex(
+                        start=range_index_meta["start"],
+                        stop=range_index_meta["stop"],
+                        step=range_index_meta["step"],
+                        name=range_index_meta["name"],
+                    )
+                    if skiprows != 0:
+                        range_idx = range_idx[skiprows:]
+                    if num_rows != -1:
+                        range_idx = range_idx[:num_rows]
+                else:
+                    actual_index_names = list(index_col_names.values())
+                    col_names = names[len(actual_index_names) :]
+
+            data, index = data_from_pylibcudf_io(
+                tbl_w_meta,
+                col_names if columns is None else names,
+                actual_index_names,
+            )
+
+            if is_range_index:
+                index = range_idx
+            elif reset_index_name:
+                index.names = [None] * len(index.names)
+
+            child_name_values = tbl_w_meta.child_names.values()
+
+            data = {
+                name: ioutils._update_col_struct_field_names(col, child_names)
+                for (name, col), child_names in zip(
+                    data.items(), child_name_values
+                )
+            }
+
+        return cudf.DataFrame._from_data(data, index=index)
     else:
         from pyarrow import orc
 
-        def read_orc_stripe(orc_file, stripe, columns):
-            pa_table = orc_file.read_stripe(stripe, columns)
-            if isinstance(pa_table, pa.RecordBatch):
-                pa_table = pa.Table.from_batches([pa_table])
-            return pa_table
-
         warnings.warn("Using CPU via PyArrow to read ORC dataset.")
         if len(filepath_or_buffer) > 1:
             raise NotImplementedError(
@@ -364,11 +345,18 @@ def read_orc_stripe(orc_file, stripe, columns):
         orc_file = orc.ORCFile(filepath_or_buffer[0])
         if stripes is not None and len(stripes) > 0:
             for stripe_source_file in stripes:
-                pa_tables = [
-                    read_orc_stripe(orc_file, i, columns)
+                pa_tables = (
+                    orc_file.read_stripe(i, columns)
                     for i in stripe_source_file
-                ]
-                pa_table = pa.concat_tables(pa_tables)
+                )
+                pa_table = pa.concat_tables(
+                    [
+                        pa.Table.from_batches([table])
+                        if isinstance(table, pa.RecordBatch)
+                        else table
+                        for table in pa_tables
+                    ]
+                )
         else:
             pa_table = orc_file.read(columns=columns)
         df = cudf.DataFrame.from_arrow(pa_table)
@@ -378,16 +366,18 @@ def read_orc_stripe(orc_file, stripe, columns):
 
 @ioutils.doc_to_orc()
 def to_orc(
-    df,
+    df: cudf.DataFrame,
     fname,
-    compression="snappy",
-    statistics="ROWGROUP",
-    stripe_size_bytes=None,
-    stripe_size_rows=None,
-    row_index_stride=None,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
     cols_as_map_type=None,
     storage_options=None,
-    index=None,
+    index: bool | None = None,
 ):
     """{docstring}"""
 
@@ -413,7 +403,7 @@ def to_orc(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            liborc.write_orc(
+            _plc_write_orc(
                 df,
                 file_obj,
                 compression,
@@ -425,7 +415,7 @@ def to_orc(
                 index,
             )
     else:
-        liborc.write_orc(
+        _plc_write_orc(
             df,
             path_or_buf,
             compression,
@@ -438,4 +428,279 @@ def to_orc(
         )
 
 
-ORCWriter = liborc.ORCWriter
+@acquire_spill_lock()
+def _plc_write_orc(
+    table: cudf.DataFrame,
+    path_or_buf,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
+    cols_as_map_type=None,
+    index: bool | None = None,
+) -> None:
+    """
+    See `cudf::io::write_orc`.
+
+    See Also
+    --------
+    cudf.read_orc
+    """
+    user_data = {"pandas": ioutils.generate_pandas_metadata(table, index)}
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = (
+            table._columns
+            if table.index is None
+            else itertools.chain(table.index._columns, table._columns)
+        )
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table._index.names):
+            tbl_meta.column_metadata[level].set_name(
+                ioutils._index_level_name(idx_name, level, table._column_names)  # type: ignore[arg-type]
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    has_map_type = False
+    if cols_as_map_type is not None:
+        cols_as_map_type = set(cols_as_map_type)
+        has_map_type = True
+
+    for i, (name, col) in enumerate(
+        table._column_labels_and_values, start=num_index_cols_meta
+    ):
+        tbl_meta.column_metadata[i].set_name(name)
+        _set_col_children_metadata(
+            col,
+            tbl_meta.column_metadata[i],
+            has_map_type and name in cols_as_map_type,
+        )
+
+    options = (
+        plc.io.orc.OrcWriterOptions.builder(
+            plc.io.SinkInfo([path_or_buf]), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(_get_comp_type(compression))
+        .enable_statistics(_get_orc_stat_freq(statistics))
+        .build()
+    )
+    if stripe_size_bytes is not None:
+        options.set_stripe_size_bytes(stripe_size_bytes)
+    if stripe_size_rows is not None:
+        options.set_stripe_size_rows(stripe_size_rows)
+    if row_index_stride is not None:
+        options.set_row_index_stride(row_index_stride)
+
+    plc.io.orc.write_orc(options)
+
+
+class ORCWriter:
+    """
+    ORCWriter lets you you incrementally write out a ORC file from a series
+    of cudf tables
+
+    See Also
+    --------
+    cudf.io.orc.to_orc
+    """
+
+    def __init__(
+        self,
+        path,
+        index: bool | None = None,
+        compression: Literal[
+            False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+        ] = "SNAPPY",
+        statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+        cols_as_map_type=None,
+        stripe_size_bytes: int | None = None,
+        stripe_size_rows: int | None = None,
+        row_index_stride: int | None = None,
+    ):
+        self.sink = plc.io.SinkInfo([path])
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.cols_as_map_type = (
+            cols_as_map_type
+            if cols_as_map_type is None
+            else set(cols_as_map_type)
+        )
+        self.stripe_size_bytes = stripe_size_bytes
+        self.stripe_size_rows = stripe_size_rows
+        self.row_index_stride = row_index_stride
+        self.initialized = False
+
+    def write_table(self, table):
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(table)
+
+        keep_index = self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        )
+        if keep_index:
+            cols_to_write = itertools.chain(
+                table.index._columns, table._columns
+            )
+        else:
+            cols_to_write = table._columns
+
+        self.writer.write(
+            plc.Table([col.to_pylibcudf(mode="read") for col in cols_to_write])
+        )
+
+    def close(self):
+        if not self.initialized:
+            return
+        self.writer.close()
+
+    def _initialize_chunked_state(self, table):
+        """
+        Prepare all the values required to build the
+        chunked_orc_writer_options anb creates a writer
+        """
+
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        has_map_type = self.cols_as_map_type is not None
+        for i, (name, col) in enumerate(
+            table._column_labels_and_values, start=num_index_cols_meta
+        ):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_children_metadata(
+                col,
+                self.tbl_meta.column_metadata[i],
+                has_map_type and name in self.cols_as_map_type,
+            )
+
+        user_data = {
+            "pandas": ioutils.generate_pandas_metadata(table, self.index)
+        }
+
+        options = (
+            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(_get_comp_type(self.compression))
+            .enable_statistics(_get_orc_stat_freq(self.statistics))
+            .build()
+        )
+        if self.stripe_size_bytes is not None:
+            options.set_stripe_size_bytes(self.stripe_size_bytes)
+        if self.stripe_size_rows is not None:
+            options.set_stripe_size_rows(self.stripe_size_rows)
+        if self.row_index_stride is not None:
+            options.set_row_index_stride(self.row_index_stride)
+
+        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
+
+        self.initialized = True
+
+
+def _get_comp_type(
+    compression: Literal[False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"],
+) -> plc.io.types.CompressionType:
+    if compression is None or compression is False:
+        return plc.io.types.CompressionType.NONE
+
+    normed_compression = compression.upper()
+    if normed_compression == "SNAPPY":
+        return plc.io.types.CompressionType.SNAPPY
+    elif normed_compression == "ZLIB":
+        return plc.io.types.CompressionType.ZLIB
+    elif normed_compression == "ZSTD":
+        return plc.io.types.CompressionType.ZSTD
+    elif normed_compression == "LZ4":
+        return plc.io.types.CompressionType.LZ4
+    else:
+        raise ValueError(f"Unsupported `compression` type {compression}")
+
+
+def _get_orc_stat_freq(
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"],
+) -> plc.io.types.StatisticsFreq:
+    """
+    Convert ORC statistics terms to CUDF convention:
+      - ORC "STRIPE"   == CUDF "ROWGROUP"
+      - ORC "ROWGROUP" == CUDF "PAGE"
+    """
+    normed_statistics = statistics.upper()
+    if normed_statistics == "NONE":
+        return plc.io.types.StatisticsFreq.STATISTICS_NONE
+    elif normed_statistics == "STRIPE":
+        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
+    elif normed_statistics == "ROWGROUP":
+        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
+    else:
+        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
+
+
+def _set_col_children_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    list_column_as_map: bool = False,
+) -> None:
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_children_metadata(
+                child_col, col_meta.child(i), list_column_as_map
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if list_column_as_map:
+            col_meta.set_list_column_as_map()
+        _set_col_children_metadata(
+            col.children[1], col_meta.child(1), list_column_as_map
+        )
+    else:
+        return
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5681601d2be..d9a3da6666d 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -3,37 +3,45 @@
 
 import datetime
 import functools
+import json
 import operator
 import os
 import urllib
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import fsspec
 import fsspec.implementations.local
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
 import cudf
 from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
+from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
 
 try:
     import fsspec.parquet as fsspec_parquet
-
 except ImportError:
     fsspec_parquet = None
 
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
 
     from cudf.core.column import ColumnBase
 
 
+PARQUET_META_TYPE_MAP = {
+    str(cudf_dtype): str(pandas_dtype)
+    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
+}
+
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
@@ -1487,6 +1495,153 @@
 )
 
 
+def _index_level_name(
+    index_name: Hashable, level: int, column_names: list[Hashable]
+) -> Hashable:
+    """
+    Return the name of an index level or a default name
+    if `index_name` is None or is already a column name.
+
+    Parameters
+    ----------
+    index_name : name of an Index object
+    level : level of the Index object
+
+    Returns
+    -------
+    name : str
+    """
+    if index_name is not None and index_name not in column_names:
+        return index_name
+    else:
+        return f"__index_level_{level}__"
+
+
+def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str:
+    col_names: list[Hashable] = []
+    types = []
+    index_levels = []
+    index_descriptors = []
+    columns_to_convert = list(table._columns)
+    # Columns
+    for name, col in table._column_labels_and_values:
+        if cudf.get_option("mode.pandas_compatible"):
+            # in pandas-compat mode, non-string column names are stringified.
+            col_names.append(str(name))
+        else:
+            col_names.append(name)
+
+        if isinstance(col.dtype, cudf.CategoricalDtype):
+            raise ValueError(
+                "'category' column dtypes are currently not "
+                + "supported by the gpu accelerated parquet writer"
+            )
+        elif isinstance(
+            col.dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype),
+        ):
+            types.append(col.dtype.to_arrow())
+        else:
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interoperable
+            # with arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
+
+    # Indexes
+    materialize_index = False
+    if index is not False:
+        for level, name in enumerate(table.index.names):
+            if isinstance(table.index, cudf.MultiIndex):
+                idx = table.index.get_level_values(level)
+            else:
+                idx = table.index
+
+            if isinstance(idx, cudf.RangeIndex):
+                if index is None:
+                    descr: dict[str, Any] | Hashable = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    materialize_index = True
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = idx._as_int_index()
+                    descr = _index_level_name(
+                        index_name=materialized_idx.name,
+                        level=level,
+                        column_names=col_names,
+                    )
+                    index_levels.append(materialized_idx)
+                    columns_to_convert.append(materialized_idx._values)
+                    col_names.append(descr)
+                    types.append(np_to_pa_dtype(materialized_idx.dtype))
+            else:
+                descr = _index_level_name(
+                    index_name=idx.name, level=level, column_names=col_names
+                )
+                columns_to_convert.append(idx._values)
+                col_names.append(descr)
+                if isinstance(idx.dtype, cudf.CategoricalDtype):
+                    raise ValueError(
+                        "'category' column dtypes are currently not "
+                        + "supported by the gpu accelerated parquet writer"
+                    )
+                elif isinstance(idx.dtype, cudf.ListDtype):
+                    types.append(col.dtype.to_arrow())
+                else:
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
+                index_levels.append(idx)
+            index_descriptors.append(descr)
+
+    df_meta = table.head(0)
+    if materialize_index:
+        df_meta.index = df_meta.index._as_int_index()
+    metadata = pa.pandas_compat.construct_metadata(
+        columns_to_convert=columns_to_convert,
+        # It is OKAY to do `.head(0).to_pandas()` because
+        # this method will extract `.columns` metadata only
+        df=df_meta.to_pandas(),
+        column_names=col_names,
+        index_levels=index_levels,
+        index_descriptors=index_descriptors,
+        preserve_index=index,
+        types=types,
+    )
+
+    md_dict = json.loads(metadata[b"pandas"])
+
+    # correct metadata for list and struct and nullable numeric types
+    for col_meta in md_dict["columns"]:
+        if (
+            col_meta["name"] in table._column_names
+            and table._data[col_meta["name"]].nullable
+            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
+            and col_meta["pandas_type"] != "decimal"
+        ):
+            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
+                col_meta["numpy_type"]
+            ]
+        if col_meta["numpy_type"] in ("list", "struct"):
+            col_meta["numpy_type"] = "object"
+
+    return json.dumps(md_dict)
+
+
 def is_url(url):
     """Check if a string is a valid URL to a network location.
 
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index a1f3b17936c..61fe33d6805 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -65,7 +65,6 @@ cdef class ColumnInMetadata:
 
 cdef class TableInputMetadata:
     cdef table_input_metadata c_obj
-    cdef list column_metadata
 
 cdef class TableWithMetadata:
     cdef public Table tbl
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
index a3a559219ff..63fa9d1ff79 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyi
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -64,6 +64,8 @@ class PartitionInfo:
 
 class TableInputMetadata:
     def __init__(self, table: Table): ...
+    @property
+    def column_metadata(self) -> list[ColumnInMetadata]: ...
 
 class ColumnInMetadata:
     def set_name(self, name: str) -> Self: ...
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index a2155829f2c..458595ca0e0 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -288,12 +288,14 @@ cdef class TableInputMetadata:
     """
     def __init__(self, Table table):
         self.c_obj = table_input_metadata(table.view())
-        self.column_metadata = [
+
+    @property
+    def column_metadata(self):
+        return [
             ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self)
             for i in range(self.c_obj.column_metadata.size())
         ]
 
-
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
     (e.g. column names)

From cbeefd8f4e4e67f52331131039533ef1f0ea0a65 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:54:54 -0500
Subject: [PATCH 36/78] Add Parquet Reader options classes to pylibcudf
 (#17464)

Follow up of #17263, this PR adds the parquet reader options classes to pylibcudf and plumbs the changes through cudf python.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/17464
---
 cpp/include/cudf/io/parquet.hpp               |   1 +
 python/cudf/cudf/_lib/parquet.pyx             |  58 +--
 python/cudf_polars/cudf_polars/dsl/ir.py      |  44 ++-
 python/pylibcudf/pylibcudf/io/parquet.pxd     |  36 +-
 python/pylibcudf/pylibcudf/io/parquet.pyi     |  21 +-
 python/pylibcudf/pylibcudf/io/parquet.pyx     | 339 +++++++++++-------
 .../pylibcudf/tests/io/test_parquet.py        |  28 +-
 7 files changed, 333 insertions(+), 194 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index bfe76d5690c..b561d0989e9 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -410,6 +410,7 @@ class parquet_reader_options_builder {
    *
    * @param val Boolean value whether to read matching projected and filter columns from mismatched
    * Parquet sources.
+   *
    * @return this for chaining.
    */
   parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index c77c9875342..1b4c18d13a7 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -205,7 +205,7 @@ cdef object _process_metadata(object df,
             else:
                 start = range_index_meta["start"] + skip_rows
                 stop = range_index_meta["stop"]
-                if nrows != -1:
+                if nrows > -1:
                     stop = start + nrows
                 idx = cudf.RangeIndex(
                     start=start,
@@ -256,16 +256,27 @@ def read_parquet_chunked(
     # (see read_parquet)
     allow_range_index = columns is not None and len(columns) != 0
 
+    options = (
+        plc.io.parquet.ParquetReaderOptions.builder(
+            plc.io.SourceInfo(filepaths_or_buffers)
+        )
+        .use_pandas_metadata(use_pandas_metadata)
+        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+        .build()
+    )
+    if row_groups is not None:
+        options.set_row_groups(row_groups)
+    if nrows > -1:
+        options.set_num_rows(nrows)
+    if skip_rows != 0:
+        options.set_skip_rows(skip_rows)
+    if columns is not None:
+        options.set_columns(columns)
+
     reader = ChunkedParquetReader(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        use_pandas_metadata=use_pandas_metadata,
+        options,
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
-        skip_rows=skip_rows,
-        nrows=nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -325,19 +336,26 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     if columns is not None and len(columns) == 0 or filters:
         allow_range_index = False
 
-    # Read Parquet
-
-    tbl_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories = False,
-        use_pandas_metadata = use_pandas_metadata,
-        skip_rows = skip_rows,
-        nrows = nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
+    options = (
+        plc.io.parquet.ParquetReaderOptions.builder(
+            plc.io.SourceInfo(filepaths_or_buffers)
+        )
+        .use_pandas_metadata(use_pandas_metadata)
+        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+        .build()
     )
+    if row_groups is not None:
+        options.set_row_groups(row_groups)
+    if nrows > -1:
+        options.set_num_rows(nrows)
+    if skip_rows != 0:
+        options.set_skip_rows(skip_rows)
+    if columns is not None:
+        options.set_columns(columns)
+    if filters is not None:
+        options.set_filter(filters)
+
+    tbl_w_meta = plc.io.parquet.read_parquet(options)
 
     df = cudf.DataFrame._from_data(
         *data_from_pylibcudf_io(tbl_w_meta)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1faa778ccf6..b5af3bb80bf 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -517,17 +517,22 @@ def do_evaluate(
         elif typ == "parquet":
             parquet_options = config_options.get("parquet_options", {})
             if parquet_options.get("chunked", True):
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                # We handle skip_rows != 0 by reading from the
+                # up to n_rows + skip_rows and slicing off the
+                # first skip_rows entries.
+                # TODO: Remove this workaround once
+                # https://github.com/rapidsai/cudf/issues/16186
+                # is fixed
+                nrows = n_rows + skip_rows
+                if nrows > -1:
+                    options.set_num_rows(nrows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    # We handle skip_rows != 0 by reading from the
-                    # up to n_rows + skip_rows and slicing off the
-                    # first skip_rows entries.
-                    # TODO: Remove this workaround once
-                    # https://github.com/rapidsai/cudf/issues/16186
-                    # is fixed
-                    nrows=n_rows + skip_rows,
-                    skip_rows=0,
+                    options,
                     chunk_read_limit=parquet_options.get(
                         "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
                     ),
@@ -573,13 +578,18 @@ def slice_skip(tbl: plc.Table):
                 if predicate is not None and row_index is None:
                     # Can't apply filters during read if we have a row index.
                     filters = to_parquet_filter(predicate.value)
-                tbl_w_meta = plc.io.parquet.read_parquet(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    filters=filters,
-                    nrows=n_rows,
-                    skip_rows=skip_rows,
-                )
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                if n_rows != -1:
+                    options.set_num_rows(n_rows)
+                if skip_rows != 0:
+                    options.set_skip_rows(skip_rows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
+                if filters is not None:
+                    options.set_filter(filters)
+                tbl_w_meta = plc.io.parquet.read_parquet(options)
                 df = DataFrame.from_table(
                     tbl_w_meta.tbl,
                     # TODO: consider nested column names?
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 7bd6ba91ca9..84f47cf5305 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_writer_options,
     parquet_writer_options_builder,
+    parquet_reader_options,
+    parquet_reader_options_builder,
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
 )
@@ -27,6 +29,25 @@ from pylibcudf.table cimport Table
 from pylibcudf.types cimport DataType
 
 
+cdef class ParquetReaderOptions:
+    cdef parquet_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_row_groups(self, list row_groups)
+    cpdef void set_num_rows(self, size_type nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_columns(self, list col_names)
+    cpdef void set_filter(self, Expression filter)
+
+cdef class ParquetReaderOptionsBuilder:
+    cdef parquet_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val)
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val)
+    cpdef build(self)
+
+
 cdef class ChunkedParquetReader:
     cdef unique_ptr[cpp_chunked_parquet_reader] reader
 
@@ -34,20 +55,7 @@ cdef class ChunkedParquetReader:
     cpdef TableWithMetadata read_chunk(self)
 
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = *,
-    list row_groups = *,
-    Expression filters = *,
-    bool convert_strings_to_categories = *,
-    bool use_pandas_metadata = *,
-    int64_t skip_rows = *,
-    size_type nrows = *,
-    bool allow_mismatched_pq_schemas = *,
-    # disabled see comment in parquet.pyx for more
-    # ReaderColumnSchema reader_column_schema = *,
-    # DataType timestamp_type = *
-)
+cpdef read_parquet(ParquetReaderOptions options)
 
 
 cdef class ParquetChunkedWriter:
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index 22bea1abd8e..2d8d12c1a45 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from collections.abc import Mapping
-from typing import Self
+
+from typing_extensions import Self
 
 from pylibcudf.expressions import Expression
 from pylibcudf.io.types import (
@@ -16,6 +17,24 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 
+class ParquetReaderOptions:
+    def __init__(self): ...
+    def set_row_groups(self, row_groups: list[list[int]]): ...
+    def set_num_rows(self, nrows: int): ...
+    def set_skip_rows(self, skip_rows: int): ...
+    def set_columns(self, col_names: list[str]): ...
+    def set_filter(self, filter: Expression): ...
+    @staticmethod
+    def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ...
+
+class ParquetReaderOptionsBuilder:
+    def __init__(self): ...
+    def convert_strings_to_categories(self, val: bool) -> Self: ...
+    def use_pandas_metadata(self, val: bool) -> Self: ...
+    def allow_mismatched_pq_schemas(self, val: bool) -> Self: ...
+    def use_arrow_schema(self, val: bool) -> Self: ...
+    def build(self) -> ParquetReaderOptions: ...
+
 class ChunkedParquetReader:
     def __init__(
         self,
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index 9bdf849a30c..672fe2be847 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -42,47 +42,204 @@ __all__ = [
     "ParquetWriterOptionsBuilder",
     "read_parquet",
     "write_parquet",
+    "ParquetReaderOptions",
+    "ParquetReaderOptionsBuilder",
     "ChunkedParquetWriterOptions",
     "ChunkedParquetWriterOptionsBuilder"
     "merge_row_group_metadata",
 ]
 
-cdef parquet_reader_options _setup_parquet_reader_options(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas=False,
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    cdef vector[string] col_vec
-    cdef parquet_reader_options opts = (
-        parquet_reader_options.builder(source_info.c_obj)
-        .convert_strings_to_categories(convert_strings_to_categories)
-        .use_pandas_metadata(use_pandas_metadata)
-        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
-        .use_arrow_schema(True)
-        .build()
-    )
-    if row_groups is not None:
-        opts.set_row_groups(row_groups)
-    if nrows != -1:
-        opts.set_num_rows(nrows)
-    if skip_rows != 0:
-        opts.set_skip_rows(skip_rows)
-    if columns is not None:
-        col_vec.reserve(len(columns))
-        for col in columns:
-            col_vec.push_back(<string>str(col).encode())
-        opts.set_columns(col_vec)
-    if filters is not None:
-        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
-    return opts
+
+cdef class ParquetReaderOptions:
+    """The settings to use for ``read_parquet``
+    For details, see :cpp:class:`cudf::io::parquet_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a ParquetReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::parquet_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the Parquet file from.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+            Builder to build ParquetReaderOptions
+        """
+        cdef ParquetReaderOptionsBuilder parquet_builder = (
+            ParquetReaderOptionsBuilder.__new__(ParquetReaderOptionsBuilder)
+        )
+        parquet_builder.c_obj = parquet_reader_options.builder(source.c_obj)
+        parquet_builder.source = source
+        return parquet_builder
+
+    cpdef void set_row_groups(self, list row_groups):
+        """
+        Sets list of individual row groups to read.
+
+        Parameters
+        ----------
+        row_groups : list
+            List of row groups to read
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] outer
+        cdef vector[size_type] inner
+        for row_group in row_groups:
+            for x in row_group:
+                inner.push_back(x)
+            outer.push_back(inner)
+            inner.clear()
+
+        self.c_obj.set_row_groups(outer)
+
+    cpdef void set_num_rows(self, size_type nrows):
+        """
+        Sets number of rows to read.
+
+        Parameters
+        ----------
+        nrows : size_type
+            Number of rows to read after skip
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip.
+
+        Parameters
+        ----------
+        skip_rows : int64_t
+            Number of rows to skip from start
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the columns to be read.
+
+        Parameters
+        ----------
+        col_names : list
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] vec
+        for name in col_names:
+            vec.push_back(<string>str(name).encode())
+        self.c_obj.set_columns(vec)
+
+    cpdef void set_filter(self, Expression filter):
+        """
+        Sets AST based filter for predicate pushdown.
+
+        Parameters
+        ----------
+        filter : Expression
+            AST expression to use as filter
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_filter(<expression &>dereference(filter.c_obj.get()))
+
+
+cdef class ParquetReaderOptionsBuilder:
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val):
+        """
+        Sets enable/disable conversion of strings to categories.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable conversion of string columns to categories
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.convert_strings_to_categories(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val):
+        """
+        Sets to enable/disable use of pandas metadata to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use pandas metadata
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_pandas_metadata(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val):
+        """
+        Sets to enable/disable reading of matching projected and filter
+        columns from mismatched Parquet sources.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to read matching projected and filter
+            columns from mismatched Parquet sources.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.allow_mismatched_pq_schemas(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val):
+        """
+        Sets to enable/disable use of arrow schema to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use arrow schema
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_arrow_schema(val)
+        return self
+
+    cpdef build(self):
+        """Create a ParquetReaderOptions object"""
+        cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__(
+            ParquetReaderOptions
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.source = self.source
+        return parquet_options
 
 
 cdef class ChunkedParquetReader:
@@ -93,63 +250,27 @@ cdef class ChunkedParquetReader:
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The names of the columns to be read
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
+    options : ParquetReaderOptions
+        Settings for controlling reading behavior
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
         or 0 if there is no limit.
     pass_read_limit : size_t, default 1024000000
         Limit on the amount of memory used for reading and decompressing data
         or 0 if there is no limit.
-    allow_mismatched_pq_schemas : bool, default False
-        Whether to read (matching) columns specified in `columns` from
-        the input files with otherwise mismatched schemas.
     """
     def __init__(
         self,
-        SourceInfo source_info,
-        list columns=None,
-        list row_groups=None,
-        bool use_pandas_metadata=True,
-        bool convert_strings_to_categories=False,
-        int64_t skip_rows = 0,
-        size_type nrows = -1,
+        ParquetReaderOptions options,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000,
-        bool allow_mismatched_pq_schemas=False
     ):
-
-        cdef parquet_reader_options opts = _setup_parquet_reader_options(
-            source_info,
-            columns,
-            row_groups,
-            filters=None,
-            convert_strings_to_categories=convert_strings_to_categories,
-            use_pandas_metadata=use_pandas_metadata,
-            skip_rows=skip_rows,
-            nrows=nrows,
-            allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-        )
-
         with nogil:
             self.reader.reset(
                 new cpp_chunked_parquet_reader(
                     chunk_read_limit,
                     pass_read_limit,
-                    opts
+                    options.c_obj,
                 )
             )
 
@@ -184,69 +305,23 @@ cdef class ChunkedParquetReader:
 
         return TableWithMetadata.from_libcudf(c_result)
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas = False,
-    # Disabled, these aren't used by cudf-python
-    # we should only add them back in if there's user demand
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+cpdef read_parquet(ParquetReaderOptions options):
+    """
+    Read from Parquet format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
 
     For details, see :cpp:func:`read_parquet`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    filters : Expression, default None
-        An AST :py:class:`pylibcudf.expressions.Expression`
-        to use for predicate pushdown.
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    allow_mismatched_pq_schemas : bool, default False
-        If True, enable reading (matching) columns specified in `columns`
-        from the input files with otherwise mismatched schemas.
-
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    options: ParquetReaderOptions
+        Settings for controlling reading behavior
     """
-    cdef table_with_metadata c_result
-    cdef parquet_reader_options opts = _setup_parquet_reader_options(
-        source_info,
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories,
-        use_pandas_metadata,
-        skip_rows,
-        nrows,
-        allow_mismatched_pq_schemas,
-    )
-
     with nogil:
-        c_result = move(cpp_read_parquet(opts))
+        c_result = move(cpp_read_parquet(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index 94524acbcc8..da535809745 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -31,19 +31,24 @@ def test_read_parquet_basic(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    res = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    if nrows > -1:
+        options.set_num_rows(nrows)
+    if skiprows != 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None:
+        options.set_columns(columns)
+
+    res = plc.io.parquet.read_parquet(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)
 
     # Adapt to nrows/skiprows
     pa_table = pa_table.slice(
-        offset=skiprows, length=nrows if nrows != -1 else None
+        offset=skiprows, length=nrows if nrows > -1 else None
     )
 
     assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
@@ -95,9 +100,12 @@ def test_read_parquet_filters(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    plc_table_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]), filters=plc_filters
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    options.set_filter(plc_filters)
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(options)
     exp = read_table(source, filters=pa_filters)
     assert_table_and_meta_eq(
         exp, plc_table_w_meta, check_field_nullability=False

From 14b4321b5172104c5d9801e196e607e3bb0c4c39 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:27:03 -0600
Subject: [PATCH 37/78] Fix all null list column with missing child column in
 JSON reader (#17348)

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Basit Ayantunde (https://github.com/lamarrr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17348
---
 cpp/src/io/json/host_tree_algorithms.cu | 126 ++++++++++++++-------
 cpp/src/io/json/json_column.cu          |  67 ++++++------
 cpp/src/io/json/nested_json.hpp         |  12 ++
 cpp/src/io/json/parser_features.cpp     |  58 +++++++---
 cpp/tests/io/json/json_test.cpp         | 140 ++++++++++++++++++++++++
 5 files changed, 317 insertions(+), 86 deletions(-)

diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7fafa885c66..7b9fc25d1cc 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -222,18 +222,19 @@ struct json_column_data {
 using hashmap_of_device_columns =
   std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr);
 
 void scatter_offsets(tree_meta_t const& tree,
                      device_span<NodeIndexT const> col_ids,
@@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream);
 
@@ -363,17 +365,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
+  auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root,
+                                                                  is_str_column_all_nulls,
+                                                                  d_column_tree,
+                                                                  d_unique_col_ids,
+                                                                  d_max_row_offsets,
+                                                                  column_names,
+                                                                  row_array_parent_col_id,
+                                                                  is_array_of_arrays,
+                                                                  options,
+                                                                  stream,
+                                                                  mr);
   if (ignore_vals.empty()) return;
   scatter_offsets(tree,
                   col_ids,
@@ -382,22 +384,24 @@ void make_device_json_column(device_span<SymbolT const> input,
                   sorted_col_ids,
                   d_column_tree,
                   ignore_vals,
+                  is_mixed_pruned,
                   columns,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
@@ -488,7 +492,9 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   // NoPruning: iterate through schema and enforce type.
 
   if (adj[parent_node_sentinel].empty())
-    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+    return {cudf::detail::make_host_vector<bool>(0, stream),
+            cudf::detail::make_host_vector<bool>(0, stream),
+            {}};  // for empty file
   CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
   auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
   std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
@@ -551,11 +557,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       auto list_child = schema.child_types.at(this_list_child_name);
       for (auto const& child_id : child_ids)
         mark_is_pruned(child_id, list_child);
+      // TODO: Store null map of non-target types for list children to mark list entry as null.
     }
   };
   if (is_array_of_arrays) {
     if (adj[adj[parent_node_sentinel][0]].empty())
-      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+      return {cudf::detail::make_host_vector<bool>(0, stream),
+              cudf::detail::make_host_vector<bool>(0, stream),
+              {}};
     auto root_list_col_id =
       is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
     // mark root and row array col_id as not pruned.
@@ -647,8 +656,12 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       ? adj[parent_node_sentinel][0]
       : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
 
+  // List children which are pruned mixed types, nullify parent list row.
+  auto is_mixed_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_mixed_pruned.begin(), num_columns, false);
   auto handle_mixed_types = [&column_categories,
                              &is_str_column_all_nulls,
+                             &is_mixed_pruned,
                              &is_pruned,
                              &expected_types,
                              &is_enabled_mixed_types_as_string,
@@ -794,6 +807,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                      "list child column insertion failed, duplicate column name in the parent");
         ref.get().column_order.emplace_back(list_child_name);
         auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        if (options.is_enabled_experimental()) {
+          for (auto const& child_id : child_ids) {
+            if (is_pruned[child_id]) {
+              // store this child_id for mixed_type nullify parent list_id.
+              is_mixed_pruned[child_id] = is_pruned[child_id];
+            }
+          }
+        }
         // Mixed type handling
         handle_mixed_types(child_ids);
         if (child_ids.empty()) {
@@ -829,7 +850,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
   cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
-  return {is_pruned, columns};
+  return {is_pruned, is_mixed_pruned, columns};
 }
 
 void scatter_offsets(tree_meta_t const& tree,
@@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed_pruned,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream)
 {
@@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree,
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
     ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async(
+    is_mixed_pruned, stream, cudf::get_current_device_resource_ref());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
     columns_data, stream, cudf::get_current_device_resource_ref());
 
@@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree,
              column_categories[col_ids[parent_node_id]] == NC_LIST and
              (!d_ignore_vals[col_ids[parent_node_id]]);
     });
+  // For children of list and in ignore_vals, find it's parent node id, and set corresponding
+  // parent's null mask to null. Setting mixed type list rows to null.
+  auto const num_list_children = thrust::distance(
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end);
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids          = node_ids.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     d_is_mixed_pruned = d_is_mixed_pruned.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
+      if (column_categories[col_ids[parent_node_id]] == NC_LIST and
+          d_is_mixed_pruned[col_ids[node_id]]) {
+        clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
+      }
+    });
 
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
   thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 30a154fdda2..1fe58a0449f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -464,46 +464,49 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       column_names.emplace_back(
         json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
-      // Note: json_col modified here, reuse the memory
+      // If child is not present, set the null mask correctly, but offsets are zero, and children
+      // are empty. Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
                                                      num_rows + 1,
                                                      json_col.child_offsets.release(),
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element =
-        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
-      auto [child_column, names] =
-        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
-          ? std::pair<std::unique_ptr<column>,
-                      // EMPTY type could not used because gather throws exception on EMPTY type.
-                      std::vector<column_name_info>>{std::make_unique<column>(
-                                                       data_type{type_id::INT8},
-                                                       0,
-                                                       rmm::device_buffer{},
-                                                       rmm::device_buffer{},
-                                                       0),
-                                                     std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                              d_input,
-                                              options,
-                                              prune_columns,
-                                              child_schema_element,
-                                              stream,
-                                              mr);
+      auto child_schema_element  = get_list_child_schema();
+      auto [child_column, names] = [&]() {
+        if (json_col.child_columns.empty()) {
+          // EMPTY type could not used because gather throws exception on EMPTY type.
+          auto empty_col = make_empty_column(
+            child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr);
+          auto children_metadata = std::vector<column_name_info>{
+            make_column_name_info(
+              child_schema_element.value_or(schema_element{data_type{type_id::INT8}}),
+              list_child_name)
+              .children};
+
+          return std::pair<std::unique_ptr<column>, std::vector<column_name_info>>{
+            std::move(empty_col), children_metadata};
+        }
+        return device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                                 d_input,
+                                                 options,
+                                                 prune_columns,
+                                                 child_schema_element,
+                                                 stream,
+                                                 mr);
+      }();
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
-      auto ret_col                      = make_lists_column(num_rows,
-                                       std::move(offsets_column),
-                                       std::move(child_column),
-                                       0,
-                                       rmm::device_buffer{0, stream, mr},
-                                       stream,
-                                       mr);
-      // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and
-      // null validation applied in make_lists_column factory, which is not needed for json
-      // parent column cannot be null when its children is non-empty in JSON
-      if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
+      auto ret_col                      = make_lists_column(
+        num_rows,
+        std::move(offsets_column),
+        std::move(child_column),
+        null_count,
+        null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask),
+        stream,
+        mr);
+      // Since some rows in child column may need to be nullified due to mixed types, we can not
+      // skip the purge_nonempty_nulls call in make_lists_column factory
       return {std::move(ret_col), std::move(column_names)};
     }
     default: CUDF_FAIL("Unsupported column type"); break;
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 4989fff4b30..2f6942fe139 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Create empty column of a given nested schema
+ *
+ * @param schema The schema of the column to create
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr resource with which to allocate
+ * @return The empty column
+ */
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr);
+
 /**
  * @brief Create all null column of a given nested schema
  *
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index ced7acb9cde..2da320b2af3 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -159,7 +159,17 @@ struct empty_column_functor {
     std::unique_ptr<column> child = cudf::type_dispatcher(
       schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name));
     auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
-    return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    0,
+                                    rmm::device_buffer{},
+                                    rmm::device_buffer{},
+                                    0,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -174,6 +184,13 @@ struct empty_column_functor {
   }
 };
 
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr)
+{
+  return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema);
+}
+
 /// Created all null column of the specified schema
 struct allnull_column_functor {
   rmm::cuda_stream_view stream;
@@ -198,10 +215,9 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
     return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr);
     auto indices   = make_zeroed_offsets(size - 1);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
@@ -221,14 +237,22 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
-    auto offsets                  = make_zeroed_offsets(size);
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
+    auto offsets   = make_zeroed_offsets(size);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_lists_column(
-      size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -240,8 +264,14 @@ struct allnull_column_functor {
         schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size));
     }
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_structs_column(
-      size, std::move(child_columns), size, std::move(null_mask), stream, mr);
+    // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on
+    // the children columns. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::STRUCT},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 };
 
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 3c8db99c3c7..37a750330fa 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -56,6 +56,8 @@ using int16_wrapper        = wrapper<int16_t>;
 using int64_wrapper        = wrapper<int64_t>;
 using timestamp_ms_wrapper = wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>;
 using bool_wrapper         = wrapper<bool>;
+using size_type_wrapper    = wrapper<cudf::size_type>;
+using strings_wrapper      = cudf::test::strings_column_wrapper;
 
 using cudf::data_type;
 using cudf::type_id;
@@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder)
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped);
     }
   }
+
+  // test list (all-null) of struct (empty) of string (empty)
+  {
+    std::string json_stringl = R"(
+    {"a" : [1], "c2": [1, 2]}
+    {}
+    )";
+    auto lines               = true;
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+        .prune_columns(true)
+        .experimental(true)
+        .lines(lines);
+
+    cudf::io::schema_element dtype_schema{
+      data_type{cudf::type_id::STRUCT},
+      {
+        {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype<int64_t>()}}}}},
+        {"c2",
+         {data_type{cudf::type_id::LIST},
+          {{"element",
+            {data_type{cudf::type_id::STRUCT},
+             {
+               {"d", {data_type{cudf::type_id::STRING}}},
+             },
+             {{"d"}}}}}}},
+      },
+      {{"a", "c2"}}};
+    in_options.set_dtypes(dtype_schema);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    // Make sure we have column "a":[int64_t]
+    ASSERT_EQ(result.tbl->num_columns(), 2);
+    ASSERT_EQ(result.metadata.schema_info.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element");
+    // Make sure we have all null list "c2": [{"d": ""}]
+    EXPECT_EQ(result.metadata.schema_info[1].name, "c2");
+    ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element");
+    ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d");
+
+    auto const expected0 = [&] {
+      auto const valids = std::vector<bool>{1, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 1, 1}.release(),
+                                     int64_wrapper{1}.release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    auto const expected1 = [&] {
+      auto const get_structs = [] {
+        auto child = cudf::test::strings_column_wrapper{};
+        return cudf::test::structs_column_wrapper{{child}};
+      };
+      auto const valids = std::vector<bool>{0, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 0, 0}.release(),
+                                     get_structs().release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view());
+  }
+}
+
+TEST_F(JsonReaderTest, NullifyMixedList)
+{
+  using namespace cudf::test::iterators;
+  // test list
+  std::string json_stringl = R"(
+      {"c2": []}
+      {"c2": [{}]}
+      {"c2": [[]]}
+      {"c2": [{}, [], {}]}
+      {"c2": [[123], {"b": "1"}]}
+      {"c2": [{"x": "y"}, {"b": "1"}]}
+      {}
+    )";
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  // valid     1  1  0  0  0  1  0
+  // ofset  0, 0, 1, 1, 1, 1, 3, 3
+  // child  {null, null}, {null, null}, {1, null}
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+      .prune_columns(true)
+      .experimental(true)
+      .lines(true);
+
+  // struct<c2: array<struct<b: string, c: string>>> eg. {"c2": [{"b": "1", "c": "2"}]}
+  cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                        {
+                                          {"c2",
+                                           {data_type{cudf::type_id::LIST},
+                                            {{"element",
+                                              {data_type{cudf::type_id::STRUCT},
+                                               {
+                                                 {"b", {data_type{cudf::type_id::STRING}}},
+                                                 {"c", {data_type{cudf::type_id::STRING}}},
+                                               },
+                                               {{"b", "c"}}}}}}},
+                                        },
+                                        {{"c2"}}};
+  in_options.set_dtypes(dtype_schema);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  ASSERT_EQ(result.tbl->num_columns(), 1);
+  ASSERT_EQ(result.metadata.schema_info.size(), 1);
+
+  // Expected: A list of struct of 2-string columns
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  auto get_structs = [] {
+    strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})};
+    strings_wrapper child1{{"", "", ""}, all_nulls()};
+    // purge non-empty nulls in list seems to retain nullmask in struct child column
+    return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release();
+  };
+  std::vector<bool> const list_nulls{1, 1, 0, 0, 0, 1, 0};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend());
+  auto const expected = cudf::make_lists_column(
+    7,
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 1, 1, 1, 1, 3, 3}.release(),
+    get_structs(),
+    null_count,
+    std::move(null_mask));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view());
 }
 
 struct JsonCompressedIOTest : public cudf::test::BaseFixture,

From 80fc629aab1cc459b9ff8f0e9fee379a82219815 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 7 Dec 2024 01:41:33 -0600
Subject: [PATCH 38/78] Update cuda-python lower bounds to 12.6.2 / 11.8.5
 (#17547)

We require a newer cuda-python lower bound for new features and to use the new layout.
This will fix a number of errors observed when the runtime version of cuda-python is older than the version used to build packages using Cython features from cuda-python.

See https://github.com/rapidsai/build-planning/issues/117#issuecomment-2524250915 for details.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/17547
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 conda/recipes/pylibcudf/meta.yaml                | 4 ++--
 dependencies.yaml                                | 8 ++++----
 python/cudf/pyproject.toml                       | 2 +-
 python/pylibcudf/pyproject.toml                  | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 87c40421be0..bad508154aa 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 0935de96d19..969124a29ad 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.5
 - cupy>=12.0.0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index e52b8c5f2a0..2c16deeed82 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -91,7 +91,7 @@ requirements:
     - cudatoolkit
     - ptxcompiler >=0.7.0
     - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
     - libcufile  # [linux64]
@@ -100,7 +100,7 @@ requirements:
     # TODO: Add nvjitlink here
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 3d965f30986..08eab363af0 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -83,9 +83,9 @@ requirements:
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - nvtx >=0.2.1
     - packaging
diff --git a/dependencies.yaml b/dependencies.yaml
index 044c7d187b3..3c55ce2c614 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -679,10 +679,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_pylibcudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
   run_cudf:
     common:
@@ -705,10 +705,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_cudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 80de9056a0a..21c18ef0174 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -20,7 +20,7 @@ requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.8.5,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index a5e5704b8ed..53ee3e2b56e 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.8.5,<12.0a0",
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",

From a0fc6a89a596ebae7df436be25aed70ec908f83e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 09:33:08 -0500
Subject: [PATCH 39/78] Use cooperative-groups instead of cub warp-reduce for
 strings contains (#17540)

Replaces the `cub::WarpReduce` usage in `cudf::strings::contains` with cooperative-groups `any()`.
The change is only for the `contains_warp_parallel` kernel which is used for wider strings.
Using cooperative-groups generates more efficient code for the same results and gives an additional 11-14% performance improvement.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17540
---
 cpp/src/strings/search/find.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 0f33fcb6fe1..94bc81ec933 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
@@ -347,13 +348,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  auto const idx    = cudf::detail::grid_1d::global_thread_id();
-  using warp_reduce = cub::WarpReduce<bool>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
   auto const str_idx = idx / cudf::detail::warp_size;
   if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
+
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
@@ -373,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
     }
   }
 
-  auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+  auto const result = warp.any(found);
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
 

From 0f5d4b9514b92f69465f4d76b1f9db1c5a37f33a Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 9 Dec 2024 10:41:26 -0500
Subject: [PATCH 40/78] Remove unused IO utilities from cudf python (#17374)

Removes unused IO utilities from cuDF Python. Depends on #17163 #16042 #17252 #17263

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17374
---
 python/cudf/cudf/_lib/io/utils.pxd |  6 +--
 python/cudf/cudf/_lib/io/utils.pyx | 87 ++----------------------------
 2 files changed, 5 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 96504ebdd66..9b8bab012e2 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -13,9 +13,6 @@ from pylibcudf.libcudf.io.types cimport (
 from cudf._lib.column cimport Column
 
 
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & data) except*
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
 cdef add_df_col_struct_names(
     df,
     child_names_dict
@@ -26,7 +23,8 @@ cdef update_col_struct_field_names(
 )
 cdef update_struct_field_names(
     table,
-    vector[column_name_info]& schema_info)
+    vector[column_name_info]& schema_info
+)
 cdef Column update_column_struct_field_names(
     Column col,
     column_name_info& info
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index f23980b387a..df4675be599 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,97 +1,16 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython.buffer cimport PyBUF_READ
-from cpython.memoryview cimport PyMemoryView_FromMemory
-from libcpp.memory cimport unique_ptr
+
 from libcpp.string cimport string
-from libcpp.utility cimport move
+
 from libcpp.vector cimport vector
 
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-)
+from pylibcudf.libcudf.io.types cimport column_name_info
 
 from cudf._lib.column cimport Column
 
-import codecs
-import io
-import os
-
 from cudf.core.dtypes import StructDtype
 
-# Converts the Python sink input to libcudf IO sink_info.
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & sink
-) except*:
-    cdef vector[data_sink *] data_sinks
-    cdef vector[string] paths
-    if isinstance(src[0], io.StringIO):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.TextIOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            # Files opened in text mode expect writes to be str rather than
-            # bytes, which requires conversion from utf-8. If the underlying
-            # buffer is utf-8, we can bypass this conversion by writing
-            # directly to it.
-            if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}:
-                raise NotImplementedError(f"Unsupported encoding {s.encoding}")
-            sink.push_back(
-                unique_ptr[data_sink](new iobase_data_sink(s.buffer))
-            )
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.IOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], (basestring, os.PathLike)):
-        paths.reserve(len(src))
-        for s in src:
-            paths.push_back(<string> os.path.expanduser(s).encode())
-        return sink_info(move(paths))
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src)))
-
-
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
-    cdef vector[unique_ptr[data_sink]] datasinks
-    cdef sink_info info = make_sinks_info([src], datasinks)
-    if not datasinks.empty():
-        sink.swap(datasinks[0])
-    return info
-
-
-# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
-# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
-cdef cppclass iobase_data_sink(data_sink):
-    object buf
-
-    iobase_data_sink(object buf_):
-        this.buf = buf_
-
-    void host_write(const void * data, size_t size) with gil:
-        if isinstance(buf, io.StringIO):
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
-                      .tobytes().decode())
-        else:
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
-
-    void flush() with gil:
-        buf.flush()
-
-    size_t bytes_written() with gil:
-        return buf.tell()
-
-
 cdef add_df_col_struct_names(df, child_names_dict):
     for name, child_names in child_names_dict.items():
         col = df._data[name]

From ba3ed5773171a545d43d9e0f598c6c2eb37ec122 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 9 Dec 2024 10:08:13 -0800
Subject: [PATCH 41/78] Fix nvcc-imposed UB in `constexpr` functions (#17534)

nvcc does not support `constexpr` functions that are not well-defined to call from the device. This is UB even when the function is not called from the device.

Throwing an exception is one such operation. This PR cleans up error handling for functions that are called from device, and removes `constexpr` from the ones that are not actually used from the device, or in the constexpr context.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17534
---
 .../cudf/detail/utilities/device_operators.cuh | 18 +++++++++++++++++-
 cpp/include/cudf/utilities/span.hpp            |  2 ++
 cpp/src/io/orc/writer_impl.cu                  |  2 +-
 cpp/src/io/utilities/time_utils.cuh            |  6 +++---
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 46f424e051b..d16be5e22dd 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,11 @@ struct DeviceSum {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support device operator identity");
+#endif
     return T{};
   }
 };
@@ -141,7 +145,11 @@ struct DeviceMin {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity");
+#endif
     return cuda::std::numeric_limits<T>::max();
   }
 
@@ -189,7 +197,11 @@ struct DeviceMax {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity");
+#endif
     return cuda::std::numeric_limits<T>::lowest();
   }
 
@@ -225,7 +237,11 @@ struct DeviceProduct {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity");
+#endif
     return T{1, numeric::scale_type{0}};
   }
 };
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 21ee4fa9e9b..2273a89892b 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -417,7 +417,9 @@ class base_2dspan {
   constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
     : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+#ifndef __CUDA_ARCH__
     CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
+#endif
   }
 
   /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d432deb8e79..76e5369ffd0 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -506,7 +506,7 @@ size_t max_varint_size()
   return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7);
 }
 
-constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
+size_t RLE_stream_size(TypeKind kind, size_t count)
 {
   using cudf::util::div_rounding_up_unsafe;
   constexpr auto byte_rle_max_len = 128;
diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh
index 687766c1bcc..ff1b9f58e6c 100644
--- a/cpp/src/io/utilities/time_utils.cuh
+++ b/cpp/src/io/utilities/time_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = {
 
 struct get_period {
   template <typename T>
-  constexpr int32_t operator()()
+  int32_t operator()()
   {
     if constexpr (is_chrono<T>()) { return T::period::den; }
     CUDF_FAIL("Invalid, non chrono type");
@@ -42,7 +42,7 @@ struct get_period {
 /**
  * @brief Function that translates cuDF time unit to clock frequency
  */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
+inline int32_t to_clockrate(type_id timestamp_type_id)
 {
   return timestamp_type_id == type_id::EMPTY
            ? 0

From ed2892c8a4f00ad376e7b020d09371902fbf6b68 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:47:22 -0500
Subject: [PATCH 42/78] Document undefined behavior in div_rounding_up_safe
 (#17542)

Adds more description to the `div_rounding_up_safe` utility identifying undefined behavior.
Closes #17539

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Lawrence Mitchell (https://github.com/wence-)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17542
---
 .../cudf/detail/utilities/integer_utils.hpp   | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 8b709f2a8f8..957b6b70fe2 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
 }  // namespace detail
 
 /**
- * Divides the left-hand-side by the right-hand-side, rounding up
+ * @brief Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
  *
- * @param dividend the number to divide
- * @param divisor the number of by which to divide
- * @return The least integer multiple of {@link divisor} which is greater than or equal to
- * the non-integral division dividend/divisor.
+ * The result is undefined if `divisor == 0` or
+ * if `divisor == -1` and `dividend == min<I>()`.
+ *
+ * Will not overflow, and may _or may not_ be slower than the intuitive
+ * approach of using `(dividend + divisor - 1) / divisor`.
  *
- * @note will not overflow, and may _or may not_ be slower than the intuitive
- * approach of using (dividend + divisor - 1) / divisor
+ * @tparam I Integer type for `dividend`, `divisor`, and the return type
+ * @param dividend The number to divide
+ * @param divisor The number by which to divide
+ * @return The least integer multiple of `divisor` which is greater than or equal to
+ * the non-integral division `dividend/divisor`
  */
 template <typename I>
 constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept

From a79077cf67ff2154c2e0cd8b40891a8ec6d1712c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 9 Dec 2024 13:06:53 -0600
Subject: [PATCH 43/78] [JNI] Enables fabric handles for CUDA async memory
 pools (#17526)

This PR adds a `CUDA_ASYNC_FABRIC` allocation mode in `RmmAllocationMode` and pipes in the options to RMM's `cuda_async_memory_resource` of a `fabric` for the handle type, and `read_write` as the memory protection mode (as that's the only mode supported by the pools, and is required for IPC).

If `CUDA_ASYNC` is used, fabric handles are not requested, and the memory protection is `none`.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17526
---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 11 ++++++----
 .../ai/rapids/cudf/RmmAllocationMode.java     |  7 ++++++-
 .../cudf/RmmCudaAsyncMemoryResource.java      | 15 ++++++++++++--
 java/src/main/native/src/RmmJni.cpp           | 20 ++++++++++++++-----
 4 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index ed029c918e4..d1cc0cc96fe 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource)
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
    *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
+   *                       {@link RmmAllocationMode#CUDA_ASYNC},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
     boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
     if (isAsync && isManaged) {
@@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       } else if (isAsync) {
         resource = new RmmLimitingResourceAdaptor<>(
             new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
+      } else if (isAsyncFabric) {
+        resource = new RmmLimitingResourceAdaptor<>(
+            new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512);
       } else if (isManaged) {
         resource = new RmmManagedMemoryResource();
       } else {
@@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
 
   private static native long allocInternal(long size, long stream) throws RmmException;
 
-
   static native void free(long ptr, long length, long stream) throws RmmException;
 
   /**
@@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle,
 
   static native void releaseArenaMemoryResource(long handle);
 
-  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
+  static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException;
 
   static native void releaseCudaAsyncMemoryResource(long handle);
 
@@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path,
 
   static native void releaseLoggingResourceAdaptor(long handle);
 
-
   static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
 
   static native void releaseTrackingResourceAdaptor(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 966c21bee22..3f7bc1fae76 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,9 @@ public class RmmAllocationMode {
    * Use CUDA async suballocation strategy
    */
   public static final int CUDA_ASYNC = 0x00000008;
+  /**
+   * Use CUDA async suballocation strategy with fabric handles that are
+   * peer accessible with read-write access
+   */
+  public static final int CUDA_ASYNC_FABRIC = 0x00000010;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index fa1f13cb7ed..cf4936e2e24 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource {
    * @param releaseThreshold size in bytes for when memory is released back to cuda
    */
   public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) {
+    this(size, releaseThreshold, false);
+  }
+
+  /**
+   * Create a new async memory resource
+   * @param size the initial size of the pool
+   * @param releaseThreshold size in bytes for when memory is released back to cuda
+   * @param fabric if true request peer read+write accessible fabric handles when
+   *        creating the pool
+   */
+  public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) {
     this.size = size;
     this.releaseThreshold = releaseThreshold;
-    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold);
+    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric);
   }
 
   @Override
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 23c7b7fb243..0f424761bfe 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -772,14 +772,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
-                                                                           jclass clazz,
-                                                                           jlong init,
-                                                                           jlong release)
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
+  JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
+
+    // When we are using fabric, we need to set the memory access to be
+    // read_write, in order for peer GPUs to have access to this memory.
+    // Otherwise, choose default parameters (optional set to nullopt).
+    auto [handle_type, prot_flag] =
+      fabric
+        ? std::pair{std::optional{
+                      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric},
+                    std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}}
+        : std::pair{std::nullopt, std::nullopt};
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);
+
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)

From f5955929b06e2a4609b9fca0e3f949afb9b1dadd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:22:04 -0800
Subject: [PATCH 44/78] Remove cudf._lib.string.convert/split in favor of
 inlining pylibcudf (#17496)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17496
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/strings/CMakeLists.txt  |  15 --
 python/cudf/cudf/_lib/strings/__init__.py     |  15 --
 .../cudf/_lib/strings/convert/CMakeLists.txt  |  24 --
 .../cudf/_lib/strings/convert/__init__.pxd    |   0
 .../cudf/_lib/strings/convert/__init__.py     |   0
 .../strings/convert/convert_fixed_point.pyx   |  76 ------
 .../_lib/strings/convert/convert_floats.pyx   |  19 --
 .../_lib/strings/convert/convert_integers.pyx |  20 --
 .../_lib/strings/convert/convert_lists.pyx    |  32 ---
 .../_lib/strings/convert/convert_urls.pyx     |  48 ----
 .../cudf/_lib/strings/split/CMakeLists.txt    |  22 --
 .../cudf/cudf/_lib/strings/split/__init__.pxd |   0
 .../cudf/cudf/_lib/strings/split/__init__.py  |   0
 .../cudf/_lib/strings/split/partition.pyx     |  35 ---
 python/cudf/cudf/_lib/strings/split/split.pyx | 155 -----------
 python/cudf/cudf/core/column/decimal.py       |  15 +-
 python/cudf/cudf/core/column/lists.py         |  10 +-
 python/cudf/cudf/core/column/string.py        | 246 +++++++++++++++---
 python/cudf/cudf/core/tools/datetimes.py      |   5 +-
 python/cudf/cudf/core/tools/numeric.py        |  66 ++---
 22 files changed, 262 insertions(+), 543 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/strings/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/split/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/strings/split/partition.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/split/split.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index e98cf283bbb..f9ac3a16940 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -46,4 +46,3 @@ target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
-add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 4758a933898..52e9b89da7b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -12,7 +12,6 @@
     sort,
     stream_compaction,
     string_casting,
-    strings,
     strings_udf,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
deleted file mode 100644
index dca9c4cc3fc..00000000000
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-add_subdirectory(convert)
-add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index b795c54c112..341ba6d11c3 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -32,18 +32,3 @@
     detokenize,
     tokenize_with_vocabulary,
 )
-from cudf._lib.strings.convert.convert_fixed_point import to_decimal
-from cudf._lib.strings.convert.convert_floats import is_float
-from cudf._lib.strings.convert.convert_integers import is_integer
-from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
-from cudf._lib.strings.split.partition import partition, rpartition
-from cudf._lib.strings.split.split import (
-    rsplit,
-    rsplit_re,
-    rsplit_record,
-    rsplit_record_re,
-    split,
-    split_re,
-    split_record,
-    split_record_re,
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
deleted file mode 100644
index e8a76b476a8..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx
-                   convert_lists.pyx convert_urls.pyx
-)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
deleted file mode 100644
index 96dcd021c3b..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def from_decimal(Column input_col):
-    """
-    Converts a `Decimal64Column` to a `StringColumn`.
-
-    Parameters
-    ----------
-    input_col : input column of type decimal
-
-    Returns
-    -------
-    A column of strings representing the input decimal values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def to_decimal(Column input_col, object out_type):
-    """
-    Returns a `Decimal64Column` from the provided `StringColumn`
-    using the scale in the `out_type`.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    out_type : The type and scale of the decimal column expected
-
-    Returns
-    -------
-    A column of decimals parsed from the string values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(out_type),
-    )
-    result = Column.from_pylibcudf(plc_column)
-    result.dtype.precision = out_type.precision
-    return result
-
-
-@acquire_spill_lock()
-def is_fixed_point(Column input_col, object dtype):
-    """
-    Returns a Column of boolean values with True for `input_col`
-    that have fixed-point characters. The output row also has a
-    False value if the corresponding string would cause an integer
-    overflow. The scale of the `dtype` is used to determine overflow
-    in the output row.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    dtype : The type and scale of a decimal column
-
-    Returns
-    -------
-    A Column of booleans indicating valid decimal conversion.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(dtype),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
deleted file mode 100644
index 5da6e3f10cc..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    plc_column = plc.strings.convert.convert_floats.is_float(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
deleted file mode 100644
index 50113347ccb..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have integers.
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_integers.is_integer(
-            source_strings.to_pylibcudf(mode="read")
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
deleted file mode 100644
index 3a2cb4bd5c7..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def format_list_column(Column source_list, Column separators):
-    """
-    Format a list column of strings into a strings column.
-
-    Parameters
-    ----------
-    input_col : input column of type list with strings child.
-
-    separators: strings used for formatting (', ', '[', ']')
-
-    Returns
-    -------
-    Formatted strings column
-    """
-    plc_column = plc.strings.convert.convert_lists.format_list_column(
-        source_list.to_pylibcudf(mode="read"),
-        as_device_scalar("None").c_value,
-        separators.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
deleted file mode 100644
index d5c2f771970..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def url_decode(Column source_strings):
-    """
-    Decode each string in column. No format checking is performed.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL decoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_decode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def url_encode(Column source_strings):
-    """
-    Encode each string in column. No format checking is performed.
-    All characters are encoded except for ASCII letters, digits,
-    and these characters: '.','_','-','~'. Encoding converts to
-    hex using UTF-8 encoded bytes.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL encoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_encode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
deleted file mode 100644
index 4ede0a2fac5..00000000000
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources partition.pyx split.pyx)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
deleted file mode 100644
index 5319addc41c..00000000000
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def partition(Column source_strings,
-              object py_delimiter):
-    """
-    Returns data by splitting the `source_strings`
-    column at the first occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.partition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rpartition(Column source_strings,
-               object py_delimiter):
-    """
-    Returns a Column by splitting the `source_strings`
-    column at the last occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.rpartition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
deleted file mode 100644
index 4ec6c7073d8..00000000000
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def split(Column source_strings,
-          object py_delimiter,
-          size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_table = plc.strings.split.split.split(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record(Column source_strings,
-                 object py_delimiter,
-                 size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_column = plc.strings.split.split.split_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit(Column source_strings,
-           object py_delimiter,
-           size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_table = plc.strings.split.split.rsplit(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_record(Column source_strings,
-                  object py_delimiter,
-                  size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_column = plc.strings.split.split.rsplit_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def split_re(Column source_strings,
-             object pattern,
-             size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_table = plc.strings.split.split.split_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_re(Column source_strings,
-              object pattern,
-              size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_table = plc.strings.split.split.rsplit_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record_re(Column source_strings,
-                    object pattern,
-                    size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_column = plc.strings.split.split.split_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit_record_re(Column source_strings,
-                     object pattern,
-                     size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_column = plc.strings.split.split.rsplit_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 2c22724d3d7..9e6a73f1a9c 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -10,13 +10,12 @@
 import numpy as np
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.strings.convert.convert_fixed_point import (
-    from_decimal as cpp_from_decimal,
-)
 from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop, unary
-from cudf.core.buffer import as_buffer
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
@@ -89,7 +88,13 @@ def as_decimal_column(
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
-            return cpp_from_decimal(self)
+            with acquire_spill_lock():
+                plc_column = (
+                    plc.strings.convert.convert_fixed_point.from_fixed_point(
+                        self.to_pylibcudf(mode="read"),
+                    )
+                )
+                return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
         else:
             return cast(
                 cudf.core.column.StringColumn,
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ea384888388..b95fb0a0d39 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
@@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
 
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        with acquire_spill_lock():
+            plc_column = plc.strings.convert.convert_lists.format_list_column(
+                lc.to_pylibcudf(mode="read"),
+                cudf.Scalar("None").device_value.c_value,
+                separators.to_pylibcudf(mode="read"),
+            )
+            return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6b45828568c..4a2483a80e3 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -19,6 +19,7 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
+from cudf import _lib as libcudf
 from cudf._lib import string_casting as str_cast, strings as libstrings
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
@@ -44,6 +45,7 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.numerical import NumericalColumn
 
 
 def str_to_boolean(column: StringColumn):
@@ -1336,7 +1338,7 @@ def isinteger(self) -> SeriesOrIndex:
         2    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_integer(self._column))
+        return self._return_or_inplace(self._column.is_integer())
 
     def ishex(self) -> SeriesOrIndex:
         """
@@ -1468,7 +1470,7 @@ def isfloat(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_float(self._column))
+        return self._return_or_inplace(self._column.is_float())
 
     def isdecimal(self) -> SeriesOrIndex:
         """
@@ -2710,26 +2712,25 @@ def split(
         if len(str(pat)) <= 1:
             regex = False
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.split_re(self._column, pat, n)
+                    data = self._column.split_re(pat, n)
                 else:
-                    data = libstrings.split(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.split(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.split_record_re(self._column, pat, n)
+                result_table = self._column.split_record_re(pat, n)
             else:
-                result_table = libstrings.split_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.split_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2883,28 +2884,25 @@ def rsplit(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.rsplit_re(self._column, pat, n)
+                    data = self._column.rsplit_re(pat, n)
                 else:
-                    data = libstrings.rsplit(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.rsplit_record_re(
-                    self._column, pat, n
-                )
+                result_table = self._column.rsplit_record_re(pat, n)
             else:
-                result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.rsplit_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2989,7 +2987,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
+            self._column.partition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -3054,7 +3052,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
+            self._column.rpartition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -4499,8 +4497,7 @@ def url_decode(self) -> SeriesOrIndex:
         1    https://medium.com/rapids-ai
         dtype: object
         """
-
-        return self._return_or_inplace(libstrings.url_decode(self._column))
+        return self._return_or_inplace(self._column.url_decode())
 
     def url_encode(self) -> SeriesOrIndex:
         """
@@ -4531,7 +4528,7 @@ def url_encode(self) -> SeriesOrIndex:
         1    https%3A%2F%2Fmedium.com%2Frapids-ai
         dtype: object
         """
-        return self._return_or_inplace(libstrings.url_encode(self._column))
+        return self._return_or_inplace(self._column.url_encode())
 
     def code_points(self) -> SeriesOrIndex:
         """
@@ -6015,13 +6012,13 @@ def as_numerical_column(
         out_dtype = cudf.api.types.dtype(dtype)
         string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(string_col).all():
+            if not string_col.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(string_col).all():
+            if not string_col.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
@@ -6099,10 +6096,17 @@ def as_timedelta_column(
     ) -> cudf.core.column.TimeDeltaColumn:
         return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
+    @acquire_spill_lock()
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return libstrings.to_decimal(self, dtype)
+    ) -> cudf.core.column.DecimalBaseColumn:
+        plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+            self.to_pylibcudf(mode="read"),
+            libcudf.types.dtype_to_pylibcudf_type(dtype),
+        )
+        result = Column.from_pylibcudf(plc_column)
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result  # type: ignore[return-value]
 
     def as_string_column(self) -> StringColumn:
         return self
@@ -6138,12 +6142,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
         if self.dtype == to_dtype:
             return True
-        elif (
-            to_dtype.kind in {"i", "u"}
-            and not libstrings.is_integer(self).all()
-        ):
+        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
             return False
-        elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
+        elif to_dtype.kind == "f" and not self.is_float().all():
             return False
         else:
             return True
@@ -6333,11 +6334,180 @@ def title(self) -> Self:
     def is_title(self) -> Self:
         return self._modify_characters(plc.strings.capitalize.is_title)
 
+    @acquire_spill_lock()
     def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
-        with acquire_spill_lock():
-            plc_result = plc.strings.replace.replace_multiple(
-                self.to_pylibcudf(mode="read"),
-                pattern.to_pylibcudf(mode="read"),
-                replacements.to_pylibcudf(mode="read"),
+        plc_result = plc.strings.replace.replace_multiple(
+            self.to_pylibcudf(mode="read"),
+            pattern.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+        )
+        return cast(Self, Column.from_pylibcudf(plc_result))
+
+    @acquire_spill_lock()
+    def _split_record_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Column,
+        ],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return cast(Self, Column.from_pylibcudf(plc_column))
+
+    def split_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.split_record_re
+        )
+
+    def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_record_re
+        )
+
+    @acquire_spill_lock()
+    def _split_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Table,
+        ],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
             )
-            return cast(Self, Column.from_pylibcudf(plc_result))
+        )
+
+    def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.split_re
+        )
+
+    def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_re
+        )
+
+    @acquire_spill_lock()
+    def _split_record(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.split_record
+        )
+
+    def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.rsplit_record
+        )
+
+    @acquire_spill_lock()
+    def _split(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.split)
+
+    def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
+
+    @acquire_spill_lock()
+    def _partition(
+        self,
+        delimiter: cudf.Scalar,
+        method: Callable[[plc.Column, plc.Scalar], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.partition
+        )
+
+    def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.rpartition
+        )
+
+    @acquire_spill_lock()
+    def url_decode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_decode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def url_encode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_encode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_integer(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_integers.is_integer(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_float(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_floats.is_float(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 80ee078917a..8be336021b1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -15,9 +15,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
                         )
                         break
                     elif arg_col.dtype.kind == "O":
-                        if not cpp_is_integer(arg_col).all():
+                        if not arg_col.is_integer().all():
                             col = new_series._column.strptime(
                                 cudf.dtype("datetime64[ns]"), format=format
                             )
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 91f23490031..40348461f8c 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -2,14 +2,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -18,10 +17,16 @@
 from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
-    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.string import StringColumn
 
 
-def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
+def to_numeric(
+    arg,
+    errors: Literal["raise", "coerce", "ignore"] = "raise",
+    downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+    dtype_backend=None,
+):
     """
     Convert argument into numerical types.
 
@@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         else:
             try:
                 col = _convert_str_col(
-                    col._get_decategorized_column(), errors, downcast
+                    col._get_decategorized_column(),  # type: ignore[attr-defined]
+                    errors,
+                    downcast,
                 )
             except ValueError as e:
                 if errors == "ignore":
@@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
                     raise e
     elif is_string_dtype(dtype):
         try:
-            col = _convert_str_col(col, errors, downcast)
+            col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
             if errors == "ignore":
                 return arg
@@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         return col.values
 
 
-def _convert_str_col(col, errors, _downcast=None):
+def _convert_str_col(
+    col: StringColumn,
+    errors: Literal["raise", "coerce", "ignore"],
+    _downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+) -> NumericalColumn:
     """
     Converts a string column to numeric column
 
@@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None):
     if not is_string_dtype(col):
         raise TypeError("col must be string dtype.")
 
-    is_integer = libstrings.is_integer(col)
-    if is_integer.all():
-        return col.astype(dtype=cudf.dtype("i8"))
+    if col.is_integer().all():
+        return col.astype(dtype=cudf.dtype("i8"))  # type: ignore[return-value]
 
-    col = _proc_inf_empty_strings(col)
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
+    converted_col = (
+        col.to_lower()
+        .find_and_replace(as_column([""]), as_column(["NaN"]))
+        .replace_multiple(
+            as_column(["+", "inf", "inity"]),  # type: ignore[arg-type]
+            as_column(["", "Inf", ""]),  # type: ignore[arg-type]
+        )
+    )
 
-    is_float = libstrings.is_float(col)
+    is_float = converted_col.is_float()
     if is_float.all():
         if _downcast in {"unsigned", "signed", "integer"}:
             warnings.warn(
@@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.astype(dtype=cudf.dtype("float32"))
+            return converted_col.astype(dtype=cudf.dtype("float32"))  # type: ignore[return-value]
         else:
-            return col.astype(dtype=cudf.dtype("float64"))
+            return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            col = libcudf.string_casting.stod(col)
+            converted_col = libcudf.string_casting.stod(converted_col)
             non_numerics = is_float.unary_operator("not")
-            col[non_numerics] = None
-            return col
+            converted_col[non_numerics] = None
+            return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
-
-
-def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
-    """Handles empty and infinity strings"""
-    col = col.to_lower()  # type: ignore[attr-defined]
-    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
-    # TODO: This can be handled by libcudf in
-    # future see StringColumn.as_numerical_column
-    col = col.replace_multiple(  # type: ignore[attr-defined]
-        as_column(["+", "inf", "inity"]),
-        as_column(["", "Inf", ""]),
-    )
-    return col

From 5b412dc14d047959d1a2b70bf27ffea139769f7a Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 9 Dec 2024 16:54:59 -0600
Subject: [PATCH 45/78] [JNI] remove rmm argument to set rw access for fabric
 handles (#17553)

This is a follow up from https://github.com/rapidsai/cudf/pull/17526, where fabric handles can be enabled from RMM. That PR also sets the memory access protection flag (`cudaMemPoolSetAccess`), but I have learned that this second flag is not needed from the owner device. In fact, it causes confusion because the owning device fails to call this function with some of the flags (access none).  `cudaMemPoolSetAccess` is meant to only be called from peer processes that have imported the pool's handle. In our case, UCX handles this from the peer's side and it does not need to be anywhere in RMM or cuDF.

Sorry for the noise. I'd like to get this fix in, and then I am going to fix RMM by removing that API.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17553
---
 java/src/main/native/src/RmmJni.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 0f424761bfe..8c733018fa7 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -778,17 +778,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
   try {
     cudf::jni::auto_set_device(env);
 
-    // When we are using fabric, we need to set the memory access to be
-    // read_write, in order for peer GPUs to have access to this memory.
-    // Otherwise, choose default parameters (optional set to nullopt).
-    auto [handle_type, prot_flag] =
-      fabric
-        ? std::pair{std::optional{
-                      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric},
-                    std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}}
-        : std::pair{std::nullopt, std::nullopt};
-
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);
+    auto handle_type =
+      fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}
+             : std::nullopt;
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type);
 
     return reinterpret_cast<jlong>(ret);
   }

From 9df95d1c5fd41b1b87976fd3680a1d06f2d26310 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:55:16 -0800
Subject: [PATCH 46/78] Remove cudf._lib.transform in favor of inlining
 pylibcudf (#17505)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17505
---
 python/cudf/cudf/_lib/CMakeLists.txt        |   1 -
 python/cudf/cudf/_lib/transform.pyx         | 113 --------------------
 python/cudf/cudf/core/column/categorical.py |   4 +-
 python/cudf/cudf/core/column/column.py      |  34 ++++--
 python/cudf/cudf/core/column/lists.py       |   2 +-
 python/cudf/cudf/core/column/numerical.py   |  30 +++++-
 python/cudf/cudf/core/dataframe.py          |  30 +++---
 python/cudf/cudf/core/df_protocol.py        |   3 +-
 python/cudf/cudf/core/frame.py              |   9 +-
 python/cudf/cudf/core/indexed_frame.py      |   2 +-
 python/cudf/cudf/core/reshape.py            |   7 +-
 python/cudf/cudf/core/series.py             |   3 +-
 python/cudf/cudf/datasets.py                |   3 +-
 python/cudf/cudf/tests/test_column.py       |   9 +-
 14 files changed, 85 insertions(+), 165 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/transform.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index f9ac3a16940..084fc19a61e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -25,7 +25,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    transform.pyx
     types.pyx
     utils.pyx
 )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
deleted file mode 100644
index a163bb07888..00000000000
--- a/python/cudf/cudf/_lib/transform.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from numba.np import numpy_support
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.utils import cudautils
-
-from pylibcudf cimport transform as plc_transform
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def bools_to_mask(Column col):
-    """
-    Given an int8 (boolean) column, compress the data from booleans to bits and
-    return a Buffer
-    """
-    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
-    """
-    Given a mask buffer, returns a boolean column representng bit 0 -> False
-    and 1 -> True within range of [begin_bit, end_bit),
-    """
-    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
-        raise TypeError("mask_buffer is not an instance of "
-                        "cudf.core.buffer.Buffer")
-    plc_column = plc_transform.mask_to_bools(
-        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def nans_to_nulls(Column input):
-    mask, _ = plc_transform.nans_to_nulls(
-        input.to_pylibcudf(mode="read")
-    )
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def transform(Column input, op):
-    nb_type = numpy_support.from_dtype(input.dtype)
-    nb_signature = (nb_type,)
-    compiled_op = cudautils.compile_udf(op, nb_signature)
-    np_dtype = cudf.dtype(compiled_op[1])
-
-    plc_column = plc_transform.transform(
-        input.to_pylibcudf(mode="read"),
-        compiled_op[0],
-        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
-        True
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def table_encode(list source_columns):
-    plc_table, plc_column = plc_transform.encode(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
-    )
-
-    return (
-        [Column.from_pylibcudf(col) for col in plc_table.columns()],
-        Column.from_pylibcudf(plc_column)
-    )
-
-
-def one_hot_encode(Column input_column, Column categories):
-    plc_table = plc_transform.one_hot_encode(
-        input_column.to_pylibcudf(mode="read"),
-        categories.to_pylibcudf(mode="read"),
-    )
-    result_columns = [
-        Column.from_pylibcudf(col, data_ptr_exposed=True)
-        for col in plc_table.columns()
-    ]
-    result_labels = [
-        x if x is not None else '<NA>'
-        for x in categories.to_arrow().to_pylist()
-    ]
-    return dict(zip(result_labels, result_columns))
-
-
-@acquire_spill_lock()
-def compute_column(list columns, tuple column_names, str expr):
-    """Compute a new column by evaluating an expression on a set of columns.
-
-    Parameters
-    ----------
-    columns : list
-        The set of columns forming the table to evaluate the expression on.
-    column_names : tuple[str]
-        The names associated with each column. These names are necessary to map
-        column names in the expression to indices in the provided list of
-        columns, which are what will be used by libcudf to evaluate the
-        expression on the table.
-    expr : str
-        The expression to evaluate.
-    """
-    result = plc_transform.compute_column(
-        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        plc.expressions.to_expression(expr, column_names),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c849a9d3d2b..71ec11e75af 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -13,7 +13,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.transform import bools_to_mask
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
@@ -775,12 +774,11 @@ def to_pandas(
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
-            new_mask = bools_to_mask(self.notnull())
             col = type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
                 dtype=self.dtype,
-                mask=new_mask,
+                mask=self.notnull().fillna(False).as_mask(),
                 children=self.children,
             )
         else:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1ddc79e8970..b317858077f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -32,7 +32,6 @@
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.transform import bools_to_mask
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -373,10 +372,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
+    @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:
-        return libcudf.transform.mask_to_bools(
-            self.base_mask, self.offset, self.offset + len(self)
+        plc_column = plc.transform.mask_to_bools(
+            self.base_mask.get_ptr(mode="read"),  # type: ignore[union-attr]
+            self.offset,
+            self.offset + len(self),
         )
+        return type(self).from_pylibcudf(plc_column)
 
     @cached_property
     def memory_usage(self) -> int:
@@ -981,11 +984,14 @@ def as_mask(self) -> Buffer:
         -------
         Buffer
         """
-
         if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
-        return bools_to_mask(self)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.bools_to_mask(
+                self.to_pylibcudf(mode="read")
+            )
+            return as_buffer(mask)
 
     @property
     def is_unique(self) -> bool:
@@ -1514,6 +1520,18 @@ def _return_sentinel_column():
         )
         return codes.fillna(na_sentinel.value)
 
+    def one_hot_encode(
+        self, categories: ColumnBase
+    ) -> abc.Generator[ColumnBase]:
+        plc_table = plc.transform.one_hot_encode(
+            self.to_pylibcudf(mode="read"),
+            categories.to_pylibcudf(mode="read"),
+        )
+        return (
+            type(self).from_pylibcudf(col, data_ptr_exposed=True)
+            for col in plc_table.columns()
+        )
+
 
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     """Check if an object dtype Series or array contains NaN."""
@@ -2093,8 +2111,7 @@ def as_column(
                     )
                 # Consider NaT as NA in the mask
                 # but maintain NaT as a value
-                bool_mask = as_column(~is_nat)
-                mask = as_buffer(bools_to_mask(bool_mask))
+                mask = as_column(~is_nat).as_mask()
             buffer = as_buffer(arbitrary.view("|u1"))
             col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
             if dtype:
@@ -2264,8 +2281,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         )
         return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
     elif typecode == "b":
-        col = as_column(cai_mask)
-        return bools_to_mask(col)
+        return as_column(cai_mask).as_mask()
     else:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index b95fb0a0d39..ba98e28f6a2 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -255,7 +255,7 @@ def from_sequences(
             data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
-            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
+            mask=as_column(mask_col).as_mask(),
             offset=0,
             null_count=0,
             children=(offset_col, data_col),
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 9514aaeab50..790cd6ea9bb 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -7,9 +7,10 @@
 
 import numpy as np
 import pandas as pd
+from numba.np import numpy_support
 from typing_extensions import Self
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 import cudf.core.column.column as column
@@ -17,11 +18,13 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop, unary
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -179,13 +182,27 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
+    @acquire_spill_lock()
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        plc_column = plc.transform.transform(
+            self.to_pylibcudf(mode="read"),
+            compiled_op[0],
+            plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+            True,
+        )
+        return type(self).from_pylibcudf(plc_column)
+
     def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         if callable(unaryop):
-            return libcudf.transform.transform(self, unaryop)
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
 
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = pylibcudf.unary.UnaryOperator[unaryop]
+        unaryop = plc.unary.UnaryOperator[unaryop]
         return unary.unary_operation(self, unaryop)
 
     def __invert__(self):
@@ -298,8 +315,11 @@ def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
         if self.dtype.kind != "f" or self.nan_count == 0:
             return self
-        newmask = libcudf.transform.nans_to_nulls(self)
-        return self.set_mask(newmask)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.nans_to_nulls(
+                self.to_pylibcudf(mode="read")
+            )
+            return self.set_mask(as_buffer(mask))
 
     def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 656274bca38..325601e5311 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6772,9 +6772,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             )
             result = column.as_column(result, dtype=result_dtype)
             if mask is not None:
-                result = result.set_mask(
-                    cudf._lib.transform.bools_to_mask(mask._column)
-                )
+                result = result.set_mask(mask._column.as_mask())
             return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result, index=self.index)
@@ -7883,6 +7881,16 @@ def interleave_columns(self):
             )
         return self._constructor_sliced._from_column(result_col)
 
+    @acquire_spill_lock()
+    def _compute_columns(self, expr: str) -> ColumnBase:
+        plc_column = plc.transform.compute_column(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in self._columns]
+            ),
+            plc.expressions.to_expression(expr, self._column_names),
+        )
+        return libcudf.column.Column.from_pylibcudf(plc_column)
+
     @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
@@ -8010,11 +8018,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_column(
-                libcudf.transform.compute_column(
-                    [*self._columns], self._column_names, statements[0]
-                )
-            )
+            return Series._from_column(self._compute_columns(statements[0]))
 
         targets = []
         exprs = []
@@ -8030,15 +8034,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
             targets.append(t.strip())
             exprs.append(e.strip())
 
-        cols = (
-            libcudf.transform.compute_column(
-                [*self._columns], self._column_names, e
-            )
-            for e in exprs
-        )
         ret = self if inplace else self.copy(deep=False)
-        for name, col in zip(targets, cols):
-            ret._data[name] = col
+        for name, expr in zip(targets, exprs):
+            ret._data[name] = self._compute_columns(expr)
         if not inplace:
             return ret
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index aa601a2b322..a798041699e 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -799,8 +799,7 @@ def _set_missing_values(
             valid_mask = _ensure_gpu_buffer(
                 valid_mask[0], valid_mask[1], allow_copy
             )
-            boolmask = as_column(valid_mask._buf, dtype="bool")
-            bitmask = cudf._lib.transform.bools_to_mask(boolmask)
+            bitmask = as_column(valid_mask._buf, dtype="bool").as_mask()
             return cudf_col.set_mask(bitmask)
         elif null == _MaskKind.BITMASK:
             valid_mask = _ensure_gpu_buffer(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0a7e6fefe6e..84a3caf905f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1457,7 +1457,14 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode(list(self._columns))
+        plc_table, plc_column = plc.transform.encode(
+            plc.Table([col.to_pylibcudf(mode="read") for col in self._columns])
+        )
+        columns = [
+            libcudf.column.Column.from_pylibcudf(col)
+            for col in plc_table.columns()
+        ]
+        indices = libcudf.column.Column.from_pylibcudf(plc_column)
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 21ac009e7ff..95f3d4d01d5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
 
         col = _post_process_output_col(ans_col, retty)
 
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        col.set_base_mask(ans_mask.as_mask())
         result = cudf.Series._from_column(col, index=self.index)
 
         return result
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 84c653c5b3f..59a3e9dbf3b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,7 +12,6 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
@@ -1338,7 +1337,11 @@ def _one_hot_encode_column(
             f"np.iinfo({size_type_dtype}).max. Consider reducing "
             "size of category"
         )
-    data = one_hot_encode(column, categories)
+    result_labels = (
+        x if x is not None else "<NA>"
+        for x in categories.to_arrow().to_pylist()
+    )
+    data = dict(zip(result_labels, column.one_hot_encode(categories)))
 
     if drop_first and len(data):
         data.pop(next(iter(data)))
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 58cefc6554e..be74b0f867a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -17,7 +17,6 @@
 from typing_extensions import Self, assert_never
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -526,7 +525,7 @@ def from_categorical(cls, categorical, codes=None):
 
             mask = None
             if not valid_codes.all():
-                mask = libcudf.transform.bools_to_mask(valid_codes)
+                mask = valid_codes.as_mask()
             col = CategoricalColumn(
                 data=col.data,
                 size=codes.size,
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index e8d634598f4..a91a4951306 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -4,7 +4,6 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.transform import bools_to_mask
 
 __all__ = ["randomdata", "timeseries"]
 
@@ -70,7 +69,7 @@ def timeseries(
             size=len(index),
             p=[1 - nulls_frequency, nulls_frequency],
         )
-        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
+        mask_buf = cudf.core.column.as_column(mask).as_mask()
         masked_col = gdf[col]._column.set_mask(mask_buf)
         gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 65947efc2df..c3c9a1c5338 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -7,7 +7,6 @@
 import pytest
 
 import cudf
-from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal
@@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data["a"]]
-    got_mask = mask_to_bools(
-        gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data["a"]._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data]
-    got_mask = mask_to_bools(
-        gd_data._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 

From ebad043967e8bb6a2a56ecfcb0b0612ea2894fa2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:37:41 -0500
Subject: [PATCH 47/78] Remove unused `BufferArrayFromVector` (#17549)

Follow up to #17506. This PR removes an unused buffer class.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17549
---
 python/cudf/cudf/_lib/parquet.pyx | 46 +------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 1b4c18d13a7..00c434ae374 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,11 +20,8 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
-from libc.stdint cimport int64_t, uint8_t
+from libc.stdint cimport int64_t
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from pylibcudf.expressions cimport Expression
 from pylibcudf.io.parquet cimport ChunkedParquetReader
@@ -47,47 +44,6 @@ from pylibcudf cimport Table
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
 from pylibcudf.io.parquet cimport ParquetChunkedWriter
-from cython.operator cimport dereference
-
-
-cdef class BufferArrayFromVector:
-    cdef Py_ssize_t length
-    cdef unique_ptr[vector[uint8_t]] in_vec
-
-    # these two things declare part of the buffer interface
-    cdef Py_ssize_t shape[1]
-    cdef Py_ssize_t strides[1]
-
-    @staticmethod
-    cdef BufferArrayFromVector from_unique_ptr(
-        unique_ptr[vector[uint8_t]] in_vec
-    ):
-        cdef BufferArrayFromVector buf = BufferArrayFromVector()
-        buf.in_vec = move(in_vec)
-        buf.length = dereference(buf.in_vec).size()
-        return buf
-
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        cdef Py_ssize_t itemsize = sizeof(uint8_t)
-
-        self.shape[0] = self.length
-        self.strides[0] = 1
-
-        buffer.buf = dereference(self.in_vec).data()
-
-        buffer.format = NULL  # byte
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.length * itemsize   # product(shape) * itemsize
-        buffer.ndim = 1
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        pass
 
 
 def _parse_metadata(meta):

From 47643959aaa7331523d79178bf37ea5106a01c05 Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Tue, 10 Dec 2024 11:13:02 +0900
Subject: [PATCH 48/78] Enable rounding for Decimal32 and Decimal64 in cuDF
 (#17332)

Authors:
  - Hirota Akio (https://github.com/a-hirota)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/17332
---
 python/cudf/cudf/core/indexed_frame.py |  8 +++-
 python/cudf/cudf/tests/test_series.py  | 63 ++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 95f3d4d01d5..0e6a5e03ea6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3970,7 +3970,13 @@ def round(self, decimals=0, how="half_even"):
 
         cols = (
             col.round(decimals[name], how=how)
-            if name in decimals and col.dtype.kind in "fiu"
+            if name in decimals
+            and (
+                col.dtype.kind in "fiu"
+                or isinstance(
+                    col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)
+                )
+            )
             else col.copy(deep=True)
             for name, col in self._column_labels_and_values
         )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 99bd9adb034..f8697c5c6b8 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal):
     assert_eq(result, expected, atol=1e-10)
 
 
+@pytest.mark.parametrize(
+    "data, dtype, decimals, expected_half_up, expected_half_even",
+    [
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.34, 3.46],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            0,
+            [1.0, 2.0, 3.0],
+            [1.0, 2.0, 3.0],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            3,
+            [1.234, 2.345, 3.456],
+            [1.234, 2.345, 3.456],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            4,
+            [1.2346, 2.3457, 3.4568],
+            [1.2346, 2.3457, 3.4568],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.35, 3.46],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            6,
+            [1.234567, 2.345678, 3.456789],
+            [1.234567, 2.345678, 3.456789],
+        ),
+    ],
+)
+def test_series_round_decimal(
+    data, dtype, decimals, expected_half_up, expected_half_even
+):
+    ser = cudf.Series(data).astype(dtype)
+
+    result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype)
+    expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype)
+    assert_eq(result_half_up, expected_ser_half_up)
+
+    result_half_even = ser.round(decimals=decimals, how="half_even").astype(
+        dtype
+    )
+    expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype)
+    assert_eq(result_half_even, expected_ser_half_even)
+
+
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):

From c53ace8f381af7c9e9dce161dcc756d07f8f147c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:35:11 -0800
Subject: [PATCH 49/78] Fix CMake format in cudf/_lib/CMakeLists.txt (#17559)

Due to 2 of my cudf._lib refactoring PRs going in which then impacted formatting of `cudf/_lib/CMakeLists.txt`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17559
---
 python/cudf/cudf/_lib/CMakeLists.txt | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 084fc19a61e..efe96ff6c3e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -13,20 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    column.pyx
-    copying.pyx
-    csv.pyx
-    groupby.pyx
-    interop.pyx
-    parquet.pyx
-    reduce.pyx
-    scalar.pyx
-    sort.pyx
-    stream_compaction.pyx
-    string_casting.pyx
-    strings_udf.pyx
-    types.pyx
-    utils.pyx
+    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx
+    sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 

From e16b3a3c499bda40082c1990f94ef0aa3bb23b35 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 10 Dec 2024 01:16:54 -0600
Subject: [PATCH 50/78] Remove Thrust patch in favor of CMake definition for
 Thrust 32-bit offset types. (#17527)

Follow-up for #17523 to use `target_compile_definitions` and drop the Thrust patch.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17527
---
 cpp/CMakeLists.txt                            |  4 ++++
 .../thirdparty/patches/cccl_override.json     |  5 -----
 .../thrust_disable_64bit_dispatching.diff     | 22 -------------------
 3 files changed, 4 insertions(+), 27 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12e6826f301..e54c71de4fa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -928,6 +928,10 @@ if(TARGET CUDA::cuFile${_cufile_suffix})
   target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
 endif()
 
+# Remove this after upgrading to a CCCL that has a proper CMake option. See
+# https://github.com/NVIDIA/cccl/pull/2844
+target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 2f29578f7ae..d5cadce40c2 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
deleted file mode 100644
index 9f68d85e7db..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 3d004aa55..71ce86bea 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -63,7 +63,7 @@
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1)                \
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2)
- 
--#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
-+#if 0
- //! @brief Always dispatches to 64 bit offset version of an algorithm
- #  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
-@@ -89,7 +89,7 @@
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
-     _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
- 
--#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE)
-+#elif 1
- 
- //! @brief Ensures that the size of the input does not overflow the offset type
- #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \

From 13e983eafecad5a3d4053157febd714e40a410c3 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@nvidia.com>
Date: Tue, 10 Dec 2024 09:15:37 -0600
Subject: [PATCH 51/78] gate telemetry dispatch calls on TELEMETRY_ENABLED env
 var (#17551)

Because of the switch away from certificates/mTLS, we are having to rework a few things. In the meantime, telemetry jobs are failing. This PR adds a switch to turn all of the telemetry stuff off - to skip it instead. It is meant to be controlled by an org-wide environment variable, which can be applied to individual repos by ops. At the time of submitting this PR, the environment variable is 'false' and no telemetry is being reported.

Authors:
  - Mike Sarahan (https://github.com/msarahan)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17551
---
 .github/workflows/pr.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7c0bd6d52e2..49ca5ca0fb9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,6 +52,7 @@ jobs:
       OTEL_SERVICE_NAME: 'pr-cudf'
     steps:
       - name: Telemetry setup
+        if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
@@ -329,7 +330,7 @@ jobs:
   telemetry-summarize:
     runs-on: ubuntu-latest
     needs: pr-builder
-    if: always()
+    if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
       - name: Load stashed telemetry env vars

From 3468e9259960b4f16cd849e8497be4f5bee0839b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 10 Dec 2024 12:32:10 -0500
Subject: [PATCH 52/78] Replace cudf::detail::copy_if logic with
 thrust::copy_if and gather (#17520)

Replaces the custom kernels for `cudf::detail::copy_if` with a call to `thrust::copy_if` to build indices to call `cudf::detail::gather`.
This is easier to maintain and faster for some cases but slower in others.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17520
---
 .../stream_compaction/apply_boolean_mask.cpp  |   8 +-
 cpp/include/cudf/detail/copy_if.cuh           | 363 +-----------------
 cpp/src/dictionary/remove_keys.cu             |   1 +
 3 files changed, 23 insertions(+), 349 deletions(-)

diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index fa017ca9e29..267aa3a93f3 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<Data
   data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
     input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table =
-    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
+  auto source_table = create_random_table(
+    cycle_dtypes({input_type, cudf::type_id::STRING}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
@@ -85,6 +85,6 @@ using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view
 NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
   .set_name("apply_boolean_mask")
   .set_type_axes_names({"type"})
-  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("columns", {1, 4, 9})
   .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
-  .add_int64_axis("hits_%", {10, 50, 100});
+  .add_int64_axis("hits_%", {10, 20, 50, 80, 90, 100});
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4159e324472..9226697a7f6 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -16,300 +16,25 @@
 
 #pragma once
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cub/cub.cuh>
-#include <cuda/atomic>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
-
 namespace cudf {
 namespace detail {
 
-// Compute the count of elements that pass the mask within each block
-template <typename Filter, int block_size>
-CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                      cudf::size_type size,
-                                      cudf::size_type per_thread,
-                                      Filter filter)
-{
-  int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
-  int count = 0;
-
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-    count += __syncthreads_count(mask_true);
-    tid += block_size;
-  }
-
-  if (threadIdx.x == 0) block_counts[blockIdx.x] = count;
-}
-
-// Compute the exclusive prefix sum of each thread's mask value within each block
-template <int block_size>
-__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum)
-{
-  int offset = 0;
-
-  using BlockScan = cub::BlockScan<cudf::size_type, block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum);
-
-  return offset;
-}
-
-// This kernel scatters data and validity mask of a column based on the
-// scan of the boolean mask. The block offsets for the scan are already computed.
-// Just compute the scan of the mask in each block and add it to the block's
-// output offset. This is the output index of each element. Scattering
-// the valid mask is not as easy, because each thread is only responsible for
-// one bit. Warp-level processing (ballot) makes this simpler.
-// To make scattering efficient, we "coalesce" the block's scattered data and
-// valids in shared memory, and then write from shared memory to global memory
-// in a contiguous manner.
-// The has_validity template parameter specializes this kernel for the
-// non-nullable case for performance without writing another kernel.
-//
-// Note: `filter` is not run on indices larger than the input column size
-template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) CUDF_KERNEL
-  void scatter_kernel(cudf::mutable_column_device_view output_view,
-                      cudf::size_type* output_null_count,
-                      cudf::column_device_view input_view,
-                      cudf::size_type const* __restrict__ block_offsets,
-                      cudf::size_type size,
-                      cudf::size_type per_thread,
-                      Filter filter)
-{
-  T* __restrict__ output_data                   = output_view.data<T>();
-  cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  static_assert(block_size <= 1024, "Maximum thread block size exceeded");
-
-  int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
-  cudf::size_type block_offset = block_offsets[blockIdx.x];
-
-  // one extra warp worth in case the block is not aligned
-  __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
-  __shared__ T temp_data[block_size];
-
-  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
-  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
-
-  // Note that since the maximum gridDim.x on all supported GPUs is as big as
-  // cudf::size_type, this loop is sufficient to cover our maximum column size
-  // regardless of the value of block_size and per_thread.
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-
-    cudf::size_type tmp_block_sum = 0;
-    // get output location using a scan of the mask result
-    cudf::size_type const local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
-    block_sum += tmp_block_sum;
-
-    if (has_validity) {
-      temp_valids[threadIdx.x] = false;  // init shared memory
-      if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();  // wait for init
-    }
-
-    if (mask_true) {
-      temp_data[local_index] = input_view.data<T>()[tid];  // scatter data to shared
-
-      // scatter validity mask to shared memory
-      if (has_validity and input_view.is_valid(tid)) {
-        // determine aligned offset for this warp's output
-        cudf::size_type const aligned_offset      = block_offset % cudf::detail::warp_size;
-        temp_valids[local_index + aligned_offset] = true;
-      }
-    }
-
-    __syncthreads();  // wait for shared data and validity mask to be complete
-
-    // Copy output data coalesced from shared to global
-    if (threadIdx.x < tmp_block_sum)
-      output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x];
-
-    if (has_validity) {
-      // Since the valid bools are contiguous in shared memory now, we can use
-      // __popc to combine them into a single mask element.
-      // Then, most mask elements can be directly copied from shared to global
-      // memory. Only the first and last 32-bit mask elements of each block must
-      // use an atomicOr, because these are where other blocks may overlap.
-
-      constexpr int num_warps = block_size / cudf::detail::warp_size;
-      // account for partial blocks with non-warp-aligned offsets
-      int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
-      int const last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
-      int const wid        = threadIdx.x / cudf::detail::warp_size;
-      int const lane       = threadIdx.x % cudf::detail::warp_size;
-
-      cudf::size_type tmp_warp_valid_counts{0};
-
-      if (tmp_block_sum > 0 && wid <= last_warp) {
-        int valid_index = (block_offset / cudf::detail::warp_size) + wid;
-
-        // compute the valid mask for this warp
-        uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]);
-
-        // Note the atomicOr's below assume that output_valid has been set to
-        // all zero before the kernel
-        if (lane == 0 && valid_warp != 0) {
-          tmp_warp_valid_counts = __popc(valid_warp);
-          if (wid > 0 && wid < last_warp)
-            output_valid[valid_index] = valid_warp;
-          else {
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-
-        // if the block is full and not aligned then we have one more warp to cover
-        if ((wid == 0) && (last_warp == num_warps)) {
-          uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
-          if (lane == 0 && valid_warp != 0) {
-            tmp_warp_valid_counts += __popc(valid_warp);
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index + num_warps]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-      }
-      warp_valid_counts += tmp_warp_valid_counts;
-    }
-
-    block_offset += tmp_block_sum;
-    tid += block_size;
-  }
-  // Compute total null_count for this block and add it to global count
-  constexpr cudf::size_type leader_lane{0};
-  cudf::size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
-
-  if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
-    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
-  }
-}
-
-template <typename T, typename Enable = void>
-struct DeviceType {
-  using type = T;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
-  using type = typename T::rep;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-// Dispatch functor which performs the scatter for fixed column types and gather for other
-template <typename Filter, int block_size>
-struct scatter_gather_functor {
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const* block_offsets,
-                                           Filter filter,
-                                           cudf::size_type per_thread,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    auto output_column =
-      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
-    auto output = output_column->mutable_view();
-
-    bool has_valid = input.nullable();
-
-    using Type = typename DeviceType<T>::type;
-
-    auto scatter = (has_valid) ? scatter_kernel<Type, Filter, block_size, true>
-                               : scatter_kernel<Type, Filter, block_size, false>;
-
-    cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
-
-    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
-    if (output.nullable()) {
-      // Have to initialize the output mask to all zeros because we may update
-      // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
-    }
-
-    auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
-    auto input_device_view  = cudf::column_device_view::create(input, stream);
-    scatter<<<grid.num_blocks, block_size, 0, stream.value()>>>(*output_device_view,
-                                                                null_count.data(),
-                                                                *input_device_view,
-                                                                block_offsets,
-                                                                input.size(),
-                                                                per_thread,
-                                                                filter);
-
-    if (has_valid) { output_column->set_null_count(null_count.value(stream)); }
-    return output_column;
-  }
-
-  template <typename T,
-            std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const*,
-                                           Filter filter,
-                                           cudf::size_type,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    rmm::device_uvector<cudf::size_type> indices(output_size, stream);
-
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::counting_iterator<cudf::size_type>(0),
-                    thrust::counting_iterator<cudf::size_type>(input.size()),
-                    indices.begin(),
-                    filter);
-
-    auto output_table = cudf::detail::gather(cudf::table_view{{input}},
-                                             indices,
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-
-    // There will be only one column
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-};
-
 /**
  * @brief Filters `input` using a Filter function object
  *
@@ -319,9 +44,11 @@ struct scatter_gather_functor {
  * false otherwise.
  *
  * @tparam Filter the filter functor type
- * @param[in] input The table_view to filter
- * @param[in] filter A function object that takes an index and returns a bool
- * @return unique_ptr<table> The table generated from filtered `input`.
+ * @param input The table_view to filter
+ * @param filter A function object that takes an index and returns a bool
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for allocating the returned memory
+ * @return The table generated from filtered `input`
  */
 template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
@@ -333,76 +60,22 @@ std::unique_ptr<table> copy_if(table_view const& input,
 
   if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); }
 
-  constexpr int block_size = 256;
-  cudf::size_type per_thread =
-    elements_per_thread(compute_block_counts<Filter, block_size>, input.num_rows(), block_size);
-  cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread};
-
-  // temp storage for block counts and offsets
-  rmm::device_uvector<cudf::size_type> block_counts(grid.num_blocks, stream);
-  rmm::device_uvector<cudf::size_type> block_offsets(grid.num_blocks + 1, stream);
-
-  // 1. Find the count of elements in each block that "pass" the mask
-  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    block_counts.begin(), input.num_rows(), per_thread, filter);
-
-  // initialize just the first element of block_offsets to 0 since the InclusiveSum below
-  // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
-
-  // 2. Find the offset for each block's output using a scan of block counts
-  if (grid.num_blocks > 1) {
-    // Determine and allocate temporary device storage
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(nullptr,
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-    rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-
-    // Run exclusive prefix sum
-    cub::DeviceScan::InclusiveSum(d_temp_storage.data(),
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-  }
-
-  // As it is InclusiveSum, last value in block_offsets will be output_size
-  // unless num_blocks == 1, in which case output_size is just block_counts[0]
-  cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &output_size,
-    grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
-    sizeof(cudf::size_type),
-    cudaMemcpyDefault,
-    stream.value()));
+  auto indices     = rmm::device_uvector<size_type>(input.num_rows(), stream);
+  auto const begin = thrust::counting_iterator<size_type>(0);
+  auto const end   = begin + input.num_rows();
+  auto const indices_end =
+    thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter);
 
-  stream.synchronize();
+  auto const output_size = static_cast<size_type>(thrust::distance(indices.begin(), indices_end));
 
-  if (output_size == input.num_rows()) {
-    return std::make_unique<table>(input, stream, mr);
-  } else if (output_size > 0) {
-    std::vector<std::unique_ptr<column>> out_columns(input.num_columns());
-    std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) {
-      return cudf::type_dispatcher(col_view.type(),
-                                   scatter_gather_functor<Filter, block_size>{},
-                                   col_view,
-                                   output_size,
-                                   block_offsets.begin(),
-                                   filter,
-                                   per_thread,
-                                   stream,
-                                   mr);
-    });
+  // nothing selected
+  if (output_size == 0) { return empty_like(input); }
+  // everything selected
+  if (output_size == input.num_rows()) { return std::make_unique<table>(input, stream, mr); }
 
-    return std::make_unique<table>(std::move(out_columns));
-  } else {
-    return empty_like(input);
-  }
+  auto const map = device_span<size_type const>(indices.data(), output_size);
+  return cudf::detail::gather(
+    input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 59c8453cf33..4715931a7a9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>

From 2f5bf7659e40cd27bb35f10785e233aad5481bbd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 10 Dec 2024 09:37:46 -0800
Subject: [PATCH 53/78] Simplify serialization protocols (#17552)

This rewrites all serialization protocols in cudf to remove the need for
pickling intermediates.
---
 python/cudf/cudf/_lib/copying.pyx             |  11 +--
 python/cudf/cudf/core/_base_index.py          |   8 --
 python/cudf/cudf/core/abc.py                  |  16 ++--
 python/cudf/cudf/core/buffer/buffer.py        |   8 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   4 +-
 python/cudf/cudf/core/column/column.py        |  23 +++---
 python/cudf/cudf/core/dataframe.py            |   9 +-
 python/cudf/cudf/core/dtypes.py               |  77 +++++++-----------
 python/cudf/cudf/core/frame.py                |  73 +++++++++++++----
 python/cudf/cudf/core/groupby/groupby.py      |  13 ++-
 python/cudf/cudf/core/index.py                |  13 +--
 python/cudf/cudf/core/multiindex.py           |   7 +-
 python/cudf/cudf/core/resample.py             |  12 +--
 python/cudf/cudf/core/series.py               |   9 +-
 .../stringColumnWithRangeIndex_cudf_23.12.pkl | Bin 1394 -> 1108 bytes
 python/cudf/cudf/tests/test_serialize.py      |  19 ++++-
 python/cudf/cudf/tests/test_struct.py         |   2 +-
 .../dask_cudf/tests/test_distributed.py       |  16 +++-
 18 files changed, 179 insertions(+), 141 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 4dfb12d8ab3..c478cd1a990 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import pickle
-
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -367,14 +365,13 @@ class PackedColumns(Serializable):
         header["index-names"] = self.index_names
         header["metadata"] = self._metadata.tobytes()
         for name, dtype in self.column_dtypes.items():
-            dtype_header, dtype_frames = dtype.serialize()
+            dtype_header, dtype_frames = dtype.device_serialize()
             self.column_dtypes[name] = (
                 dtype_header,
                 (len(frames), len(frames) + len(dtype_frames)),
             )
             frames.extend(dtype_frames)
         header["column-dtypes"] = self.column_dtypes
-        header["type-serialized"] = pickle.dumps(type(self))
         return header, frames
 
     @classmethod
@@ -382,9 +379,9 @@ class PackedColumns(Serializable):
         column_dtypes = {}
         for name, dtype in header["column-dtypes"].items():
             dtype_header, (start, stop) = dtype
-            column_dtypes[name] = pickle.loads(
-                dtype_header["type-serialized"]
-            ).deserialize(dtype_header, frames[start:stop])
+            column_dtypes[name] = Serializable.device_deserialize(
+                dtype_header, frames[start:stop]
+            )
         return cls(
             plc.contiguous_split.pack(
                 plc.contiguous_split.unpack_from_memoryviews(
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index a6abd63d042..950ce5f1236 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal
@@ -330,13 +329,6 @@ def get_level_values(self, level):
         else:
             raise KeyError(f"Requested level with name {level} " "not found")
 
-    @classmethod
-    def deserialize(cls, header, frames):
-        # Dispatch deserialization to the appropriate index type in case
-        # deserialization is ever attempted with the base class directly.
-        idx_type = pickle.loads(header["type-serialized"])
-        return idx_type.deserialize(header, frames)
-
     @property
     def names(self):
         """
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index ce6bb83bc77..c8ea03b04fe 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,8 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 """Common abstract base classes for cudf."""
 
-import pickle
-
 import numpy
 
 import cudf
@@ -22,6 +20,14 @@ class Serializable:
     latter converts back from that representation into an equivalent object.
     """
 
+    # A mapping from class names to the classes themselves. This is used to
+    # reconstruct the correct class when deserializing an object.
+    _name_type_map: dict = {}
+
+    def __init_subclass__(cls, /, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls._name_type_map[cls.__name__] = cls
+
     def serialize(self):
         """Generate an equivalent serializable representation of an object.
 
@@ -98,7 +104,7 @@ def device_serialize(self):
             )
             for f in frames
         )
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["type-serialized-name"] = type(self).__name__
         header["is-cuda"] = [
             hasattr(f, "__cuda_array_interface__") for f in frames
         ]
@@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames):
 
         :meta private:
         """
-        typ = pickle.loads(header["type-serialized"])
+        typ = cls._name_type_map[header["type-serialized-name"]]
         frames = [
             cudf.core.buffer.as_buffer(f) if c else memoryview(f)
-            for c, f in zip(header["is-cuda"], frames)
+            for c, f in zip(header["is-cuda"], frames, strict=True)
         ]
         return typ.deserialize(header, frames)
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index ffa306bf93f..625938ca168 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import math
-import pickle
 import weakref
 from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Literal
@@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]:
             second element is a list containing single frame.
         """
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+        header["owner-type-serialized-name"] = type(self._owner).__name__
         header["frame_count"] = 1
         frames = [self]
         return header, frames
@@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
         if isinstance(frame, cls):
             return frame  # The frame is already deserialized
 
-        owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
+        owner_type: BufferOwner = Serializable._name_type_map[
+            header["owner-type-serialized-name"]
+        ]
         if hasattr(frame, "__cuda_array_interface__"):
             owner = owner_type.from_device_memory(frame, exposed=False)
         else:
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b40c56c9a6b..66f8be4ddc5 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import collections.abc
-import pickle
 import time
 import weakref
 from threading import RLock
@@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
         frames: list[Buffer | memoryview]
         with self._owner.lock:
-            header["type-serialized"] = pickle.dumps(self.__class__)
-            header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+            header["owner-type-serialized-name"] = type(self._owner).__name__
             header["frame_count"] = 1
             if self.is_spilled:
                 frames = [self.memoryview()]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f6eaea4b783..4b1e9c1129e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import pickle
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -1224,28 +1223,27 @@ def serialize(self) -> tuple[dict, list]:
 
         header: dict[Any, Any] = {}
         frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
         try:
-            dtype, dtype_frames = self.dtype.serialize()
+            dtype, dtype_frames = self.dtype.device_serialize()
             header["dtype"] = dtype
             frames.extend(dtype_frames)
             header["dtype-is-cudf-serialized"] = True
         except AttributeError:
-            header["dtype"] = pickle.dumps(self.dtype)
+            header["dtype"] = self.dtype.str
             header["dtype-is-cudf-serialized"] = False
 
         if self.data is not None:
-            data_header, data_frames = self.data.serialize()
+            data_header, data_frames = self.data.device_serialize()
             header["data"] = data_header
             frames.extend(data_frames)
 
         if self.mask is not None:
-            mask_header, mask_frames = self.mask.serialize()
+            mask_header, mask_frames = self.mask.device_serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
         if self.children:
             child_headers, child_frames = zip(
-                *(c.serialize() for c in self.children)
+                *(c.device_serialize() for c in self.children)
             )
             header["subheaders"] = list(child_headers)
             frames.extend(chain(*child_frames))
@@ -1257,8 +1255,7 @@ def serialize(self) -> tuple[dict, list]:
     def deserialize(cls, header: dict, frames: list) -> ColumnBase:
         def unpack(header, frames) -> tuple[Any, list]:
             count = header["frame_count"]
-            klass = pickle.loads(header["type-serialized"])
-            obj = klass.deserialize(header, frames[:count])
+            obj = cls.device_deserialize(header, frames[:count])
             return obj, frames[count:]
 
         assert header["frame_count"] == len(frames), (
@@ -1268,7 +1265,7 @@ def unpack(header, frames) -> tuple[Any, list]:
         if header["dtype-is-cudf-serialized"]:
             dtype, frames = unpack(header["dtype"], frames)
         else:
-            dtype = pickle.loads(header["dtype"])
+            dtype = np.dtype(header["dtype"])
         if "data" in header:
             data, frames = unpack(header["data"], frames)
         else:
@@ -2219,7 +2216,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     frames = []
 
     if len(columns) > 0:
-        header_columns = [c.serialize() for c in columns]
+        header_columns: list[tuple[dict, list]] = [
+            c.device_serialize() for c in columns
+        ]
         headers, column_frames = zip(*header_columns)
         for f in column_frames:
             frames.extend(f)
@@ -2236,7 +2235,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
 
     for meta in headers:
         col_frame_count = meta["frame_count"]
-        col_typ = pickle.loads(meta["type-serialized"])
+        col_typ = Serializable._name_type_map[meta["type-serialized-name"]]
         colobj = col_typ.deserialize(meta, frames[:col_frame_count])
         columns.append(colobj)
         # Advance frames
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bd78d5dd9f1..fd68a40324e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7,7 +7,6 @@
 import itertools
 import numbers
 import os
-import pickle
 import re
 import sys
 import textwrap
@@ -44,7 +43,6 @@
 )
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     CategoricalColumn,
@@ -582,7 +580,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
     pass
 
 
-class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
+class DataFrame(IndexedFrame, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
 
@@ -1184,7 +1182,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1199,8 +1197,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 2110e610c37..8765a27a165 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -3,7 +3,6 @@
 
 import decimal
 import operator
-import pickle
 import textwrap
 import warnings
 from functools import cached_property
@@ -91,13 +90,13 @@ def dtype(arbitrary):
         raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype")
 
 
-def _decode_type(
+def _check_type(
     cls: type,
     header: dict,
     frames: list,
     is_valid_class: Callable[[type, type], bool] = operator.is_,
-) -> tuple[dict, list, type]:
-    """Decode metadata-encoded type and check validity
+) -> None:
+    """Perform metadata-encoded type and check validity
 
     Parameters
     ----------
@@ -112,12 +111,6 @@ class performing deserialization
         serialization by `cls` (default is to check type equality), called
         as `is_valid_class(decoded_class, cls)`.
 
-    Returns
-    -------
-    tuple
-        Tuple of validated headers, frames, and the decoded class
-        constructor.
-
     Raises
     ------
     AssertionError
@@ -128,11 +121,10 @@ class performing deserialization
         f"Deserialization expected {header['frame_count']} frames, "
         f"but received {len(frames)}."
     )
-    klass = pickle.loads(header["type-serialized"])
     assert is_valid_class(
-        klass, cls
+        klass := Serializable._name_type_map[header["type-serialized-name"]],
+        cls,
     ), f"Header-encoded {klass=} does not match decoding {cls=}."
-    return header, frames, klass
 
 
 class _BaseDtype(ExtensionDtype, Serializable):
@@ -305,13 +297,14 @@ def construct_from_string(self):
 
     def serialize(self):
         header = {}
-        header["type-serialized"] = pickle.dumps(type(self))
         header["ordered"] = self.ordered
 
         frames = []
 
         if self.categories is not None:
-            categories_header, categories_frames = self.categories.serialize()
+            categories_header, categories_frames = (
+                self.categories.device_serialize()
+            )
         header["categories"] = categories_header
         frames.extend(categories_frames)
         header["frame_count"] = len(frames)
@@ -319,15 +312,14 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         ordered = header["ordered"]
         categories_header = header["categories"]
         categories_frames = frames
-        categories_type = pickle.loads(categories_header["type-serialized"])
-        categories = categories_type.deserialize(
+        categories = Serializable.device_deserialize(
             categories_header, categories_frames
         )
-        return klass(categories=categories, ordered=ordered)
+        return cls(categories=categories, ordered=ordered)
 
     def __repr__(self):
         return self.to_pandas().__repr__()
@@ -495,12 +487,13 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Dtype] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames = []
 
         if isinstance(self.element_type, _BaseDtype):
-            header["element-type"], frames = self.element_type.serialize()
+            header["element-type"], frames = (
+                self.element_type.device_serialize()
+            )
         else:
             header["element-type"] = getattr(
                 self.element_type, "name", self.element_type
@@ -510,14 +503,14 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         if isinstance(header["element-type"], dict):
-            element_type = pickle.loads(
-                header["element-type"]["type-serialized"]
-            ).deserialize(header["element-type"], frames)
+            element_type = Serializable.device_deserialize(
+                header["element-type"], frames
+            )
         else:
             element_type = header["element-type"]
-        return klass(element_type=element_type)
+        return cls(element_type=element_type)
 
     @cached_property
     def itemsize(self):
@@ -641,7 +634,6 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames: list[Buffer] = []
 
@@ -649,33 +641,31 @@ def serialize(self) -> tuple[dict, list]:
 
         for k, dtype in self.fields.items():
             if isinstance(dtype, _BaseDtype):
-                dtype_header, dtype_frames = dtype.serialize()
+                dtype_header, dtype_frames = dtype.device_serialize()
                 fields[k] = (
                     dtype_header,
                     (len(frames), len(frames) + len(dtype_frames)),
                 )
                 frames.extend(dtype_frames)
             else:
-                fields[k] = pickle.dumps(dtype)
+                fields[k] = dtype.str
         header["fields"] = fields
         header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         fields = {}
         for k, dtype in header["fields"].items():
             if isinstance(dtype, tuple):
                 dtype_header, (start, stop) = dtype
-                fields[k] = pickle.loads(
-                    dtype_header["type-serialized"]
-                ).deserialize(
+                fields[k] = Serializable.device_deserialize(
                     dtype_header,
                     frames[start:stop],
                 )
             else:
-                fields[k] = pickle.loads(dtype)
+                fields[k] = np.dtype(dtype)
         return cls(fields)
 
     @cached_property
@@ -838,7 +828,6 @@ def _from_decimal(cls, decimal):
     def serialize(self) -> tuple[dict, list]:
         return (
             {
-                "type-serialized": pickle.dumps(type(self)),
                 "precision": self.precision,
                 "scale": self.scale,
                 "frame_count": 0,
@@ -848,11 +837,8 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(
-            cls, header, frames, is_valid_class=issubclass
-        )
-        klass = pickle.loads(header["type-serialized"])
-        return klass(header["precision"], header["scale"])
+        _check_type(cls, header, frames, is_valid_class=issubclass)
+        return cls(header["precision"], header["scale"])
 
     def __eq__(self, other: Dtype) -> bool:
         if other is self:
@@ -960,18 +946,17 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "fields": pickle.dumps((self.subtype, self.closed)),
+            "fields": (self.subtype.str, self.closed),
             "frame_count": 0,
         }
         return header, []
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
-        klass = pickle.loads(header["type-serialized"])
-        subtype, closed = pickle.loads(header["fields"])
-        return klass(subtype, closed=closed)
+        _check_type(cls, header, frames)
+        subtype, closed = header["fields"]
+        subtype = np.dtype(subtype)
+        return cls(subtype, closed=closed)
 
 
 def _is_categorical_dtype(obj):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 30868924bcd..f7af374ca8d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections import abc
 from typing import TYPE_CHECKING, Any, Literal
@@ -22,6 +21,7 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -45,7 +45,7 @@
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
-class Frame(BinaryOperand, Scannable):
+class Frame(BinaryOperand, Scannable, Serializable):
     """A collection of Column objects with an optional index.
 
     Parameters
@@ -95,37 +95,80 @@ def ndim(self) -> int:
     @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
+        frames = []
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(self._column_names),
-            "column_rangeindex": pickle.dumps(self._data.rangeindex),
-            "column_multiindex": pickle.dumps(self._data.multiindex),
-            "column_label_dtype": pickle.dumps(self._data.label_dtype),
-            "column_level_names": pickle.dumps(self._data._level_names),
+            "column_label_dtype": None,
+            "dtype-is-cudf-serialized": False,
         }
-        header["columns"], frames = serialize_columns(self._columns)
+        if (label_dtype := self._data.label_dtype) is not None:
+            try:
+                header["column_label_dtype"], frames = (
+                    label_dtype.device_serialize()
+                )
+                header["dtype-is-cudf-serialized"] = True
+            except AttributeError:
+                header["column_label_dtype"] = label_dtype.str
+
+        header["columns"], column_frames = serialize_columns(self._columns)
+        column_names, column_names_numpy_type = (
+            zip(
+                *[
+                    (cname.item(), type(cname).__name__)
+                    if isinstance(cname, np.generic)
+                    else (cname, "")
+                    for cname in self._column_names
+                ]
+            )
+            if self._column_names
+            else ((), ())
+        )
+        header |= {
+            "column_names": column_names,
+            "column_names_numpy_type": column_names_numpy_type,
+            "column_rangeindex": self._data.rangeindex,
+            "column_multiindex": self._data.multiindex,
+            "column_level_names": self._data._level_names,
+        }
+        frames.extend(column_frames)
+
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
-        cls_deserialize = pickle.loads(header["type-serialized"])
-        column_names = pickle.loads(header["column_names"])
-        columns = deserialize_columns(header["columns"], frames)
         kwargs = {}
+        dtype_header = header["column_label_dtype"]
+        if header["dtype-is-cudf-serialized"]:
+            count = dtype_header["frame_count"]
+            kwargs["label_dtype"] = cls.device_deserialize(
+                header, frames[:count]
+            )
+            frames = frames[count:]
+        else:
+            kwargs["label_dtype"] = (
+                np.dtype(dtype_header) if dtype_header is not None else None
+            )
+
+        columns = deserialize_columns(header["columns"], frames)
         for metadata in [
             "rangeindex",
             "multiindex",
-            "label_dtype",
             "level_names",
         ]:
             key = f"column_{metadata}"
             if key in header:
-                kwargs[metadata] = pickle.loads(header[key])
+                kwargs[metadata] = header[key]
+
+        column_names = [
+            getattr(np, cntype)(cname) if cntype != "" else cname
+            for cname, cntype in zip(
+                header["column_names"], header["column_names_numpy_type"]
+            )
+        ]
         col_accessor = ColumnAccessor(
             data=dict(zip(column_names, columns)), **kwargs
         )
-        return cls_deserialize._from_data(col_accessor)
+        return cls._from_data(col_accessor)
 
     @classmethod
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e59b948aba9..a7ced1b833a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3,7 +3,6 @@
 
 import copy
 import itertools
-import pickle
 import textwrap
 import warnings
 from collections import abc
@@ -1265,7 +1264,7 @@ def serialize(self):
 
         obj_header, obj_frames = self.obj.serialize()
         header["obj"] = obj_header
-        header["obj_type"] = pickle.dumps(type(self.obj))
+        header["obj_type_name"] = type(self.obj).__name__
         header["num_obj_frames"] = len(obj_frames)
         frames.extend(obj_frames)
 
@@ -1280,7 +1279,7 @@ def serialize(self):
     def deserialize(cls, header, frames):
         kwargs = header["kwargs"]
 
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
@@ -3304,8 +3303,8 @@ def _handle_misc(self, by):
     def serialize(self):
         header = {}
         frames = []
-        header["names"] = pickle.dumps(self.names)
-        header["_named_columns"] = pickle.dumps(self._named_columns)
+        header["names"] = self.names
+        header["_named_columns"] = self._named_columns
         column_header, column_frames = cudf.core.column.serialize_columns(
             self._key_columns
         )
@@ -3315,8 +3314,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames
         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1b90e9f9df0..244bd877c1a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
@@ -495,9 +494,8 @@ def serialize(self):
         header["index_column"]["step"] = self.step
         frames = []
 
-        header["name"] = pickle.dumps(self.name)
-        header["dtype"] = pickle.dumps(self.dtype)
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["name"] = self.name
+        header["dtype"] = self.dtype.str
         header["frame_count"] = 0
         return header, frames
 
@@ -505,11 +503,14 @@ def serialize(self):
     @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
-        name = pickle.loads(header["name"])
+        name = header["name"]
         start = h["start"]
         stop = h["stop"]
         step = h.get("step", 1)
-        return RangeIndex(start=start, stop=stop, step=step, name=name)
+        dtype = np.dtype(header["dtype"])
+        return RangeIndex(
+            start=start, stop=stop, step=step, dtype=dtype, name=name
+        )
 
     @property  # type: ignore
     @_performance_tracking
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index bfff62f0a89..a878b072860 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -5,7 +5,6 @@
 import itertools
 import numbers
 import operator
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any
@@ -918,15 +917,15 @@ def take(self, indices) -> Self:
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
-        header["column_names"] = pickle.dumps(self.names)
+        header["column_names"] = self.names
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
-        column_names = pickle.loads(header["column_names"])
-        header["column_names"] = pickle.dumps(range(0, len(column_names)))
+        column_names = header["column_names"]
+        header["column_names"] = range(0, len(column_names))
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index d95d252559f..391ee31f125 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 from __future__ import annotations
 
-import pickle
 import warnings
 from typing import TYPE_CHECKING
 
@@ -26,6 +25,7 @@
 
 import cudf
 from cudf._lib.column import Column
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
@@ -97,21 +97,21 @@ def serialize(self):
         header, frames = super().serialize()
         grouping_head, grouping_frames = self.grouping.serialize()
         header["grouping"] = grouping_head
-        header["resampler_type"] = pickle.dumps(type(self))
+        header["resampler_type"] = type(self).__name__
         header["grouping_frames_count"] = len(grouping_frames)
         frames.extend(grouping_frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
         grouping = _ResampleGrouping.deserialize(
             header["grouping"], frames[header["num_obj_frames"] :]
         )
-        resampler_cls = pickle.loads(header["resampler_type"])
+        resampler_cls = Serializable._name_type_map[header["resampler_type"]]
         out = resampler_cls.__new__(resampler_cls)
         out.grouping = grouping
         super().__init__(out, obj, by=grouping)
@@ -163,8 +163,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames[: -header["__bin_labels_count"]]
         )
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9b60424c924..778db5973bf 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4,7 +4,6 @@
 
 import functools
 import inspect
-import pickle
 import textwrap
 import warnings
 from collections import abc
@@ -28,7 +27,6 @@
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -415,7 +413,7 @@ def _loc_to_iloc(self, arg):
                 return indices
 
 
-class Series(SingleColumnFrame, IndexedFrame, Serializable):
+class Series(SingleColumnFrame, IndexedFrame):
     """
     One-dimensional GPU array (including time series).
 
@@ -900,7 +898,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -916,8 +914,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl
index 1ec077d10f77f4b3b3a8cc1bbfee559707b60dfc..64e06f0631d1475e9b6ab62434227253ad95a28b 100644
GIT binary patch
literal 1108
zcmbVLO>fgc5RDs$(AI*6hP3J*AeTf!;=qAJL=aLjB%+)+AZuf9vKEdVt#=7QMdA{g
zNPFw>dtqibse&jRuw-kz``(-HZ~l*SzhWPAccjxyrFjXaKH-WfCE*&(ajcVZH!dXa
zCQPxhWK#}i{{`AFt&Nx?QIsl5c*$kTvh)jw?{EQMp=}<-MW&~Dl(7-dqC_ob90ump
z8lAN4ka*{YmcZK79iz1Lnq!!~%OU)e<x^~ya#qZ9E<5CDN4S&Qun(W>@tDgYLBJf^
zWTLpxxq{F$&D%+L90|+f0%q_5R?O5ho==o0@h=RRHvW{AA1MTJESlnB=!up%5vPO|
zXNc(`=AhIg!CAs3(Fl9bRG+0!Kpd?_<L}%i>6cX4u;!%A{ehlhxnq{~ZHZyW6~dW#
z6#59Qik1o9DVgSz9b9|0T5*c19R^`9Y;wH>6Kr}#rNSZb7~tW_?qQVc>1+|%E}9Bm
zH#XOj<rALf3IB{O+#XIf&eQhrW)k=(oCN1%U~29fGdF=Xu^TinuKT585<64D+*L7y
z&?keV;bA#!M+dHePXSY{V5$cf6v(*^gbG4AYSVDizqK3D!Y9ZdwJ`g0nJUgk8P4ip
za0MDcre$;QVtlv>HZvwATVUBD$>Sm~mDI7Kjj35Aj!T|6$Tg<0guXX|D_o0q=!L-&
z#2d4jZlvt#$FN?x+p6&{$?vP5_}EWaQ7~Hf1Ca`zWyQRZSps+@UW*|qi?>_d9{#*v
e_j`4>)BpXwUA^hln<Tr3FL?<zFEKc6efJl;dyC%y

literal 1394
zcmbtU-)qxQ6mGZFx*6!K9aA4gr4KeZ5$40j>LQ{BD$Hjox5;g9SelgkgE>*?OI-{1
zZD)V4o_n)w-Ry8`Aj!$iz27<CcYchW(?O+VFMYeG;zY)&09_qWNu=a(0{UL4qP`bq
zoO%(Fq|eEaf?j!p_FHS?o;8*U*^Q{6JSIu}mBJ6$hwsqTHF-3mt@HH={O4om^hxfr
zbQ5q7v>RrL8m8S8T?_E*h)^vM$^<5Wd#d>!agEn4JoBtLvvENwKjjI@<8*+YK3T)J
z{*XNaS0Y0J{5?e2DiA8E8jg<6f1(6SA78<2dW^K2!LlxYgHEUO;U`}95$sCnA=rHj
z)`~^TgfOqmOgl#2Yr@||=T8ggbK&<?zfZZBzd+yGv)_5(cWkP<VWv_EL0M^MNW_70
zz&8AU0Avpg$f{61MoC1ufX^0e&ELW3UKG(i{7qcc3hN>y3Fum<u;(OZ^*@Rmznm9$
z`--?FD=ury*!q96bA&10pklca_Be3RT2>Rxbeu-?i%pMfO%&D?rCq5ANjQpJ(vFbX
z2m;3#6b&Q0DO>QJvD5M3CR{HS(qgU)`l^LVcvK{zc9DcUZoRrs(gA&MNtn7uUL~dP
z=2R|KN=aAq`Xrd(=5#uxx|+~*AeqT{GjM};(4czdkjIjeUP4VajzQr+y>9y=pB6)f
z(}ZwNuut4Br(u?2o2gKmsZumhHI4EuC#c>8{BjTS9x4a!1lSI%o8D5J^Xb3ZTPFQ8
z-(@kQNs=9AJc$68*f!fWnCx|d*v5}{GrwI7k2AIY`n4Fnk)t;Z+!Ef#i+gsP6V%K^
F?-%(Z>0$r?

diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 68f2aaf9cab..b50ed04427f 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging import version
 
 import cudf
 from cudf.testing import _utils as utils, assert_eq
@@ -149,13 +150,19 @@ def test_serialize(df, to_host):
 
 def test_serialize_dtype_error_checking():
     dtype = cudf.IntervalDtype("float", "right")
-    header, frames = dtype.serialize()
-    with pytest.raises(AssertionError):
-        # Invalid number of frames
-        type(dtype).deserialize(header, [None] * (header["frame_count"] + 1))
+    # Must call device_serialize (not serialize) to ensure that the type metadata is
+    # encoded in the header.
+    header, frames = dtype.device_serialize()
     with pytest.raises(AssertionError):
         # mismatching class
         cudf.StructDtype.deserialize(header, frames)
+    # The is-cuda flag list length must match the number of frames
+    header["is-cuda"] = [False]
+    with pytest.raises(AssertionError):
+        # Invalid number of frames
+        type(dtype).deserialize(
+            header, [np.zeros(1)] * (header["frame_count"] + 1)
+        )
 
 
 def test_serialize_dataframe():
@@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes():
     assert expect == got
 
 
+@pytest.mark.skipif(
+    version.parse(np.__version__) < version.parse("2.0.0"),
+    reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x",
+)
 def test_deserialize_cudf_23_12(datadir):
     fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl"
 
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 899d78c999b..b85943626a6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -79,7 +79,7 @@ def test_series_construction_with_nulls():
 )
 def test_serialize_struct_dtype(fields):
     dtype = cudf.StructDtype(fields)
-    recreated = dtype.__class__.deserialize(*dtype.serialize())
+    recreated = dtype.__class__.device_deserialize(*dtype.device_serialize())
     assert recreated == dtype
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index d03180852eb..c28b7e49207 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -4,7 +4,7 @@
 import pytest
 
 import dask
-from dask import dataframe as dd
+from dask import array as da, dataframe as dd
 from dask.distributed import Client
 from distributed.utils_test import cleanup, loop, loop_in_thread  # noqa: F401
 
@@ -121,3 +121,17 @@ def test_unique():
                 ddf.x.unique().compute(),
                 check_index=False,
             )
+
+
+def test_serialization_of_numpy_types():
+    # Dask uses numpy integers as column names, which can break cudf serialization
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            with dask.config.set(
+                {"dataframe.backend": "cudf", "array.backend": "cupy"}
+            ):
+                rng = da.random.default_rng()
+                X_arr = rng.random((100, 10), chunks=(50, 10))
+                X = dd.from_dask_array(X_arr)
+                X = X[X.columns[0]]
+                X.compute()

From 5306eca611c7926fa59c581351c3cf7f0abf464d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 10 Dec 2024 09:50:48 -0800
Subject: [PATCH 54/78] Use rapids-logger to generate the cudf logger (#17307)

This PR replaces cudf's logger implementation with one generated using https://github.com/rapidsai/rapids-logger. This approach allows us to centralize the logger definition across different RAPIDS projects while allowing each project to vendor its own copy with a suitable set of macros and default logger objects. The common logger also takes care of handling the more complex packaging problems around ensuring that we fully isolate our spdlog dependency and do not leak any of its symbols, allowing our libraries to be safely installed in a much broader set of environments.

Contributes to https://github.com/rapidsai/build-planning/issues/104.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17307
---
 .../all_cuda-118_arch-x86_64.yaml             |  1 -
 .../all_cuda-125_arch-x86_64.yaml             |  1 -
 conda/recipes/libcudf/conda_build_config.yaml |  3 -
 conda/recipes/libcudf/meta.yaml               |  1 -
 cpp/CMakeLists.txt                            | 22 ++---
 cpp/benchmarks/io/cuio_common.cpp             |  2 +-
 cpp/cmake/thirdparty/get_spdlog.cmake         | 27 ------
 .../developer_guide/DEVELOPER_GUIDE.md        |  6 +-
 cpp/include/cudf/detail/utilities/logger.hpp  | 27 ------
 cpp/include/cudf/utilities/logger.hpp         | 54 ------------
 cpp/src/io/comp/nvcomp_adapter.cpp            |  2 +-
 cpp/src/io/csv/reader_impl.cu                 |  2 +-
 cpp/src/io/orc/reader_impl_chunking.cu        |  2 +-
 cpp/src/io/orc/stripe_enc.cu                  |  2 +-
 cpp/src/io/orc/writer_impl.cu                 |  2 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  2 +-
 cpp/src/io/parquet/writer_impl.cu             |  2 +-
 cpp/src/io/utilities/base64_utilities.cpp     |  2 +-
 cpp/src/io/utilities/data_sink.cpp            |  2 +-
 cpp/src/io/utilities/datasource.cpp           |  2 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |  3 +-
 cpp/src/io/utilities/getenv_or.hpp            |  2 +-
 cpp/src/utilities/host_memory.cpp             |  2 +-
 cpp/src/utilities/logger.cpp                  | 83 -------------------
 cpp/src/utilities/stream_pool.cpp             |  2 +-
 cpp/tests/utilities_tests/logger_tests.cpp    | 48 +++++------
 dependencies.yaml                             |  1 -
 27 files changed, 53 insertions(+), 252 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake
 delete mode 100644 cpp/include/cudf/detail/utilities/logger.hpp
 delete mode 100644 cpp/include/cudf/utilities/logger.hpp
 delete mode 100644 cpp/src/utilities/logger.cpp

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bad508154aa..33fc2f651c6 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -87,7 +87,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 969124a29ad..c290a83a37f 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -86,7 +86,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c78ca326005..00020fdf6b8 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -31,9 +31,6 @@ fmt_version:
 flatbuffers_version:
   - "=24.3.25"
 
-spdlog_version:
-  - ">=1.14.1,<1.15"
-
 nvcomp_version:
   - "=4.1.0.6"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1c2e9e8dd98..b585aafc397 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -68,7 +68,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - flatbuffers {{ flatbuffers_version }}
-    - spdlog {{ spdlog_version }}
     - zlib {{ zlib_version }}
 
 outputs:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e54c71de4fa..3d77307ccde 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -273,6 +273,14 @@ endif()
 
 # add third party dependencies using CPM
 rapids_cpm_init()
+
+# Not using rapids-cmake since we never want to find, always download.
+CPMAddPackage(
+  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG
+  14bb233d2420f7187a690f0bb528ec0420c70d48
+)
+rapids_make_logger(cudf EXPORT_SET cudf-exports)
+
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
 # find NVTX
@@ -299,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake)
 include(cmake/thirdparty/get_kvikio.cmake)
 # find fmt
 include(cmake/thirdparty/get_fmt.cmake)
-# find spdlog
-include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
 # find thread_pool
@@ -772,7 +778,6 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
-  src/utilities/logger.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
@@ -910,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED)
   target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
 endif()
 
-# Define RMM logging level
-target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")
-
-# Define spdlog level
-target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")
+# Define logging level
+target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}")
 
 # Enable remote IO through KvikIO
 target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
@@ -938,8 +940,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
-         spdlog::spdlog_header_only
+  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool> cudf_logger
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp>
           cuco::cuco
           ZLIB::ZLIB
@@ -948,6 +949,7 @@ target_link_libraries(
           $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}>
           nanoarrow
           rmm::rmm_logger_impl
+          cudf_logger_impl
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 45b46005c47..38a21961735 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -17,7 +17,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
deleted file mode 100644
index 90b0f4d8a8e..00000000000
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Use CPM to find or clone speedlog
-function(find_and_configure_spdlog)
-
-  include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(
-    FMT_OPTION "EXTERNAL_FMT_HO"
-    INSTALL_EXPORT_SET cudf-exports
-    BUILD_EXPORT_SET cudf-exports
-  )
-
-endfunction()
-
-find_and_configure_spdlog()
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 1c1052487f2..5032a073b58 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable,
 in between the two settings will be excluded from the written log. The available levels are the same
 as for the CMake variable.
 * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime.
-For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that
+For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that
 are not errors or critical errors. This API should not be used within libcudf to manipulate logging,
 its purpose is to allow upstream users to configure libcudf logging to fit their application.
 
 By default, logging messages are output to stderr.
 Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the
 specified path (can be relative to the current directory).
-Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to
-standard output or even a custom spdlog sink.
+Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to
+standard output.
 
 # Data Types
 
diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp
deleted file mode 100644
index e7643eb44bd..00000000000
--- a/cpp/include/cudf/detail/utilities/logger.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/logger.hpp>
-
-// Log messages that require computation should only be used at level TRACE and DEBUG
-#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__)
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
deleted file mode 100644
index 982554a23f5..00000000000
--- a/cpp/include/cudf/utilities/logger.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/export.hpp>
-
-#include <spdlog/spdlog.h>
-
-namespace CUDF_EXPORT cudf {
-
-namespace detail {
-spdlog::logger& logger();
-}
-
-/**
- * @brief Returns the global logger.
- *
- * This is a global instance of a spdlog logger. It can be used to configure logging behavior in
- * libcudf.
- *
- * Examples:
- * @code{.cpp}
- * // Turn off logging at runtime
- * cudf::logger().set_level(spdlog::level::off);
- * // Add a stdout sink to the logger
- * cudf::logger().sinks().push_back(std::make_shared<spdlog::sinks::stdout_sink_mt>());
- * // Replace the default sink
- * cudf::logger().sinks() ={std::make_shared<spdlog::sinks::stderr_sink_mt>()};
- * @endcode
- *
- * Note: Changes to the sinks are not thread safe and should only be done during global
- * initialization.
- *
- * @return spdlog::logger& The logger.
- */
-[[deprecated(
-  "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger&
-logger();
-
-}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 9d3cf75a13f..d45c02f374f 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -18,8 +18,8 @@
 
 #include "nvcomp_adapter.cuh"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 6c84b53db46..7f0b5e07b09 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -28,13 +28,13 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index fcaee9c548e..726c79bd004 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ed0b6969154..07172b6b7f7 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -23,10 +23,10 @@
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 76e5369ffd0..0906017ee61 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -29,9 +29,9 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index bfd0cc992cf..0dd1aff41e9 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -23,7 +23,7 @@
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f865c9a7643..188e6a8c0d8 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -38,10 +38,10 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
index 2a2a07afc8d..00fc54f9883 100644
--- a/cpp/src/io/utilities/base64_utilities.cpp
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -60,7 +60,7 @@
 
 #include "base64_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index bed03869b34..dfa5d46cf48 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -16,9 +16,9 @@
 
 #include "file_io_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <kvikio/file_handle.hpp>
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 62ef7c7a794..38dedcc2627 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -17,11 +17,11 @@
 #include "file_io_utilities.hpp"
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9b17e7f6d55..28367c95430 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -19,10 +19,11 @@
 #include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 
 #include <dlfcn.h>
+#include <sys/stat.h>
 
 #include <cerrno>
 #include <cstring>
diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp
index 3fd97a00b61..b9613428418 100644
--- a/cpp/src/io/utilities/getenv_or.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <cstdlib>
 #include <sstream>
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index e30806a5011..4196523d211 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp
deleted file mode 100644
index e52fffbd8c6..00000000000
--- a/cpp/src/utilities/logger.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/logger.hpp>
-
-#include <spdlog/sinks/basic_file_sink.h>
-#include <spdlog/sinks/stdout_sinks.h>
-
-#include <string>
-
-namespace {
-
-/**
- * @brief Creates a sink for libcudf logging.
- *
- * Returns a file sink if the file name has been specified, otherwise returns a stderr sink.
- */
-[[nodiscard]] spdlog::sink_ptr make_libcudf_sink()
-{
-  if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) {
-    return std::make_shared<spdlog::sinks::basic_file_sink_mt>(filename, true);
-  } else {
-    return std::make_shared<spdlog::sinks::stderr_sink_mt>();
-  }
-}
-
-/**
- * @brief Converts the level name into the `spdlog` level enum.
- */
-[[nodiscard]] spdlog::level::level_enum libcudf_log_level()
-{
-  auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL");
-  if (env_level == nullptr) { return spdlog::level::warn; }
-
-  auto const env_lvl_str = std::string(env_level);
-  if (env_lvl_str == "TRACE") return spdlog::level::trace;
-  if (env_lvl_str == "DEBUG") return spdlog::level::debug;
-  if (env_lvl_str == "INFO") return spdlog::level::info;
-  if (env_lvl_str == "WARN") return spdlog::level::warn;
-  if (env_lvl_str == "ERROR") return spdlog::level::err;
-  if (env_lvl_str == "CRITICAL") return spdlog::level::critical;
-  if (env_lvl_str == "OFF") return spdlog::level::off;
-
-  CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable");
-}
-
-/**
- * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization.
- */
-struct logger_wrapper {
-  spdlog::logger logger_;
-
-  logger_wrapper() : logger_{"CUDF", make_libcudf_sink()}
-  {
-    logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
-    logger_.set_level(libcudf_log_level());
-    logger_.flush_on(spdlog::level::warn);
-  }
-};
-
-}  // namespace
-
-spdlog::logger& cudf::detail::logger()
-{
-  static logger_wrapper wrapped{};
-  return wrapped.logger_;
-}
-
-spdlog::logger& cudf::logger() { return cudf::detail::logger(); }
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d1bebd1937..b0f2d8c0637 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index cfab570833b..58396115a54 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -16,29 +16,25 @@
 
 #include <cudf_test/base_fixture.hpp>
 
-#include <cudf/detail/utilities/logger.hpp>
-
-#include <spdlog/sinks/ostream_sink.h>
+#include <cudf/logger.hpp>
 
 #include <string>
 
 class LoggerTest : public cudf::test::BaseFixture {
   std::ostringstream oss;
-  spdlog::level::level_enum prev_level;
-  std::vector<spdlog::sink_ptr> prev_sinks;
+  cudf::level_enum prev_level;
 
  public:
-  LoggerTest()
-    : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()}
+  LoggerTest() : prev_level{cudf::default_logger().level()}
   {
-    cudf::detail::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
-    cudf::detail::logger().set_formatter(
-      std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
+    cudf::default_logger().sinks().push_back(std::make_shared<cudf::ostream_sink_mt>(oss));
+    cudf::default_logger().set_pattern("%v");
   }
   ~LoggerTest() override
   {
-    cudf::detail::logger().set_level(prev_level);
-    cudf::detail::logger().sinks() = prev_sinks;
+    cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
+    cudf::default_logger().set_level(prev_level);
+    cudf::default_logger().sinks().pop_back();
   }
 
   void clear_sink() { oss.str(""); }
@@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture {
 
 TEST_F(LoggerTest, Basic)
 {
-  cudf::detail::logger().critical("crit msg");
+  cudf::default_logger().critical("crit msg");
   ASSERT_EQ(this->sink_content(), "crit msg\n");
 }
 
 TEST_F(LoggerTest, DefaultLevel)
 {
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
-  cudf::detail::logger().error("error");
-  cudf::detail::logger().critical("critical");
-  ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
+  cudf::default_logger().error("error");
+  cudf::default_logger().critical("critical");
+  ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
 {
-  cudf::detail::logger().set_level(spdlog::level::warn);
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
+  cudf::default_logger().set_level(cudf::level_enum::warn);
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
   ASSERT_EQ(this->sink_content(), "warn\n");
 
   this->clear_sink();
 
-  cudf::detail::logger().set_level(spdlog::level::debug);
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
+  cudf::default_logger().set_level(cudf::level_enum::debug);
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
   ASSERT_EQ(this->sink_content(), "debug\n");
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index 3c55ce2c614..44767f1e9d3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -413,7 +413,6 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
-          - spdlog>=1.14.1,<1.15
   depends_on_nvcomp:
     common:
       - output_types: conda

From 657f50bae866d97a231d565f34a1941efd49c721 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 10 Dec 2024 10:16:11 -0800
Subject: [PATCH 55/78] Fix typos, rename types, and add null_probability
 benchmark axis for distinct (#17546)

This PR addresses several minor issues discovered while working on #17467:

- Corrected a typo where `RowHasher` should have been `RowEqual`
- Renamed `hash_set_type` to `distinct_set_t`
- Added a `null_probability` benchmark axis for the distinct benchmark, similar to other stream compaction benchmarks

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17546
---
 cpp/benchmarks/stream_compaction/distinct.cpp  |  4 +++-
 cpp/src/stream_compaction/distinct.cu          |  4 ++--
 cpp/src/stream_compaction/distinct_helpers.cu  | 12 ++++++------
 cpp/src/stream_compaction/distinct_helpers.hpp | 12 +++++++-----
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index d7deebca89a..75d04bb4e8e 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
   cudf::size_type const num_rows    = state.get_int64("NumRows");
   auto const keep                   = get_keep(state.get_string("keep"));
   cudf::size_type const cardinality = state.get_int64("cardinality");
+  auto const null_probability       = state.get_float64("null_probability");
 
   if (cardinality > num_rows) {
     state.skip("cardinality > num_rows");
@@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   data_profile profile = data_profile_builder()
                            .cardinality(cardinality)
-                           .null_probability(0.01)
+                           .null_probability(null_probability)
                            .distribution(cudf::type_to_id<Type>(),
                                          distribution_id::UNIFORM,
                                          static_cast<Type>(0),
@@ -65,6 +66,7 @@ using data_type = nvbench::type_list<int32_t, int64_t>;
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
+  .add_float64_axis("null_probability", {0.01})
   .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
   .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 7d11b02d3e1..9ab8ed5938a 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -95,8 +95,8 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const helper_func = [&](auto const& d_equal) {
-    using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{
+    using RowEqual = std::decay_t<decltype(d_equal)>;
+    auto set       = distinct_set_t<RowEqual>{
       num_rows,
       0.5,  // desired load factor
       cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index c3a004b7f28..aadb438b019 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -21,8 +21,8 @@
 
 namespace cudf::detail {
 
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,
@@ -100,7 +100,7 @@ rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
 }
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -110,7 +110,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -120,7 +120,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
@@ -130,7 +130,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index f15807c2434..4ca1cab937a 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
-template <typename RowHasher>
-using hash_set_type =
+template <typename RowEqual>
+using distinct_set_t =
   cuco::static_set<size_type,
                    cuco::extent<int64_t>,
                    cuda::thread_scope_device,
-                   RowHasher,
+                   RowEqual,
                    cuco::linear_probing<1,
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
@@ -79,6 +79,8 @@ using hash_set_type =
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
+ * @tparam RowEqual The type of row equality comparator
+ *
  * @param set The auxiliary set to perform reduction
  * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
@@ -87,8 +89,8 @@ using hash_set_type =
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the output indices
  */
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,

From be62ea60440a8357702eb292e19e69dd6be001e0 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:21:39 -0600
Subject: [PATCH 56/78] Update version references in workflow (#17568)

Update version references in breaking-change trigger workflow
---
 .github/workflows/trigger-breaking-change-alert.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 3b972f31ca4..01dd2436beb 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From 1e95864f6631a1dc90d78fc9418281c256fa9f59 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 10 Dec 2024 13:47:42 -0600
Subject: [PATCH 57/78] Fix Dask-cuDF `clip` APIs (#17509)

Closes https://github.com/rapidsai/cudf/issues/17502

**Background Info**: The cudf and pandas `axis` defaults are different, and the upstream dask-expr `clip` APIs are consistent with the behavior of Pandas (not cudf).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17509
---
 .../dask_cudf/dask_cudf/_expr/collection.py   | 10 +++++++
 python/dask_cudf/dask_cudf/tests/test_core.py | 26 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 2dc4031b876..5192e6b8171 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -163,6 +163,11 @@ def read_text(*args, **kwargs):
 
         return legacy_read_text(*args, **kwargs)
 
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
+
 
 class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
@@ -182,6 +187,11 @@ def struct(self):
 
         return StructMethods(self)
 
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
+
 
 class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index cda7e2d134d..7101fb7e00a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1019,3 +1019,29 @@ def test_rename_axis_after_join():
     result = ddf1.join(ddf2, how="outer")
     expected = df1.join(df2, how="outer")
     dd.assert_eq(result, expected, check_index=False)
+
+
+def test_clip_dataframe():
+    df = cudf.DataFrame(
+        {
+            "id": ["a", "b", "c", "d"],
+            "score": [-1, 1, 4, 6],
+        }
+    )
+    expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1)
+    got = dd.from_pandas(df, npartitions=2).clip(
+        lower=["b", 1], upper=["d", 5], axis=1
+    )
+    dd.assert_eq(expect, got)
+
+
+def test_clip_series():
+    ser = cudf.Series([-0.5, 0.5, 4.5, 5.5])
+    expect = ser.clip(lower=0, upper=5).round().astype(int)
+    got = (
+        dd.from_pandas(ser, npartitions=2)
+        .clip(lower=0, upper=5)
+        .round()
+        .astype(int)
+    )
+    dd.assert_eq(expect, got)

From 0c5bd6627159fe44a49e56020f0c0842696bc397 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:10:45 -0500
Subject: [PATCH 58/78] Rework minhash APIs for deprecation cycle (#17421)

Renames `minhash_permuted()` to `minhash()` and deprecates `minhash_permuted`
Also removes the `word_minhash` APIs deprecated in 24.12.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17421
---
 cpp/benchmarks/text/minhash.cpp               |   5 +-
 cpp/include/nvtext/minhash.hpp                | 194 +---------
 cpp/src/text/minhash.cu                       | 341 +-----------------
 cpp/tests/text/minhash_tests.cpp              |  79 ++--
 docs/cudf/source/conf.py                      |   2 +
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  46 +--
 python/cudf/cudf/_lib/strings/__init__.py     |   4 -
 python/cudf/cudf/core/column/string.py        | 166 +--------
 .../cudf/cudf/tests/text/test_text_methods.py |  72 +---
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  34 --
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  12 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyi |   7 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 168 +--------
 .../pylibcudf/tests/test_nvtext_minhash.py    |  30 +-
 14 files changed, 100 insertions(+), 1060 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index a80d0dcbdb8..8c86e8d4366 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64
-                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
-                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
+    auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width)
+                         : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index b2c1a23f57e..f0d5d9ecb5d 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext {
  * @file
  */
 
-/**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint32_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width             = 4,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
-
 /**
  * @brief Returns the minhash values for each string
  *
@@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash_permuted(
+std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   uint32_t seed,
   cudf::device_span<uint32_t const> parameter_a,
@@ -142,67 +79,16 @@ std::unique_ptr<cudf::column> minhash_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
- * The hash function returns 2 uint64 values but only the first value
- * is used with the minhash calculation.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values as UINT64 for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint64_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * @copydoc nvtext::minhash
  *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash()
  */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash_permuted(
   cudf::strings_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width             = 4,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
@@ -244,7 +130,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64_permuted(
+std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   uint64_t seed,
   cudf::device_span<uint64_t const> parameter_a,
@@ -254,64 +140,18 @@ std::unique_ptr<cudf::column> minhash64_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
+ * @copydoc nvtext::minhash64
  *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash64()
  */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
+[[deprecated]] std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
-/**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm though
- * only the first 64-bits of the hash are used in computing the output.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index b7a719a2041..9a44d9477ab 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -52,118 +52,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Compute the minhash of each string for each seed
- *
- * This is a warp-per-string algorithm where parallel threads within a warp
- * work on substrings of a single string row.
- *
- * @tparam HashFunction hash function to use on each substring
- *
- * @param d_strings Strings column to process
- * @param seeds Seeds for hashing each string
- * @param width Substring window size in characters
- * @param d_hashes Minhash output values for each string
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
-                                cudf::device_span<hash_value_type const> seeds,
-                                cudf::size_type width,
-                                hash_value_type* d_hashes)
-{
-  auto const idx = cudf::detail::grid_1d::global_thread_id();
-
-  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-  if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-
-  if (d_strings.is_null(str_idx)) { return; }
-
-  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const d_output = d_hashes + (str_idx * seeds.size());
-
-  // initialize hashes output for this string
-  if (lane_idx == 0) {
-    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  auto const begin = d_str.data() + lane_idx;
-  auto const end   = d_str.data() + d_str.size_bytes();
-
-  // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
-    auto const check_str =  // used for counting 'width' characters
-      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
-    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
-    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
-
-    auto const hash_str = cudf::string_view(itr, bytes);
-    // hashing with each seed on the same section of the string is 10x faster than
-    // computing the substrings for each seed
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash substring and store the min value
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        auto const hvalue = hasher(hash_str);
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        auto const hvalue = thrust::get<0>(hasher(hash_str));
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      }
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
-                                         cudf::device_span<hash_value_type const> seeds,
-                                         cudf::size_type width,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS(width >= 2,
-               "Parameter width should be an integer value of 2 or greater",
-               std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, d_hashes);
-
-  return hashes;
-}
-
 constexpr cudf::thread_index_type block_size = 256;
 // for potentially tuning minhash_seed_kernel independently from block_size
 constexpr cudf::thread_index_type tile_size = block_size;
@@ -297,13 +185,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
  * @param d_results Final results vector of calculate values
  */
 template <typename hash_value_type, int blocks_per_string>
-CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
-                                         cudf::device_span<cudf::size_type const> indices,
-                                         cudf::device_span<hash_value_type const> parameter_a,
-                                         cudf::device_span<hash_value_type const> parameter_b,
-                                         cudf::size_type width,
-                                         hash_value_type const* d_hashes,
-                                         hash_value_type* d_results)
+CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+                                cudf::device_span<cudf::size_type const> indices,
+                                cudf::device_span<hash_value_type const> parameter_a,
+                                cudf::device_span<hash_value_type const> parameter_b,
+                                cudf::size_type width,
+                                hash_value_type const* d_hashes,
+                                hash_value_type* d_results)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
   auto const idx = (tid / blocks_per_string) / block_size;
@@ -478,7 +366,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
                                block_size};
-    minhash_permuted_kernel<hash_value_type, 1>
+    minhash_kernel<hash_value_type, 1>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -489,7 +377,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices =
       cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
     cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
-    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+    minhash_kernel<hash_value_type, blocks_per_string>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -497,101 +385,6 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return results;
 }
 
-/**
- * @brief Compute the minhash of each list row of strings for each seed
- *
- * This is a warp-per-row algorithm where parallel threads within a warp
- * work on strings in a single list row.
- *
- * @tparam HashFunction hash function to use on each string
- *
- * @param d_input List of strings to process
- * @param seeds Seeds for hashing each string
- * @param d_hashes Minhash output values (one per row)
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
-                                     cudf::device_span<hash_value_type const> seeds,
-                                     hash_value_type* d_hashes)
-{
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const row_idx = idx / cudf::detail::warp_size;
-
-  if (row_idx >= d_input.size()) { return; }
-  if (d_input.is_null(row_idx)) { return; }
-
-  auto const d_row    = cudf::list_device_view(d_input, row_idx);
-  auto const d_output = d_hashes + (row_idx * seeds.size());
-
-  // initialize hashes output for this row
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-  if (lane_idx == 0) {
-    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  // each lane hashes a string from the input row
-  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
-    auto const hash_str =
-      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash string and store the min value
-      hash_value_type hv;
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv = hasher(hash_str);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        hv = thrust::get<0>(hasher(hash_str));
-      }
-      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
-                                              cudf::device_span<hash_value_type const> seeds,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_word_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
-
-  return hashes;
-}
-
 std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
@@ -620,30 +413,6 @@ std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
 }
 }  // namespace
 
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> const& seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       uint32_t seed,
                                       cudf::device_span<uint32_t const> parameter_a,
@@ -658,30 +427,6 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> const& seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
                                         cudf::device_span<uint64_t const> parameter_a,
@@ -696,45 +441,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seeds, width, stream, mr);
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
@@ -750,23 +468,15 @@ std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const&
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seeds, width, stream, mr);
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
@@ -781,21 +491,4 @@ std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const
   return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash(input, seeds, stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash64(input, seeds, stream, mr);
-}
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 042ac44621e..8bfb17e0efd 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted)
 
   auto view = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(10);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(10);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide)
   auto input = cudf::test::strings_column_wrapper({small, wide});
   auto view  = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(20);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(20);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
   auto first = thrust::counting_iterator<uint32_t>(20);
   // more than params_per_thread
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   // more than params_per_thread
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view   = cudf::strings_column_view(input->view());
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view    = cudf::strings_column_view(input->view());
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
   auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
-  results       = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
-    std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+               std::invalid_argument);
   auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
   EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
     std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
@@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest)
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
   auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+               std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
     std::overflow_error);
-  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64_permuted(
-                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
-               std::overflow_error);
 
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 }
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index fbb9ca4b128..7aa8f9f4a1c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -594,6 +594,8 @@ def on_missing_reference(app, env, node, contnode):
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
+    ("py:class", "np.uint32"),
+    ("py:class", "np.uint64"),
 ]
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 25cfcf99ca6..9f2b3f92502 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,19 +10,9 @@ from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def minhash(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
+def minhash(Column input, uint32_t seed, Column a, Column b, int width):
     return Column.from_pylibcudf(
-        nvtext.minhash.minhash_permuted(
+        nvtext.minhash.minhash(
             input.to_pylibcudf(mode="read"),
             seed,
             a.to_pylibcudf(mode="read"),
@@ -33,19 +23,9 @@ def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width)
 
 
 @acquire_spill_lock()
-def minhash64(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
+def minhash64(Column input, uint64_t seed, Column a, Column b, int width):
     return Column.from_pylibcudf(
-        nvtext.minhash.minhash64_permuted(
+        nvtext.minhash.minhash64(
             input.to_pylibcudf(mode="read"),
             seed,
             a.to_pylibcudf(mode="read"),
@@ -53,21 +33,3 @@ def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int widt
             width,
         )
     )
-
-
-@acquire_spill_lock()
-def word_minhash(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def word_minhash64(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 341ba6d11c3..b9095a22a42 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -9,10 +9,6 @@
 from cudf._lib.nvtext.minhash import (
     minhash,
     minhash64,
-    minhash64_permuted,
-    minhash_permuted,
-    word_minhash,
-    word_minhash64,
 )
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 4a2483a80e3..06196717ce3 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5461,49 +5461,6 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         )
 
     def minhash(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0     [21141582]
-        1    [962346254]
-        dtype: list
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0    [21141582, 403093213, 1258052021]
-        1    [962346254, 677440381, 122618762]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash(self._column, seeds_column, width)
-        )
-
-    def minhash_permuted(
         self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5535,7 +5492,7 @@ def minhash_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book'])
         >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
         >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
-        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash(0, a=a, b=b, width=5)
         0    [1305480171, 462824409, 74608232]
         1       [32665388, 65330773, 97996158]
         dtype: list
@@ -5551,53 +5508,10 @@ def minhash_permuted(
                 f"Expecting a Series with dtype uint32, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash_permuted(
-                self._column, seed, a_column, b_column, width
-            )
+            libstrings.minhash(self._column, seed, a_column, b_column, width)
         )
 
     def minhash64(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> str_series.str.minhash64(seeds)
-        0    [3232308021562742685, 4445611509348165860, 586435843695903598]
-        1    [23008204270530356, 1281229757012344693, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash64(self._column, seeds_column, width)
-        )
-
-    def minhash64_permuted(
         self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5628,7 +5542,7 @@ def minhash64_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
         >>> a = cudf.Series([2, 3], dtype=np.uint64)
         >>> b = cudf.Series([5, 6], dtype=np.uint64)
-        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash64(0, a=a, b=b, width=5)
         0    [172452388517576012, 316595762085180527]
         1      [71427536958126239, 58787297728258215]
         2    [423885828176437114, 1140588505926961370]
@@ -5645,79 +5559,7 @@ def minhash64_permuted(
                 f"Expecting a Series with dtype uint64, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash64_permuted(
-                self._column, seed, a_column, b_column, width
-            )
-        )
-
-    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> ls.str.word_minhash(seeds=seeds)
-        0     [21141582, 1232889953, 1268336794]
-        1    [962346254, 2321233602, 1354839212]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash(self._column, seeds_column)
-        )
-
-    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> ls.str.word_minhash64(seeds)
-        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
-        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash64(self._column, seeds_column)
+            libstrings.minhash64(self._column, seed, a_column, b_column, width)
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 3637ef075f2..9a62285403f 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,7 +882,7 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash_permuted():
+def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
     params = cudf.Series([1, 2, 3], dtype=np.uint32)
@@ -894,7 +894,7 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     params = cudf.Series([1, 2, 3], dtype=np.uint64)
@@ -912,78 +912,18 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash64(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash_permuted(1, a="a", b="b", width=7)
+        strings.str.minhash(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash_permuted(1, a=params, b=params, width=6)
+        strings.str.minhash(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
-
-
-def test_word_minhash():
-    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
-            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash(seeds=seeds)
-    assert_eq(expected, actual)
-
-    expected = cudf.Series(
-        [
-            cudf.Series([2603139454418834912], dtype=np.uint64),
-            cudf.Series([5240044617220523711], dtype=np.uint64),
-        ]
-    )
-    actual = ls.str.word_minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-    expected = cudf.Series(
-        [
-            cudf.Series(
-                [
-                    2603139454418834912,
-                    8644371945174847701,
-                    5541030711534384340,
-                ],
-                dtype=np.uint64,
-            ),
-            cudf.Series(
-                [5240044617220523711, 5847101123925041457, 153762819128779913],
-                dtype=np.uint64,
-            ),
-        ]
-    )
-    actual = ls.str.word_minhash64(seeds=seeds)
-    assert_eq(expected, actual)
-
-    # test wrong seed types
-    with pytest.raises(ValueError):
-        ls.str.word_minhash(seeds="a")
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        ls.str.word_minhash(seeds=seeds)
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        ls.str.word_minhash64(seeds=seeds)
+        strings.str.minhash64(1, a=params, b=params, width=8)
 
 
 def test_jaccard_index():
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 8570531dfde..9d1e8cba425 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const numeric_scalar[uint32_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash_permuted(
         const column_view &strings,
         const uint32_t seed,
         const column_view &a,
@@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
     ) except +
 
     cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const numeric_scalar[uint64_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64_permuted(
         const column_view &strings,
         const uint64_t seed,
         const column_view &a,
         const column_view &b,
         const size_type width,
     ) except +
-
-    cdef unique_ptr[column] word_minhash(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] word_minhash64(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 6b544282f44..0af53748cdc 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -19,16 +17,10 @@ cpdef Column minhash_permuted(
     size_type width
 )
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
     Column b,
     size_type width
 )
-
-cpdef Column word_minhash(Column input, Column seeds)
-
-cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
index a2d9b6364f7..5d88cfbbea0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -1,13 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
-from pylibcudf.scalar import Scalar
 
 def minhash(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
 def minhash64(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
-def word_minhash(input: Column, seeds: Column) -> Column: ...
-def word_minhash64(input: Column, seeds: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 5448cc6de9b..84811cda867 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
-    minhash64_permuted as cpp_minhash64_permuted,
-    minhash_permuted as cpp_minhash_permuted,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
 )
-from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
-from pylibcudf.scalar cimport Scalar
-
-from cython.operator import dereference
-import warnings
 
 __all__ = [
     "minhash",
     "minhash64",
-    "word_minhash",
-    "word_minhash64",
 ]
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -81,7 +27,7 @@ cpdef Column minhash_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash_permuted`.
+    For details, see :cpp:func:`minhash`.
 
     Parameters
     ----------
@@ -104,7 +50,7 @@ cpdef Column minhash_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash_permuted(
+        c_result = cpp_minhash(
             input.view(),
             seed,
             a.view(),
@@ -114,50 +60,7 @@ cpdef Column minhash_permuted(
 
     return Column.from_libcudf(move(c_result))
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash64(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
@@ -168,7 +71,7 @@ cpdef Column minhash64_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x64_128 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash64_permuted`.
+    For details, see :cpp:func:`minhash64`.
 
     Parameters
     ----------
@@ -191,7 +94,7 @@ cpdef Column minhash64_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash64_permuted(
+        c_result = cpp_minhash64(
             input.view(),
             seed,
             a.view(),
@@ -200,62 +103,3 @@ cpdef Column minhash64_permuted(
         )
 
     return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`word_minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash64(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm though
-    only the first 64-bits of the hash are used in computing the output.
-
-    For details, see :cpp:func:`word_minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash64(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ec533e64307..ad7a6f7a762 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -13,20 +13,13 @@ def minhash_input_data(request):
     return input_arr, seeds, request.param
 
 
-@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
-def word_minhash_input_data(request):
-    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
-    seeds = pa.array([2, 3, 4, 5], request.param)
-    return input_arr, seeds, request.param
-
-
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash_permuted(minhash_input_data, width):
+def test_minhash(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash_permuted
+        plc.nvtext.minhash.minhash
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64_permuted
+        else plc.nvtext.minhash.minhash64
     )
     result = minhash_func(
         plc.interop.from_arrow(input_arr),
@@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width):
     assert pa_result.type == pa.list_(
         pa.field("element", seed_type, nullable=False)
     )
-
-
-def test_word_minhash(word_minhash_input_data):
-    input_arr, seeds, seed_type = word_minhash_input_data
-    word_minhash_func = (
-        plc.nvtext.minhash.word_minhash
-        if seed_type == pa.uint32()
-        else plc.nvtext.minhash.word_minhash64
-    )
-    result = word_minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
-    )
-    pa_result = plc.interop.to_arrow(result)
-    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
-    assert pa_result.type == pa.list_(
-        pa.field("element", seed_type, nullable=False)
-    )

From cd3a79bfa71be68c8e95ff8dd60a41eb641f8d5a Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 11 Dec 2024 11:12:32 -0600
Subject: [PATCH 59/78] Specify a version for rapids_logger dependency (#17573)

## Description
#17307 broke builds that use the rapids-cmake pinned dependencies
feature since no version was specified for the rapids_logger dependency.
This adds a version string equal to the git tag so the dependency has a
stated version.

## Checklist
- [X] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] New or existing tests cover these changes.
- [X] The documentation is up to date with these changes.

---------

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3d77307ccde..2f17b57b0a4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -277,7 +277,7 @@ rapids_cpm_init()
 # Not using rapids-cmake since we never want to find, always download.
 CPMAddPackage(
   NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG
-  14bb233d2420f7187a690f0bb528ec0420c70d48
+  c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55
 )
 rapids_make_logger(cudf EXPORT_SET cudf-exports)
 

From 3801e7496914dec453f0d3cb49aef7c60ab636aa Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 11 Dec 2024 12:18:49 -0800
Subject: [PATCH 60/78] Replace direct `cudaMemcpyAsync` calls with utility
 functions (within `/include`) (#17557)

Replaced the calls to `cudaMemcpyAsync` with the new `cuda_memcpy`/`cuda_memcpy_async` utility, which optionally avoids using the copy engine.

Also took the opportunity to use `cudf::detail::host_vector` and its factories to enable wider pinned memory use.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17557
---
 cpp/include/cudf/detail/get_value.cuh        | 11 +++++------
 cpp/include/cudf/table/table_device_view.cuh | 10 +++++++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 5ea0d06039f..1bfb40e5916 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(data_type(type_to_id<T>()) == col_view.type(), "get_value data type mismatch");
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
-  T result;
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &result, col_view.data<T>() + element_index, sizeof(T), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
-  return result;
+  return cudf::detail::make_host_vector_sync(
+           device_span<T const>{col_view.data<T>() + element_index, 1}, stream)
+    .front();
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 16d532ea2b8..4f6238b5fe7 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -16,6 +16,8 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   // A buffer of CPU memory is allocated to hold the ColumnDeviceView
   // objects. Once filled, the CPU memory is then copied to device memory
   // and the pointer is set in the d_columns member.
-  std::vector<int8_t> h_buffer(padded_views_size_bytes);
+  auto h_buffer = cudf::detail::make_host_vector<int8_t>(padded_views_size_bytes, stream);
   // Each ColumnDeviceView instance may have child objects which may
   // require setting some internal device pointers before being copied
   // from CPU to device.
@@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
     source_view.begin(), source_view.end(), h_ptr, d_ptr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  auto const h_span = host_span<int8_t const>{h_buffer}.subspan(
+    static_cast<int8_t const*>(h_ptr) - h_buffer.data(), views_size_bytes);
+  auto const d_span = device_span<int8_t>{static_cast<int8_t*>(d_ptr), views_size_bytes};
+  cudf::detail::cuda_memcpy(d_span, h_span, stream);
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 

From 63c5a384f29050ee50e4a2ab0681fceeab5cd3ec Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:32:29 -0500
Subject: [PATCH 61/78] Fix some possible thread-id overflow calculations
 (#17473)

Fixes some possible thread-id calculations or usages that may possibly overflow `int32` type or `size_type`.
Reference #10368

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17473
---
 cpp/include/cudf/detail/copy_if_else.cuh         | 11 ++++++-----
 cpp/src/partitioning/partitioning.cu             |  7 ++++---
 cpp/src/quantiles/tdigest/tdigest_aggregation.cu |  2 +-
 cpp/src/transform/jit/kernel.cu                  |  5 +++--
 cpp/src/transform/row_bit_count.cu               |  2 +-
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 5dc75b1a3fb..a7efb4e6e93 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
-  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
-  int const warp_id              = tidx / cudf::detail::warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
+  auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
+
+  auto const stride         = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const warp_id        = tidx / cudf::detail::warp_size;
+  auto const warps_per_grid = stride / cudf::detail::warp_size;
 
   // begin/end indices for the column data
   size_type const begin = 0;
@@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % cudf::detail::warp_size;
+  auto const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index ebab3beb08f..d6b85db3f0f 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_sizes[partition_number] = 0;
     partition_number += blockDim.x;
@@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit
   extern __shared__ size_type shared_partition_offsets[];
 
   // Initialize array of this blocks offsets from global array
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_offsets[partition_number] =
       block_partition_offsets[partition_number * gridDim.x + blockIdx.x];
@@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter,
 
   // Fetch the offset in the output buffer of each partition in this thread
   // block
-  for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) {
+  for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions;
+       ipartition += blockDim.x) {
     partition_offset_global[ipartition] =
       scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x];
   }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d27420658d6..2128bacff80 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 size_type const* group_cluster_offsets,
                                                 bool has_nulls)
 {
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const group_index = tid;
   if (group_index >= num_groups) { return; }
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 4fd0369c26b..9d96c11c3f2 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
-  thread_index_type const start  = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const block_size          = static_cast<thread_index_type>(blockDim.x);
+  thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
+  thread_index_type const stride = block_size * gridDim.x;
 
   for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
     GENERIC_UNARY_OP(&out_data[i], in_data[i]);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 66bbe532e46..39c11295fbd 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
                                        size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
 
   auto const num_segments = static_cast<size_type>(output.size());
   if (tid >= num_segments) { return; }

From 32548b074bc0350186906c223980acac142ba5a2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:34:28 -0800
Subject: [PATCH 62/78] Expose Scalar's constructor and
 `Scalar#getScalarHandle()` to public (#17580)

This exposes the constructor and `getScalarHandle()` method in `Scalar.java` to the public, allowing them to be called from the outside. Without access to these methods, it was very inconvenient. Workaround has been implemented ([spark-rapids-jni/CudfAccessor.java](https://github.com/NVIDIA/spark-rapids-jni/blob/5231d4d82603d488b95ea259874a26f9f4354005/src/main/java/ai/rapids/cudf/CudfAccessor.java#L21)) to overcome this but it is better to have the issue addressed from the root.

Partially contributes to https://github.com/NVIDIA/spark-rapids-jni/issues/1307.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/17580
---
 java/src/main/java/ai/rapids/cudf/Scalar.java | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 286b5c208c9..f3155bc5860 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native long makeStructScalar(long[] viewHandles, boolean isValid);
   private static native long repeatString(long scalarHandle, int repeatTimes);
 
-  Scalar(DType type, long scalarHandle) {
+  /**
+   * Constructor to create a scalar from a native handle and a type.
+   *
+   * @param type The type of the scalar
+   * @param scalarHandle The native handle (pointer address) to the scalar data
+   */
+  public Scalar(DType type, long scalarHandle) {
     this.type = type;
     this.offHeap = new OffHeapState(scalarHandle);
     MemoryCleaner.register(this, offHeap);
     incRefCount();
   }
 
+  /**
+   * Get the native handle (native pointer address) for the scalar.
+   *
+   * @return The native handle
+   */
+  public long getScalarHandle() {
+    return offHeap.scalarHandle;
+  }
+
   /**
    * Increment the reference count for this scalar.  You need to call close on this
    * to decrement the reference count again.
@@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() {
     return this;
   }
 
-  long getScalarHandle() {
-    return offHeap.scalarHandle;
-  }
-
   /**
    * Free the memory associated with a scalar.
    */

From 78e5c0d6c5a5c876421d1ab2308b14f8c7ecb9f7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 11 Dec 2024 17:53:36 -0800
Subject: [PATCH 63/78] Use batched memcpy when writing ORC statistics (#17572)

This PR replaces a set of per-column, per-rowgroup D2D memcopies with a single call to the `batched_memcpy_async` utility. Should improve performance when writing wide tables.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17572
---
 cpp/src/io/orc/writer_impl.cu | 36 ++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 0906017ee61..8e532b01788 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -28,6 +28,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   // we know the size of each array. The number of stripes per column in a chunk array can
   // be calculated by dividing the number of chunks by the number of columns.
   // That many chunks need to be copied at a time to the proper destination.
-  size_t num_entries_seen = 0;
+  size_t num_entries_seen        = 0;
+  auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2;
+  auto h_srcs = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_dsts = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_lens = cudf::detail::make_empty_host_vector<size_t>(num_buffers_to_copy, stream);
+
   for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
     auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns;
 
-    auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
-    auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
     for (size_t col = 0; col < num_columns; ++col) {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
-                        chunk_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
-                        merge_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
+      h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col);
+      h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_chunk));
+
+      h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col);
+      h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group));
     }
     num_entries_seen += stripes_per_col;
   }
 
+  auto const& mr    = cudf::get_current_device_resource_ref();
+  auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
+  auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
+  auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr);
+  cudf::detail::batched_memcpy_async(
+    d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream);
+
   auto file_stats_merge =
     cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {

From 00ed1f27df491d11c82d4990c979b0c2783c5881 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 11 Dec 2024 23:41:11 -0600
Subject: [PATCH 64/78] Remove unused code of json schema in JSON reader
 (#17581)

Remove dead code in json reader

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17581
---
 cpp/src/io/json/nested_json.hpp     |  11 ---
 cpp/src/io/json/parser_features.cpp | 116 ----------------------------
 2 files changed, 127 deletions(-)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 2f6942fe139..cc5f256ea80 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -464,17 +464,6 @@ std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
  */
 column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name);
 
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the column
- * @param options json reader options which holds schema
- * @return data type of the column if present
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options);
-
 /**
  * @brief Helper class to get path of a column by column id from reduced column tree
  *
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 2da320b2af3..4b4827ca8d9 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -68,78 +68,6 @@ void json_reader_options::set_dtypes(schema_element types)
 }  // namespace cudf::io
 
 namespace cudf::io::json::detail {
-namespace {
-
-// example schema and its path.
-// "a": int             {"a", int}
-// "a": [ int ]         {"a", list}, {"element", int}
-// "a": { "b": int}     {"a", struct}, {"b", int}
-// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
-// "a": [ null]         {"a", list}, {"element", str}
-// back() is root.
-// front() is leaf.
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the json column
- * @param root root of input schema element
- * @return data type of the column if present, otherwise std::nullopt
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
-{
-  if (path.empty() || path.size() == 1) {
-    return root.type;
-  } else {
-    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
-      auto const child_name      = path.first(path.size() - 1).back().first;
-      auto const child_schema_it = root.child_types.find(child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
-      auto const child_schema_it = root.child_types.find(list_child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    }
-    return std::optional<data_type>{};
-  }
-}
-
-std::optional<schema_element> child_schema_element(std::string const& col_name,
-                                                   cudf::io::json_reader_options const& options)
-{
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-        auto column_index = atol(col_name.data());
-        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? user_dtypes.find(col_name)->second
-                 : std::optional<schema_element>{};
-      },
-      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
-                 ? user_dtypes.child_types.find(col_name)->second
-                 : std::optional<schema_element>{};
-      }},
-    options.get_dtypes());
-}
-
-}  // namespace
-
 /// Created an empty column of the specified schema
 struct empty_column_functor {
   rmm::cuda_stream_view stream;
@@ -311,48 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string
   }
   return info;
 }
-
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options)
-{
-  if (path.empty()) return {};
-  std::optional<schema_element> col_schema = child_schema_element(path.back().first, options);
-  // check if it has value, then do recursive call and return.
-  if (col_schema.has_value()) {
-    return get_path_data_type(path, col_schema.value());
-  } else {
-    return {};
-  }
-}
-
-// idea: write a memoizer using template and lambda?, then call recursively.
-std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
-{
-  std::vector<path_rep> path;
-  // stops at root.
-  while (this_col_id != parent_node_sentinel) {
-    auto type        = column_categories[this_col_id];
-    std::string name = "";
-    // code same as name_and_parent_index lambda.
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    }
-    // "name": type/schema
-    path.emplace_back(name, type);
-    this_col_id = parent_col_id;
-    if (this_col_id == row_array_parent_col_id) return path;
-  }
-  return {};
-}
-
 }  // namespace cudf::io::json::detail

From 98d98560ff3f5cdf3c6e72243d914ef87fcd4753 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 12 Dec 2024 08:34:47 -0500
Subject: [PATCH 65/78] Add anonymous namespace to libcudf test source (#17529)

Uses anonymous namespace declaration on internal only functions and structures in the libcudf gtest source. This helps prevent odd nvcc compile errors like the one described in #17432

Closes #17432

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/17529
---
 cpp/tests/bitmask/set_nullmask_tests.cu              |  9 ++-------
 cpp/tests/bitmask/valid_if_tests.cu                  |  2 ++
 cpp/tests/column/bit_cast_test.cpp                   |  4 ++++
 cpp/tests/column/compound_test.cu                    |  2 ++
 cpp/tests/device_atomics/device_atomics_test.cu      |  2 ++
 cpp/tests/fixed_point/fixed_point_tests.cpp          |  2 ++
 cpp/tests/fixed_point/fixed_point_tests.cu           |  2 ++
 cpp/tests/groupby/tdigest_tests.cu                   |  4 ++++
 cpp/tests/interop/dlpack_test.cpp                    |  2 ++
 cpp/tests/io/json/json_tree.cpp                      |  4 ++--
 cpp/tests/io/json/json_tree_csr.cu                   |  3 +++
 cpp/tests/io/parquet_chunked_reader_test.cu          | 10 ++++++++++
 cpp/tests/iterator/optional_iterator_test_numeric.cu | 12 ++----------
 cpp/tests/iterator/pair_iterator_test_numeric.cu     | 12 ++----------
 cpp/tests/quantiles/percentile_approx_test.cpp       |  2 ++
 cpp/tests/reductions/tdigest_tests.cu                |  4 +++-
 cpp/tests/streams/interop_test.cpp                   |  2 ++
 cpp/tests/transform/row_bit_count_test.cu            |  2 ++
 cpp/tests/wrappers/timestamps_test.cu                |  4 ++++
 19 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index e95c9fb41c6..9f8d22ea94d 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <iostream>
 
+namespace {
 struct valid_bit_functor {
   cudf::bitmask_type const* _null_mask;
   __device__ bool operator()(cudf::size_type element_index) const noexcept
@@ -38,13 +39,7 @@ struct valid_bit_functor {
     return cudf::bit_is_set(_null_mask, element_index);
   }
 };
-
-std::ostream& operator<<(std::ostream& stream, thrust::host_vector<bool> const& bits)
-{
-  for (auto _bit : bits)
-    stream << int(_bit);
-  return stream;
-}
+}  // namespace
 
 struct SetBitmaskTest : public cudf::test::BaseFixture {
   void expect_bitmask_equal(cudf::bitmask_type const* bitmask,  // Device Ptr
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 96f122f21a8..8ffcc552ecb 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -28,6 +28,7 @@
 
 struct ValidIfTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct odds_valid {
   __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; }
 };
@@ -37,6 +38,7 @@ struct all_valid {
 struct all_null {
   __host__ __device__ bool operator()(cudf::size_type i) { return false; }
 };
+}  // namespace
 
 TEST_F(ValidIfTest, EmptyRange)
 {
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index 5570a7d498c..1f29ea9e5fc 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -25,6 +25,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
@@ -47,12 +48,14 @@ struct rep_type_impl<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 
 template <typename T>
 using rep_type_t = typename rep_type_impl<T>::type;
+}  // namespace
 
 template <typename T>
 struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes);
 
+namespace {
 template <typename FromType, typename ToType, typename Iterator>
 void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end)
 {
@@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator
     }
   }
 }
+}  // namespace
 
 TYPED_TEST(ColumnViewAllTypesTests, BitCast)
 {
diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu
index d7e93fb22a3..fff3282fdd5 100644
--- a/cpp/tests/column/compound_test.cu
+++ b/cpp/tests/column/compound_test.cu
@@ -34,6 +34,7 @@
 
 struct CompoundColumnTest : public cudf::test::BaseFixture {};
 
+namespace {
 template <typename ColumnDeviceView>
 struct checker_for_level1 {
   ColumnDeviceView d_column;
@@ -62,6 +63,7 @@ struct checker_for_level2 {
     return bcheck;
   }
 };
+}  // namespace
 
 TEST_F(CompoundColumnTest, ChildrenLevel1)
 {
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index b81f8196d89..2fb24f6b31e 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -31,6 +31,7 @@
 
 #include <algorithm>
 
+namespace {
 template <typename T>
 CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 {
@@ -109,6 +110,7 @@ std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const>
     xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); });
   return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}};
 }
+}  // namespace
 
 template <typename T>
 struct AtomicsTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index b96c6909e55..f8f8d525043 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector)
   float_vector_test(0.15, 20, -2, std::multiplies<>());
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index f34760341d8..ddc48c97012 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
   EXPECT_EQ(vec2, vec3);
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 {
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 4ae5d06b214..883a5093bd1 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -30,6 +30,7 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 /**
  * @brief Functor to generate a tdigest by key.
  *
@@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op {
     return std::move(result.second[0].results[0]);
   }
 };
+}  // namespace
 
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {};
@@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
 
+namespace {
 std::unique_ptr<cudf::table> do_agg(
   cudf::column_view key,
   cudf::column_view val,
@@ -537,6 +540,7 @@ std::unique_ptr<cudf::table> do_agg(
 
   return std::make_unique<cudf::table>(std::move(result_columns));
 }
+}  // namespace
 
 TEST_F(TDigestMergeTest, AllValuesAreNull)
 {
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index ef4b9dd9b8a..b7106e823dd 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -26,6 +26,7 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
@@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype)
   EXPECT_EQ(1, dtype.lanes);
   EXPECT_EQ(sizeof(T) * 8, dtype.bits);
 }
+}  // namespace
 
 class DLPackUntypedTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 887d4fa783f..5201a46ba7d 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -34,6 +34,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 // Host copy of tree_meta_t
 struct tree_meta_t2 {
   std::vector<cuio_json::NodeT> node_categories;
@@ -43,8 +45,6 @@ struct tree_meta_t2 {
   std::vector<cuio_json::SymbolOffsetT> node_range_end;
 };
 
-namespace {
-
 tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream)
 {
   return {cudf::detail::make_std_vector_async(d_value.node_categories, stream),
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
index f988ae24b38..a67830a7864 100644
--- a/cpp/tests/io/json/json_tree_csr.cu
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -36,6 +36,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 struct h_tree_meta_t {
   std::vector<cuio_json::NodeT> node_categories;
   std::vector<cuio_json::NodeIndexT> parent_node_ids;
@@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true)
   // assert equality between csr and meta formats
   ASSERT_TRUE(iseq);
 }
+}  // namespace
 
 struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 153a8a0c5aa..369376b6c95 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
+namespace {
 constexpr size_t input_limit_expected_file_count = 4;
 
 std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
@@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector<std::string> const& test_filenames,
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
 }
+}  // namespace
 
 struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {};
 
@@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 
 struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct offset_gen {
   int const group_size;
   __device__ int operator()(int i) { return i * group_size; }
@@ -1198,6 +1201,8 @@ template <typename T>
 struct value_gen {
   __device__ T operator()(int i) { return i % 1024; }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
   auto base_path      = temp_env->get_temp_filepath("list");
@@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
+namespace {
 void tiny_list_rowgroup_test(bool just_list_col)
 {
   auto iter = thrust::make_counting_iterator(0);
@@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first));
 }
+}  // namespace
 
 TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle)
 {
@@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed)
   tiny_list_rowgroup_test(false);
 }
 
+namespace {
 struct char_values {
   __device__ int8_t operator()(int i)
   {
@@ -1341,6 +1349,8 @@ struct char_values {
     return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 257c0979017..8377060b6ec 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -26,16 +26,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericOptionalIteratorTest : public IteratorTest<T> {};
 
@@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator)
 }
 TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); }
 
+namespace {
 // Transformers and Operators for optional_iterator test
 template <typename ElementType>
 struct transformer_optional_meanvar {
@@ -65,6 +56,7 @@ template <typename T>
 struct optional_to_meanvar {
   CUDF_HOST_DEVICE inline T operator()(cuda::std::optional<T> const& v) { return v.value_or(T{0}); }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 3447aa0dde6..5f707232953 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -24,16 +24,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericPairIteratorTest : public IteratorTest<T> {};
 
@@ -53,6 +43,7 @@ struct transformer_pair_meanvar {
   };
 };
 
+namespace {
 struct sum_if_not_null {
   template <typename T>
   CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(thrust::pair<T, bool> const& lhs,
@@ -66,6 +57,7 @@ struct sum_if_not_null {
       return {rhs};
   }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 37414eb3fba..c146fd2ea4e 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -33,6 +33,7 @@
 
 #include <arrow/util/tdigest.h>
 
+namespace {
 std::unique_ptr<cudf::column> arrow_percentile_approx(cudf::column_view const& _values,
                                                       int delta,
                                                       std::vector<double> const& percentages)
@@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type()
   if constexpr (cudf::is_fixed_point<T>()) { return cudf::data_type{cudf::type_to_id<T>(), -7}; }
   return cudf::data_type{cudf::type_to_id<T>()};
 }
+}  // namespace
 
 using PercentileApproxTypes =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index c8fec51e1c9..184725e17e0 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ template <typename T>
 struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {};
 TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes);
 
+namespace {
 struct reduce_op {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& values, int delta) const
   {
@@ -60,6 +61,7 @@ struct reduce_merge_op {
     return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer());
   }
 };
+}  // namespace
 
 TYPED_TEST(ReductionTDigestAllTypes, Simple)
 {
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index 7133baf6df1..79ea6b7d6d4 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -23,9 +23,11 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
+}  // namespace
 
 struct DLPackTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 01a042130d6..7e203086fca 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
+namespace {
 struct sum_functor {
   cudf::size_type const* s0;
   cudf::size_type const* s1;
@@ -597,6 +598,7 @@ struct sum_functor {
 
   cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; }
 };
+}  // namespace
 
 TEST_F(RowBitCount, Table)
 {
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 4086c5a91bb..8e5129dfbd2 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -37,6 +37,7 @@
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 
+namespace {
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
   cudf::size_type size() { return cudf::size_type(100); }
@@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation {
     return primitive == dur.count();
   }
 };
+}  // namespace
 
 TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes);
 
@@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
                                *cudf::column_device_view::create(chrono_col)}));
 }
 
+namespace {
 template <typename ChronoT>
 struct compare_chrono_elements {
   cudf::binary_operator comp;
@@ -129,6 +132,7 @@ struct compare_chrono_elements {
     }
   }
 };
+}  // namespace
 
 TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 {

From 92652be87839e4a4e49216c49bd36860674bff6a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 13:17:28 -0800
Subject: [PATCH 66/78] Remove cudf._lib.parquet in favor of inlining pylibcudf
 (#17562)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17562
---
 python/cudf/cudf/_lib/CMakeLists.txt    |   5 +-
 python/cudf/cudf/_lib/__init__.py       |   1 -
 python/cudf/cudf/_lib/io/CMakeLists.txt |  21 -
 python/cudf/cudf/_lib/io/__init__.pxd   |   0
 python/cudf/cudf/_lib/io/__init__.py    |   0
 python/cudf/cudf/_lib/io/utils.pxd      |  31 -
 python/cudf/cudf/_lib/io/utils.pyx      |  74 --
 python/cudf/cudf/_lib/parquet.pyx       | 817 -------------------
 python/cudf/cudf/io/parquet.py          | 992 +++++++++++++++++++++---
 python/cudf/cudf/tests/test_parquet.py  |  72 +-
 python/cudf/cudf/utils/ioutils.py       |   1 -
 11 files changed, 941 insertions(+), 1073 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/io/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/io/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/io/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/io/utils.pxd
 delete mode 100644 python/cudf/cudf/_lib/io/utils.pyx
 delete mode 100644 python/cudf/cudf/_lib/parquet.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index efe96ff6c3e..f422635d22a 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx
-    sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
+    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx
+    stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
@@ -31,5 +31,4 @@ include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
 
-add_subdirectory(io)
 add_subdirectory(nvtext)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 52e9b89da7b..cfdcec4cd3b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -7,7 +7,6 @@
     groupby,
     interop,
     nvtext,
-    parquet,
     reduce,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
deleted file mode 100644
index e7408cf2852..00000000000
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources utils.pyx)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
deleted file mode 100644
index 9b8bab012e2..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-    source_info,
-)
-
-from cudf._lib.column cimport Column
-
-
-cdef add_df_col_struct_names(
-    df,
-    child_names_dict
-)
-cdef update_col_struct_field_names(
-    Column col,
-    child_names
-)
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info
-)
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
deleted file mode 100644
index df4675be599..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-
-from libcpp.string cimport string
-
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.types cimport column_name_info
-
-from cudf._lib.column cimport Column
-
-from cudf.core.dtypes import StructDtype
-
-cdef add_df_col_struct_names(df, child_names_dict):
-    for name, child_names in child_names_dict.items():
-        col = df._data[name]
-
-        df._data[name] = update_col_struct_field_names(col, child_names)
-
-
-cdef update_col_struct_field_names(Column col, child_names):
-    if col.children:
-        children = list(col.children)
-        for i, (child, names) in enumerate(zip(children, child_names.values())):
-            children[i] = update_col_struct_field_names(
-                child,
-                names
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        col = col._rename_fields(
-            child_names.keys()
-        )
-
-    return col
-
-
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info
-):
-    # Deprecated, remove in favor of add_col_struct_names
-    # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._column_labels_and_values):
-        table._data[name] = update_column_struct_field_names(
-            col, schema_info[i]
-        )
-
-
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-):
-    cdef vector[string] field_names
-
-    if col.children:
-        children = list(col.children)
-        for i, child in enumerate(children):
-            children[i] = update_column_struct_field_names(
-                child,
-                info.children[i]
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        field_names.reserve(len(col.base_children))
-        for i in range(info.children.size()):
-            field_names.push_back(info.children[i].name)
-        col = col._rename_fields(
-            field_names
-        )
-
-    return col
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
deleted file mode 100644
index 00c434ae374..00000000000
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ /dev/null
@@ -1,817 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-import io
-
-import pyarrow as pa
-import itertools
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-import numpy as np
-
-from cudf.api.types import is_list_like
-
-from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
-
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-
-from libc.stdint cimport int64_t
-from libcpp cimport bool
-
-from pylibcudf.expressions cimport Expression
-from pylibcudf.io.parquet cimport ChunkedParquetReader
-from pylibcudf.libcudf.io.types cimport (
-    statistics_freq,
-    compression_type,
-    dictionary_policy,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    add_df_col_struct_names,
-)
-
-import pylibcudf as plc
-
-from pylibcudf cimport Table
-
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
-from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
-from pylibcudf.io.parquet cimport ParquetChunkedWriter
-
-
-def _parse_metadata(meta):
-    file_is_range_index = False
-    file_index_cols = None
-    file_column_dtype = None
-
-    if 'index_columns' in meta and len(meta['index_columns']) > 0:
-        file_index_cols = meta['index_columns']
-
-        if isinstance(file_index_cols[0], dict) and \
-                file_index_cols[0]['kind'] == 'range':
-            file_is_range_index = True
-    if 'column_indexes' in meta and len(meta['column_indexes']) == 1:
-        file_column_dtype = meta['column_indexes'][0]["numpy_type"]
-    return file_is_range_index, file_index_cols, file_column_dtype
-
-
-cdef object _process_metadata(object df,
-                              list names,
-                              dict child_names,
-                              list per_file_user_data,
-                              object row_groups,
-                              object filepaths_or_buffers,
-                              bool allow_range_index,
-                              bool use_pandas_metadata,
-                              size_type nrows=-1,
-                              int64_t skip_rows=0,
-                              ):
-
-    add_df_col_struct_names(df, child_names)
-    index_col = None
-    is_range_index = True
-    column_index_type = None
-    index_col_names = None
-    meta = None
-    for single_file in per_file_user_data:
-        if b'pandas' not in single_file:
-            continue
-        json_str = single_file[b'pandas'].decode('utf-8')
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-        is_range_index &= file_is_range_index
-
-        if not file_is_range_index and index_col is not None \
-                and index_col_names is None:
-            index_col_names = {}
-            for idx_col in index_col:
-                for c in meta['columns']:
-                    if c['field_name'] == idx_col:
-                        index_col_names[idx_col] = c['name']
-
-    if meta is not None:
-        # Book keep each column metadata as the order
-        # of `meta["columns"]` and `column_names` are not
-        # guaranteed to be deterministic and same always.
-        meta_data_per_column = {
-            col_meta['name']: col_meta for col_meta in meta["columns"]
-        }
-
-        # update the decimal precision of each column
-        for col in names:
-            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
-                df._data[col].dtype.precision = (
-                    meta_data_per_column[col]["metadata"]["precision"]
-                )
-
-    # Set the index column
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            if not allow_range_index:
-                return df
-
-            if len(per_file_user_data) > 1:
-                range_index_meta = {
-                    "kind": "range",
-                    "name": None,
-                    "start": 0,
-                    "stop": len(df),
-                    "step": 1
-                }
-            else:
-                range_index_meta = index_col[0]
-
-            if row_groups is not None:
-                per_file_metadata = [
-                    pa.parquet.read_metadata(
-                        # Pyarrow cannot read directly from bytes
-                        io.BytesIO(s) if isinstance(s, bytes) else s
-                    ) for s in filepaths_or_buffers
-                ]
-
-                filtered_idx = []
-                for i, file_meta in enumerate(per_file_metadata):
-                    row_groups_i = []
-                    start = 0
-                    for row_group in range(file_meta.num_row_groups):
-                        stop = start + file_meta.row_group(row_group).num_rows
-                        row_groups_i.append((start, stop))
-                        start = stop
-
-                    for rg in row_groups[i]:
-                        filtered_idx.append(
-                            cudf.RangeIndex(
-                                start=row_groups_i[rg][0],
-                                stop=row_groups_i[rg][1],
-                                step=range_index_meta['step']
-                            )
-                        )
-
-                if len(filtered_idx) > 0:
-                    idx = cudf.concat(filtered_idx)
-                else:
-                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
-            else:
-                start = range_index_meta["start"] + skip_rows
-                stop = range_index_meta["stop"]
-                if nrows > -1:
-                    stop = start + nrows
-                idx = cudf.RangeIndex(
-                    start=start,
-                    stop=stop,
-                    step=range_index_meta['step'],
-                    name=range_index_meta['name']
-                )
-
-            df._index = idx
-        elif set(index_col).issubset(names):
-            index_data = df[index_col]
-            actual_index_names = iter(index_col_names.values())
-            if index_data._num_columns == 1:
-                idx = cudf.Index._from_column(
-                    index_data._columns[0],
-                    name=next(actual_index_names)
-                )
-            else:
-                idx = cudf.MultiIndex.from_frame(
-                    index_data,
-                    names=list(actual_index_names)
-                )
-            df.drop(columns=index_col, inplace=True)
-            df._index = idx
-        else:
-            if use_pandas_metadata:
-                df.index.names = index_col
-
-    if df._num_columns == 0 and column_index_type is not None:
-        df._data.label_dtype = cudf.dtype(column_index_type)
-
-    return df
-
-
-def read_parquet_chunked(
-    filepaths_or_buffers,
-    columns=None,
-    row_groups=None,
-    use_pandas_metadata=True,
-    size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000,
-    size_type nrows=-1,
-    int64_t skip_rows=0,
-    allow_mismatched_pq_schemas=False
-):
-    # Note: If this function ever takes accepts filters
-    # allow_range_index needs to be False when a filter is passed
-    # (see read_parquet)
-    allow_range_index = columns is not None and len(columns) != 0
-
-    options = (
-        plc.io.parquet.ParquetReaderOptions.builder(
-            plc.io.SourceInfo(filepaths_or_buffers)
-        )
-        .use_pandas_metadata(use_pandas_metadata)
-        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
-        .build()
-    )
-    if row_groups is not None:
-        options.set_row_groups(row_groups)
-    if nrows > -1:
-        options.set_num_rows(nrows)
-    if skip_rows != 0:
-        options.set_skip_rows(skip_rows)
-    if columns is not None:
-        options.set_columns(columns)
-
-    reader = ChunkedParquetReader(
-        options,
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-    )
-
-    tbl_w_meta = reader.read_chunk()
-    column_names = tbl_w_meta.column_names(include_children=False)
-    child_names = tbl_w_meta.child_names
-    per_file_user_data = tbl_w_meta.per_file_user_data
-    concatenated_columns = tbl_w_meta.tbl.columns()
-
-    # save memory
-    del tbl_w_meta
-
-    cdef Table tbl
-    while reader.has_next():
-        tbl = reader.read_chunk().tbl
-
-        for i in range(tbl.num_columns()):
-            concatenated_columns[i] = plc.concatenate.concatenate(
-                [concatenated_columns[i], tbl._columns[i]]
-            )
-            # Drop residual columns to save memory
-            tbl._columns[i] = None
-
-    df = cudf.DataFrame._from_data(
-        *_data_from_columns(
-            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
-            column_names=column_names,
-            index_names=None
-        )
-    )
-    df = _process_metadata(df, column_names, child_names,
-                           per_file_user_data, row_groups,
-                           filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-
-cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   use_pandas_metadata=True,
-                   Expression filters=None,
-                   size_type nrows=-1,
-                   int64_t skip_rows=0,
-                   allow_mismatched_pq_schemas=False):
-    """
-    Cython function to call into libcudf API, see `read_parquet`.
-
-    filters, if not None, should be an Expression that evaluates to a
-    boolean predicate as a function of columns being read.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-
-    allow_range_index = True
-    if columns is not None and len(columns) == 0 or filters:
-        allow_range_index = False
-
-    options = (
-        plc.io.parquet.ParquetReaderOptions.builder(
-            plc.io.SourceInfo(filepaths_or_buffers)
-        )
-        .use_pandas_metadata(use_pandas_metadata)
-        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
-        .build()
-    )
-    if row_groups is not None:
-        options.set_row_groups(row_groups)
-    if nrows > -1:
-        options.set_num_rows(nrows)
-    if skip_rows != 0:
-        options.set_skip_rows(skip_rows)
-    if columns is not None:
-        options.set_columns(columns)
-    if filters is not None:
-        options.set_filter(filters)
-
-    tbl_w_meta = plc.io.parquet.read_parquet(options)
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(tbl_w_meta)
-    )
-
-    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
-                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
-                           row_groups, filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-cpdef read_parquet_metadata(list filepaths_or_buffers):
-    """
-    Cython function to call into libcudf API, see `read_parquet_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
-        plc.io.SourceInfo(filepaths_or_buffers)
-    )
-
-    # read all column names including index column, if any
-    col_names = [info.name() for info in parquet_metadata.schema().root().children()]
-
-    index_col_names = set()
-    json_str = parquet_metadata.metadata()['pandas']
-    if json_str != "":
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, _ = _parse_metadata(meta)
-        if (
-            not file_is_range_index
-            and index_col is not None
-        ):
-            columns = meta['columns']
-            for idx_col in index_col:
-                for c in columns:
-                    if c['field_name'] == idx_col:
-                        index_col_names.add(idx_col)
-
-    # remove the index column from the list of column names
-    # only if index_col_names is not None
-    if len(index_col_names) >= 0:
-        col_names = [name for name in col_names if name not in index_col_names]
-
-    return (
-        parquet_metadata.num_rows(),
-        parquet_metadata.num_rowgroups(),
-        col_names,
-        len(col_names),
-        parquet_metadata.rowgroup_metadata()
-    )
-
-
-@acquire_spill_lock()
-def write_parquet(
-    table,
-    object filepaths_or_buffers,
-    object index=None,
-    object compression="snappy",
-    object statistics="ROWGROUP",
-    object metadata_file_path=None,
-    object int96_timestamps=False,
-    object row_group_size_bytes=None,
-    object row_group_size_rows=None,
-    object max_page_size_bytes=None,
-    object max_page_size_rows=None,
-    object max_dictionary_size=None,
-    object partitions_info=None,
-    object force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-    write_arrow_schema=False,
-):
-    """
-    Cython function to call into libcudf API, see `write_parquet`.
-
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        columns = [*table.index._columns, *table._columns]
-        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        tbl_meta = TableInputMetadata(plc_table)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        plc_table = plc.Table(
-            [col.to_pylibcudf(mode="read") for col in table._columns]
-        )
-        tbl_meta = TableInputMetadata(plc_table)
-        num_index_cols_meta = 0
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        if not isinstance(name, str):
-            if cudf.get_option("mode.pandas_compatible"):
-                tbl_meta.column_metadata[i].set_name(str(name))
-            else:
-                raise ValueError(
-                    "Writing a Parquet file requires string column names"
-                )
-        else:
-            tbl_meta.column_metadata[i].set_name(name)
-
-        _set_col_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            force_nullable_schema,
-            None,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-    if partitions_info is not None:
-        user_data = [
-            {"pandas": generate_pandas_metadata(
-                table.iloc[start_row:start_row + num_row].copy(deep=False),
-                index
-            )}
-            for start_row, num_row in partitions_info
-        ]
-    else:
-        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
-
-    if header_version not in ("1.0", "2.0"):
-        raise ValueError(
-            f"Invalid parquet header version: {header_version}. "
-            "Valid values are '1.0' and '2.0'"
-        )
-
-    dict_policy = (
-        plc.io.types.DictionaryPolicy.ADAPTIVE
-        if use_dictionary
-        else plc.io.types.DictionaryPolicy.NEVER
-    )
-
-    comp_type = _get_comp_type(compression)
-    stat_freq = _get_stat_freq(statistics)
-    options = (
-        plc.io.parquet.ParquetWriterOptions.builder(
-            plc.io.SinkInfo(filepaths_or_buffers), plc_table
-        )
-        .metadata(tbl_meta)
-        .key_value_metadata(user_data)
-        .compression(comp_type)
-        .stats_level(stat_freq)
-        .int96_timestamps(int96_timestamps)
-        .write_v2_headers(header_version == "2.0")
-        .dictionary_policy(dict_policy)
-        .utc_timestamps(False)
-        .write_arrow_schema(write_arrow_schema)
-        .build()
-    )
-    if partitions_info is not None:
-        options.set_partitions(
-            [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info]
-        )
-    if metadata_file_path is not None:
-        if is_list_like(metadata_file_path):
-            options.set_column_chunks_file_paths(metadata_file_path)
-        else:
-            options.set_column_chunks_file_paths([metadata_file_path])
-    if row_group_size_bytes is not None:
-        options.set_row_group_size_bytes(row_group_size_bytes)
-    if row_group_size_rows is not None:
-        options.set_row_group_size_rows(row_group_size_rows)
-    if max_page_size_bytes is not None:
-        options.set_max_page_size_bytes(max_page_size_bytes)
-    if max_page_size_rows is not None:
-        options.set_max_page_size_rows(max_page_size_rows)
-    if max_dictionary_size is not None:
-        options.set_max_dictionary_size(max_dictionary_size)
-    blob = plc.io.parquet.write_parquet(options)
-    if metadata_file_path is not None:
-        return np.asarray(blob.obj)
-    else:
-        return None
-
-
-cdef class ParquetWriter:
-    """
-    ParquetWriter lets you incrementally write out a Parquet file from a series
-    of cudf tables
-
-    Parameters
-    ----------
-    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
-        File path or buffer to write to. The argument may also correspond
-        to a list of file paths or buffers.
-    index : bool or None, default None
-        If ``True``, include a dataframe's index(es) in the file output.
-        If ``False``, they will not be written to the file. If ``None``,
-        index(es) other than RangeIndex will be saved as columns.
-    compression : {'snappy', None}, default 'snappy'
-        Name of the compression to use. Use ``None`` for no compression.
-    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
-        Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default ``uint64 max``
-        Maximum size of each stripe of the output.
-        By default, a virtually infinite size equal to ``uint64 max`` will be used.
-    row_group_size_rows: int, default 1000000
-        Maximum number of rows of each stripe of the output.
-        By default, 1000000 (10^6 rows) will be used.
-    max_page_size_bytes: int, default 524288
-        Maximum uncompressed size of each page of the output.
-        By default, 524288 (512KB) will be used.
-    max_page_size_rows: int, default 20000
-        Maximum number of rows of each page of the output.
-        By default, 20000 will be used.
-    max_dictionary_size: int, default 1048576
-        Maximum size of the dictionary page for each output column chunk. Dictionary
-        encoding for column chunks that exceeds this limit will be disabled.
-        By default, 1048576 (1MB) will be used.
-    use_dictionary : bool, default True
-        If ``True``, enable dictionary encoding for Parquet page data
-        subject to ``max_dictionary_size`` constraints.
-        If ``False``, disable dictionary encoding for Parquet page data.
-    store_schema : bool, default False
-        If ``True``, enable computing and writing arrow schema to Parquet
-        file footer's key-value metadata section for faithful round-tripping.
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-    cdef bool initialized
-    cdef ParquetChunkedWriter writer
-    cdef SinkInfo sink
-    cdef TableInputMetadata tbl_meta
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef size_t row_group_size_bytes
-    cdef size_type row_group_size_rows
-    cdef size_t max_page_size_bytes
-    cdef size_type max_page_size_rows
-    cdef size_t max_dictionary_size
-    cdef bool use_dictionary
-    cdef bool write_arrow_schema
-
-    def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression="snappy", str statistics="ROWGROUP",
-                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  size_type row_group_size_rows=1000000,
-                  size_t max_page_size_bytes=524288,
-                  size_type max_page_size_rows=20000,
-                  size_t max_dictionary_size=1048576,
-                  bool use_dictionary=True,
-                  bool store_schema=False):
-        filepaths_or_buffers = (
-            list(filepath_or_buffer)
-            if is_list_like(filepath_or_buffer)
-            else [filepath_or_buffer]
-        )
-        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.initialized = False
-        self.row_group_size_bytes = row_group_size_bytes
-        self.row_group_size_rows = row_group_size_rows
-        self.max_page_size_bytes = max_page_size_bytes
-        self.max_page_size_rows = max_page_size_rows
-        self.max_dictionary_size = max_dictionary_size
-        self.use_dictionary = use_dictionary
-        self.write_arrow_schema = store_schema
-
-    def write_table(self, table, object partitions_info=None):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(
-                table,
-                num_partitions=len(partitions_info) if partitions_info else 1
-            )
-        if self.index is not False and (
-            table._index.name is not None or
-                isinstance(table._index, cudf.core.multiindex.MultiIndex)):
-            columns = [*table.index._columns, *table._columns]
-            plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        else:
-            plc_table = plc.Table(
-                [col.to_pylibcudf(mode="read") for col in table._columns]
-            )
-        self.writer.write(plc_table, partitions_info)
-
-    def close(self, object metadata_file_path=None):
-        if not self.initialized:
-            return None
-        column_chunks_file_paths=[]
-        if metadata_file_path is not None:
-            if is_list_like(metadata_file_path):
-                column_chunks_file_paths = list(metadata_file_path)
-            else:
-                column_chunks_file_paths = [metadata_file_path]
-        blob = self.writer.close(column_chunks_file_paths)
-        if metadata_file_path is not None:
-            return np.asarray(blob.obj)
-        return None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        self.close()
-
-    def _initialize_chunked_state(self, table, num_partitions=1):
-        """ Prepares all the values required to build the
-        chunked_parquet_writer_options and creates a writer"""
-
-        # Set the table_metadata
-        num_index_cols_meta = 0
-        plc_table = plc.Table(
-            [
-                col.to_pylibcudf(mode="read")
-                for col in table._columns
-            ]
-        )
-        self.tbl_meta = TableInputMetadata(plc_table)
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                plc_table = plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read")
-                        for col in itertools.chain(table.index._columns, table._columns)
-                    ]
-                )
-                self.tbl_meta = TableInputMetadata(plc_table)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(idx_name)
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    plc_table = plc.Table(
-                        [
-                            col.to_pylibcudf(mode="read")
-                            for col in itertools.chain(
-                                table.index._columns, table._columns
-                            )
-                        ]
-                    )
-                    self.tbl_meta = TableInputMetadata(plc_table)
-                    self.tbl_meta.column_metadata[0].set_name(table._index.name)
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name)
-            _set_col_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-            )
-
-        index = (
-            False if isinstance(table._index, cudf.RangeIndex) else self.index
-        )
-        user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions
-        cdef compression_type comp_type = _get_comp_type(self.compression)
-        cdef statistics_freq stat_freq = _get_stat_freq(self.statistics)
-        cdef dictionary_policy dict_policy = (
-            plc.io.types.DictionaryPolicy.ADAPTIVE
-            if self.use_dictionary
-            else plc.io.types.DictionaryPolicy.NEVER
-        )
-        options = (
-            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
-            .metadata(self.tbl_meta)
-            .key_value_metadata(user_data)
-            .compression(comp_type)
-            .stats_level(stat_freq)
-            .row_group_size_bytes(self.row_group_size_bytes)
-            .row_group_size_rows(self.row_group_size_rows)
-            .max_page_size_bytes(self.max_page_size_bytes)
-            .max_page_size_rows(self.max_page_size_rows)
-            .max_dictionary_size(self.max_dictionary_size)
-            .write_arrow_schema(self.write_arrow_schema)
-            .build()
-        )
-        options.set_dictionary_policy(dict_policy)
-        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
-        self.initialized = True
-
-
-cpdef merge_filemetadata(object filemetadata_list):
-    """
-    Cython function to call into libcudf API, see `merge_row_group_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.merge_row_group_metadata
-    """
-    return np.asarray(
-        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
-    )
-
-
-cdef statistics_freq _get_stat_freq(str statistics):
-    result = getattr(
-        plc.io.types.StatisticsFreq,
-        f"STATISTICS_{statistics.upper()}",
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `statistics_freq` type")
-    return result
-
-
-cdef compression_type _get_comp_type(object compression):
-    if compression is None:
-        return plc.io.types.CompressionType.NONE
-    result = getattr(
-        plc.io.types.CompressionType,
-        str(compression).upper(),
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `compression` type")
-    return result
-
-
-cdef _set_col_metadata(
-    Column col,
-    ColumnInMetadata col_meta,
-    bool force_nullable_schema=False,
-    str path=None,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-):
-    need_path = (skip_compression is not None or column_encoding is not None or
-                 column_type_length is not None or output_as_binary is not None)
-    name = col_meta.get_name() if need_path else None
-    full_path = path + "." + name if path is not None else name
-
-    if force_nullable_schema:
-        # Only set nullability if `force_nullable_schema`
-        # is true.
-        col_meta.set_nullability(True)
-
-    if skip_compression is not None and full_path in skip_compression:
-        col_meta.set_skip_compression(True)
-
-    if column_encoding is not None and full_path in column_encoding:
-        encoding = column_encoding[full_path]
-        if encoding is None:
-            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
-        else:
-            enc = str(encoding).upper()
-            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
-            if c_encoding is None:
-                raise ValueError("Unsupported `column_encoding` type")
-        col_meta.set_encoding(c_encoding)
-
-    if column_type_length is not None and full_path in column_type_length:
-        col_meta.set_output_as_binary(True)
-        col_meta.set_type_length(column_type_length[full_path])
-
-    if output_as_binary is not None and full_path in output_as_binary:
-        col_meta.set_output_as_binary(True)
-
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name)
-            _set_col_metadata(
-                child_col,
-                col_meta.child(i),
-                force_nullable_schema,
-                full_path,
-                skip_compression,
-                column_encoding,
-                column_type_length,
-                output_as_binary
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if full_path is not None:
-            full_path = full_path + ".list"
-            col_meta.child(1).set_name("element")
-        _set_col_metadata(
-            col.children[1],
-            col_meta.child(1),
-            force_nullable_schema,
-            full_path,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
-        col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 2382e9f12ed..66095d4a155 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
 import itertools
 import math
 import operator
@@ -10,23 +11,42 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 from uuid import uuid4
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import parquet as libparquet
+from cudf._lib.column import Column
+from cudf._lib.utils import (
+    _data_from_columns,
+    _index_level_name,
+    data_from_pylibcudf_io,
+    generate_pandas_metadata,
+)
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column, column_empty
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
+
+    from typing_extensions import Self
+
+    from cudf.core.column import ColumnBase
 
 
 BYTE_SIZES = {
@@ -55,31 +75,200 @@
 }
 
 
+@acquire_spill_lock()
+def _plc_write_parquet(
+    table,
+    filepaths_or_buffers,
+    index: bool | None = None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
+    partitions_info=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = False,
+) -> np.ndarray | None:
+    """
+    Cython function to call into libcudf API, see `write_parquet`.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = itertools.chain(table.index._columns, table._columns)
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table.index.names):
+            tbl_meta.column_metadata[level].set_name(
+                _index_level_name(idx_name, level, table._column_names)
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        if not isinstance(name, str):
+            if cudf.get_option("mode.pandas_compatible"):
+                tbl_meta.column_metadata[i].set_name(str(name))
+            else:
+                raise ValueError(
+                    "Writing a Parquet file requires string column names"
+                )
+        else:
+            tbl_meta.column_metadata[i].set_name(name)
+
+        _set_col_metadata(
+            table[name]._column,
+            tbl_meta.column_metadata[i],
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    if partitions_info is not None:
+        user_data = [
+            {
+                "pandas": generate_pandas_metadata(
+                    table.iloc[start_row : start_row + num_row].copy(
+                        deep=False
+                    ),
+                    index,
+                )
+            }
+            for start_row, num_row in partitions_info
+        ]
+    else:
+        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
+
+    if header_version not in ("1.0", "2.0"):
+        raise ValueError(
+            f"Invalid parquet header version: {header_version}. "
+            "Valid values are '1.0' and '2.0'"
+        )
+
+    dict_policy = (
+        plc.io.types.DictionaryPolicy.ADAPTIVE
+        if use_dictionary
+        else plc.io.types.DictionaryPolicy.NEVER
+    )
+
+    comp_type = _get_comp_type(compression)
+    stat_freq = _get_stat_freq(statistics)
+    options = (
+        plc.io.parquet.ParquetWriterOptions.builder(
+            plc.io.SinkInfo(filepaths_or_buffers), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .int96_timestamps(int96_timestamps)
+        .write_v2_headers(header_version == "2.0")
+        .dictionary_policy(dict_policy)
+        .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
+        .build()
+    )
+    if partitions_info is not None:
+        options.set_partitions(
+            [
+                plc.io.types.PartitionInfo(part[0], part[1])
+                for part in partitions_info
+            ]
+        )
+    if metadata_file_path is not None:
+        if is_list_like(metadata_file_path):
+            options.set_column_chunks_file_paths(metadata_file_path)
+        else:
+            options.set_column_chunks_file_paths([metadata_file_path])
+    if row_group_size_bytes is not None:
+        options.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        options.set_row_group_size_rows(row_group_size_rows)
+    if max_page_size_bytes is not None:
+        options.set_max_page_size_bytes(max_page_size_bytes)
+    if max_page_size_rows is not None:
+        options.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        options.set_max_dictionary_size(max_dictionary_size)
+    blob = plc.io.parquet.write_parquet(options)
+    if metadata_file_path is not None:
+        return np.asarray(blob.obj)
+    else:
+        return None
+
+
 @_performance_tracking
 def _write_parquet(
     df,
     paths,
-    compression="snappy",
-    index=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     partitions_info=None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
-    write_arrow_schema=True,
-):
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = True,
+) -> np.ndarray | None:
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
             ValueError("partition info is required for multiple paths")
@@ -124,11 +313,11 @@ def _write_parquet(
             file_objs = [
                 ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs
             ]
-            write_parquet_res = libparquet.write_parquet(
+            write_parquet_res = _plc_write_parquet(
                 df, filepaths_or_buffers=file_objs, **common_args
             )
     else:
-        write_parquet_res = libparquet.write_parquet(
+        write_parquet_res = _plc_write_parquet(
             df, filepaths_or_buffers=paths_or_bufs, **common_args
         )
 
@@ -141,26 +330,38 @@ def _write_parquet(
 def write_to_dataset(
     df,
     root_path,
-    compression="snappy",
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
     filename=None,
     partition_cols=None,
     fs=None,
-    preserve_index=False,
-    return_metadata=False,
-    statistics="ROWGROUP",
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
+    preserve_index: bool = False,
+    return_metadata: bool = False,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
@@ -330,9 +531,29 @@ def write_to_dataset(
     return metadata
 
 
+def _parse_metadata(meta) -> tuple[bool, Any, Any]:
+    file_is_range_index = False
+    file_index_cols = None
+    file_column_dtype = None
+
+    if "index_columns" in meta and len(meta["index_columns"]) > 0:
+        file_index_cols = meta["index_columns"]
+
+        if (
+            isinstance(file_index_cols[0], dict)
+            and file_index_cols[0]["kind"] == "range"
+        ):
+            file_is_range_index = True
+    if "column_indexes" in meta and len(meta["column_indexes"]) == 1:
+        file_column_dtype = meta["column_indexes"][0]["numpy_type"]
+    return file_is_range_index, file_index_cols, file_column_dtype
+
+
 @ioutils.doc_read_parquet_metadata()
 @_performance_tracking
-def read_parquet_metadata(filepath_or_buffer):
+def read_parquet_metadata(
+    filepath_or_buffer,
+) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]:
     """{docstring}"""
 
     # List of filepaths or buffers
@@ -341,7 +562,39 @@ def read_parquet_metadata(filepath_or_buffer):
         bytes_per_thread=None,
     )
 
-    return libparquet.read_parquet_metadata(filepaths_or_buffers)
+    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(filepaths_or_buffers)
+    )
+
+    # read all column names including index column, if any
+    col_names = [
+        info.name() for info in parquet_metadata.schema().root().children()
+    ]
+
+    index_col_names = set()
+    json_str = parquet_metadata.metadata()["pandas"]
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None:
+            columns = meta["columns"]
+            for idx_col in index_col:
+                for c in columns:
+                    if c["field_name"] == idx_col:
+                        index_col_names.add(idx_col)
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if len(index_col_names) >= 0:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    return (
+        parquet_metadata.num_rows(),
+        parquet_metadata.num_rowgroups(),
+        col_names,
+        len(col_names),
+        parquet_metadata.rowgroup_metadata(),
+    )
 
 
 @_performance_tracking
@@ -913,16 +1166,18 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
-    nrows=None,
-    skip_rows=None,
-    allow_mismatched_pq_schemas=False,
+    nrows: int | None = None,
+    skip_rows: int | None = None,
+    allow_mismatched_pq_schemas: bool = False,
     *args,
     **kwargs,
-):
+) -> cudf.DataFrame:
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
-        if kwargs:
+        if set(kwargs.keys()).difference(
+            set(("_chunk_read_limit", "_pass_read_limit"))
+        ):
             raise ValueError(
                 "cudf engine doesn't support the "
                 f"following keyword arguments: {list(kwargs.keys())}"
@@ -932,30 +1187,123 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
+        if nrows is None:
+            nrows = -1
+        if skip_rows is None:
+            skip_rows = 0
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.read_parquet_chunked(
+            # Note: If this function ever takes accepts filters
+            # allow_range_index needs to be False when a filter is passed
+            # (see read_parquet)
+            allow_range_index = columns is not None and len(columns) != 0
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+
+            reader = plc.io.parquet.ChunkedParquetReader(
+                options,
+                chunk_read_limit=kwargs.get("_chunk_read_limit", 0),
+                pass_read_limit=kwargs.get("_pass_read_limit", 1024000000),
+            )
+
+            tbl_w_meta = reader.read_chunk()
+            column_names = tbl_w_meta.column_names(include_children=False)
+            child_names = tbl_w_meta.child_names
+            per_file_user_data = tbl_w_meta.per_file_user_data
+            concatenated_columns = tbl_w_meta.tbl.columns()
+
+            # save memory
+            del tbl_w_meta
+
+            while reader.has_next():
+                tbl = reader.read_chunk().tbl
+
+                for i in range(tbl.num_columns()):
+                    concatenated_columns[i] = plc.concatenate.concatenate(
+                        [concatenated_columns[i], tbl._columns[i]]
+                    )
+                    # Drop residual columns to save memory
+                    tbl._columns[i] = None
+
+            df = cudf.DataFrame._from_data(
+                *_data_from_columns(
+                    columns=[
+                        Column.from_pylibcudf(plc)
+                        for plc in concatenated_columns
+                    ],
+                    column_names=column_names,
+                    index_names=None,
+                )
+            )
+            df = _process_metadata(
+                df,
+                column_names,
+                child_names,
+                per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
-                nrows=nrows if nrows is not None else -1,
-                skip_rows=skip_rows if skip_rows is not None else 0,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
+                allow_range_index,
+                use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
+            return df
         else:
-            if nrows is None:
-                nrows = -1
-            if skip_rows is None:
-                skip_rows = 0
-            return libparquet.read_parquet(
+            allow_range_index = True
+            filters = kwargs.get("filters", None)
+            if columns is not None and len(columns) == 0 or filters:
+                allow_range_index = False
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+            if filters is not None:
+                options.set_filter(filters)
+
+            tbl_w_meta = plc.io.parquet.read_parquet(options)
+
+            df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta))
+
+            df = _process_metadata(
+                df,
+                tbl_w_meta.column_names(include_children=False),
+                tbl_w_meta.child_names,
+                tbl_w_meta.per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
+                allow_range_index,
+                use_pandas_metadata,
                 nrows=nrows,
                 skip_rows=skip_rows,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
+            return df
     else:
         if (
             isinstance(filepaths_or_buffers, list)
@@ -980,28 +1328,40 @@ def to_parquet(
     df,
     path,
     engine="cudf",
-    compression="snappy",
-    index=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
     partition_cols=None,
     partition_file_name=None,
     partition_offsets=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     storage_options=None,
-    return_metadata=False,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    return_metadata: bool = False,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
     *args,
     **kwargs,
@@ -1114,10 +1474,11 @@ def to_parquet(
 
 
 @ioutils.doc_merge_parquet_filemetadata()
-def merge_parquet_filemetadata(filemetadata_list):
+def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray:
     """{docstring}"""
-
-    return libparquet.merge_filemetadata(filemetadata_list)
+    return np.asarray(
+        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
+    )
 
 
 def _generate_filename():
@@ -1205,10 +1566,207 @@ def _get_groups_and_offsets(
     return part_names, grouped_df, part_offsets
 
 
-ParquetWriter = libparquet.ParquetWriter
+class ParquetWriter:
+    """
+    ParquetWriter lets you incrementally write out a Parquet file from a series
+    of cudf tables
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
+        File path or buffer to write to. The argument may also correspond
+        to a list of file paths or buffers.
+    index : bool or None, default None
+        If ``True``, include a dataframe's index(es) in the file output.
+        If ``False``, they will not be written to the file. If ``None``,
+        index(es) other than RangeIndex will be saved as columns.
+    compression : {'snappy', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
+        Level at which column statistics should be included in file.
+    row_group_size_bytes: int, default ``uint64 max``
+        Maximum size of each stripe of the output.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
+    row_group_size_rows: int, default 1000000
+        Maximum number of rows of each stripe of the output.
+        By default, 1000000 (10^6 rows) will be used.
+    max_page_size_bytes: int, default 524288
+        Maximum uncompressed size of each page of the output.
+        By default, 524288 (512KB) will be used.
+    max_page_size_rows: int, default 20000
+        Maximum number of rows of each page of the output.
+        By default, 20000 will be used.
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+
+    def __init__(
+        self,
+        filepath_or_buffer,
+        index: bool | None = None,
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+        row_group_size_bytes: int = int(np.iinfo(np.uint64).max),
+        row_group_size_rows: int = 1000000,
+        max_page_size_bytes: int = 524288,
+        max_page_size_rows: int = 20000,
+        max_dictionary_size: int = 1048576,
+        use_dictionary: bool = True,
+        store_schema: bool = False,
+    ):
+        filepaths_or_buffers = (
+            list(filepath_or_buffer)
+            if is_list_like(filepath_or_buffer)
+            else [filepath_or_buffer]
+        )
+        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.initialized = False
+        self.row_group_size_bytes = row_group_size_bytes
+        self.row_group_size_rows = row_group_size_rows
+        self.max_page_size_bytes = max_page_size_bytes
+        self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.use_dictionary = use_dictionary
+        self.write_arrow_schema = store_schema
+
+    def write_table(self, table, partitions_info=None) -> None:
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(
+                table,
+                num_partitions=len(partitions_info) if partitions_info else 1,
+            )
+        if self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        ):
+            columns = itertools.chain(table.index._columns, table._columns)
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns]
+            )
+        else:
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
+            )
+        self.writer.write(plc_table, partitions_info)
+
+    def close(self, metadata_file_path=None) -> np.ndarray | None:
+        if not self.initialized:
+            return None
+        column_chunks_file_paths = []
+        if metadata_file_path is not None:
+            if is_list_like(metadata_file_path):
+                column_chunks_file_paths = list(metadata_file_path)
+            else:
+                column_chunks_file_paths = [metadata_file_path]
+        blob = self.writer.close(column_chunks_file_paths)
+        if metadata_file_path is not None:
+            return np.asarray(blob.obj)
+        return None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(self, *args) -> None:
+        self.close()
+
+    def _initialize_chunked_state(
+        self, table, num_partitions: int = 1
+    ) -> None:
+        """Prepares all the values required to build the
+        chunked_parquet_writer_options and creates a writer
+        """
 
+        # Set the table_metadata
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_metadata(
+                table[name]._column,
+                self.tbl_meta.column_metadata[i],
+            )
 
-def _parse_bytes(s):
+        index = (
+            False if isinstance(table.index, cudf.RangeIndex) else self.index
+        )
+        user_data = [
+            {"pandas": generate_pandas_metadata(table, index)}
+        ] * num_partitions
+        comp_type = _get_comp_type(self.compression)
+        stat_freq = _get_stat_freq(self.statistics)
+        dict_policy = (
+            plc.io.types.DictionaryPolicy.ADAPTIVE
+            if self.use_dictionary
+            else plc.io.types.DictionaryPolicy.NEVER
+        )
+        options = (
+            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(comp_type)
+            .stats_level(stat_freq)
+            .row_group_size_bytes(self.row_group_size_bytes)
+            .row_group_size_rows(self.row_group_size_rows)
+            .max_page_size_bytes(self.max_page_size_bytes)
+            .max_page_size_rows(self.max_page_size_rows)
+            .max_dictionary_size(self.max_dictionary_size)
+            .write_arrow_schema(self.write_arrow_schema)
+            .build()
+        )
+        options.set_dictionary_policy(dict_policy)
+        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
+        self.initialized = True
+
+
+def _parse_bytes(s: str) -> int:
     """Parse byte string to numbers
 
     Utility function vendored from Dask.
@@ -1345,8 +1903,8 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression="snappy",
-        statistics="ROWGROUP",
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
         storage_options=None,
@@ -1370,9 +1928,7 @@ def __init__(
         self.partition_cols = partition_cols
         # Collection of `ParquetWriter`s, and the corresponding
         # partition_col values they're responsible for
-        self._chunked_writers: list[
-            tuple[libparquet.ParquetWriter, list[str], str]
-        ] = []
+        self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = []
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: dict[str, int] = {}
@@ -1563,3 +2119,257 @@ def _hive_dirname(name, val):
     if pd.isna(val):
         val = "__HIVE_DEFAULT_PARTITION__"
     return f"{name}={val}"
+
+
+def _set_col_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    force_nullable_schema: bool = False,
+    path: str | None = None,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+) -> None:
+    need_path = (
+        skip_compression is not None
+        or column_encoding is not None
+        or column_type_length is not None
+        or output_as_binary is not None
+    )
+    name = col_meta.get_name() if need_path else None
+    full_path = (
+        path + "." + name if (path is not None and name is not None) else name
+    )
+
+    if force_nullable_schema:
+        # Only set nullability if `force_nullable_schema`
+        # is true.
+        col_meta.set_nullability(True)
+
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        encoding = column_encoding[full_path]
+        if encoding is None:
+            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
+        else:
+            enc = str(encoding).upper()
+            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
+            if c_encoding is None:
+                raise ValueError("Unsupported `column_encoding` type")
+        col_meta.set_encoding(c_encoding)
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_metadata(
+                child_col,
+                col_meta.child(i),
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary,
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element")
+        _set_col_metadata(
+            col.children[1],
+            col_meta.child(1),
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
+        col_meta.set_decimal_precision(col.dtype.precision)
+
+
+def _get_comp_type(
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None],
+) -> plc.io.types.CompressionType:
+    if compression is None:
+        return plc.io.types.CompressionType.NONE
+    result = getattr(plc.io.types.CompressionType, compression.upper(), None)
+    if result is None:
+        raise ValueError("Unsupported `compression` type")
+    return result
+
+
+def _get_stat_freq(
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"],
+) -> plc.io.types.StatisticsFreq:
+    result = getattr(
+        plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None
+    )
+    if result is None:
+        raise ValueError("Unsupported `statistics_freq` type")
+    return result
+
+
+def _process_metadata(
+    df: cudf.DataFrame,
+    names: list[Hashable],
+    child_names: dict,
+    per_file_user_data: list,
+    row_groups,
+    filepaths_or_buffers,
+    allow_range_index: bool,
+    use_pandas_metadata: bool,
+    nrows: int = -1,
+    skip_rows: int = 0,
+) -> cudf.DataFrame:
+    ioutils._add_df_col_struct_names(df, child_names)
+    index_col = None
+    is_range_index = True
+    column_index_type = None
+    index_col_names = None
+    meta = None
+    for single_file in per_file_user_data:
+        if b"pandas" not in single_file:
+            continue
+        json_str = single_file[b"pandas"].decode("utf-8")
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(
+            meta
+        )
+        is_range_index &= file_is_range_index
+
+        if (
+            not file_is_range_index
+            and index_col is not None
+            and index_col_names is None
+        ):
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta["columns"]:
+                    if c["field_name"] == idx_col:
+                        index_col_names[idx_col] = c["name"]
+
+    if meta is not None:
+        # Book keep each column metadata as the order
+        # of `meta["columns"]` and `column_names` are not
+        # guaranteed to be deterministic and same always.
+        meta_data_per_column = {
+            col_meta["name"]: col_meta for col_meta in meta["columns"]
+        }
+
+        # update the decimal precision of each column
+        for col in names:
+            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
+                df._data[col].dtype.precision = meta_data_per_column[col][
+                    "metadata"
+                ]["precision"]
+
+    # Set the index column
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            if not allow_range_index:
+                return df
+
+            if len(per_file_user_data) > 1:
+                range_index_meta = {
+                    "kind": "range",
+                    "name": None,
+                    "start": 0,
+                    "stop": len(df),
+                    "step": 1,
+                }
+            else:
+                range_index_meta = index_col[0]
+
+            if row_groups is not None:
+                per_file_metadata = [
+                    pa.parquet.read_metadata(
+                        # Pyarrow cannot read directly from bytes
+                        io.BytesIO(s) if isinstance(s, bytes) else s
+                    )
+                    for s in filepaths_or_buffers
+                ]
+
+                filtered_idx = []
+                for i, file_meta in enumerate(per_file_metadata):
+                    row_groups_i = []
+                    start = 0
+                    for row_group in range(file_meta.num_row_groups):
+                        stop = start + file_meta.row_group(row_group).num_rows
+                        row_groups_i.append((start, stop))
+                        start = stop
+
+                    for rg in row_groups[i]:
+                        filtered_idx.append(
+                            cudf.RangeIndex(
+                                start=row_groups_i[rg][0],
+                                stop=row_groups_i[rg][1],
+                                step=range_index_meta["step"],
+                            )
+                        )
+
+                if len(filtered_idx) > 0:
+                    idx = cudf.concat(filtered_idx)
+                else:
+                    idx = cudf.Index._from_column(
+                        cudf.core.column.column_empty(0)
+                    )
+            else:
+                start = range_index_meta["start"] + skip_rows  # type: ignore[operator]
+                stop = range_index_meta["stop"]
+                if nrows > -1:
+                    stop = start + nrows
+                idx = cudf.RangeIndex(
+                    start=start,
+                    stop=stop,
+                    step=range_index_meta["step"],
+                    name=range_index_meta["name"],
+                )
+
+            df.index = idx
+        elif set(index_col).issubset(names):
+            index_data = df[index_col]
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
+                idx = cudf.Index._from_column(
+                    index_data._columns[0], name=next(actual_index_names)
+                )
+            else:
+                idx = cudf.MultiIndex.from_frame(
+                    index_data, names=list(actual_index_names)
+                )
+            df.drop(columns=index_col, inplace=True)
+            df.index = idx
+        else:
+            if use_pandas_metadata:
+                df.index.names = index_col
+
+    if df._num_columns == 0 and column_index_type is not None:
+        df._data.label_dtype = cudf.dtype(column_index_type)
+
+    return df
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 13efa71ebae..77d1f77d30b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,6 @@
 from pyarrow import parquet as pq
 
 import cudf
-from cudf._lib.parquet import read_parquet_chunked
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.parquet import (
     ParquetDatasetWriter,
@@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer, row_group_size=10000)
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        use_pandas_metadata=use_pandas_metadata,
-        row_groups=row_groups,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            use_pandas_metadata=use_pandas_metadata,
+            row_groups=row_groups,
+        )
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
@@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs(
     # Number of rows to read
     nrows = num_rows if num_rows is not None else len(df)
 
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders(
     nrows = num_rows if num_rows is not None else len(df)
 
     # Check with num_rows specified
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     ).reset_index(drop=True)
 
     # Read with chunked reader (filter columns not supported)
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["list", "d_list", "str"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["list", "d_list", "str"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
 
     # Construct the expected table without filter columns
     expected_chunked = cudf.concat(
@@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs():
     )
 
     # Read with chunked reader
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["struct.b.b_b.b_b_a"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b.b_b.b_b_a"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index d9a3da6666d..a04fcb8df7a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -43,7 +43,6 @@
 }
 
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for

From f811c383b46d7a8acc8496593e3d0caff83d6c8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:56:03 -0500
Subject: [PATCH 67/78] Allow large strings in nvbench strings benchmarks
 (#17571)

Removes the 2GB limit check from the strings benchmarks and adjusts the parameters to be consistent across the benchmarks.
The default parameters will still not exceed 2GB for automation purposes.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/17571
---
 cpp/benchmarks/string/case.cpp              | 19 +++----
 cpp/benchmarks/string/char_types.cpp        | 15 +++---
 cpp/benchmarks/string/contains.cpp          | 13 ++---
 cpp/benchmarks/string/copy_if_else.cpp      | 15 +++---
 cpp/benchmarks/string/copy_range.cpp        | 15 +++---
 cpp/benchmarks/string/count.cpp             | 15 +++---
 cpp/benchmarks/string/extract.cpp           |  9 +---
 cpp/benchmarks/string/join_strings.cpp      | 15 +++---
 cpp/benchmarks/string/lengths.cpp           | 15 +++---
 cpp/benchmarks/string/like.cpp              |  9 +---
 cpp/benchmarks/string/replace_re.cpp        | 19 +++----
 cpp/benchmarks/string/reverse.cpp           | 15 +++---
 cpp/benchmarks/string/slice.cpp             |  9 +---
 cpp/benchmarks/string/split.cpp             | 15 +++---
 cpp/benchmarks/string/split_re.cpp          | 15 +++---
 cpp/benchmarks/string/string_bench_args.hpp | 56 ---------------------
 16 files changed, 80 insertions(+), 189 deletions(-)
 delete mode 100644 cpp/benchmarks/string/string_bench_args.hpp

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index cd4d3ca964b..9750475a079 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -24,18 +24,14 @@
 
 void bench_case(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const encoding  = state.get_string("encoding");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
 
   auto col_view = column->view();
 
@@ -74,6 +70,7 @@ void bench_case(nvbench::state& state)
 
 NVBENCH_BENCH(bench_case)
   .set_name("case")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index eec9a5f54d7..abc5254392e 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -25,16 +25,12 @@
 static void bench_char_types(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const api_type  = state.get_string("api");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state)
 
 NVBENCH_BENCH(bench_char_types)
   .set_name("char_types")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index a73017dda18..e3940cbc0c7 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"
 
 static void bench_contains(nvbench::state& state)
 {
-  auto const n_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
   auto const hit_rate      = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto col   = create_string_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(num_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
@@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state)
 
 NVBENCH_BENCH(bench_contains)
   .set_name("contains")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
   .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp
index e06cca497c2..5a5743dfddf 100644
--- a/cpp/benchmarks/string/copy_if_else.cpp
+++ b/cpp/benchmarks/string/copy_if_else.cpp
@@ -25,15 +25,11 @@
 static void bench_copy(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const str_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const source_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
   auto const target_table =
@@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy)
   .set_name("copy_if_else")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp
index af217a49195..7e7353a0e78 100644
--- a/cpp/benchmarks/string/copy_range.cpp
+++ b/cpp/benchmarks/string/copy_range.cpp
@@ -25,16 +25,12 @@
 static void bench_copy_range(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const source_tables = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
@@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy_range)
   .set_name("copy_range")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index f964bc5d224..cf90e316f71 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"};
 static void bench_count(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state)
 
 NVBENCH_BENCH(bench_count)
   .set_name("count")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index af4fedb5799..d6866598ff4 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto groups = static_cast<cudf::size_type>(state.get_int64("groups"));
 
   std::default_random_engine generator;
@@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state)
 
 NVBENCH_BENCH(bench_extract)
   .set_name("extract")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("groups", {1, 2, 4});
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
index 6dcf731ad3c..27652193b7b 100644
--- a/cpp/benchmarks/string/join_strings.cpp
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -25,15 +25,11 @@
 static void bench_join(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state)
 
 NVBENCH_BENCH(bench_join)
   .set_name("strings_join")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
index a19060ead3b..8156e19412b 100644
--- a/cpp/benchmarks/string/lengths.cpp
+++ b/cpp/benchmarks/string/lengths.cpp
@@ -25,15 +25,11 @@
 static void bench_lengths(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state)
 
 NVBENCH_BENCH(bench_lengths)
   .set_name("lengths")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 105ae65cbe8..f6410aaef30 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hit_rate  = static_cast<int32_t>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
@@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state)
 
 NVBENCH_BENCH(bench_like)
   .set_name("strings_like")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {10, 25, 70, 100});
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index 4dcf1314f83..69426a2d484 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -26,18 +26,14 @@
 
 static void bench_replace(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const rtype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
   auto program = cudf::strings::regex_program::create("(\\d+)");
@@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"replace", "backref"});
diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp
index a2676609a40..e2e914cb350 100644
--- a/cpp/benchmarks/string/reverse.cpp
+++ b/cpp/benchmarks/string/reverse.cpp
@@ -25,15 +25,11 @@
 static void bench_reverse(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state)
 
 NVBENCH_BENCH(bench_reverse)
   .set_name("reverse")
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 1898f0340b6..c828a8ed0b0 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
@@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state)
 
 NVBENCH_BENCH(bench_slice)
   .set_name("slice")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"position", "multi"});
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 9ef58daf0fc..9c7c27c4f07 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -28,16 +28,12 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   cudf::string_scalar target("+");
@@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp
index 1fdb6e67109..34a7aa96e84 100644
--- a/cpp/benchmarks/string/split_re.cpp
+++ b/cpp/benchmarks/string/split_re.cpp
@@ -28,17 +28,13 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   auto prog = cudf::strings::regex_program::create("\\d+");
 
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
deleted file mode 100644
index a34026281e8..00000000000
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-
-#include <benchmark/benchmark.h>
-
-#include <limits>
-
-/**
- * @brief Generate row count and row length argument ranges for a string benchmark.
- *
- * Generates a series of row count and row length arguments for string benchmarks.
- * Combinations of row count and row length that would exceed the maximum string character
- * column data length are not generated.
- *
- * @param b           Benchmark to update with row count and row length arguments.
- * @param min_rows    Minimum row count argument to generate.
- * @param max_rows    Maximum row count argument to generate.
- * @param rows_mult   Row count multiplier to generate intermediate row count arguments.
- * @param min_rowlen  Minimum row length argument to generate.
- * @param max_rowlen  Maximum row length argument to generate.
- * @param rowlen_mult Row length multiplier to generate intermediate row length arguments.
- */
-inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
-                                       int min_rows,
-                                       int max_rows,
-                                       int rows_mult,
-                                       int min_rowlen,
-                                       int max_rowlen,
-                                       int rowlen_mult)
-{
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
-  }
-}

From 8a3e5f1a7af6c638397fcabf17bea9192bd799d2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:40:20 -0800
Subject: [PATCH 68/78] Remove cudf._lib.nvtext in favor of inlining pylibcudf
 (#17535)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17535
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   2 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  24 --
 python/cudf/cudf/_lib/nvtext/__init__.pxd     |   0
 python/cudf/cudf/_lib/nvtext/__init__.py      |   0
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     |  24 --
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |  24 --
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  35 --
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      |  17 -
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  35 --
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx |  24 --
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |  28 --
 python/cudf/cudf/_lib/nvtext/replace.pyx      |  52 ---
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      |  55 ---
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |  38 --
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  86 ----
 python/cudf/cudf/_lib/strings/__init__.pxd    |   0
 python/cudf/cudf/_lib/strings/__init__.py     |  30 --
 python/cudf/cudf/core/byte_pair_encoding.py   |  13 +-
 python/cudf/cudf/core/column/string.py        | 388 ++++++++++++++----
 python/cudf/cudf/core/subword_tokenizer.py    |   7 +-
 python/cudf/cudf/core/tokenize_vocabulary.py  |   9 +-
 22 files changed, 328 insertions(+), 564 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index f422635d22a..c2677c6d88d 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -30,5 +30,3 @@ target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DI
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
-
-add_subdirectory(nvtext)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cfdcec4cd3b..f86a15b932b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -6,7 +6,6 @@
     csv,
     groupby,
     interop,
-    nvtext,
     reduce,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
deleted file mode 100644
index 22ec5d472f2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources
-    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
-)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
deleted file mode 100644
index 2b2762eead2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
-
-
-@acquire_spill_lock()
-def byte_pair_encoding(
-    Column strings,
-    object merge_pairs,
-    object separator
-):
-    return Column.from_pylibcudf(
-        nvtext.byte_pair_encode.byte_pair_encoding(
-            strings.to_pylibcudf(mode="read"),
-            merge_pairs,
-            separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
deleted file mode 100644
index 3dd99c42d76..00000000000
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf cimport nvtext
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def edit_distance(Column strings, Column targets):
-    result = nvtext.edit_distance.edit_distance(
-        strings.to_pylibcudf(mode="read"),
-        targets.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def edit_distance_matrix(Column strings):
-    result = nvtext.edit_distance.edit_distance_matrix(
-        strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
deleted file mode 100644
index 7fdf9258b7f..00000000000
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def generate_ngrams(Column strings, int ngrams, object py_separator):
-    result = nvtext.generate_ngrams.generate_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams,
-        py_separator.device_value.c_value
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def generate_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.generate_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def hash_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.hash_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
deleted file mode 100644
index c964d0206b7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def jaccard_index(Column input1, Column input2, int width):
-    result = nvtext.jaccard.jaccard_index(
-        input1.to_pylibcudf(mode="read"),
-        input2.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
deleted file mode 100644
index 9f2b3f92502..00000000000
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t, uint64_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def minhash(Column input, uint32_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minhash64(Column input, uint64_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash64(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
deleted file mode 100644
index c125d92a24e..00000000000
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def ngrams_tokenize(
-    Column input,
-    int ngrams,
-    object py_delimiter,
-    object py_separator
-):
-    return Column.from_pylibcudf(
-        nvtext.ngrams_tokenize.ngrams_tokenize(
-            input.to_pylibcudf(mode="read"),
-            ngrams,
-            py_delimiter.device_value.c_value,
-            py_separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
deleted file mode 100644
index cc45123dd0a..00000000000
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def normalize_spaces(Column input):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_spaces(
-            input.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def normalize_characters(Column input, bool do_lower=True):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_characters(
-            input.to_pylibcudf(mode="read"),
-            do_lower,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
deleted file mode 100644
index bec56ade83c..00000000000
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def replace_tokens(Column strings,
-                   Column targets,
-                   Column replacements,
-                   object py_delimiter):
-    """
-    The `targets` tokens are searched for within each `strings`
-    in the Column and replaced with the corresponding `replacements`
-    if found. Tokens are identified by the `py_delimiter` character
-    provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.replace_tokens(
-            strings.to_pylibcudf(mode="read"),
-            targets.to_pylibcudf(mode="read"),
-            replacements.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def filter_tokens(Column strings,
-                  size_type min_token_length,
-                  object py_replacement,
-                  object py_delimiter):
-    """
-    Tokens smaller than `min_token_length` are removed from `strings`
-    in the Column and optionally replaced with the corresponding
-    `py_replacement` string. Tokens are identified by the `py_delimiter`
-    character provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.filter_tokens(
-            strings.to_pylibcudf(mode="read"),
-            min_token_length,
-            py_replacement.device_value.c_value,
-            py_delimiter.device_value.c_value,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
deleted file mode 100644
index 63a389b64d5..00000000000
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from enum import IntEnum
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.nvtext.stemmer cimport (
-    letter_type,
-    underlying_type_t_letter_type,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-class LetterType(IntEnum):
-    CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
-    VOWEL = <underlying_type_t_letter_type> letter_type.VOWEL
-
-
-@acquire_spill_lock()
-def porter_stemmer_measure(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.porter_stemmer_measure(
-            strings.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter(Column strings,
-              object ltype,
-              size_type index):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            index,
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter_multi(Column strings,
-                    object ltype,
-                    Column indices):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            indices.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
deleted file mode 100644
index 5e0bfb74705..00000000000
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def subword_tokenize_inmem_hash(
-    Column strings,
-    object hashed_vocabulary,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-):
-    """
-    Subword tokenizes text series by using the pre-loaded hashed vocabulary
-    """
-    result = nvtext.subword_tokenize.subword_tokenize(
-        strings.to_pylibcudf(mode="read"),
-        hashed_vocabulary,
-        max_sequence_length,
-        stride,
-        do_lower,
-        do_truncate,
-    )
-    # return the 3 tensor components
-    tokens = Column.from_pylibcudf(result[0])
-    masks = Column.from_pylibcudf(result[1])
-    metadata = Column.from_pylibcudf(result[2])
-    return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
deleted file mode 100644
index f473c48e2f7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def _tokenize_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _tokenize_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def character_tokenize(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.character_tokenize(
-            strings.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def detokenize(Column strings, Column indices, object py_separator):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.detokenize(
-            strings.to_pylibcudf(mode="read"),
-            indices.to_pylibcudf(mode="read"),
-            py_separator.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def tokenize_with_vocabulary(Column strings,
-                             object vocabulary,
-                             object py_delimiter,
-                             size_type default_id):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_with_vocabulary(
-            strings.to_pylibcudf(mode="read"),
-            vocabulary,
-            py_delimiter.device_value.c_value,
-            default_id
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
deleted file mode 100644
index b9095a22a42..00000000000
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
-from cudf._lib.nvtext.generate_ngrams import (
-    generate_character_ngrams,
-    generate_ngrams,
-    hash_character_ngrams,
-)
-from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import (
-    minhash,
-    minhash64,
-)
-from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
-from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
-from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
-from cudf._lib.nvtext.stemmer import (
-    LetterType,
-    is_letter,
-    is_letter_multi,
-    porter_stemmer_measure,
-)
-from cudf._lib.nvtext.tokenize import (
-    _count_tokens_column,
-    _count_tokens_scalar,
-    _tokenize_column,
-    _tokenize_scalar,
-    character_tokenize,
-    detokenize,
-    tokenize_with_vocabulary,
-)
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 8d38a5f2272..b49f5154697 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.byte_pair_encode import (
-    byte_pair_encoding as cpp_byte_pair_encoding,
-)
 
 
 class BytePairEncoder:
@@ -25,12 +22,12 @@ class BytePairEncoder:
     BytePairEncoder
     """
 
-    def __init__(self, merges_pair: "cudf.Series"):
+    def __init__(self, merges_pair: cudf.Series) -> None:
         self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
             merges_pair._column.to_pylibcudf(mode="read")
         )
 
-    def __call__(self, text, separator: str = " ") -> cudf.Series:
+    def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series:
         dtype: object
         """
         sep = cudf.Scalar(separator, dtype="str")
-        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
-
-        return cudf.Series._from_column(result)
+        return cudf.Series._from_column(
+            text._column.byte_pair_encoding(self.merge_pairs, sep)
+        )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 06196717ce3..c021554f3bd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -20,7 +20,7 @@
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
 from cudf import _lib as libcudf
-from cudf._lib import string_casting as str_cast, strings as libstrings
+from cudf._lib import string_casting as str_cast
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
@@ -45,6 +45,7 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.lists import ListColumn
     from cudf.core.column.numerical import NumericalColumn
 
 
@@ -624,7 +625,7 @@ def join(
 
     def _split_by_character(self):
         col = self._column.fillna("")  # sanitize nulls
-        result_col = libstrings.character_tokenize(col)
+        result_col = col.character_tokenize()
 
         offset_col = col.children[0]
 
@@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         1    test string
         dtype: object
         """
-        return self._return_or_inplace(
-            libstrings.normalize_spaces(self._column)
-        )
+        return self._return_or_inplace(self._column.normalize_spaces())
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
@@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         dtype: object
         """
         return self._return_or_inplace(
-            libstrings.normalize_characters(self._column, do_lower)
+            self._column.normalize_characters(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         2    goodbye
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
-        if isinstance(delimiter, Column):
+        if isinstance(delim, Column):
             result = self._return_or_inplace(
-                libstrings._tokenize_column(self._column, delimiter),
+                self._column.tokenize_column(delim),
                 retain_index=False,
             )
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             result = self._return_or_inplace(
-                libstrings._tokenize_scalar(self._column, delimiter),
+                self._column.tokenize_scalar(delim),
                 retain_index=False,
             )
         else:
@@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         return result
 
     def detokenize(
-        self, indices: "cudf.Series", separator: str = " "
+        self, indices: cudf.Series, separator: str = " "
     ) -> SeriesOrIndex:
         """
         Combines tokens into strings by concatenating them in the order
@@ -4829,9 +4828,9 @@ def detokenize(
         2          three
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.detokenize(self._column, indices._column, separator),
+            self._column.detokenize(indices._column, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex:
         2    .
         dtype: object
         """
-        result_col = libstrings.character_tokenize(self._column)
+        result_col = self._column.character_tokenize()
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series._from_column(
+            return type(self._parent)._from_column(
                 result_col, name=self._parent.name, index=index
             )
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
-            return result_col
+            return self._return_or_inplace(result_col)
 
     def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         """
@@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         2    0
         dtype: int32
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        if isinstance(delimiter, Column):
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        if isinstance(delim, Column):
             return self._return_or_inplace(
-                libstrings._count_tokens_column(self._column, delimiter)
+                self._column.count_tokens_column(delim)
             )
 
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             return self._return_or_inplace(
-                libstrings._count_tokens_scalar(self._column, delimiter)
+                self._column.count_tokens_scalar(delim)  # type: ignore[arg-type]
             )
         else:
             raise TypeError(
@@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex:
         2    xyz_hhh
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.generate_ngrams(self._column, n, separator),
+            self._column.generate_ngrams(n, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5015,7 +5012,7 @@ def character_ngrams(
         dtype: list
         """
         result = self._return_or_inplace(
-            libstrings.generate_character_ngrams(self._column, n),
+            self._column.generate_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5060,7 +5057,7 @@ def hash_character_ngrams(
         """
 
         result = self._return_or_inplace(
-            libstrings.hash_character_ngrams(self._column, n),
+            self._column.hash_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5098,10 +5095,10 @@ def ngrams_tokenize(
         2    best_book
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter")
-        separator = _massage_string_arg(separator, "separator")
+        delim = _massage_string_arg(delimiter, "delimiter")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.ngrams_tokenize(self._column, n, delimiter, separator),
+            self._column.ngrams_tokenize(n, delim, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5180,10 +5177,9 @@ def replace_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.replace_tokens(
-                self._column,
-                targets_column,
-                replacements_column,
+            self._column.replace_tokens(
+                targets_column,  # type: ignore[arg-type]
+                replacements_column,  # type: ignore[arg-type]
                 cudf.Scalar(delimiter, dtype="str"),
             ),
         )
@@ -5251,8 +5247,7 @@ def filter_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.filter_tokens(
-                self._column,
+            self._column.filter_tokens(
                 min_token_length,
                 cudf.Scalar(replacement, dtype="str"),
                 cudf.Scalar(delimiter, dtype="str"),
@@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex:
         1    2
         dtype: int32
         """
-        return self._return_or_inplace(
-            libstrings.porter_stemmer_measure(self._column)
-        )
+        return self._return_or_inplace(self._column.porter_stemmer_measure())
 
     def is_consonant(self, position) -> SeriesOrIndex:
         """
@@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex:
         1    False
         dtype: bool
         """
-        ltype = libstrings.LetterType.CONSONANT
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(False, position)  # type: ignore[arg-type]
         )
 
     def is_vowel(self, position) -> SeriesOrIndex:
@@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex:
         1     True
         dtype: bool
         """
-        ltype = libstrings.LetterType.VOWEL
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(True, position)  # type: ignore[arg-type]
         )
 
     def edit_distance(self, targets) -> SeriesOrIndex:
@@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex:
             )
 
         return self._return_or_inplace(
-            libstrings.edit_distance(self._column, targets_column)
+            self._column.edit_distance(targets_column)  # type: ignore[arg-type]
         )
 
     def edit_distance_matrix(self) -> SeriesOrIndex:
@@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
             )
-        return self._return_or_inplace(
-            libstrings.edit_distance_matrix(self._column)
-        )
+        return self._return_or_inplace(self._column.edit_distance_matrix())
 
     def minhash(
         self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
@@ -5508,7 +5485,7 @@ def minhash(
                 f"Expecting a Series with dtype uint32, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash(self._column, seed, a_column, b_column, width)
+            self._column.minhash(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def minhash64(
@@ -5559,7 +5536,7 @@ def minhash64(
                 f"Expecting a Series with dtype uint64, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash64(self._column, seed, a_column, b_column, width)
+            self._column.minhash64(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
@@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         1    0.307692
         dtype: float32
         """
-
         return self._return_or_inplace(
-            libstrings.jaccard_index(self._column, input._column, width),
+            self._column.jaccard_index(input._column, width)
         )
 
 
-def _massage_string_arg(value, name, allow_col=False):
+def _massage_string_arg(
+    value, name, allow_col: bool = False
+) -> StringColumn | cudf.Scalar:
     if isinstance(value, cudf.Scalar):
         return value
 
@@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False):
 
     if allow_col:
         if isinstance(value, list):
-            return column.as_column(value, dtype="str")
+            return column.as_column(value, dtype="str")  # type: ignore[return-value]
 
-        if isinstance(value, Column) and is_string_dtype(value.dtype):
+        if isinstance(value, StringColumn):
             return value
 
         allowed_types.append("Column")
@@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         return to_view.view(dtype)
 
+    @acquire_spill_lock()
+    def minhash(
+        self,
+        seed: np.uint32,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def minhash64(
+        self,
+        seed: np.uint64,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash64(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def jaccard_index(self, other: Self, width: int) -> NumericalColumn:
+        result = plc.nvtext.jaccard.jaccard_index(
+            self.to_pylibcudf(mode="read"),
+            other.to_pylibcudf(mode="read"),
+            width,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self:
+        result = plc.nvtext.generate_ngrams.generate_ngrams(
+            self.to_pylibcudf(mode="read"),
+            ngrams,
+            separator.device_value.c_value,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def hash_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance(self, targets: Self) -> NumericalColumn:
+        result = plc.nvtext.edit_distance.edit_distance(
+            self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance_matrix(self) -> ListColumn:
+        result = plc.nvtext.edit_distance.edit_distance_matrix(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def byte_pair_encoding(
+        self,
+        merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.byte_pair_encode.byte_pair_encoding(
+                self.to_pylibcudf(mode="read"),
+                merge_pairs,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def ngrams_tokenize(
+        self,
+        ngrams: int,
+        delimiter: cudf.Scalar,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+                self.to_pylibcudf(mode="read"),
+                ngrams,
+                delimiter.device_value.c_value,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_spaces(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_spaces(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_characters(self, do_lower: bool = True) -> Self:
+        return Column.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                do_lower,
+            )
+        )
+
+    @acquire_spill_lock()
+    def replace_tokens(
+        self, targets: Self, replacements: Self, delimiter: cudf.Scalar
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.replace_tokens(
+                self.to_pylibcudf(mode="read"),
+                targets.to_pylibcudf(mode="read"),
+                replacements.to_pylibcudf(mode="read"),
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def filter_tokens(
+        self,
+        min_token_length: int,
+        replacement: cudf.Scalar,
+        delimiter: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.filter_tokens(
+                self.to_pylibcudf(mode="read"),
+                min_token_length,
+                replacement.device_value.c_value,
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def porter_stemmer_measure(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.porter_stemmer_measure(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.is_letter(
+                self.to_pylibcudf(mode="read"),
+                is_vowel,
+                index
+                if isinstance(index, int)
+                else index.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def subword_tokenize(
+        self,
+        hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary,
+        max_sequence_length: int = 64,
+        stride: int = 48,
+        do_lower: bool = True,
+        do_truncate: bool = False,
+    ) -> tuple[ColumnBase, ColumnBase, ColumnBase]:
+        """
+        Subword tokenizes text series by using the pre-loaded hashed vocabulary
+        """
+        result = plc.nvtext.subword_tokenize.subword_tokenize(
+            self.to_pylibcudf(mode="read"),
+            hashed_vocabulary,
+            max_sequence_length,
+            stride,
+            do_lower,
+            do_truncate,
+        )
+        # return the 3 tensor components
+        tokens = type(self).from_pylibcudf(result[0])
+        masks = type(self).from_pylibcudf(result[1])
+        metadata = type(self).from_pylibcudf(result[2])
+        return tokens, masks, metadata
+
+    @acquire_spill_lock()
+    def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_column(self, delimiters: Self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_column(self, delimiters: Self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def character_tokenize(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.character_tokenize(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_with_vocabulary(
+        self,
+        vocabulary: plc.nvtext.tokenize.TokenizeVocabulary,
+        delimiter: cudf.Scalar,
+        default_id: int,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_with_vocabulary(
+                self.to_pylibcudf(mode="read"),
+                vocabulary,
+                delimiter.device_value.c_value,
+                default_id,
+            )
+        )
+
+    @acquire_spill_lock()
+    def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.detokenize(
+                self.to_pylibcudf(mode="read"),
+                indices.to_pylibcudf(mode="read"),
+                separator.device_value.c_value,
+            )
+        )
+
     def _modify_characters(
         self, method: Callable[[plc.Column], plc.Column]
     ) -> Self:
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index dda1f199078..479838ef2a8 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -8,10 +8,6 @@
 
 import pylibcudf as plc
 
-from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize_inmem_hash as cpp_subword_tokenize,
-)
-
 
 def _cast_to_appropriate_type(ar, cast_type):
     if cast_type == "cp":
@@ -210,8 +206,7 @@ def __call__(
         stride = max_length - stride
         # behavior varies from subword_tokenize but maps with huggingface
 
-        input_ids, attention_mask, metadata = cpp_subword_tokenize(
-            text._column,
+        input_ids, attention_mask, metadata = text._column.subword_tokenize(
             self.vocab_file,
             max_sequence_length=max_length,
             stride=stride,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 1e31376cce8..fb8b9b3131c 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.tokenize import (
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
 
 
 class TokenizeVocabulary:
@@ -20,7 +17,7 @@ class TokenizeVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: "cudf.Series"):
+    def __init__(self, vocabulary: cudf.Series) -> None:
         self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
@@ -46,8 +43,8 @@ def tokenize(
         if delimiter is None:
             delimiter = ""
         delim = cudf.Scalar(delimiter, dtype="str")
-        result = cpp_tokenize_with_vocabulary(
-            text._column, self.vocabulary, delim, default_id
+        result = text._column.tokenize_with_vocabulary(
+            self.vocabulary, delim, default_id
         )
 
         return cudf.Series._from_column(result)

From 774970283bfa6ca5ac4bc0619fc8595f01b7362b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 18:06:44 -0800
Subject: [PATCH 69/78] Remove cudf._lib.csv in favor in inlining pylibcudf
 (#17485)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17485
---
 python/cudf/cudf/_lib/CMakeLists.txt |   5 +-
 python/cudf/cudf/_lib/__init__.py    |   1 -
 python/cudf/cudf/_lib/csv.pyx        | 414 ------------------------
 python/cudf/cudf/io/csv.py           | 466 ++++++++++++++++++++++-----
 4 files changed, 385 insertions(+), 501 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/csv.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index c2677c6d88d..b402db0443d 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,9 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources
-    column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx
-    stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
+set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx
+                   stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index f86a15b932b..0299b264189 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -3,7 +3,6 @@
 
 from . import (
     copying,
-    csv,
     groupby,
     interop,
     reduce,
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
deleted file mode 100644
index 641fc18c203..00000000000
--- a/python/cudf/cudf/_lib/csv.pyx
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-cimport pylibcudf.libcudf.types as libcudf_types
-
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import errno
-import os
-from collections import abc
-from io import BytesIO, StringIO
-
-import numpy as np
-import pandas as pd
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-from cudf.api.types import is_hashable
-
-from pylibcudf.types cimport DataType
-
-CSV_HEX_TYPE_MAP = {
-    "hex": np.dtype("int64"),
-    "hex64": np.dtype("int64"),
-    "hex32": np.dtype("int32")
-}
-
-
-def validate_args(
-    object delimiter,
-    object sep,
-    bool delim_whitespace,
-    object decimal,
-    object thousands,
-    object nrows,
-    int skipfooter,
-    object byte_range,
-    int skiprows
-):
-    if delim_whitespace:
-        if delimiter is not None:
-            raise ValueError("cannot set both delimiter and delim_whitespace")
-        if sep != ',':
-            raise ValueError("cannot set both sep and delim_whitespace")
-
-    # Alias sep -> delimiter.
-    actual_delimiter = delimiter if delimiter else sep
-
-    if decimal == actual_delimiter:
-        raise ValueError("decimal cannot be the same as delimiter")
-
-    if thousands == actual_delimiter:
-        raise ValueError("thousands cannot be the same as delimiter")
-
-    if nrows is not None and skipfooter != 0:
-        raise ValueError("cannot use both nrows and skipfooter parameters")
-
-    if byte_range is not None:
-        if skipfooter != 0 or skiprows != 0 or nrows is not None:
-            raise ValueError("""cannot manually limit rows to be read when
-                                using the byte range parameter""")
-
-
-def read_csv(
-    object datasource,
-    object lineterminator="\n",
-    object quotechar='"',
-    int quoting=0,
-    bool doublequote=True,
-    object header="infer",
-    bool mangle_dupe_cols=True,
-    object usecols=None,
-    object sep=",",
-    object delimiter=None,
-    bool delim_whitespace=False,
-    bool skipinitialspace=False,
-    object names=None,
-    object dtype=None,
-    int skipfooter=0,
-    int skiprows=0,
-    bool dayfirst=False,
-    object compression="infer",
-    object thousands=None,
-    object decimal=".",
-    object true_values=None,
-    object false_values=None,
-    object nrows=None,
-    object byte_range=None,
-    bool skip_blank_lines=True,
-    object parse_dates=None,
-    object comment=None,
-    object na_values=None,
-    bool keep_default_na=True,
-    bool na_filter=True,
-    object prefix=None,
-    object index_col=None,
-):
-    """
-    Cython function to call into libcudf API, see `read_csv`.
-
-    See Also
-    --------
-    cudf.read_csv
-    """
-
-    if not isinstance(datasource, (BytesIO, StringIO, bytes)):
-        if not os.path.isfile(datasource):
-            raise FileNotFoundError(
-                errno.ENOENT, os.strerror(errno.ENOENT), datasource
-            )
-
-    if isinstance(datasource, StringIO):
-        datasource = datasource.read().encode()
-    elif isinstance(datasource, str) and not os.path.isfile(datasource):
-        datasource = datasource.encode()
-
-    validate_args(delimiter, sep, delim_whitespace, decimal, thousands,
-                  nrows, skipfooter, byte_range, skiprows)
-
-    # Alias sep -> delimiter.
-    if delimiter is None:
-        delimiter = sep
-
-    delimiter = str(delimiter)
-
-    if byte_range is None:
-        byte_range = (0, 0)
-
-    if compression is None:
-        c_compression = plc.io.types.CompressionType.NONE
-    else:
-        compression_map = {
-            "infer": plc.io.types.CompressionType.AUTO,
-            "gzip": plc.io.types.CompressionType.GZIP,
-            "bz2": plc.io.types.CompressionType.BZIP2,
-            "zip": plc.io.types.CompressionType.ZIP,
-        }
-        c_compression = compression_map[compression]
-
-    # We need this later when setting index cols
-    orig_header = header
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            header = -1
-        else:
-            header = header
-        names = list(names)
-    else:
-        if header is None:
-            header = -1
-        elif header == 'infer':
-            header = 0
-
-    hex_cols = []
-
-    new_dtypes = []
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            new_dtypes = dict()
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    hex_cols.append(str(k))
-
-                new_dtypes[k] = _get_plc_data_type_from_dtype(
-                    cudf.dtype(col_type)
-                )
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                hex_cols.append(0)
-
-            new_dtypes.append(
-                _get_plc_data_type_from_dtype(dtype)
-            )
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    hex_cols.append(index)
-
-                new_dtypes.append(
-                    _get_plc_data_type_from_dtype(col_dtype)
-                )
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-    options = (
-        plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource]))
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(byte_range[0])
-        .byte_range_size(byte_range[1])
-        .nrows(nrows if nrows is not None else -1)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(quoting)
-        .lineterminator(str(lineterminator))
-        .quotechar(quotechar)
-        .decimal(decimal)
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    options.set_header(header)
-
-    if names is not None:
-        options.set_names([str(name) for name in names])
-
-    if prefix is not None:
-        options.set_prefix(prefix)
-
-    if usecols is not None:
-        if all(isinstance(col, int) for col in usecols):
-            options.set_use_cols_indexes(list(usecols))
-        else:
-            options.set_use_cols_names([str(name) for name in usecols])
-
-    if delimiter is not None:
-        options.set_delimiter(delimiter)
-
-    if thousands is not None:
-        options.set_thousands(thousands)
-
-    if comment is not None:
-        options.set_comment(comment)
-
-    if parse_dates is not None:
-        options.set_parse_dates(list(parse_dates))
-
-    if hex_cols is not None:
-        options.set_parse_hex(list(hex_cols))
-
-    options.set_dtypes(new_dtypes)
-
-    if true_values is not None:
-        options.set_true_values([str(val) for val in true_values])
-
-    if false_values is not None:
-        options.set_false_values([str(val) for val in false_values])
-
-    if na_values is not None:
-        options.set_na_values([str(val) for val in na_values])
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(plc.io.csv.read_csv(options))
-    )
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
-                    df._data[str(k)] = df._data[str(k)].astype(v)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
-                df = df.astype(dtype)
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._column_names[index]
-                    df._data[col_name] = df._data[col_name].astype(col_dtype)
-
-    if names is not None and len(names) and isinstance(names[0], int):
-        df.columns = [int(x) for x in df._data]
-    elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"):
-        df.columns = [int(x) for x in df._column_names]
-
-    # Set index if the index_col parameter is passed
-    if index_col is not None and index_col is not False:
-        if isinstance(index_col, int):
-            index_col_name = df._data.get_labels_by_index(index_col)[0]
-            df = df.set_index(index_col_name)
-            if isinstance(index_col_name, str) and \
-                    names is None and orig_header == "infer":
-                if index_col_name.startswith("Unnamed:"):
-                    # TODO: Try to upstream it to libcudf
-                    # csv reader in future
-                    df._index.name = None
-            elif names is None:
-                df._index.name = index_col
-        else:
-            df = df.set_index(index_col)
-
-    return df
-
-
-@acquire_spill_lock()
-def write_csv(
-    table,
-    object path_or_buf=None,
-    object sep=",",
-    object na_rep="",
-    bool header=True,
-    object lineterminator="\n",
-    int rows_per_chunk=8,
-    bool index=True,
-):
-    """
-    Cython function to call into libcudf API, see `write_csv`.
-
-    See Also
-    --------
-    cudf.to_csv
-    """
-    index_and_not_empty = index is True and table.index is not None
-    columns = [
-        col.to_pylibcudf(mode="read") for col in table.index._columns
-    ] if index_and_not_empty else []
-    columns.extend(col.to_pylibcudf(mode="read") for col in table._columns)
-    col_names = []
-    if header:
-        all_names = list(table.index.names) if index_and_not_empty else []
-        all_names.extend(
-            na_rep if name is None or pd.isnull(name)
-            else name for name in table._column_names
-        )
-        col_names = [
-            '""' if (name in (None, '') and len(all_names) == 1)
-            else (str(name) if name not in (None, '') else '')
-            for name in all_names
-        ]
-    try:
-        plc.io.csv.write_csv(
-            (
-                plc.io.csv.CsvWriterOptions.builder(
-                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
-                )
-                .names(col_names)
-                .na_rep(na_rep)
-                .include_header(header)
-                .rows_per_chunk(rows_per_chunk)
-                .line_terminator(str(lineterminator))
-                .inter_column_delimiter(str(sep))
-                .true_value("True")
-                .false_value("False")
-                .build()
-            )
-        )
-    except OverflowError:
-        raise OverflowError(
-            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
-            "Consider providing a smaller chunksize argument."
-        )
-
-
-cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
-    # TODO: Remove this work-around Dictionary types
-    # in libcudf are fully mapped to categorical columns:
-    # https://github.com/rapidsai/cudf/issues/3960
-    if isinstance(dtype, cudf.CategoricalDtype):
-        dtype = dtype.categories.dtype
-    elif dtype == "category":
-        dtype = "str"
-
-    if isinstance(dtype, str):
-        if str(dtype) == "date32":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_DAYS
-            )
-        elif str(dtype) in ("date", "date64"):
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[us]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MICROSECONDS
-            )
-        elif str(dtype) == "timestamp[s]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_SECONDS
-            )
-        elif str(dtype) == "timestamp[ms]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[ns]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_NANOSECONDS
-            )
-
-    dtype = cudf.dtype(dtype)
-    return dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 3dc8915bfd1..da9a66f3874 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,57 +1,73 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import errno
+import itertools
+import os
 import warnings
 from collections import abc
 from io import BytesIO, StringIO
+from typing import cast
 
 import numpy as np
+import pandas as pd
+
+import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
-from cudf.api.types import is_scalar
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf._lib.utils import data_from_pylibcudf_io
+from cudf.api.types import is_hashable, is_scalar
+from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
 from cudf.utils.performance_tracking import _performance_tracking
 
+_CSV_HEX_TYPE_MAP = {
+    "hex": np.dtype("int64"),
+    "hex64": np.dtype("int64"),
+    "hex32": np.dtype("int32"),
+}
+
 
 @_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
-    sep=",",
-    delimiter=None,
+    sep: str = ",",
+    delimiter: str | None = None,
     header="infer",
     names=None,
     index_col=None,
     usecols=None,
     prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols: bool = True,
     dtype=None,
     true_values=None,
     false_values=None,
-    skipinitialspace=False,
-    skiprows=0,
-    skipfooter=0,
-    nrows=None,
+    skipinitialspace: bool = False,
+    skiprows: int = 0,
+    skipfooter: int = 0,
+    nrows: int | None = None,
     na_values=None,
-    keep_default_na=True,
-    na_filter=True,
-    skip_blank_lines=True,
+    keep_default_na: bool = True,
+    na_filter: bool = True,
+    skip_blank_lines: bool = True,
     parse_dates=None,
-    dayfirst=False,
+    dayfirst: bool = False,
     compression="infer",
-    thousands=None,
-    decimal=".",
-    lineterminator="\n",
-    quotechar='"',
-    quoting=0,
-    doublequote=True,
-    comment=None,
-    delim_whitespace=False,
-    byte_range=None,
+    thousands: str | None = None,
+    decimal: str = ".",
+    lineterminator: str = "\n",
+    quotechar: str = '"',
+    quoting: int = 0,
+    doublequote: bool = True,
+    comment: str | None = None,
+    delim_whitespace: bool = False,
+    byte_range: list[int] | tuple[int, int] | None = None,
     storage_options=None,
-    bytes_per_thread=None,
-):
+    bytes_per_thread: int | None = None,
+) -> cudf.DataFrame:
     """{docstring}"""
 
     if delim_whitespace is not False:
@@ -77,60 +93,225 @@ def read_csv(
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
-    df = libcudf.csv.read_csv(
-        filepath_or_buffer,
-        lineterminator=lineterminator,
-        quotechar=quotechar,
-        quoting=quoting,
-        doublequote=doublequote,
-        header=header,
-        mangle_dupe_cols=mangle_dupe_cols,
-        usecols=usecols,
-        sep=sep,
-        delimiter=delimiter,
-        delim_whitespace=delim_whitespace,
-        skipinitialspace=skipinitialspace,
-        names=names,
-        dtype=dtype,
-        skipfooter=skipfooter,
-        skiprows=skiprows,
-        dayfirst=dayfirst,
-        compression=compression,
-        thousands=thousands,
-        decimal=decimal,
-        true_values=true_values,
-        false_values=false_values,
-        nrows=nrows,
-        byte_range=byte_range,
-        skip_blank_lines=skip_blank_lines,
-        parse_dates=parse_dates,
-        comment=comment,
-        na_values=na_values,
-        keep_default_na=keep_default_na,
-        na_filter=na_filter,
-        prefix=prefix,
-        index_col=index_col,
+    if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)):
+        if not os.path.isfile(filepath_or_buffer):
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer
+            )
+
+    if isinstance(filepath_or_buffer, StringIO):
+        filepath_or_buffer = filepath_or_buffer.read().encode()
+    elif isinstance(filepath_or_buffer, str) and not os.path.isfile(
+        filepath_or_buffer
+    ):
+        filepath_or_buffer = filepath_or_buffer.encode()
+
+    _validate_args(
+        delimiter,
+        sep,
+        delim_whitespace,
+        decimal,
+        thousands,
+        nrows,
+        skipfooter,
+        byte_range,
+        skiprows,
+    )
+
+    # Alias sep -> delimiter.
+    if delimiter is None:
+        delimiter = sep
+
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = plc.io.types.CompressionType.NONE
+    else:
+        compression_map = {
+            "infer": plc.io.types.CompressionType.AUTO,
+            "gzip": plc.io.types.CompressionType.GZIP,
+            "bz2": plc.io.types.CompressionType.BZIP2,
+            "zip": plc.io.types.CompressionType.ZIP,
+        }
+        c_compression = compression_map[compression]
+
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == "infer":
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == "infer":
+            header = 0
+
+    hex_cols: list[abc.Hashable] = []
+    new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = {}
+            for k, col_type in dtype.items():
+                if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP:
+                    col_type = _CSV_HEX_TYPE_MAP[col_type]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif cudf.api.types.is_scalar(dtype) or isinstance(
+            dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type)
+        ):
+            if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP:
+                dtype = _CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype))
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP:
+                    col_dtype = _CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype))
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+    options = (
+        plc.io.csv.CsvReaderOptions.builder(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
+        .compression(c_compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range[0])
+        .byte_range_size(byte_range[1])
+        .nrows(nrows if nrows is not None else -1)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(str(lineterminator))
+        .quotechar(quotechar)
+        .decimal(decimal)
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
+    )
+
+    options.set_header(header)
+
+    if names is not None:
+        options.set_names([str(name) for name in names])
+
+    if prefix is not None:
+        options.set_prefix(prefix)
+
+    if usecols is not None:
+        if all(isinstance(col, int) for col in usecols):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name) for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(delimiter)
+
+    if thousands is not None:
+        options.set_thousands(thousands)
+
+    if comment is not None:
+        options.set_comment(comment)
+
+    if parse_dates is not None:
+        options.set_parse_dates(list(parse_dates))
+
+    if hex_cols is not None:
+        options.set_parse_hex(list(hex_cols))
+
+    options.set_dtypes(new_dtypes)
+
+    if true_values is not None:
+        options.set_true_values([str(val) for val in true_values])
+
+    if false_values is not None:
+        options.set_false_values([str(val) for val in false_values])
+
+    if na_values is not None:
+        options.set_na_values([str(val) for val in na_values])
+
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(plc.io.csv.read_csv(options))
     )
 
+    if isinstance(dtype, abc.Mapping):
+        for k, v in dtype.items():
+            if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
+                df._data[str(k)] = df._data[str(k)].astype(v)
+    elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype):
+        df = df.astype(dtype)
+    elif isinstance(dtype, abc.Collection) and not is_scalar(dtype):
+        for index, col_dtype in enumerate(dtype):
+            if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
+                col_name = df._column_names[index]
+                df._data[col_name] = df._data[col_name].astype(col_dtype)
+
+    if names is not None and len(names) and isinstance(names[0], int):
+        df.columns = [int(x) for x in df._data]
+    elif (
+        names is None
+        and header == -1
+        and cudf.get_option("mode.pandas_compatible")
+    ):
+        df.columns = [int(x) for x in df._column_names]
+
+    # Set index if the index_col parameter is passed
+    if index_col is not None and index_col is not False:
+        if isinstance(index_col, int):
+            index_col_name = df._data.get_labels_by_index(index_col)[0]
+            df = df.set_index(index_col_name)
+            if (
+                isinstance(index_col_name, str)
+                and names is None
+                and orig_header == "infer"
+            ):
+                if index_col_name.startswith("Unnamed:"):
+                    # TODO: Try to upstream it to libcudf
+                    # csv reader in future
+                    df.index.name = None
+            elif names is None:
+                df.index.name = index_col
+        else:
+            df = df.set_index(index_col)
+
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
         specified_dtypes = {} if dtype is None else dtype
-        unspecified_dtypes = {
-            name: dtype
-            for name, dtype in df._dtypes
-            if name not in specified_dtypes
-        }
         default_dtypes = {}
-
-        for name, dt in unspecified_dtypes.items():
-            if dt == np.dtype("i1"):
+        for name, dt in df._dtypes:
+            if name in specified_dtypes:
+                continue
+            elif dt == np.dtype("i1"):
                 # csv reader reads all null column as int8.
                 # The dtype should remain int8.
                 default_dtypes[name] = dt
             else:
                 default_dtypes[name] = _maybe_convert_to_default_type(dt)
-        df = df.astype(default_dtypes)
+
+        if default_dtypes:
+            df = df.astype(default_dtypes)
 
     return df
 
@@ -138,17 +319,17 @@ def read_csv(
 @_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
-    df,
+    df: cudf.DataFrame,
     path_or_buf=None,
-    sep=",",
-    na_rep="",
+    sep: str = ",",
+    na_rep: str = "",
     columns=None,
-    header=True,
-    index=True,
+    header: bool = True,
+    index: bool = True,
     encoding=None,
     compression=None,
-    lineterminator="\n",
-    chunksize=None,
+    lineterminator: str = "\n",
+    chunksize: int | None = None,
     storage_options=None,
 ):
     """{docstring}"""
@@ -187,15 +368,10 @@ def to_csv(
             )
 
     for _, dtype in df._dtypes:
-        if isinstance(dtype, cudf.ListDtype):
-            raise NotImplementedError(
-                "Writing to csv format is not yet supported with "
-                "list columns."
-            )
-        elif isinstance(dtype, cudf.StructDtype):
+        if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
-                "Struct columns."
+                f"{dtype} columns."
             )
 
     # TODO: Need to typecast categorical columns to the underlying
@@ -208,7 +384,7 @@ def to_csv(
         df = df.copy(deep=False)
         for col_name, col in df._column_labels_and_values:
             if isinstance(col.dtype, cudf.CategoricalDtype):
-                df._data[col_name] = col.astype(col.categories.dtype)
+                df._data[col_name] = col.astype(col.dtype.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
             df.index = df.index.astype(df.index.categories.dtype)
@@ -218,7 +394,7 @@ def to_csv(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            libcudf.csv.write_csv(
+            _plc_write_csv(
                 df,
                 path_or_buf=file_obj,
                 sep=sep,
@@ -229,7 +405,7 @@ def to_csv(
                 index=index,
             )
     else:
-        libcudf.csv.write_csv(
+        _plc_write_csv(
             df,
             path_or_buf=path_or_buf,
             sep=sep,
@@ -243,3 +419,127 @@ def to_csv(
     if return_as_string:
         path_or_buf.seek(0)
         return path_or_buf.read()
+
+
+@acquire_spill_lock()
+def _plc_write_csv(
+    table: cudf.DataFrame,
+    path_or_buf=None,
+    sep: str = ",",
+    na_rep: str = "",
+    header: bool = True,
+    lineterminator: str = "\n",
+    rows_per_chunk: int = 8,
+    index: bool = True,
+) -> None:
+    iter_columns = (
+        itertools.chain(table.index._columns, table._columns)
+        if index
+        else table._columns
+    )
+    columns = [col.to_pylibcudf(mode="read") for col in iter_columns]
+    col_names = []
+    if header:
+        table_names = (
+            na_rep if name is None or pd.isnull(name) else name
+            for name in table._column_names
+        )
+        iter_names = (
+            itertools.chain(table.index.names, table_names)
+            if index
+            else table_names
+        )
+        all_names = list(iter_names)
+        col_names = [
+            '""'
+            if (name in (None, "") and len(all_names) == 1)
+            else (str(name) if name not in (None, "") else "")
+            for name in all_names
+        ]
+    try:
+        plc.io.csv.write_csv(
+            (
+                plc.io.csv.CsvWriterOptions.builder(
+                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
+                )
+                .names(col_names)
+                .na_rep(na_rep)
+                .include_header(header)
+                .rows_per_chunk(rows_per_chunk)
+                .line_terminator(str(lineterminator))
+                .inter_column_delimiter(str(sep))
+                .true_value("True")
+                .false_value("False")
+                .build()
+            )
+        )
+    except OverflowError as err:
+        raise OverflowError(
+            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
+            "Consider providing a smaller chunksize argument."
+        ) from err
+
+
+def _validate_args(
+    delimiter: str | None,
+    sep: str,
+    delim_whitespace: bool,
+    decimal: str,
+    thousands: str | None,
+    nrows: int | None,
+    skipfooter: int,
+    byte_range: list[int] | tuple[int, int] | None,
+    skiprows: int,
+) -> None:
+    if delim_whitespace:
+        if delimiter is not None:
+            raise ValueError("cannot set both delimiter and delim_whitespace")
+        if sep != ",":
+            raise ValueError("cannot set both sep and delim_whitespace")
+
+    # Alias sep -> delimiter.
+    actual_delimiter = delimiter if delimiter else sep
+
+    if decimal == actual_delimiter:
+        raise ValueError("decimal cannot be the same as delimiter")
+
+    if thousands == actual_delimiter:
+        raise ValueError("thousands cannot be the same as delimiter")
+
+    if nrows is not None and skipfooter != 0:
+        raise ValueError("cannot use both nrows and skipfooter parameters")
+
+    if byte_range is not None:
+        if skipfooter != 0 or skiprows != 0 or nrows is not None:
+            raise ValueError(
+                "cannot manually limit rows to be read when using the byte range parameter"
+            )
+
+
+def _get_plc_data_type_from_dtype(dtype) -> plc.DataType:
+    # TODO: Remove this work-around Dictionary types
+    # in libcudf are fully mapped to categorical columns:
+    # https://github.com/rapidsai/cudf/issues/3960
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
+
+    if isinstance(dtype, str):
+        if dtype == "date32":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS)
+        elif dtype in ("date", "date64"):
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[us]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype == "timestamp[s]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS)
+        elif dtype == "timestamp[ms]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[ns]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS)
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_pylibcudf_type(dtype)

From 5baaf6d7f868dc42f8e0213e164dca340a7bfcff Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 12 Dec 2024 21:55:49 -0500
Subject: [PATCH 70/78] Propagate failures in pandas integration tests and Skip
 failing tests (#17521)

This PR ensures that the integration tests fail in any one of the test modules fails. It also skips of xfails any tests that are not currently passing. Finally, it fixes one incorrect use of `rng.random`.

Some of the change were originally made in  #17489

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17521
---
 .../third-party-integration/test.sh           |  16 ++-
 .../dependencies.yaml                         |  15 --
 .../tests/test_catboost.py                    | 129 ------------------
 .../tests/test_holoviews.py                   |   3 +
 .../tests/test_matplotlib.py                  |   6 +
 .../tests/test_numpy.py                       |   3 +
 .../tests/test_pytorch.py                     |   3 +
 .../tests/test_seaborn.py                     |   3 +
 .../tests/test_stumpy_distributed.py          |   2 +-
 .../tests/test_tensorflow.py                  |   1 +
 .../tests/test_xgboost.py                     |   3 +
 11 files changed, 34 insertions(+), 150 deletions(-)
 delete mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py

diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index f8ddbaba0f3..30e3ffc9a43 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -26,6 +26,8 @@ main() {
     LIBS=${LIBS#[}
     LIBS=${LIBS%]}
 
+    ANY_FAILURES=0
+
     for lib in ${LIBS//,/ }; do
         lib=$(echo "$lib" | tr -d '""')
         echo "Running tests for library $lib"
@@ -56,10 +58,6 @@ main() {
         rapids-logger "Check GPU usage"
         nvidia-smi
 
-        EXITCODE=0
-        trap "EXITCODE=1" ERR
-        set +e
-
         rapids-logger "pytest ${lib}"
 
         NUM_PROCESSES=8
@@ -72,12 +70,20 @@ main() {
             fi
         done
 
+        EXITCODE=0
+        trap "EXITCODE=1" ERR
+        set +e
+
         TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
 
+        set -e
         rapids-logger "Test script exiting with value: ${EXITCODE}"
+        if [[ ${EXITCODE} != 0 ]]; then
+            ANY_FAILURES=1
+        fi
     done
 
-    exit ${EXITCODE}
+    exit ${ANY_FAILURES}
 }
 
 main "$@"
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index e726b7fdca1..3891110e9d3 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -76,13 +76,6 @@ files:
       - py_version
       - test_base
       - test_xgboost
-  test_catboost:
-    output: none
-    includes:
-      - cuda_version
-      - py_version
-      - test_base
-      - test_catboost
   test_cuml:
     output: none
     includes:
@@ -251,14 +244,6 @@ dependencies:
           - pip
           - pip:
             - xgboost>=2.0.1
-  test_catboost:
-    common:
-      - output_types: conda
-        packages:
-          - numpy
-          - scipy
-          - scikit-learn
-          - catboost
   test_cuml:
     common:
       - output_types: conda
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
deleted file mode 100644
index 04cc69231fe..00000000000
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-import pytest
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
-from sklearn.datasets import make_classification, make_regression
-
-rng = np.random.default_rng(seed=42)
-
-
-def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
-    if isinstance(expect, (tuple, list)):
-        assert len(expect) == len(got)
-        for e, g in zip(expect, got):
-            assert_catboost_equal(e, g, rtol, atol)
-    elif isinstance(expect, np.ndarray):
-        np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
-    elif isinstance(expect, pd.DataFrame):
-        pd.testing.assert_frame_equal(expect, got)
-    elif isinstance(expect, pd.Series):
-        pd.testing.assert_series_equal(expect, got)
-    else:
-        assert expect == got
-
-
-pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)
-
-
-@pytest.fixture
-def regression_data():
-    X, y = make_regression(n_samples=100, n_features=10, random_state=42)
-    return pd.DataFrame(X), pd.Series(y)
-
-
-@pytest.fixture
-def classification_data():
-    X, y = make_classification(
-        n_samples=100, n_features=10, n_classes=2, random_state=42
-    )
-    return pd.DataFrame(X), pd.Series(y)
-
-
-def test_catboost_regressor_with_dataframe(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_regressor_with_numpy(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_classifier_with_dataframe(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_classifier_with_numpy(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_pool_and_dataframe(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X, y)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_with_pool_and_numpy(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X.values, y.values)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_categorical_features():
-    data = {
-        "numerical_feature": rng.standard_normal(100),
-        "categorical_feature": rng.choice(["A", "B", "C"], size=100),
-        "target": rng.integers(0, 2, size=100),
-    }
-    df = pd.DataFrame(data)
-    X = df[["numerical_feature", "categorical_feature"]]
-    y = df["target"]
-    cat_features = ["categorical_feature"]
-    model = CatBoostClassifier(
-        iterations=10, verbose=0, cat_features=cat_features
-    )
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-@pytest.mark.parametrize(
-    "X, y",
-    [
-        (
-            pd.DataFrame(rng.standard_normal((100, 5))),
-            pd.Series(rng.standard_normal(100)),
-        ),
-        (rng.standard_normal((100, 5)), rng.standard_normal(100)),
-    ],
-)
-def test_catboost_train_test_split(X, y):
-    from sklearn.model_selection import train_test_split
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X_train, y_train)
-    predictions = model.predict(X_test)
-    return len(X_train), len(X_test), len(y_train), len(y_test), predictions
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
index bef02c86355..8be48953974 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
@@ -71,6 +71,9 @@ def test_holoviews_heatmap(df):
     )
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_holoviews_histogram(df):
     return get_plot_info(hv.Histogram(df.values))
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
index 1909392b9f7..c91808021e8 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -33,6 +33,9 @@ def assert_plots_equal(expect, got):
 pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_line():
     df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
     (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")
@@ -40,6 +43,9 @@ def test_line():
     return plt.gca()
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_bar():
     data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
     ax = data.plot(kind="bar")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
index 472f1889354..4d35d9e8946 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
@@ -37,6 +37,9 @@ def test_numpy_dot(df):
     return np.dot(df, df.T)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_numpy_fft(sr):
     fft = np.fft.fft(sr)
     return fft
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
index ad287471aa0..7cea635afc4 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -116,6 +116,9 @@ def test_torch_train(data):
     return model(test_x1, test_x2)
 
 
+@pytest.mark.skip(
+    reason="AssertionError: The values for attribute 'device' do not match: cpu != cuda:0."
+)
 def test_torch_tensor_ctor():
     s = pd.Series(range(5))
     return torch.tensor(s.values)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
index 021c5bac9b7..f6a8a96ae3c 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -54,6 +54,9 @@ def test_scatter(df):
     return ax
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_lineplot_with_sns_data():
     df = sns.load_dataset("flights")
     ax = sns.lineplot(data=df, x="month", y="passengers")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
index 0777d982ac2..f275659288e 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -41,7 +41,7 @@ def test_multidimensional_distributed_timeseries(dask_client):
     rng = np.random.default_rng(seed=42)
     # Each row represents data from a different dimension while each column represents
     # data from the same dimension
-    your_time_series = rng.random(3, 1000)
+    your_time_series = rng.random((3, 1000))
     # Approximately, how many data points might be found in a pattern
     window_size = 50
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
index ba1f518cbfd..b4fad3024e7 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
@@ -271,6 +271,7 @@ def call(self, values):
         return tf.concat(values, axis=-1)
 
 
+@pytest.mark.xfail(reason="ValueError: Invalid dtype: object")
 def test_full_example_train_with_df(df, target):
     # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
     # Inputs are directly passed as dictionary of series
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
index 70f1e6a4250..0fd632507a6 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
@@ -113,6 +113,9 @@ def test_with_external_memory(
     return predt
 
 
+@pytest.mark.skip(
+    reason="TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly."
+)
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_predict(device: str) -> np.ndarray:
     reg = xgb.XGBRegressor(n_estimators=2, device=device)

From 48aa08f6dca0d60da421adb4b1735f075881541d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 20:46:26 -0800
Subject: [PATCH 71/78] Remove cudf._lib.reduce in favor of inlining pylibcudf
 (#17574)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17574
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   2 +-
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/copying.pyx             |   4 +-
 python/cudf/cudf/_lib/reduce.pyx              | 135 ------------------
 python/cudf/cudf/core/column/column.py        | 122 +++++++++++++---
 python/cudf/cudf/core/column/interval.py      |  14 --
 python/cudf/cudf/core/column/numerical.py     |  27 +---
 .../cudf/cudf/core/column/numerical_base.py   |   6 +-
 python/cudf/cudf/core/column/struct.py        |   7 +-
 python/cudf/cudf/core/copy_types.py           |   5 +-
 python/cudf/cudf/core/dataframe.py            |  11 +-
 python/cudf/cudf/core/multiindex.py           |   6 +-
 python/cudf/cudf/core/window/ewm.py           |  10 +-
 13 files changed, 120 insertions(+), 230 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/reduce.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index b402db0443d..8cec8af3c67 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx
+set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx
                    stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 0299b264189..001e5cbb676 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -5,7 +5,6 @@
     copying,
     groupby,
     interop,
-    reduce,
     sort,
     stream_compaction,
     string_casting,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index a7ea9c25a86..ef544dc89eb 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -12,8 +12,6 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.reduce import minmax
-
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
@@ -34,7 +32,7 @@ def _gather_map_is_valid(
     """
     if not check_bounds or nullify or len(gather_map) == 0:
         return True
-    gm_min, gm_max = minmax(gather_map)
+    gm_min, gm_max = gather_map.minmax()
     return gm_min >= -nrows and gm_max < nrows
 
 
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
deleted file mode 100644
index 2850cab93a1..00000000000
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import warnings
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
-
-import pylibcudf
-
-from cudf.core._internals.aggregation import make_aggregation
-
-
-@acquire_spill_lock()
-def reduce(reduction_op, Column incol, dtype=None, **kwargs):
-    """
-    Top level Cython reduce function wrapping libcudf reductions.
-
-    Parameters
-    ----------
-    reduction_op : string
-        A string specifying the operation, e.g. sum, prod
-    incol : Column
-        A cuDF Column object
-    dtype: numpy.dtype, optional
-        A numpy data type to use for the output, defaults
-        to the same type as the input column
-    """
-    if dtype is not None:
-        warnings.warn(
-            "dtype is deprecated and will be remove in a future release. "
-            "Cast the result (e.g. .astype) after the operation instead.",
-            FutureWarning
-        )
-        col_dtype = dtype
-    else:
-        col_dtype = incol._reduction_result_dtype(reduction_op)
-
-    # check empty case
-    if len(incol) <= incol.null_count:
-        if reduction_op == 'sum' or reduction_op == 'sum_of_squares':
-            return incol.dtype.type(0)
-        if reduction_op == 'product':
-            return incol.dtype.type(1)
-        if reduction_op == "any":
-            return False
-
-        return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
-
-    result = pylibcudf.reduce.reduce(
-        incol.to_pylibcudf(mode="read"),
-        make_aggregation(reduction_op, kwargs).c_obj,
-        dtype_to_pylibcudf_type(col_dtype),
-    )
-
-    if is_decimal_type_id(result.type().id()):
-        scale = -result.type().scale()
-        precision = _reduce_precision(col_dtype, reduction_op, len(incol))
-        return DeviceScalar.from_pylibcudf(
-            result,
-            dtype=col_dtype.__class__(precision, scale),
-        ).value
-    scalar = DeviceScalar.from_pylibcudf(result).value
-    if isinstance(col_dtype, cudf.StructDtype):
-        # TODO: Utilize column_metadata in libcudf to maintain field labels
-        return dict(zip(col_dtype.fields.keys(), scalar.values()))
-    return scalar
-
-
-@acquire_spill_lock()
-def scan(scan_op, Column incol, inclusive, **kwargs):
-    """
-    Top level Cython scan function wrapping libcudf scans.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-    scan_op : string
-        A string specifying the operation, e.g. cumprod
-    inclusive: bool
-        Flag for including nulls in relevant scan
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.reduce.scan(
-            incol.to_pylibcudf(mode="read"),
-            make_aggregation(scan_op, kwargs).c_obj,
-            pylibcudf.reduce.ScanType.INCLUSIVE if inclusive
-            else pylibcudf.reduce.ScanType.EXCLUSIVE,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minmax(Column incol):
-    """
-    Top level Cython minmax function wrapping libcudf minmax.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-
-    Returns
-    -------
-    A pair of ``(min, max)`` values of ``incol``
-    """
-    min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read"))
-    return (
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)),
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)),
-    )
-
-
-def _reduce_precision(dtype, op, nrows):
-    """
-    Returns the result precision when performing the reduce
-    operation `op` for the given dtype and column size.
-
-    See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
-    p = dtype.precision
-    if op in ("min", "max"):
-        new_p = p
-    elif op == "sum":
-        new_p = p + nrows - 1
-    elif op == "product":
-        new_p = p * nrows + nrows - 1
-    elif op == "sum_of_squares":
-        new_p = 2 * p + nrows
-    else:
-        raise NotImplementedError()
-    return max(min(new_p, dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 68307f0e109..42b4fda8be2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import warnings
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -31,7 +32,7 @@
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.types import size_type_dtype
+from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -41,7 +42,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import unary
+from cudf.core._internals import aggregation, unary
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -259,21 +260,17 @@ def all(self, skipna: bool = True) -> bool:
         # The skipna argument is only used for numerical columns.
         # If all entries are null the result is True, including when the column
         # is empty.
-
         if self.null_count == self.size:
             return True
-
-        return libcudf.reduce.reduce("all", self)
+        return self.reduce("all")
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
-
         if not skipna and self.has_nulls():
             return True
         elif skipna and self.null_count == self.size:
             return False
-
-        return libcudf.reduce.reduce("any", self)
+        return self.reduce("any")
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -1393,33 +1390,35 @@ def _reduce(
         )
         if isinstance(preprocessed, ColumnBase):
             dtype = kwargs.pop("dtype", None)
-            return libcudf.reduce.reduce(
-                op, preprocessed, dtype=dtype, **kwargs
-            )
+            return preprocessed.reduce(op, dtype, **kwargs)
         return preprocessed
 
+    def _can_return_nan(self, skipna: bool | None = None) -> bool:
+        return not skipna and self.has_nulls(include_nan=False)
+
     def _process_for_reduction(
         self, skipna: bool | None = None, min_count: int = 0
     ) -> ColumnBase | ScalarLike:
-        if skipna is None:
-            skipna = True
+        skipna = True if skipna is None else skipna
 
-        if self.has_nulls():
+        if self._can_return_nan(skipna=skipna):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        col = self.nans_to_nulls() if skipna else self
+        if col.has_nulls():
             if skipna:
-                result_col = self.dropna()
+                col = col.dropna()
             else:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        result_col = self
-
         # TODO: If and when pandas decides to validate that `min_count` >= 0 we
         # should insert comparable behavior.
         # https://github.com/pandas-dev/pandas/issues/50022
         if min_count > 0:
-            valid_count = len(result_col) - result_col.null_count
+            valid_count = len(col) - col.null_count
             if valid_count < min_count:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-        return result_col
+        return col
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         """
@@ -1529,6 +1528,91 @@ def one_hot_encode(
             for col in plc_table.columns()
         )
 
+    @acquire_spill_lock()
+    def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.reduce.scan(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(scan_op, kwargs).c_obj,
+                plc.reduce.ScanType.INCLUSIVE
+                if inclusive
+                else plc.reduce.ScanType.EXCLUSIVE,
+            )
+        )
+
+    def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
+        if dtype is not None:
+            warnings.warn(
+                "dtype is deprecated and will be remove in a future release. "
+                "Cast the result (e.g. .astype) after the operation instead.",
+                FutureWarning,
+            )
+            col_dtype = dtype
+        else:
+            col_dtype = self._reduction_result_dtype(reduction_op)
+
+        # check empty case
+        if len(self) <= self.null_count:
+            if reduction_op == "sum" or reduction_op == "sum_of_squares":
+                return self.dtype.type(0)
+            if reduction_op == "product":
+                return self.dtype.type(1)
+            if reduction_op == "any":
+                return False
+
+            return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
+
+        with acquire_spill_lock():
+            plc_scalar = plc.reduce.reduce(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(reduction_op, kwargs).c_obj,
+                dtype_to_pylibcudf_type(col_dtype),
+            )
+            result_col = type(self).from_pylibcudf(
+                plc.Column.from_scalar(plc_scalar, 1)
+            )
+            if plc_scalar.type().id() in {
+                plc.TypeId.DECIMAL128,
+                plc.TypeId.DECIMAL64,
+                plc.TypeId.DECIMAL32,
+            }:
+                scale = -plc_scalar.type().scale()
+                # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
+                p = col_dtype.precision
+                nrows = len(self)
+                if reduction_op in {"min", "max"}:
+                    new_p = p
+                elif reduction_op == "sum":
+                    new_p = p + nrows - 1
+                elif reduction_op == "product":
+                    new_p = p * nrows + nrows - 1
+                elif reduction_op == "sum_of_squares":
+                    new_p = 2 * p + nrows
+                else:
+                    raise NotImplementedError(
+                        f"{reduction_op} not implemented for decimal types."
+                    )
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
+                new_dtype = type(col_dtype)(precision, scale)
+                result_col = result_col.astype(new_dtype)
+            elif isinstance(col_dtype, cudf.IntervalDtype):
+                result_col = type(self).from_struct_column(  # type: ignore[attr-defined]
+                    result_col, closed=col_dtype.closed
+                )
+        return result_col.element_indexing(0)
+
+    @acquire_spill_lock()
+    def minmax(self) -> tuple[ScalarLike, ScalarLike]:
+        min_val, max_val = plc.reduce.minmax(self.to_pylibcudf(mode="read"))
+        return (
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(min_val, 1))
+            .element_indexing(0),
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(max_val, 1))
+            .element_indexing(0),
+        )
+
 
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     """Check if an object dtype Series or array contains NaN."""
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 34975fc94f4..dd8f58a118e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -14,7 +14,6 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-    from cudf._typing import ScalarLike
     from cudf.core.buffer import Buffer
     from cudf.core.column import ColumnBase
 
@@ -211,16 +210,3 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
-
-    def _reduce(
-        self,
-        op: str,
-        skipna: bool | None = None,
-        min_count: int = 0,
-        *args,
-        **kwargs,
-    ) -> ScalarLike:
-        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
-        if cudf.get_option("mode.pandas_compatible"):
-            return pd.Interval(**result, closed=self.dtype.closed)
-        return result
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 790cd6ea9bb..28a2bd7fa6c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -420,22 +420,12 @@ def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
         # is empty.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if result_col.null_count == result_col.size:
-            return True
-
-        return libcudf.reduce.reduce("all", result_col)
+        return super(type(self), result_col).all(skipna=skipna)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if not skipna and result_col.has_nulls():
-            return True
-        elif skipna and result_col.null_count == result_col.size:
-            return False
-
-        return libcudf.reduce.reduce("any", result_col)
+        return super(type(self), result_col).any(skipna=skipna)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -483,19 +473,6 @@ def _process_values_for_isin(
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
-    def _process_for_reduction(
-        self, skipna: bool | None = None, min_count: int = 0
-    ) -> NumericalColumn | ScalarLike:
-        skipna = True if skipna is None else skipna
-
-        if self._can_return_nan(skipna=skipna):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        col = self.nans_to_nulls() if skipna else self
-        return super(NumericalColumn, col)._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 3f9abdabc2f..e06a0447f5c 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -263,6 +263,6 @@ def round(
             )
 
     def _scan(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(
-            op.replace("cum", ""), self, True
-        )._with_type_metadata(self.dtype)
+        return self.scan(op.replace("cum", ""), True)._with_type_metadata(
+            self.dtype
+        )
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index db6ad72ab56..ba765b50729 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -107,12 +107,9 @@ def memory_usage(self) -> int:
 
         return n
 
-    def element_indexing(self, index: int):
+    def element_indexing(self, index: int) -> dict:
         result = super().element_indexing(index)
-        return {
-            field: value
-            for field, value in zip(self.dtype.fields, result.values())
-        }
+        return dict(zip(self.dtype.fields, result.values()))
 
     def __setitem__(self, key, value):
         if isinstance(value, dict):
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 16d8964f083..4b6ad59c8e1 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -5,7 +5,6 @@
 from typing_extensions import Self
 
 import cudf
-import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 
 if TYPE_CHECKING:
@@ -70,8 +69,8 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             if self.column.dtype.kind not in {"i", "u"}:
                 raise TypeError("Gather map must have integer dtype")
             if not nullify:
-                lo, hi = libcudf.reduce.minmax(self.column)
-                if lo.value < -nrows or hi.value >= nrows:
+                lo, hi = self.column.minmax()
+                if lo < -nrows or hi >= nrows:
                     raise IndexError(
                         f"Gather map is out of bounds for [0, {nrows})"
                     )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b74128a8a61..8cdc45e12da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2505,16 +2505,7 @@ def scatter_by_map(
                 )
 
             if map_index.size > 0:
-                plc_lo, plc_hi = plc.reduce.minmax(
-                    map_index.to_pylibcudf(mode="read")
-                )
-                # TODO: Use pylibcudf Scalar once APIs are more developed
-                lo = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_lo, 1)
-                ).element_indexing(0)
-                hi = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_hi, 1)
-                ).element_indexing(0)
+                lo, hi = map_index.minmax()
                 if lo < 0 or hi >= map_size:
                     raise ValueError("Partition map has invalid values")
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 5a41a33e583..f5ee36f851c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -191,12 +191,12 @@ def __init__(
         source_data = {}
         for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
-                lo, hi = libcudf.reduce.minmax(code)
-                if lo.value < -1 or hi.value > len(level) - 1:
+                lo, hi = code.minmax()
+                if lo < -1 or hi > len(level) - 1:
                     raise ValueError(
                         f"Codes must be -1 <= codes <= {len(level) - 1}"
                     )
-                if lo.value == -1:
+                if lo == -1:
                     # Now we can gather and insert null automatically
                     code[code == -1] = np.iinfo(size_type_dtype).min
             result_col = libcudf.copying.gather(
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 094df955273..c4a063a50e8 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from cudf._lib.reduce import scan
 from cudf.api.types import is_numeric_dtype
 from cudf.core.window.rolling import _RollingBase
 
@@ -194,13 +193,8 @@ def _apply_agg_column(
         # as such we need to convert the nans to nulls before
         # passing them in.
         to_libcudf_column = source_column.astype("float64").nans_to_nulls()
-
-        return scan(
-            agg_name,
-            to_libcudf_column,
-            True,
-            com=self.com,
-            adjust=self.adjust,
+        return to_libcudf_column.scan(
+            agg_name, True, com=self.com, adjust=self.adjust
         )
 
 

From f3f159ae166426125347e7d6f8dd7210d4075179 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 13 Dec 2024 08:46:57 -0500
Subject: [PATCH 72/78] Use no-sync copy for fixed-width types in
 cudf::concatenate (#17584)

Replacing `thrust::copy` with `cudaMemcpyAsync` improves performance upto 2x in specific cases in `cudf::concatenate`
The `thrust::copy` does a sync for device-to-device copy though it is not necessary.  Using `rmm::exec_policy_nosync` had no effect. Will work with CCCL to determine if this is a bug in `thrust::copy` since computing the return value does not require a sync.

Also moved the benchmark for concatenate from googlebench to nvbench.

Closes #17172

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17584
---
 cpp/benchmarks/CMakeLists.txt          |   5 +-
 cpp/benchmarks/column/concatenate.cpp  | 169 -------------------------
 cpp/benchmarks/copying/concatenate.cpp |  84 ++++++++++++
 cpp/src/copying/concatenate.cu         |   6 +-
 4 files changed, 92 insertions(+), 172 deletions(-)
 delete mode 100644 cpp/benchmarks/column/concatenate.cpp
 create mode 100644 cpp/benchmarks/copying/concatenate.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8e5ea900efa..b1456600c95 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
 endfunction()
 
 # ##################################################################################################
-# * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
+# * copying benchmarks
+# -----------------------------------------------------------------------------
+ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
deleted file mode 100644
index 51106c72137..00000000000
--- a/cpp/benchmarks/column/concatenate.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/concatenate.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <algorithm>
-#include <vector>
-
-class Concatenate : public cudf::benchmark {};
-
-template <typename T, bool Nullable>
-static void BM_concatenate(benchmark::State& state)
-{
-  cudf::size_type const num_rows = state.range(0);
-  cudf::size_type const num_cols = state.range(1);
-
-  auto input         = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                     row_count{num_rows},
-                                     Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  auto input_columns = input->view();
-  std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
-}
-
-#define CONCAT_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                                \
-    ->Unit(benchmark::kMillisecond)                                         \
-    ->UseManualTime();
-
-CONCAT_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_BENCHMARK_DEFINE(int64_t, true)
-
-template <typename T, bool Nullable>
-static void BM_concatenate_tables(benchmark::State& state)
-{
-  cudf::size_type const num_rows   = state.range(0);
-  cudf::size_type const num_cols   = state.range(1);
-  cudf::size_type const num_tables = state.range(2);
-
-  std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
-  std::generate_n(tables.begin(), num_tables, [&]() {
-    return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                 row_count{num_rows},
-                                 Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  });
-
-  // Generate table views
-  std::vector<cudf::table_view> table_views(num_tables);
-  std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
-    return table->view();
-  });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(table_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
-}
-
-#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                           \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                               \
-    ->Unit(benchmark::kMillisecond)                                                \
-    ->UseManualTime();
-
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
-
-class ConcatenateStrings : public cudf::benchmark {};
-
-template <bool Nullable>
-static void BM_concatenate_strings(benchmark::State& state)
-{
-  using column_wrapper = cudf::test::strings_column_wrapper;
-
-  auto const num_rows  = state.range(0);
-  auto const num_chars = state.range(1);
-  auto const num_cols  = state.range(2);
-
-  std::string str(num_chars, 'a');
-
-  // Create owning columns
-  std::vector<column_wrapper> columns;
-  columns.reserve(num_cols);
-  std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() {
-    auto iter = thrust::make_constant_iterator(c_str);
-    if (Nullable) {
-      auto count_it = thrust::make_counting_iterator(0);
-      auto valid_iter =
-        thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; });
-      return column_wrapper(iter, iter + num_rows, valid_iter);
-    } else {
-      return column_wrapper(iter, iter + num_rows);
-    }
-  });
-
-  // Generate column views
-  std::vector<cudf::column_view> column_views;
-  column_views.reserve(columns.size());
-  std::transform(
-    columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows *
-                          (sizeof(int32_t) + num_chars));  // offset + chars
-}
-
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                                   \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_strings<nullable>(st); }               \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                            \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                               \
-    ->Unit(benchmark::kMillisecond)                                                 \
-    ->UseManualTime();
-
-CONCAT_STRINGS_BENCHMARK_DEFINE(false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp
new file mode 100644
index 00000000000..586b479d0ad
--- /dev/null
+++ b/cpp/benchmarks/copying/concatenate.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void bench_concatenate(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const nulls    = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  auto input = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
+  auto input_columns = input->view();
+  auto column_views  = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int64_t>(num_rows * num_cols);
+  state.add_global_memory_writes<int64_t>(num_rows * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate)
+  .set_name("concatenate")
+  .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
+  .add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
+  .add_float64_axis("nulls", {0.0, 0.3});
+
+static void bench_concatenate_strings(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const nulls     = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .null_probability(nulls);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = column->view();
+
+  auto column_views = std::vector<cudf::column_view>(num_cols, input);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const sv = cudf::strings_column_view(input);
+  state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
+  state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate_strings)
+  .set_name("concatenate_strings")
+  .add_int64_axis("num_rows", {256, 512, 4096, 16384})
+  .add_int64_axis("num_cols", {2, 8, 64, 256})
+  .add_int64_axis("row_width", {32, 128})
+  .add_float64_axis("nulls", {0.0, 0.3});
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d8419760120..6fc49afd7ac 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    cudaMemcpyAsync(m_view.begin<T>() + count,
+                    v.begin<T>(),
+                    v.size() * sizeof(T),
+                    cudaMemcpyDeviceToDevice,
+                    stream.value());
     count += v.size();
   }
 

From a0957273a686875c8c3da19dfb80f4048e472e19 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 13 Dec 2024 08:47:35 -0500
Subject: [PATCH 73/78] Allow large strings in nvtext benchmarks (#17579)

Removes the 2GB limit check from the nvtext benchmarks and adjusts the parameters to be consistent across the benchmarks.
Also converts the subword-tokenizer to nvbench and removes the unused `word_minhash.cpp` source file.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17579
---
 cpp/benchmarks/CMakeLists.txt         | 15 ++++--
 cpp/benchmarks/text/edit_distance.cpp | 15 +++---
 cpp/benchmarks/text/hash_ngrams.cpp   | 15 +++---
 cpp/benchmarks/text/jaccard.cpp       | 13 ++---
 cpp/benchmarks/text/normalize.cpp     | 15 +++---
 cpp/benchmarks/text/replace.cpp       |  9 +---
 cpp/benchmarks/text/subword.cpp       | 58 +++++++++-----------
 cpp/benchmarks/text/tokenize.cpp      | 15 +++---
 cpp/benchmarks/text/vocab.cpp         | 17 +++---
 cpp/benchmarks/text/word_minhash.cpp  | 77 ---------------------------
 10 files changed, 74 insertions(+), 175 deletions(-)
 delete mode 100644 cpp/benchmarks/text/word_minhash.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b1456600c95..749e1b628ee 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -352,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/subword.cpp)
-
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH
+  text/edit_distance.cpp
+  text/hash_ngrams.cpp
+  text/jaccard.cpp
+  text/minhash.cpp
+  text/ngrams.cpp
+  text/normalize.cpp
+  text/replace.cpp
+  text/subword.cpp
+  text/tokenize.cpp
+  text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
index 6ffa90edb8f..0ad1ae30f8c 100644
--- a/cpp/benchmarks/text/edit_distance.cpp
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -27,15 +27,11 @@
 static void bench_edit_distance(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input1(strings_table->view().column(0));
@@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state)
 
 NVBENCH_BENCH(bench_edit_distance)
   .set_name("edit_distance")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144});
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
index 4e5daf83a3c..7577cf00c0f 100644
--- a/cpp/benchmarks/text/hash_ngrams.cpp
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -27,16 +27,12 @@
 static void bench_hash_ngrams(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
@@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state)
 
 NVBENCH_BENCH(bench_hash_ngrams)
   .set_name("hash_ngrams")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {16384, 32768, 262144})
   .add_int64_axis("ngrams", {5, 10});
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d5b74da6773..5506501138b 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -28,17 +28,13 @@
 static void bench_jaccard(nvbench::state& state)
 {
   auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width       = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width       = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width       = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const substring_width = static_cast<cudf::size_type>(state.get_int64("substring_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const input_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
@@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 1024, 2048})
   .add_int64_axis("num_rows", {32768, 131072, 262144})
-  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 71bccd80d39..594dc0de28a 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -28,16 +28,12 @@
 static void bench_normalize(nvbench::state& state)
 {
   auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width      = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width      = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const normalize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 767ebab3eee..24ca4e5dfd7 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
                                  "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
@@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index dd8df695d3e..0b4e3bdefa5 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/file_utilities.hpp>
 
@@ -24,6 +21,8 @@
 
 #include <nvtext/subword_tokenize.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -54,40 +53,33 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_subword_tokenizer(benchmark::State& state)
+static void bench_subword_tokenizer(nvbench::state& state)
 {
-  auto const nrows = static_cast<cudf::size_type>(state.range(0));
-  std::vector<char const*> h_strings(nrows, "This is a test ");
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+
+  std::vector<char const*> h_strings(num_rows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
-  uint32_t max_sequence_length = 64;
-  uint32_t stride              = 48;
-  uint32_t do_truncate         = 0;
-  uint32_t do_lower            = 1;
-  //
-  auto vocab = nvtext::load_vocabulary_file(hash_file);
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                           *vocab,
-                                           max_sequence_length,
-                                           stride,
-                                           do_lower,
-                                           do_truncate);
-  }
-}
+  uint32_t max_sequence = 64;
+  uint32_t stride       = 48;
+  uint32_t do_truncate  = 0;
+  uint32_t do_lower     = 1;
 
-class Subword : public cudf::benchmark {};
+  auto input = cudf::strings_column_view{strings};
 
-#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
-  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
-  BENCHMARK_REGISTER_F(Subword, name)                                                            \
-    ->RangeMultiplier(2)                                                                         \
-    ->Range(1 << 10, 1 << 17)                                                                    \
-    ->UseManualTime()                                                                            \
-    ->Unit(benchmark::kMillisecond);
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * max_sequence);
 
-SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result =
+      nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate);
+  });
+}
 
-// BENCHMARK_MAIN();
+NVBENCH_BENCH(bench_subword_tokenizer)
+  .set_name("subword_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index e83310e0343..b9590c5539f 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -31,17 +31,13 @@
 static void bench_tokenize(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const tokenize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
@@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_tokenize)
   .set_name("tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 523d277df18..0502f375d99 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state)
 {
   auto const stream    = cudf::get_default_stream();
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto const column = [num_rows, row_width] {
+  auto const column = [num_rows, min_width, max_width] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+      cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
@@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_vocab_tokenize)
   .set_name("vocab_tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
deleted file mode 100644
index adc3dddc59c..00000000000
--- a/cpp/benchmarks/text/word_minhash.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <nvtext/minhash.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-static void bench_word_minhash(nvbench::state& state)
-{
-  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
-  auto const base64     = state.get_int64("hash_type") == 64;
-
-  data_profile const strings_profile =
-    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
-  auto strings_table =
-    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
-
-  auto const num_offsets = (num_rows / row_width) + 1;
-  auto offsets           = cudf::sequence(num_offsets,
-                                cudf::numeric_scalar<cudf::size_type>(0),
-                                cudf::numeric_scalar<cudf::size_type>(row_width));
-
-  auto source = cudf::make_lists_column(num_offsets - 1,
-                                        std::move(offsets),
-                                        std::move(strings_table->release().front()),
-                                        0,
-                                        rmm::device_buffer{});
-
-  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-
-  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
-  auto chars_size = input.chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
-                         : nvtext::word_minhash(source->view(), seeds.view());
-  });
-}
-
-NVBENCH_BENCH(bench_word_minhash)
-  .set_name("word_minhash")
-  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
-  .add_int64_axis("row_width", {10, 100, 1000})
-  .add_int64_axis("seed_count", {2, 25})
-  .add_int64_axis("hash_type", {32, 64});

From 62669e04cc11bd53dab1102e83aba76804f4dbde Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 13 Dec 2024 10:10:02 -0500
Subject: [PATCH 74/78] Fix ctest fail running libcudf tests in a Debug build
 (#17576)

Fixes libcudf gtest failures when running with ctest on a Debug build.
The error from `LastTest.log` indicates:
```
1/106 Testing: COLUMN_TEST
1/106 Test: COLUMN_TEST
Command: "/conda/envs/rapids/bin/cmake" "-Dcommand_to_run=/cudf/cpp/build/gtests/COLUMN_TEST" "-Dcommand_args=" "-P=/cudf/cpp/build/rapids-cmake/./run_gpu_test.cmake"
Directory: /cudf/cpp/build/tests
"COLUMN_TEST" start time: Dec 11 15:46 UTC
Output:
----------------------------------------------------------
/conda/envs/rapids/bin/cmake: symbol lookup error: /cudf/cpp/build/libcudf_identify_stream_usage_mode_cudf.so: undefined symbol: _ZN3rmm6loggerD1Ev
<end of output>
Test time =   0.00 sec
----------------------------------------------------------
Test Failed.
"COLUMN_TEST" end time: Dec 11 15:46 UTC
"COLUMN_TEST" time elapsed: 00:00:00
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17576
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f17b57b0a4..78f529a44d3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1105,7 +1105,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl)
     if(CUDF_BUILD_STACKTRACE_DEBUG)
       target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
     endif()

From 4d6925ce1b83e10ea249346436ff8fdc4d28d73d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 13 Dec 2024 10:30:45 -0800
Subject: [PATCH 75/78] Remove unused masked keyword in column_empty (#17530)

Follow up to https://github.com/rapidsai/cudf/pull/16715.

Now that the usages of the `masked` keyword in RAPIDS have been address (https://github.com/rapidsai/cuspatial/pull/1496 is the only one I could find), I think we can remove this keyword all together in this method

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17530
---
 python/cudf/cudf/core/column/categorical.py   |  2 +-
 python/cudf/cudf/core/column/column.py        | 12 ++----
 python/cudf/cudf/core/column/datetime.py      |  6 +--
 .../cudf/cudf/core/column/numerical_base.py   |  2 +-
 python/cudf/cudf/core/column/string.py        |  2 +-
 python/cudf/cudf/core/column/timedelta.py     |  2 +-
 python/cudf/cudf/core/dataframe.py            | 39 +++++++------------
 python/cudf/cudf/core/dtypes.py               |  4 +-
 python/cudf/cudf/core/groupby/groupby.py      |  7 ++--
 python/cudf/cudf/core/index.py                |  2 +-
 python/cudf/cudf/core/indexed_frame.py        |  1 -
 python/cudf/cudf/io/parquet.py                |  1 -
 12 files changed, 28 insertions(+), 52 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 71ec11e75af..a0cf38c6f51 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1193,7 +1193,7 @@ def _concat(
                 f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
             )
         elif newsize == 0:
-            codes_col = column.column_empty(0, head.codes.dtype, masked=True)
+            codes_col = column.column_empty(0, head.codes.dtype)
         else:
             codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 42b4fda8be2..624a3ac95ed 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -551,7 +551,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return cast(Self, column_empty(0, self.dtype, masked=True))
+            return cast(Self, column_empty(0, self.dtype))
         # compute mask slice
         if stride == 1:
             return libcudf.copying.column_slice(self, [start, stop])[
@@ -1054,7 +1054,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             if self.dtype == dtype:
                 result = self
             else:
-                result = column_empty(0, dtype=dtype, masked=self.nullable)
+                result = column_empty(0, dtype=dtype)
         elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
@@ -1625,7 +1625,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
 def column_empty(
     row_count: int,
     dtype: Dtype = "object",
-    masked: bool = False,
     for_numba: bool = False,
 ) -> ColumnBase:
     """
@@ -1642,9 +1641,6 @@ def column_empty(
     dtype : Dtype
         Type of the column.
 
-    masked : bool
-        Unused.
-
     for_numba : bool, default False
         If True, don't allocate a mask as it's not supported by numba.
     """
@@ -2420,7 +2416,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
         dtype = cudf.dtype(None)
-        return column_empty(0, dtype=dtype, masked=True)
+        return column_empty(0, dtype=dtype)
 
     # If all columns are `NumericalColumn` with different dtypes,
     # we cast them to a common dtype.
@@ -2467,7 +2463,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
             f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
         )
     elif newsize == 0:
-        return column_empty(0, head.dtype, masked=True)
+        return column_empty(0, head.dtype)
 
     # Filter out inputs that have 0 length, then concatenate.
     objs_with_len = [o for o in objs if len(o)]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b526a6efa51..81b82040b8d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -598,14 +598,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
-            names = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            names = column.column_empty(0, dtype="object")
         return string._datetime_to_str_typecast_functions[self.dtype](
             self, format, names
         )
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index e06a0447f5c..7a39355dd50 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -139,7 +139,7 @@ def quantile(
             result = cast(
                 NumericalBaseColumn,
                 cudf.core.column.column_empty(
-                    row_count=len(q), dtype=self.dtype, masked=True
+                    row_count=len(q), dtype=self.dtype
                 ),
             )
         else:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c021554f3bd..d76caa5c3b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5855,7 +5855,7 @@ def strptime(
                 f"dtype must be datetime or timedelta type, not {dtype}"
             )
         elif self.null_count == len(self):
-            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+            return column.column_empty(len(self), dtype=dtype)  # type: ignore[return-value]
         elif (self == "None").any():
             raise ValueError(
                 "Cannot convert `None` value to datetime or timedelta."
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index f3a7916aa35..8b1515acae2 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -294,7 +294,7 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         else:
             return string._timedelta_to_str_typecast_functions[self.dtype](
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8cdc45e12da..fce361e18ea 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -774,9 +774,7 @@ def __init__(
                 label_dtype = getattr(columns, "dtype", None)
                 self._data = ColumnAccessor(
                     {
-                        k: column.column_empty(
-                            len(self), dtype="object", masked=True
-                        )
+                        k: column_empty(len(self), dtype="object")
                         for k in columns
                     },
                     level_names=tuple(columns.names)
@@ -979,8 +977,8 @@ def _init_from_series_list(self, data, columns, index):
         if columns is not None:
             for col_name in columns:
                 if col_name not in self._data:
-                    self._data[col_name] = column.column_empty(
-                        row_count=len(self), dtype=None, masked=True
+                    self._data[col_name] = column_empty(
+                        row_count=len(self), dtype=None
                     )
             self._data._level_names = (
                 tuple(columns.names)
@@ -1031,11 +1029,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
-                data = [
-                    cudf.core.column.column_empty(row_count=0, dtype=None)
-                    for _ in columns
-                ]
-
+                data = [column_empty(row_count=0, dtype=None) for _ in columns]
             for col_name, col in enumerate(data):
                 self._data[col_name] = column.as_column(col)
             self._data.rangeindex = True
@@ -1074,9 +1068,8 @@ def _init_from_dict_like(
                 # the provided index, so we need to return a masked
                 # array of nulls if an index is given.
                 empty_column = functools.partial(
-                    cudf.core.column.column_empty,
-                    row_count=(0 if index is None else len(index)),
-                    masked=index is not None,
+                    column_empty,
+                    row_count=0 if index is None else len(index),
                 )
 
             data = {
@@ -1421,7 +1414,7 @@ def __setitem__(self, arg, value):
                         new_columns = (
                             value
                             if key == arg
-                            else column.column_empty(
+                            else column_empty(
                                 row_count=length, dtype=col.dtype
                             )
                             for key, col in self._column_labels_and_values
@@ -3373,7 +3366,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
                         (
-                            column.column_empty(row_count=length, dtype=dtype)
+                            column_empty(row_count=length, dtype=dtype)
                             for _, dtype in self._dtypes
                         ),
                         verify=False,
@@ -3479,7 +3472,7 @@ def diff(self, periods=1, axis=0):
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
-                    name: column_empty(len(self), dtype=dtype, masked=True)
+                    name: column_empty(len(self), dtype=dtype)
                     for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
@@ -3859,9 +3852,7 @@ def agg(self, aggs, axis=None):
                 result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = self[key]
-                    col_empty = column_empty(
-                        len(idxs), dtype=col.dtype, masked=True
-                    )
+                    col_empty = column_empty(len(idxs), dtype=col.dtype)
                     ans = cudf.Series._from_column(
                         col_empty, index=cudf.Index(idxs)
                     )
@@ -6177,9 +6168,7 @@ def quantile(
                         quant_index=False,
                     )._column
                     if len(res) == 0:
-                        res = column.column_empty(
-                            row_count=len(qs), dtype=ser.dtype
-                        )
+                        res = column_empty(row_count=len(qs), dtype=ser.dtype)
                     result[k] = res
             result = DataFrame._from_data(result)
 
@@ -7333,9 +7322,7 @@ def unnamed_group_generator():
             )
 
             all_nulls = functools.cache(
-                functools.partial(
-                    column_empty, self.shape[0], common_type, masked=True
-                )
+                functools.partial(column_empty, self.shape[0], common_type)
             )
 
             # homogenize the dtypes of the columns
@@ -8582,7 +8569,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
             # If column not in this df, fill with an all-null column
             if idx >= len(cols) or cols[idx] is None:
                 n = len(next(x for x in cols if x is not None))
-                cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True)
+                cols[idx] = column_empty(row_count=n, dtype=dtype)
             else:
                 # If column is categorical, rebase the codes with the
                 # combined categories, and cast the new codes to the
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 9bb29f1920a..971f0be77f8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -189,9 +189,7 @@ def categories(self) -> cudf.Index:
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            col = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            col = cudf.core.column.column_empty(0, dtype="object")
         else:
             col = self._categories
         return cudf.Index._from_column(col)
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d4f3394833a..a8d82f977d5 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -493,9 +493,7 @@ def size(self):
         """
         Return the size of each group.
         """
-        col = cudf.core.column.column_empty(
-            len(self.obj), "int8", masked=False
-        )
+        col = cudf.core.column.column_empty(len(self.obj), "int8")
         result = (
             cudf.Series._from_column(col, name=getattr(self.obj, "name", None))
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
@@ -523,7 +521,8 @@ def cumcount(self, ascending: bool = True):
         return (
             cudf.Series._from_column(
                 cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
+                    len(self.obj),
+                    "int8",
                 ),
                 index=self.obj.index,
             )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index eeb6e3bd547..8d3ef1036d1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -336,7 +336,7 @@ def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
-            return column.column_empty(0, masked=False, dtype=self.dtype)
+            return column.column_empty(0, dtype=self.dtype)
 
     def _clean_nulls_from_index(self) -> Self:
         return self
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0e6a5e03ea6..81d954960e2 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3851,7 +3851,6 @@ def _reindex(
                 if name in df._data
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
-                    masked=True,
                     row_count=len(index),
                 )
             )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 66095d4a155..153ee0fa01a 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1139,7 +1139,6 @@ def _parquet_to_frame(
                     dfs[-1][name] = column_empty(
                         row_count=_len,
                         dtype=_dtype,
-                        masked=True,
                     )
                 else:
                     dfs[-1][name] = as_column(

From 1a67646fa3998788757b05a08eae1c8d1ee73eb2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 13 Dec 2024 12:23:30 -0800
Subject: [PATCH 76/78] Move cudf._lib.sort to cudf.core._internals (#17488)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17488
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   4 +-
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/sort.pyx                | 365 ------------------
 python/cudf/cudf/core/_internals/sorting.py   | 205 ++++++++++
 python/cudf/cudf/core/column/column.py        |  23 +-
 python/cudf/cudf/core/column/numerical.py     |  65 ++--
 .../cudf/cudf/core/column/numerical_base.py   |   4 +-
 python/cudf/cudf/core/frame.py                |   3 +-
 python/cudf/cudf/core/groupby/groupby.py      |  25 +-
 python/cudf/cudf/core/indexed_frame.py        |  44 ++-
 python/cudf/cudf/core/join/join.py            |   5 +-
 python/cudf/cudf/core/multiindex.py           |   3 +-
 python/cudf/cudf/core/series.py               |   7 +-
 13 files changed, 324 insertions(+), 430 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/sort.pyx
 create mode 100644 python/cudf/cudf/core/_internals/sorting.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8cec8af3c67..427ffcc8c12 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx
-                   stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx
+set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx
+                   string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 001e5cbb676..26afdd62caf 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -5,7 +5,6 @@
     copying,
     groupby,
     interop,
-    sort,
     stream_compaction,
     string_casting,
     strings_udf,
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
deleted file mode 100644
index eefe37d9880..00000000000
--- a/python/cudf/cudf/_lib/sort.pyx
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from itertools import repeat
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def is_sorted(
-    list source_columns, object ascending=None, object null_position=None
-):
-    """
-    Checks whether the rows of a `table` are sorted in lexicographical order.
-
-    Parameters
-    ----------
-    source_columns : list of columns
-        columns to be checked for sort order
-    ascending : None or list-like of booleans
-        None or list-like of boolean values indicating expected sort order of
-        each column. If list-like, size of list-like must be len(columns). If
-        None, all columns expected sort order is set to ascending. False (0) -
-        descending, True (1) - ascending.
-    null_position : None or list-like of booleans
-        None or list-like of boolean values indicating desired order of nulls
-        compared to other elements. If list-like, size of list-like must be
-        len(columns). If None, null order is set to before. False (0) - after,
-        True (1) - before.
-
-    Returns
-    -------
-    returns : boolean
-        Returns True, if sorted as expected by ``ascending`` and
-        ``null_position``, False otherwise.
-    """
-
-    if ascending is None:
-        column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns)
-    else:
-        if len(ascending) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(ascending)} for `ascending`"
-            )
-        column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns)
-        for idx, val in enumerate(ascending):
-            if val:
-                column_order[idx] = pylibcudf.types.Order.ASCENDING
-
-    if null_position is None:
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-    else:
-        if len(null_position) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(null_position)} for `null_position`"
-            )
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-        for idx, val in enumerate(null_position):
-            if val:
-                null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE
-
-    return pylibcudf.sorting.is_sorted(
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ),
-        column_order,
-        null_precedence
-    )
-
-
-def ordering(column_order, null_precedence):
-    """
-    Construct order and null order vectors
-
-    Parameters
-    ----------
-    column_order
-        Iterable of bool (True for ascending order, False for descending)
-    null_precedence
-        Iterable string for null positions ("first" for start, "last" for end)
-
-    Both iterables must be the same length (not checked)
-
-    Returns
-    -------
-    pair of vectors (order, and null_order)
-    """
-    c_column_order = []
-    c_null_precedence = []
-    for asc, null in zip(column_order, null_precedence):
-        c_column_order.append(
-            pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING
-        )
-        if asc ^ (null == "first"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.AFTER)
-        elif asc ^ (null == "last"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE)
-        else:
-            raise ValueError(f"Invalid null precedence {null}")
-    return c_column_order, c_null_precedence
-
-
-@acquire_spill_lock()
-def order_by(
-    list columns_from_table,
-    object ascending,
-    str na_position,
-    *,
-    bool stable
-):
-    """
-    Get index to sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    columns_from_table : list[Column]
-        Columns from the table which will be sorted
-    ascending : sequence[bool]
-         Sequence of boolean values which correspond to each column
-         in the table to be sorted signifying the order of each column
-         True - Ascending and False - Descending
-    na_position : str
-        Whether null values should show up at the "first" or "last"
-        position of **all** sorted column.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    Column of indices that sorts the table
-    """
-    order = ordering(ascending, repeat(na_position))
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order")
-
-    return Column.from_pylibcudf(
-        func(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in columns_from_table],
-            ),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort(
-    list values,
-    list column_order=None,
-    list null_precedence=None,
-):
-    """
-    Sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    return columns_from_pylibcudf_table(
-        pylibcudf.sorting.sort(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort_by_key(
-    list values,
-    list keys,
-    object ascending,
-    object na_position,
-    *,
-    bool stable,
-):
-    """
-    Sort a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    ascending : list[bool]
-        Sequence of boolean values which correspond to each column
-        in the table to be sorted signifying the order of each column
-        True - Ascending and False - Descending
-    na_position : list[str]
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    order = ordering(ascending, na_position)
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key")
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def segmented_sort_by_key(
-    list values,
-    list keys,
-    Column segment_offsets,
-    list column_order=None,
-    list null_precedence=None,
-    *,
-    bool stable,
-):
-    """
-    Sort segments of a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    offsets : Column
-        Segment offsets
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    func = getattr(
-        pylibcudf.sorting,
-        f"{'stable_' if stable else ''}segmented_sort_by_key"
-    )
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            segment_offsets.to_pylibcudf(mode="read"),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def digitize(list source_columns, list bins, bool right=False):
-    """
-    Return the indices of the bins to which each value in source_table belongs.
-
-    Parameters
-    ----------
-    source_columns : Input columns to be binned.
-    bins : List containing columns of bins
-    right : Indicating whether the intervals include the
-            right or the left bin edge.
-    """
-    return Column.from_pylibcudf(
-        getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in bins]
-            ),
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in source_columns]
-            ),
-            [pylibcudf.types.Order.ASCENDING]*len(bins),
-            [pylibcudf.types.NullOrder.BEFORE]*len(bins)
-        )
-    )
-
-
-@acquire_spill_lock()
-def rank_columns(list source_columns, rank_method method, str na_option,
-                 bool ascending, bool pct
-                 ):
-    """
-    Compute numerical data ranks (1 through n) of each column in the dataframe
-    """
-    column_order = (
-        pylibcudf.types.Order.ASCENDING
-        if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-    # ascending
-    #    #top    = na_is_smallest
-    #    #bottom = na_is_largest
-    #    #keep   = na_is_largest
-    # descending
-    #    #top    = na_is_largest
-    #    #bottom = na_is_smallest
-    #    #keep   = na_is_smallest
-    if ascending:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-        else:
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-    else:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-        else:
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-    c_null_handling = (
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if na_option == 'keep'
-        else pylibcudf.types.NullPolicy.INCLUDE
-    )
-
-    return [
-        Column.from_pylibcudf(
-            pylibcudf.sorting.rank(
-                col.to_pylibcudf(mode="read"),
-                method,
-                column_order,
-                c_null_handling,
-                null_precedence,
-                pct,
-            )
-        )
-        for col in source_columns
-    ]
diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py
new file mode 100644
index 00000000000..69f9e7664b1
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/sorting.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def is_sorted(
+    source_columns: list[ColumnBase],
+    ascending: list[bool] | None = None,
+    null_position: list[bool] | None = None,
+) -> bool:
+    """
+    Checks whether the rows of a `table` are sorted in lexicographical order.
+
+    Parameters
+    ----------
+    source_columns : list of columns
+        columns to be checked for sort order
+    ascending : None or list-like of booleans
+        None or list-like of boolean values indicating expected sort order of
+        each column. If list-like, size of list-like must be len(columns). If
+        None, all columns expected sort order is set to ascending. False (0) -
+        descending, True (1) - ascending.
+    null_position : None or list-like of booleans
+        None or list-like of boolean values indicating desired order of nulls
+        compared to other elements. If list-like, size of list-like must be
+        len(columns). If None, null order is set to before. False (0) - after,
+        True (1) - before.
+
+    Returns
+    -------
+    returns : boolean
+        Returns True, if sorted as expected by ``ascending`` and
+        ``null_position``, False otherwise.
+    """
+    if ascending is None:
+        column_order = [plc.types.Order.ASCENDING] * len(source_columns)
+    else:
+        if len(ascending) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(ascending)} for `ascending`"
+            )
+        column_order = [
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+            for asc in ascending
+        ]
+
+    if null_position is None:
+        null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns)
+    else:
+        if len(null_position) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(null_position)} for `null_position`"
+            )
+        null_precedence = [
+            plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER
+            for null in null_position
+        ]
+
+    return plc.sorting.is_sorted(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        column_order,
+        null_precedence,
+    )
+
+
+def ordering(
+    column_order: list[bool],
+    null_precedence: Iterable[Literal["first", "last"]],
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Construct order and null order vectors
+
+    Parameters
+    ----------
+    column_order
+        Iterable of bool (True for ascending order, False for descending)
+    null_precedence
+        Iterable string for null positions ("first" for start, "last" for end)
+
+    Both iterables must be the same length (not checked)
+
+    Returns
+    -------
+    pair of vectors (order, and null_order)
+    """
+    c_column_order = []
+    c_null_precedence = []
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.append(
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+        )
+        if asc ^ (null == "first"):
+            c_null_precedence.append(plc.types.NullOrder.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.append(plc.types.NullOrder.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    return c_column_order, c_null_precedence
+
+
+@acquire_spill_lock()
+def order_by(
+    columns_from_table: list[ColumnBase],
+    ascending: list[bool],
+    na_position: Literal["first", "last"],
+    *,
+    stable: bool,
+):
+    """
+    Get index to sort the table in ascending/descending order.
+
+    Parameters
+    ----------
+    columns_from_table : list[Column]
+        Columns from the table which will be sorted
+    ascending : sequence[bool]
+         Sequence of boolean values which correspond to each column
+         in the table to be sorted signifying the order of each column
+         True - Ascending and False - Descending
+    na_position : str
+        Whether null values should show up at the "first" or "last"
+        position of **all** sorted column.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    Column of indices that sorts the table
+    """
+    order = ordering(ascending, itertools.repeat(na_position))
+    func = (
+        plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns_from_table],
+            ),
+            order[0],
+            order[1],
+        )
+    )
+
+
+@acquire_spill_lock()
+def sort_by_key(
+    values: list[ColumnBase],
+    keys: list[ColumnBase],
+    ascending: list[bool],
+    na_position: list[Literal["first", "last"]],
+    *,
+    stable: bool,
+) -> list[ColumnBase]:
+    """
+    Sort a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    ascending : list[bool]
+        Sequence of boolean values which correspond to each column
+        in the table to be sorted signifying the order of each column
+        True - Ascending and False - Descending
+    na_position : list[str]
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    order = ordering(ascending, na_position)
+    func = (
+        plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+    )
+    return [
+        Column.from_pylibcudf(col)
+        for col in func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
+            order[0],
+            order[1],
+        ).columns()
+    ]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 624a3ac95ed..cc07af0f669 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -42,7 +42,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import aggregation, unary
+from cudf.core._internals import aggregation, sorting, unary
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -996,13 +996,13 @@ def is_unique(self) -> bool:
 
     @cached_property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [True], None
         )
 
     @cached_property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [False], None
         )
 
@@ -1026,15 +1026,20 @@ def contains(self, other: ColumnBase) -> ColumnBase:
     def sort_values(
         self: Self,
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
             return self.copy()
-        return libcudf.sort.sort(
-            [self], column_order=[ascending], null_precedence=[na_position]
-        )[0]
+        order = sorting.ordering([ascending], [na_position])
+        with acquire_spill_lock():
+            plc_table = plc.sorting.sort(
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                order[0],
+                order[1],
+            )
+            return type(self).from_pylibcudf(plc_table.columns()[0])  # type: ignore[return-value]
 
     def distinct_count(self, dropna: bool = True) -> int:
         try:
@@ -1204,7 +1209,7 @@ def argsort(
                 as_column(range(len(self) - 1, -1, -1)),
             )
         else:
-            return libcudf.sort.order_by(
+            return sorting.order_by(
                 [self], [ascending], na_position, stable=True
             )
 
@@ -1511,7 +1516,7 @@ def _return_sentinel_column():
         del right_rows
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        (codes,) = libcudf.sort.sort_by_key(
+        (codes,) = sorting.sort_by_key(
             codes, [left_gather_map], [True], ["last"], stable=True
         )
         return codes.fillna(na_sentinel.value)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 28a2bd7fa6c..f099cef3331 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -718,6 +718,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
 
         return super()._reduction_result_dtype(reduction_op)
 
+    @acquire_spill_lock()
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
+        """Return the indices of the bins to which each value in column belongs.
+
+        Parameters
+        ----------
+        bins : np.ndarray
+            1-D column-like object of bins with same type as `column`, should be
+            monotonically increasing.
+        right : bool
+            Indicates whether interval contains the right or left bin edge.
+
+        Returns
+        -------
+        A column containing the indices
+        """
+        if self.dtype != bins.dtype:
+            raise ValueError(
+                "digitize() expects bins and input column have the same dtype."
+            )
+
+        bin_col = as_column(bins, dtype=bins.dtype)
+        if bin_col.nullable:
+            raise ValueError("`bins` cannot contain null entries.")
+
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            getattr(plc.search, "lower_bound" if right else "upper_bound")(
+                plc.Table([bin_col.to_pylibcudf(mode="read")]),
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                [plc.types.Order.ASCENDING],
+                [plc.types.NullOrder.BEFORE],
+            )
+        )
+
 
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list
@@ -772,34 +806,3 @@ def _normalize_find_and_replace_input(
     if not normalized_column.can_cast_safely(input_column_dtype):
         return normalized_column
     return normalized_column.astype(input_column_dtype)
-
-
-def digitize(
-    column: ColumnBase, bins: np.ndarray, right: bool = False
-) -> ColumnBase:
-    """Return the indices of the bins to which each value in column belongs.
-
-    Parameters
-    ----------
-    column : Column
-        Input column.
-    bins : Column-like
-        1-D column-like object of bins with same type as `column`, should be
-        monotonically increasing.
-    right : bool
-        Indicates whether interval contains the right or left bin edge.
-
-    Returns
-    -------
-    A column containing the indices
-    """
-    if not column.dtype == bins.dtype:
-        raise ValueError(
-            "Digitize() expects bins and input column have the same dtype."
-        )
-
-    bin_col = as_column(bins, dtype=bins.dtype)
-    if bin_col.nullable:
-        raise ValueError("`bins` cannot contain null entries.")
-
-    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 7a39355dd50..aaf2239a71e 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -10,7 +10,7 @@
 import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import sorting
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
@@ -144,7 +144,7 @@ def quantile(
             )
         else:
             # get sorted indices and exclude nulls
-            indices = libcudf.sort.order_by(
+            indices = sorting.order_by(
                 [self], [True], "first", stable=True
             ).slice(self.null_count, len(self))
             with acquire_spill_lock():
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 00199cca828..4f40ba0bd92 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -22,6 +22,7 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import sorting
 from cudf.core._internals.search import search_sorted
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
@@ -1476,7 +1477,7 @@ def _get_sorted_inds(
         else:
             ascending_lst = list(ascending)
 
-        return libcudf.sort.order_by(
+        return sorting.order_by(
             list(to_sort),
             ascending_lst,
             na_position,
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index a8d82f977d5..b772d35846d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -18,11 +18,11 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
-from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -792,7 +792,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 # want, and right order is a matching gather map for
                 # the result table. Get the correct order by sorting
                 # the right gather map.
-                (right_order,) = libcudf.sort.sort_by_key(
+                (right_order,) = sorting.sort_by_key(
                     [right_order],
                     [left_order],
                     [True],
@@ -1248,15 +1248,20 @@ def sample(
                 for off, size in zip(group_offsets, size_per_group):
                     rs.shuffle(indices[off : off + size])
             else:
-                rng = cp.random.default_rng(seed=random_state)
-                (indices,) = segmented_sort_by_key(
-                    [as_column(indices)],
-                    [as_column(rng.random(size=nrows))],
-                    as_column(group_offsets),
-                    [],
-                    [],
-                    stable=True,
+                keys = cp.random.default_rng(seed=random_state).random(
+                    size=nrows
                 )
+                with acquire_spill_lock():
+                    plc_table = plc.sorting.stable_segmented_sort_by_key(
+                        plc.Table(
+                            [as_column(indices).to_pylibcudf(mode="read")]
+                        ),
+                        plc.Table([as_column(keys).to_pylibcudf(mode="read")]),
+                        as_column(group_offsets).to_pylibcudf(mode="read"),
+                        [plc.types.Order.ASCENDING],
+                        [plc.types.NullOrder.AFTER],
+                    )
+                    indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
             want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 81d954960e2..1a667e24bef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6367,9 +6367,49 @@ def rank(
             elif source._num_columns != num_cols:
                 dropped_cols = True
 
-        result_columns = libcudf.sort.rank_columns(
-            [*source._columns], method_enum, na_option, ascending, pct
+        column_order = (
+            plc.types.Order.ASCENDING
+            if ascending
+            else plc.types.Order.DESCENDING
         )
+        # ascending
+        #    #top    = na_is_smallest
+        #    #bottom = na_is_largest
+        #    #keep   = na_is_largest
+        # descending
+        #    #top    = na_is_largest
+        #    #bottom = na_is_smallest
+        #    #keep   = na_is_smallest
+        if ascending:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.BEFORE
+            else:
+                null_precedence = plc.types.NullOrder.AFTER
+        else:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.AFTER
+            else:
+                null_precedence = plc.types.NullOrder.BEFORE
+        c_null_handling = (
+            plc.types.NullPolicy.EXCLUDE
+            if na_option == "keep"
+            else plc.types.NullPolicy.INCLUDE
+        )
+
+        with acquire_spill_lock():
+            result_columns = [
+                libcudf.column.Column.from_pylibcudf(
+                    plc.sorting.rank(
+                        col.to_pylibcudf(mode="read"),
+                        method_enum,
+                        column_order,
+                        c_null_handling,
+                        null_precedence,
+                        pct,
+                    )
+                )
+                for col in source._columns
+            ]
 
         if dropped_cols:
             result = type(source)._from_data(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 5c224176730..e7ea91c1f21 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -9,6 +9,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.types import size_type_dtype
+from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
@@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols):
                 for map_, n, null in zip(maps, lengths, nullify)
             )
         )
-        return libcudf.sort.sort_by_key(
+        return sorting.sort_by_key(
             list(maps),
             # If how is right, right map is primary sort key.
             key_order[:: -1 if self.how == "right" else 1],
@@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
             else:
                 to_sort = [*result._columns]
                 index_names = None
-            result_columns = libcudf.sort.sort_by_key(
+            result_columns = sorting.sort_by_key(
                 to_sort,
                 by,
                 [True] * len(by),
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index f5ee36f851c..a99e06e4a8e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -22,6 +22,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core._internals import sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -1677,7 +1678,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
                 f"Expected a list-like or None for `null_position`, got "
                 f"{type(null_position)}"
             )
-        return libcudf.sort.is_sorted(
+        return sorting.is_sorted(
             [*self._columns], ascending=ascending, null_position=null_position
         )
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 647e20fc16b..961e5e11bc0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3410,7 +3410,7 @@ def describe(
         )
 
     @_performance_tracking
-    def digitize(self, bins, right=False):
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
         """Return the indices of the bins to which each value belongs.
 
         Notes
@@ -3441,9 +3441,8 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series._from_column(
-            cudf.core.column.numerical.digitize(self._column, bins, right),
-            name=self.name,
+        return type(self)._from_column(
+            self._column.digitize(bins, right), name=self.name
         )
 
     @_performance_tracking

From 34e20451cf5452ecea74092dae3c6f5078ade0bd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 13 Dec 2024 15:36:55 -0800
Subject: [PATCH 77/78] Mark more constexpr functions as device-available
 (#17545)

Contributes to #7795.

Also contributes to https://github.com/rapidsai/build-planning/issues/76.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17545
---
 ci/build_docs.sh                              |   6 +
 .../cudf/column/column_device_view.cuh        |  18 ++-
 .../cudf/detail/aggregation/aggregation.cuh   |   2 +-
 cpp/include/cudf/detail/utilities/cuda.cuh    |  11 +-
 .../detail/utilities/device_operators.cuh     |  30 ++--
 .../cudf/detail/utilities/integer_utils.hpp   |   4 +-
 .../detail/floating_conversion.hpp            |   7 +-
 .../cudf/hashing/detail/hash_functions.cuh    |   5 +-
 cpp/include/cudf/hashing/detail/hashing.hpp   |   2 +-
 cpp/include/cudf/strings/detail/utf8.hpp      |  21 +--
 cpp/include/cudf/strings/string_view.cuh      |   8 +-
 .../cudf/table/experimental/row_operators.cuh |  74 +++++-----
 cpp/include/cudf/types.hpp                    |   9 +-
 cpp/include/cudf/utilities/span.hpp           | 138 ++++++++++++------
 cpp/include/cudf/utilities/traits.hpp         |  42 +++---
 cpp/src/binaryop/compiled/binary_ops.cuh      |   6 +-
 cpp/src/copying/contiguous_split.cu           |   3 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |   3 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |   4 +-
 cpp/src/hash/sha_hash.cuh                     |   4 +-
 cpp/src/hash/xxhash_64.cu                     |   3 +-
 cpp/src/io/avro/avro_common.hpp               |   2 +-
 cpp/src/io/comp/unsnap.cu                     |   3 +-
 cpp/src/io/fst/agent_dfa.cuh                  |  14 +-
 cpp/src/io/statistics/byte_array_view.cuh     |  33 +++--
 .../io/statistics/typed_statistics_chunk.cuh  |   5 +-
 cpp/src/io/utilities/parsing_utils.cuh        |  19 ++-
 cpp/src/io/utilities/trie.cuh                 |   4 +-
 cpp/src/quantiles/quantiles_util.hpp          |   9 +-
 cpp/src/strings/search/find.cu                |   3 +-
 cpp/src/strings/slice.cu                      |   7 +-
 docs/cudf/source/conf.py                      |   2 +
 32 files changed, 302 insertions(+), 199 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4290d013fe4..52d8f659611 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -35,6 +35,10 @@ rapids-mamba-retry install \
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
 aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
@@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
 RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
+
+exit ${EXITCODE}
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index db6d5255616..ea480b133dc 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -33,11 +33,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
+#include <type_traits>
 
 /**
  * @file column_device_view.cuh
@@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf {
  *
  */
 struct nullate {
-  struct YES : std::bool_constant<true> {};
-  struct NO : std::bool_constant<false> {};
+  struct YES : cuda::std::bool_constant<true> {};
+  struct NO : cuda::std::bool_constant<false> {};
   /**
    * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than
    * compile time. The calling code is responsible for specifying whether or not nulls are
@@ -80,7 +82,7 @@ struct nullate {
      * @return `true` if nulls are expected in the operation in which this object is applied,
      * otherwise false
      */
-    constexpr operator bool() const noexcept { return value; }
+    CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; }
     bool value;  ///< True if nulls are expected
   };
 };
@@ -319,14 +321,14 @@ class alignas(16) column_device_view_base {
   }
 
   template <typename C, typename T, typename = void>
-  struct has_element_accessor_impl : std::false_type {};
+  struct has_element_accessor_impl : cuda::std::false_type {};
 
   template <typename C, typename T>
   struct has_element_accessor_impl<
     C,
     T,
-    void_t<decltype(std::declval<C>().template element<T>(std::declval<size_type>()))>>
-    : std::true_type {};
+    void_t<decltype(cuda::std::declval<C>().template element<T>(cuda::std::declval<size_type>()))>>
+    : cuda::std::true_type {};
 };
 // @cond
 // Forward declaration
@@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return `true` if `column_device_view::element<T>()` has a valid overload, `false` otherwise
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<column_device_view, T>::value;
   }
@@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return `true` if `mutable_column_device_view::element<T>()` has a valid overload, `false`
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<mutable_column_device_view, T>::value;
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index de53e7586cd..c30c3d6f4bd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -36,7 +36,7 @@
 namespace cudf {
 namespace detail {
 template <typename T>
-constexpr bool is_product_supported()
+CUDF_HOST_DEVICE constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 61a8e9f7ec3..72cdc3d8067 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -74,9 +74,10 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
-                                                      thread_index_type block_id,
-                                                      thread_index_type num_threads_per_block)
+  __device__ static constexpr thread_index_type global_thread_id(
+    thread_index_type thread_id,
+    thread_index_type block_id,
+    thread_index_type num_threads_per_block)
   {
     return thread_id + block_id * num_threads_per_block;
   }
@@ -114,8 +115,8 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
-                                                 thread_index_type num_blocks_per_grid)
+  __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                            thread_index_type num_blocks_per_grid)
   {
     return num_threads_per_block * num_blocks_per_grid;
   }
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index d16be5e22dd..923cd04479d 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -29,6 +29,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/functional>
+
 #include <type_traits>
 
 namespace cudf {
@@ -42,7 +44,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
-  return std::min(lhs, rhs);
+  return cuda::std::min(lhs, rhs);
 }
 
 /**
@@ -53,7 +55,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
-  return std::max(lhs, rhs);
+  return cuda::std::max(lhs, rhs);
 }
 }  // namespace detail
 
@@ -68,20 +70,20 @@ struct DeviceSum {
   }
 
   template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{typename T::duration{0}};
   }
 
   template <typename T,
             std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{0};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
@@ -109,7 +111,7 @@ struct DeviceCount {
   }
 
   template <typename T>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{};
   }
@@ -129,7 +131,7 @@ struct DeviceMin {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::max()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -143,7 +145,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
@@ -161,7 +163,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::max_value());
   }
@@ -181,7 +183,7 @@ struct DeviceMax {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::min()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -195,7 +197,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
@@ -212,7 +214,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::lowest_value());
   }
@@ -229,13 +231,13 @@ struct DeviceProduct {
   }
 
   template <typename T, std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{1};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 957b6b70fe2..2e3d71815c0 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -187,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr auto absolute_value(T value) -> T
+CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index fce08b4a5c4..9e68bafb09a 100644
--- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -22,6 +22,7 @@
 #include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 #include <cstring>
 
@@ -183,7 +184,7 @@ struct floating_converter {
    * @param integer_rep The bit-casted floating value to extract the exponent from
    * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+  CUDF_HOST_DEVICE inline static cuda::std::pair<IntegralType, int> get_significand_and_pow2(
     IntegralType integer_rep)
   {
     // Extract the significand
@@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
@@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
index 0ec41a20ef1..fd3455e761d 100644
--- a/cpp/include/cudf/hashing/detail/hash_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -18,7 +18,8 @@
 
 #include <cudf/utilities/traits.hpp>
 
-#include <limits>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 
 namespace cudf::hashing::detail {
 
@@ -29,7 +30,7 @@ template <typename T>
 T __device__ inline normalize_nans(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+    if (cuda::std::isnan(key)) { return cuda::std::numeric_limits<T>::quiet_NaN(); }
   }
   return key;
 }
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index a978e54a1b9..7cb80081a95 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -82,7 +82,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
  * @param rhs The second hash value
  * @return Combined hash value
  */
-constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
+CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
 {
   return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2));
 }
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 85349a421b1..84957ab9f1d 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -31,7 +31,7 @@ namespace strings::detail {
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this is not the first byte of the character
  */
-constexpr bool is_utf8_continuation_char(unsigned char chr)
+CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr)
 {
   // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
   return (chr & 0xC0) == 0x80;
@@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr)
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this the first byte of the character
  */
-constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); }
+CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr)
+{
+  return not is_utf8_continuation_char(chr);
+}
 
 /**
  * @brief This will return true if the passed in byte could be the start of
@@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin
  * @param byte The byte to be tested
  * @return true if this can be the first byte of a character
  */
-constexpr bool is_valid_begin_utf8_char(uint8_t byte)
+CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte)
 {
   // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of:
   //  0b0vvvvvvv a 1 byte character
@@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte)
  * @param character Single character
  * @return Number of bytes
  */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character)
 {
   return 1 + static_cast<size_type>((character & 0x0000'FF00u) > 0) +
          static_cast<size_type>((character & 0x00FF'0000u) > 0) +
@@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character)
  * @param byte Byte from an encoded character.
  * @return Number of bytes.
  */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte)
 {
   return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
          + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
@@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-constexpr size_type to_char_utf8(char const* str, char_utf8& character)
+CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character)
  * @param[out] str Output array.
  * @return The number of bytes in the character
  */
-constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
@@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
  * @param utf8_char Single UTF-8 character to convert.
  * @return Code-point for the UTF-8 character.
  */
-constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
+CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
   if (utf8_char < 0x0000'0080)  // single-byte pass thru
@@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
  * @param unchr Character code-point to convert.
  * @return Single UTF-8 character.
  */
-constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
+CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
   if (unchr < 0x0000'0080)  // single byte utf8
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 1ae4c3703b2..f0040e069d8 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -31,6 +31,8 @@
 #include <thrust/execution_policy.h>
 #endif
 
+#include <cuda/std/utility>
+
 #include <algorithm>
 
 // This file should only include device code logic.
@@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte
  * @param pos Character position to count to
  * @return The number of bytes and the left over non-counted position value
  */
-__device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
-                                                                              size_type pos)
+__device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
+  string_view d_str, size_type pos)
 {
   size_type bytes    = 0;
   auto ptr           = d_str.data();
@@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const
 __device__ inline size_type string_view::byte_offset(size_type pos) const
 {
   if (length() == size_bytes()) return pos;
-  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
+  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
 }
 
 __device__ inline int string_view::compare(string_view const& in) const
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 3f33c70c29a..8214ea6e83b 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -33,6 +33,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <cuda/std/tuple>
 #include <cuda/std/utility>
 #include <thrust/detail/use_default.h>
@@ -48,11 +50,8 @@
 #include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
-#include <limits>
 #include <memory>
-#include <optional>
 #include <type_traits>
-#include <utility>
 
 namespace CUDF_EXPORT cudf {
 
@@ -287,15 +286,16 @@ class device_row_comparator {
    * `null_order::BEFORE` for all columns.
    * @param comparator Physical element relational comparison functor.
    */
-  device_row_comparator(Nullate check_nulls,
-                        table_device_view lhs,
-                        table_device_view rhs,
-                        device_span<detail::dremel_device_view const> l_dremel_device_views,
-                        device_span<detail::dremel_device_view const> r_dremel_device_views,
-                        std::optional<device_span<int const>> depth                  = std::nullopt,
-                        std::optional<device_span<order const>> column_order         = std::nullopt,
-                        std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-                        PhysicalElementComparator comparator                         = {}) noexcept
+  device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    device_span<detail::dremel_device_view const> l_dremel_device_views,
+    device_span<detail::dremel_device_view const> r_dremel_device_views,
+    cuda::std::optional<device_span<int const>> depth                  = cuda::std::nullopt,
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel(l_dremel_device_views),
@@ -331,9 +331,9 @@ class device_row_comparator {
     Nullate check_nulls,
     table_device_view lhs,
     table_device_view rhs,
-    std::optional<device_span<order const>> column_order         = std::nullopt,
-    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-    PhysicalElementComparator comparator                         = {}) noexcept
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel{},
@@ -410,7 +410,7 @@ class device_row_comparator {
 
       return cuda::std::pair(_comparator(_lhs.element<Element>(lhs_element_index),
                                          _rhs.element<Element>(rhs_element_index)),
-                             std::numeric_limits<int>::max());
+                             cuda::std::numeric_limits<int>::max());
     }
 
     /**
@@ -455,7 +455,7 @@ class device_row_comparator {
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits<int>::max());
+          return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits<int>::max());
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
@@ -607,7 +607,7 @@ class device_row_comparator {
   __device__ constexpr weak_ordering operator()(size_type const lhs_index,
                                                 size_type const rhs_index) const noexcept
   {
-    int last_null_depth = std::numeric_limits<int>::max();
+    int last_null_depth = cuda::std::numeric_limits<int>::max();
     size_type list_column_index{-1};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; }
@@ -626,9 +626,9 @@ class device_row_comparator {
       // here, otherwise the current code would be failing.
       auto const [l_dremel_i, r_dremel_i] =
         _lhs.column(i).type().id() == type_id::LIST
-          ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
-                            optional_dremel_view(_r_dremel[list_column_index]))
-          : std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+          ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
+                                  optional_dremel_view(_r_dremel[list_column_index]))
+          : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
 
       auto element_comp = element_comparator{_check_nulls,
                                              _lhs.column(i),
@@ -658,9 +658,9 @@ class device_row_comparator {
   device_span<detail::dremel_device_view const> const _l_dremel;
   device_span<detail::dremel_device_view const> const _r_dremel;
   Nullate const _check_nulls;
-  std::optional<device_span<int const>> const _depth;
-  std::optional<device_span<order const>> const _column_order;
-  std::optional<device_span<null_order const>> const _null_precedence;
+  cuda::std::optional<device_span<int const>> const _depth;
+  cuda::std::optional<device_span<order const>> const _column_order;
+  cuda::std::optional<device_span<null_order const>> const _null_precedence;
   PhysicalElementComparator const _comparator;
 };  // class device_row_comparator
 
@@ -882,10 +882,10 @@ struct preprocessed_table {
    * @return Device array containing respective column orders. If no explicit column orders were
    * specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<order const>> column_order() const
+  [[nodiscard]] cuda::std::optional<device_span<order const>> column_order() const
   {
-    return _column_order.size() ? std::optional<device_span<order const>>(_column_order)
-                                : std::nullopt;
+    return _column_order.size() ? cuda::std::optional<device_span<order const>>(_column_order)
+                                : cuda::std::nullopt;
   }
 
   /**
@@ -895,10 +895,11 @@ struct preprocessed_table {
    * @return Device array containing respective column null precedence. If no explicit column null
    * precedences were specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<null_order const>> null_precedence() const
+  [[nodiscard]] cuda::std::optional<device_span<null_order const>> null_precedence() const
   {
-    return _null_precedence.size() ? std::optional<device_span<null_order const>>(_null_precedence)
-                                   : std::nullopt;
+    return _null_precedence.size()
+             ? cuda::std::optional<device_span<null_order const>>(_null_precedence)
+             : cuda::std::nullopt;
   }
 
   /**
@@ -909,9 +910,10 @@ struct preprocessed_table {
    * @return std::optional<device_span<int const>> Device array containing respective column depths.
    * If there are no nested columns in the table then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<int const>> depths() const
+  [[nodiscard]] cuda::std::optional<device_span<int const>> depths() const
   {
-    return _depths.size() ? std::optional<device_span<int const>>(_depths) : std::nullopt;
+    return _depths.size() ? cuda::std::optional<device_span<int const>>(_depths)
+                          : cuda::std::nullopt;
   }
 
   [[nodiscard]] device_span<detail::dremel_device_view const> dremel_device_views() const
@@ -940,8 +942,8 @@ struct preprocessed_table {
   rmm::device_uvector<size_type> const _depths;
 
   // Dremel encoding of list columns used for the comparison algorithm
-  std::optional<std::vector<detail::dremel_data>> _dremel_data;
-  std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
+  cuda::std::optional<std::vector<detail::dremel_data>> _dremel_data;
+  cuda::std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
 
   // Intermediate columns generated from transforming nested children columns into
   // integers columns using `cudf::rank()`, need to be kept alive.
@@ -1808,7 +1810,7 @@ class element_hasher {
   __device__ element_hasher(
     Nullate nulls,
     uint32_t seed             = DEFAULT_HASH_SEED,
-    hash_value_type null_hash = std::numeric_limits<hash_value_type>::max()) noexcept
+    hash_value_type null_hash = cuda::std::numeric_limits<hash_value_type>::max()) noexcept
     : _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
   {
   }
@@ -1892,7 +1894,7 @@ class device_row_hasher {
    */
   template <template <typename> class hash_fn>
   class element_hasher_adapter {
-    static constexpr hash_value_type NULL_HASH     = std::numeric_limits<hash_value_type>::max();
+    static constexpr hash_value_type NULL_HASH = cuda::std::numeric_limits<hash_value_type>::max();
     static constexpr hash_value_type NON_NULL_HASH = 0;
 
    public:
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 409b8c825bb..9443bd5cb52 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -266,7 +266,7 @@ class data_type {
    *
    * @param id The type's identifier
    */
-  explicit constexpr data_type(type_id id) : _id{id} {}
+  CUDF_HOST_DEVICE explicit constexpr data_type(type_id id) : _id{id} {}
 
   /**
    * @brief Construct a new `data_type` object for `numeric::fixed_point`
@@ -284,14 +284,17 @@ class data_type {
    *
    * @return The type identifier
    */
-  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    *
    * @return The scale
    */
-  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t scale() const noexcept
+  {
+    return _fixed_point_scale;
+  }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 2273a89892b..e7b76946248 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -69,52 +70,22 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;  ///< The extent of the span
 
-  constexpr span_base() noexcept {}
+  CUDF_HOST_DEVICE constexpr span_base() noexcept {}
   /**
    * @brief Constructs a span from a pointer and a size.
    *
    * @param data Pointer to the first element in the span.
    * @param size The number of elements in the span.
    */
-  constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
+  CUDF_HOST_DEVICE constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
-  constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
    * @return Reference to this span.
    */
-  constexpr span_base& operator=(span_base const&) noexcept = default;
-
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr reference front() const { return _data[0]; }
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling last on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; }
-  // not noexcept due to undefined behavior when idx < 0 || idx >= size
-  /**
-   * @brief Returns a reference to the idx-th element of the sequence.
-   *
-   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
-   * size()).
-   *
-   * @param idx the index of the element to access
-   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
-   */
-  constexpr reference operator[](size_type idx) const { return _data[idx]; }
+  CUDF_HOST_DEVICE constexpr span_base& operator=(span_base const&) noexcept = default;
 
   /**
    * @brief Returns an iterator to the first element of the span.
@@ -123,7 +94,7 @@ class span_base {
    *
    * @return An iterator to the first element of the span
    */
-  [[nodiscard]] constexpr iterator begin() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator begin() const noexcept { return _data; }
   /**
    * @brief Returns an iterator to the element following the last element of the span.
    *
@@ -131,32 +102,36 @@ class span_base {
    *
    * @return An iterator to the element following the last element of the span
    */
-  [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator end() const noexcept { return _data + _size; }
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr pointer data() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr pointer data() const noexcept { return _data; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return The number of elements in the span
    */
-  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size() const noexcept { return _size; }
   /**
    * @brief Returns the size of the sequence in bytes.
    *
    * @return The size of the sequence in bytes
    */
-  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size_bytes() const noexcept
+  {
+    return sizeof(T) * _size;
+  }
+
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -180,9 +155,9 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
- private:
-  pointer _data{nullptr};
-  size_type _size{0};
+ protected:
+  pointer _data{nullptr};  ///< Pointer to the first element in the span
+  size_type _size{0};      ///< The number of elements in the span
 };
 
 }  // namespace detail
@@ -288,6 +263,39 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  constexpr typename base::reference operator[](size_type idx) const { return this->_data[idx]; }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference front() const { return this->_data[0]; }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
 
   /**
    * @brief Returns whether the data is device accessible (e.g. pinned memory)
@@ -339,7 +347,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
   using base = cudf::detail::span_base<T, Extent, device_span<T, Extent>>;  ///< Base type
   using base::base;
 
-  constexpr device_span() noexcept : base() {}  // required to compile on centos
+  CUDF_HOST_DEVICE constexpr device_span() noexcept : base() {}  // required to compile on centos
 
   /// Constructor from container
   /// @param in The container to construct the span from
@@ -374,11 +382,51 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
-  constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
+  CUDF_HOST_DEVICE constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
 
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  __device__ constexpr typename base::reference operator[](size_type idx) const
+  {
+    return this->_data[idx];
+  }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference front() const
+  {
+    return this->_data[0];
+  }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
+
   /**
    * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
    *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index dae1cd38832..0f4bde204fa 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -169,7 +169,7 @@ bool is_equality_comparable(data_type type);
  * @return false  `T` is not numeric
  */
 template <typename T>
-constexpr inline bool is_numeric()
+CUDF_HOST_DEVICE constexpr inline bool is_numeric()
 {
   return cuda::std::is_arithmetic<T>();
 }
@@ -271,9 +271,9 @@ bool is_unsigned(data_type type);
  * @return true if the iterator's value type is unsigned
  */
 template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+CUDF_HOST_DEVICE constexpr inline bool is_signed_iterator()
 {
-  return std::is_signed_v<typename std::iterator_traits<Iterator>::value_type>;
+  return cuda::std::is_signed_v<typename cuda::std::iterator_traits<Iterator>::value_type>;
 }
 
 /**
@@ -356,9 +356,9 @@ bool is_numeric_not_bool(data_type type);
  * @return false  `T` is not floating point
  */
 template <typename T>
-constexpr inline bool is_floating_point()
+CUDF_HOST_DEVICE constexpr inline bool is_floating_point()
 {
-  return std::is_floating_point_v<T>;
+  return cuda::std::is_floating_point_v<T>;
 }
 
 /**
@@ -415,7 +415,7 @@ bool is_boolean(data_type type);
  * @return false  `T` is not a timestamp
  */
 template <typename T>
-constexpr inline bool is_timestamp()
+CUDF_HOST_DEVICE constexpr inline bool is_timestamp()
 {
   return is_timestamp_t<T>::value;
 }
@@ -439,13 +439,14 @@ bool is_timestamp(data_type type);
  * @return false  `T` is not a fixed-point type
  */
 template <typename T>
-constexpr inline bool is_fixed_point()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T> ||
-         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
+  return cuda::std::is_same_v<numeric::decimal32, T> ||
+         cuda::std::is_same_v<numeric::decimal64, T> ||
+         cuda::std::is_same_v<numeric::decimal128, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
@@ -465,7 +466,7 @@ bool is_fixed_point(data_type type);
  * @return false  `T` is not a duration
  */
 template <typename T>
-constexpr inline bool is_duration()
+CUDF_HOST_DEVICE constexpr inline bool is_duration()
 {
   return is_duration_t<T>::value;
 }
@@ -489,7 +490,7 @@ bool is_duration(data_type type);
  * @return false  `T` is neither a duration nor a timestamp type
  */
 template <typename T>
-constexpr inline bool is_chrono()
+CUDF_HOST_DEVICE constexpr inline bool is_chrono()
 {
   return is_duration<T>() || is_timestamp<T>();
 }
@@ -557,7 +558,7 @@ bool is_dictionary(data_type type);
  * @return false `T` corresponds to a variable-width element type
  */
 template <typename T>
-constexpr inline bool is_fixed_width()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_width()
 {
   // TODO Add fixed width wrapper types
   // Is a category fixed width?
@@ -590,10 +591,11 @@ class string_view;
  * @return false `T` corresponds to a "simple" type
  */
 template <typename T>
-constexpr inline bool is_compound()
+CUDF_HOST_DEVICE constexpr inline bool is_compound()
 {
-  return std::is_same_v<T, cudf::string_view> or std::is_same_v<T, cudf::dictionary32> or
-         std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::string_view> or
+         cuda::std::is_same_v<T, cudf::dictionary32> or cuda::std::is_same_v<T, cudf::list_view> or
+         cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
@@ -622,9 +624,9 @@ bool is_compound(data_type type);
  * @return false T is not a nested type
  */
 template <typename T>
-constexpr inline bool is_nested()
+CUDF_HOST_DEVICE constexpr inline bool is_nested()
 {
-  return std::is_same_v<T, cudf::list_view> || std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::list_view> || cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec63504a414..0e31a0b6cf5 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -32,10 +32,10 @@ namespace binops {
 namespace compiled {
 
 template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-constexpr bool is_bool_result()
+CUDF_HOST_DEVICE constexpr bool is_bool_result()
 {
-  using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
-  return std::is_same_v<bool, ReturnType>;
+  using ReturnType = cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+  return cuda::std::is_same_v<bool, ReturnType>;
 }
 
 /**
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e9443980320..e3ed5b55415 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -35,6 +35,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -1675,7 +1676,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
         if (bytes == 0) { return {1, 0}; }
 
         // The number of batches we want to subdivide this buffer into
-        std::size_t const num_batches = std::max(
+        std::size_t const num_batches = cuda::std::max(
           std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
 
         // NOTE: leaving batch size as a separate parameter for future tuning
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 65bd5ac408f..583357d9090 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/pair.h>
@@ -185,7 +186,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
-      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+      return unequal ? row_index_in_group + 1 : cuda::std::numeric_limits<size_type>::max();
     },
     DeviceMin{},
     has_nested_nulls(table_view{{grouped_values}}),
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 43df7f325ac..ccdd097fa9c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -25,6 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/array>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -83,7 +84,8 @@ class murmur_device_row_hasher {
                                           hash_value_type const seed) const noexcept
     {
       if (check_nulls && col.is_null(row_index)) {
-        return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
+        return {cuda::std::numeric_limits<uint64_t>::max(),
+                cuda::std::numeric_limits<uint64_t>::max()};
       }
       auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index eb002cf9c6f..52f31667ff0 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -37,7 +38,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -252,7 +252,7 @@ struct HasherDispatcher {
   {
     Element const& key = input_col.element<Element>(row_index);
     if (isnan(key)) {
-      Element nan = std::numeric_limits<Element>::quiet_NaN();
+      Element nan = cuda::std::numeric_limits<Element>::quiet_NaN();
       hasher->process_fixed_width(nan);
     } else if (key == Element{0.0}) {
       hasher->process_fixed_width(Element{0.0});
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index bdbe13b1ffb..5e74148ceaf 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -72,7 +73,7 @@ class device_row_hasher {
                                           hash_value_type const _seed) const noexcept
     {
       if (_check_nulls && col.is_null(row_index)) {
-        return std::numeric_limits<hash_value_type>::max();
+        return cuda::std::numeric_limits<hash_value_type>::max();
       }
       auto const hasher = XXHash_64<T>{_seed};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 9bf66369d6a..4c05d78292b 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -142,7 +142,7 @@ enum logicaltype_kind_e {
  *
  * @return true if the logical type is supported, false otherwise.
  */
-inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+CUDF_HOST_DEVICE inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
 {
   switch (logical_kind) {
     case logicaltype_date: return true;
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index b48e49ffd78..9b01272ac70 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -65,7 +65,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+  CUDF_HOST_DEVICE constexpr unsnap_state_s() noexcept {
+  }  // required to compile on ctk-12.2 + aarch64
 
   uint8_t const* base{};           ///< base ptr of compressed stream
   uint8_t const* end{};            ///< end of compressed stream
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 0e70984b39c..2a75c034dc8 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,6 +18,7 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -342,8 +343,9 @@ class WriteCoalescingCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __device__ __forceinline__ StateVectorTransitionOp(
-    TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
+  __device__ __forceinline__
+  StateVectorTransitionOp(TransitionTableT const& transition_table,
+                          cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
@@ -360,7 +362,7 @@ class StateVectorTransitionOp {
   }
 
  public:
-  std::array<StateIndexT, NUM_INSTANCES>& state_vector;
+  cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector;
   TransitionTableT const& transition_table;
 };
 
@@ -620,7 +622,7 @@ struct AgentDFA {
     SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
-    std::array<StateIndexT, NUM_STATES>& state_vector)
+    cuda::std::array<StateIndexT, NUM_STATES>& state_vector)
   {
     using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, TransitionTableT>;
 
@@ -796,10 +798,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
-    std::array<StateIndexT, NUM_STATES> state_vector;
+    cuda::std::array<StateIndexT, NUM_STATES> state_vector;
 
     // Initialize the seed state transition vector with the identity vector
-    thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector));
+    thrust::sequence(thrust::seq, cuda::std::begin(state_vector), cuda::std::end(state_vector));
 
     // Compute the state transition vector
     agent_dfa.GetThreadStateTransitionVector<NUM_STATES>(symbol_matcher,
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 58698c6a19d..50d823ade88 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <cuda/std/limits>
+
 namespace cudf::io::statistics {
 
 /**
@@ -30,15 +32,19 @@ class byte_array_view {
  public:
   using element_type = std::byte const;  ///< The type of the elements in the byte array
 
-  constexpr byte_array_view() noexcept {}
+  CUDF_HOST_DEVICE constexpr byte_array_view() noexcept {}
   /**
    * @brief Constructs a byte_array_view from a pointer and a size.
    *
    * @param data Pointer to the first element in the byte array.
    * @param size The number of elements in the byte array.
    */
-  constexpr byte_array_view(element_type* data, std::size_t size) : _data(data, size) {}
-  constexpr byte_array_view(byte_array_view const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr byte_array_view(element_type* data, std::size_t size)
+    : _data(data, size)
+  {
+  }
+  CUDF_HOST_DEVICE constexpr byte_array_view(byte_array_view const&) noexcept =
+    default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
@@ -55,14 +61,20 @@ class byte_array_view {
    * @param idx The index of the element to access.
    * @return A reference to the idx-th element of the byte_array_view, i.e., `_data.data()[idx]`.
    */
-  [[nodiscard]] constexpr element_type& operator[](std::size_t idx) const { return _data[idx]; }
+  [[nodiscard]] __device__ constexpr element_type& operator[](std::size_t idx) const
+  {
+    return _data[idx];
+  }
 
   /**
    * @brief Returns a pointer to the beginning of the byte_array_view.
    *
    * @return A pointer to the first element of the byte_array_view.
    */
-  [[nodiscard]] constexpr element_type* data() const noexcept { return _data.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr element_type* data() const noexcept
+  {
+    return _data.data();
+  }
 
   /**
    * @brief Returns the number of elements in the byte_array_view.
@@ -76,7 +88,10 @@ class byte_array_view {
    *
    * @return The size of the byte_array_view in bytes
    */
-  [[nodiscard]] constexpr std::size_t size_bytes() const noexcept { return _data.size_bytes(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr std::size_t size_bytes() const noexcept
+  {
+    return _data.size_bytes();
+  }
 
   /**
    * @brief Comparing target byte_array_view with this byte_array_view. Each byte in the array is
@@ -98,9 +113,9 @@ class byte_array_view {
     auto const* ptr2 = rhs.data();
     if ((ptr1 == ptr2) && (len1 == len2)) { return 0; }
     // if I am max, I am greater than the argument
-    if (ptr1 == nullptr && len1 == std::numeric_limits<std::size_t>::max()) { return 1; }
+    if (ptr1 == nullptr && len1 == cuda::std::numeric_limits<std::size_t>::max()) { return 1; }
     // if the argument is max, it is greater than me
-    if (ptr2 == nullptr && len2 == std::numeric_limits<std::size_t>::max()) { return -1; }
+    if (ptr2 == nullptr && len2 == cuda::std::numeric_limits<std::size_t>::max()) { return -1; }
     std::size_t idx = 0;
     for (; (idx < len1) && (idx < len2); ++idx) {
       if (ptr1[idx] != ptr2[idx]) {
@@ -170,7 +185,7 @@ class byte_array_view {
    */
   [[nodiscard]] __device__ inline static byte_array_view max()
   {
-    return {nullptr, std::numeric_limits<std::size_t>::max()};
+    return {nullptr, cuda::std::numeric_limits<std::size_t>::max()};
   }
 
  private:
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 01db781c766..dc023e69423 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -30,6 +30,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <cuda/std/limits>
 #include <math_constants.h>
 #include <thrust/extrema.h>
 
@@ -246,9 +247,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
       if (!chunk.has_minmax) { return true; }
-      return std::numeric_limits<E>::max() / chunk.non_nulls >=
+      return cuda::std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
-             std::numeric_limits<E>::lowest() / chunk.non_nulls <=
+             cuda::std::numeric_limits<E>::lowest() / chunk.non_nulls <=
                static_cast<E>(chunk.minimum_value);
     }
     return true;
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 734067582f7..75e45a68842 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,12 +30,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
 
-#include <optional>
-
 using cudf::device_span;
 
 namespace cudf {
@@ -183,7 +182,7 @@ constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' -
  * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-constexpr bool is_infinity(char const* begin, char const* end)
+CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
 {
   if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
@@ -208,9 +207,9 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ std::optional<T> parse_numeric(char const* begin,
-                                                   char const* end,
-                                                   parse_options_view const& opts)
+__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
+                                                         char const* end,
+                                                         parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -267,7 +266,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::optional<T>{}; }
+  if (!all_digits_valid) { return cuda::std::optional<T>{}; }
 
   return value * sign;
 }
@@ -524,7 +523,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
+    auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
@@ -573,7 +572,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
@@ -602,7 +601,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index caea8dabb88..c0efc5b6f20 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -82,8 +82,8 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-__host__ __device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                         device_span<char const> key)
+CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                      device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 47864c25c5f..a60cbbb8db2 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -20,7 +20,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cmath>
+#include <cuda/std/cmath>
+#include <cuda/std/functional>
 
 namespace cudf {
 namespace detail {
@@ -96,12 +97,12 @@ struct quantile_index {
 
   CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
-    quantile = std::min(std::max(quantile, 0.0), 1.0);
+    quantile = cuda::std::min(cuda::std::max(quantile, 0.0), 1.0);
 
     double val = quantile * (count - 1);
     lower      = std::floor(val);
-    higher     = static_cast<size_type>(std::ceil(val));
-    nearest    = static_cast<size_type>(std::nearbyint(val));
+    higher     = static_cast<size_type>(cuda::std::ceil(val));
+    nearest    = static_cast<size_type>(cuda::std::nearbyint(val));
     fraction   = val - lower;
   }
 };
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 94bc81ec933..4ed66622508 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -34,6 +34,7 @@
 
 #include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -142,7 +143,7 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
     if (stop < 0) { return d_str.size_bytes(); }
     if (stop <= start) { return begin; }
     // we count from `begin` instead of recounting from the beginning of the string
-    return begin + std::get<0>(bytes_to_character_position(
+    return begin + cuda::std::get<0>(bytes_to_character_position(
                      string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start));
   }();
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 4c39fc96397..a74b19aae28 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -35,6 +35,7 @@
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -140,14 +141,16 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
-      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+      first_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
     stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
-      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+      last_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
     }
 
     d_output[str_idx] = (first_byte < last_byte)
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7aa8f9f4a1c..09214803c0c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -439,6 +439,8 @@ def _generate_namespaces(namespaces):
     # Sphinx doesn't know how to distinguish between the ORC and Parquet
     # definitions because Breathe doesn't to preserve namespaces for enums.
     "TypeKind",
+    # Span subclasses access base class members
+    "base::",
 }
 
 _domain_objects = None

From 76b35adec49d85cf23d4a32a44588c856234f140 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:50:11 -0800
Subject: [PATCH 78/78] Ignore NaN correctly in .quantile (#17593)

From an offline conversation, fixes the follow discrepancy between cudf and pandas

```python
In [1]: import cudf

In [2]: import numpy as np

In [3]: ser = cudf.Series([np.nan, np.nan, 0.9], nan_as_null=False)

In [4]: ser
Out[4]:
0    NaN
1    NaN
2    0.9
dtype: float64

In [5]: ser.quantile(0.9)
Out[5]: np.float64(nan)

In [6]: import pandas as pd

In [7]: ser = pd.Series([np.nan, np.nan, 0.9])

In [8]: ser.quantile(0.9)
Out[8]: np.float64(0.9)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17593
---
 python/cudf/cudf/core/column/numerical_base.py |  7 ++++---
 python/cudf/cudf/tests/test_quantiles.py       | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index aaf2239a71e..689d5132d45 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -143,13 +143,14 @@ def quantile(
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
             indices = sorting.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 9a2816f5444..84de2ac38e7 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):
 
     assert expected == actual
     assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)