diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7c0bd6d52e2..49ca5ca0fb9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,6 +52,7 @@ jobs:
       OTEL_SERVICE_NAME: 'pr-cudf'
     steps:
       - name: Telemetry setup
+        if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
@@ -329,7 +330,7 @@ jobs:
   telemetry-summarize:
     runs-on: ubuntu-latest
     needs: pr-builder
-    if: always()
+    if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
       - name: Load stashed telemetry env vars
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 3b972f31ca4..01dd2436beb 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 37b26949804..39869b67547 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -17,11 +17,11 @@ repos:
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.2
+    rev: v0.16.6
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.10.0'
+    rev: 'v1.13.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -33,7 +33,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.8.5
+    rev: 1.9.1
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
             ^cpp/include/cudf_test/cxxopts.hpp
           )
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.6
+    rev: 0.6.7
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -133,7 +133,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -144,7 +144,7 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.8
+    rev: v0.8.0
     hooks:
       - id: ruff
         args: ["--fix"]
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4290d013fe4..52d8f659611 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -35,6 +35,10 @@ rapids-mamba-retry install \
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
 aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
@@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
 RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
+
+exit ${EXITCODE}
diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
index 4d5b62ba280..9702b055512 100755
--- a/ci/cpp_linters.sh
+++ b/ci/cpp_linters.sh
@@ -27,7 +27,7 @@ source rapids-configure-sccache
 # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.
 
 iwyu_flag=""
-if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
+if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then
   iwyu_flag="-DCUDF_IWYU=ON"
 fi
 cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index f8ddbaba0f3..30e3ffc9a43 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -26,6 +26,8 @@ main() {
     LIBS=${LIBS#[}
     LIBS=${LIBS%]}
 
+    ANY_FAILURES=0
+
     for lib in ${LIBS//,/ }; do
         lib=$(echo "$lib" | tr -d '""')
         echo "Running tests for library $lib"
@@ -56,10 +58,6 @@ main() {
         rapids-logger "Check GPU usage"
         nvidia-smi
 
-        EXITCODE=0
-        trap "EXITCODE=1" ERR
-        set +e
-
         rapids-logger "pytest ${lib}"
 
         NUM_PROCESSES=8
@@ -72,12 +70,20 @@ main() {
             fi
         done
 
+        EXITCODE=0
+        trap "EXITCODE=1" ERR
+        set +e
+
         TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
 
+        set -e
         rapids-logger "Test script exiting with value: ${EXITCODE}"
+        if [[ ${EXITCODE} != 0 ]]; then
+            ANY_FAILURES=1
+        fi
     done
 
-    exit ${EXITCODE}
+    exit ${ANY_FAILURES}
 }
 
 main "$@"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 97c72ec8042..33fc2f651c6 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.8.5,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
@@ -80,7 +80,6 @@ dependencies:
 - python-confluent-kafka>=2.5.0,<2.6.0a0
 - python-xxhash
 - python>=3.10,<3.13
-- pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
 - rich
@@ -88,7 +87,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
@@ -97,8 +95,6 @@ dependencies:
 - sphinxcontrib-websupport
 - streamz
 - sysroot_linux-64==2.17
-- tokenizers==0.15.2
-- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 84b58b6d7a4..c290a83a37f 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.5
 - cupy>=12.0.0
@@ -78,7 +78,7 @@ dependencies:
 - python-confluent-kafka>=2.5.0,<2.6.0a0
 - python-xxhash
 - python>=3.10,<3.13
-- pytorch>=2.1.0
+- pytorch>=2.4.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==25.2.*,>=0.0.0a0
 - rich
@@ -86,7 +86,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 04904e95630..2c16deeed82 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -91,7 +91,7 @@ requirements:
     - cudatoolkit
     - ptxcompiler >=0.7.0
     - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
     - libcufile  # [linux64]
@@ -100,7 +100,7 @@ requirements:
     # TODO: Add nvjitlink here
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c78ca326005..00020fdf6b8 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -31,9 +31,6 @@ fmt_version:
 flatbuffers_version:
   - "=24.3.25"
 
-spdlog_version:
-  - ">=1.14.1,<1.15"
-
 nvcomp_version:
   - "=4.1.0.6"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1c2e9e8dd98..b585aafc397 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -68,7 +68,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - flatbuffers {{ flatbuffers_version }}
-    - spdlog {{ spdlog_version }}
     - zlib {{ zlib_version }}
 
 outputs:
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index ec3fcd59c62..08eab363af0 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -83,9 +83,9 @@ requirements:
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - nvtx >=0.2.1
     - packaging
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f25b46a52cd..78f529a44d3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -93,6 +93,7 @@ option(
 mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL)
 option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
 option(CUDF_IWYU "Enable IWYU during compilation" OFF)
+option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF)
 
 option(
   CUDF_KVIKIO_REMOTE_IO
@@ -205,9 +206,16 @@ function(enable_static_checkers target)
   if(_LINT_CLANG_TIDY)
     # clang will complain about unused link libraries on the compile line unless we specify
     # -Qunused-arguments.
-    set_target_properties(
-      ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
-    )
+    if(CUDF_CLANG_TIDY_AUTOFIX)
+      set_target_properties(
+        ${target} PROPERTIES CXX_CLANG_TIDY
+                             "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix"
+      )
+    else()
+      set_target_properties(
+        ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments"
+      )
+    endif()
   endif()
   if(_LINT_IWYU)
     # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not
@@ -265,6 +273,14 @@ endif()
 
 # add third party dependencies using CPM
 rapids_cpm_init()
+
+# Not using rapids-cmake since we never want to find, always download.
+CPMAddPackage(
+  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG
+  c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55
+)
+rapids_make_logger(cudf EXPORT_SET cudf-exports)
+
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
 # find NVTX
@@ -291,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake)
 include(cmake/thirdparty/get_kvikio.cmake)
 # find fmt
 include(cmake/thirdparty/get_fmt.cmake)
-# find spdlog
-include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
 # find thread_pool
@@ -764,7 +778,6 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
-  src/utilities/logger.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
@@ -902,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED)
   target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
 endif()
 
-# Define RMM logging level
-target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")
-
-# Define spdlog level
-target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")
+# Define logging level
+target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}")
 
 # Enable remote IO through KvikIO
 target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
@@ -920,14 +930,17 @@ if(TARGET CUDA::cuFile${_cufile_suffix})
   target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
 endif()
 
+# Remove this after upgrading to a CCCL that has a proper CMake option. See
+# https://github.com/NVIDIA/cccl/pull/2844
+target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
-         spdlog::spdlog_header_only
+  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool> cudf_logger
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp>
           cuco::cuco
           ZLIB::ZLIB
@@ -936,6 +949,7 @@ target_link_libraries(
           $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}>
           nanoarrow
           rmm::rmm_logger_impl
+          cudf_logger_impl
 )
 
 # Add Conda library, and include paths if specified
@@ -1091,7 +1105,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl)
     if(CUDF_BUILD_STACKTRACE_DEBUG)
       target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
     endif()
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d3de9b39977..749e1b628ee 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
 endfunction()
 
 # ##################################################################################################
-# * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
+# * copying benchmarks
+# -----------------------------------------------------------------------------
+ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
@@ -351,17 +352,22 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/subword.cpp)
-
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH
+  text/edit_distance.cpp
+  text/hash_ngrams.cpp
+  text/jaccard.cpp
+  text/minhash.cpp
+  text/ngrams.cpp
+  text/normalize.cpp
+  text/replace.cpp
+  text/subword.cpp
+  text/tokenize.cpp
+  text/vocab.cpp
 )
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
-ConfigureBench(STRINGS_BENCH string/factory.cu)
-
 ConfigureNVBench(
   STRINGS_NVBENCH
   string/case.cpp
@@ -377,6 +383,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/factory.cpp
   string/filter.cpp
   string/find.cpp
   string/find_multiple.cpp
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
deleted file mode 100644
index 51106c72137..00000000000
--- a/cpp/benchmarks/column/concatenate.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/concatenate.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <algorithm>
-#include <vector>
-
-class Concatenate : public cudf::benchmark {};
-
-template <typename T, bool Nullable>
-static void BM_concatenate(benchmark::State& state)
-{
-  cudf::size_type const num_rows = state.range(0);
-  cudf::size_type const num_cols = state.range(1);
-
-  auto input         = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                     row_count{num_rows},
-                                     Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  auto input_columns = input->view();
-  std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
-}
-
-#define CONCAT_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                                \
-    ->Unit(benchmark::kMillisecond)                                         \
-    ->UseManualTime();
-
-CONCAT_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_BENCHMARK_DEFINE(int64_t, true)
-
-template <typename T, bool Nullable>
-static void BM_concatenate_tables(benchmark::State& state)
-{
-  cudf::size_type const num_rows   = state.range(0);
-  cudf::size_type const num_cols   = state.range(1);
-  cudf::size_type const num_tables = state.range(2);
-
-  std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
-  std::generate_n(tables.begin(), num_tables, [&]() {
-    return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                 row_count{num_rows},
-                                 Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  });
-
-  // Generate table views
-  std::vector<cudf::table_view> table_views(num_tables);
-  std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
-    return table->view();
-  });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(table_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
-}
-
-#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                           \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                               \
-    ->Unit(benchmark::kMillisecond)                                                \
-    ->UseManualTime();
-
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
-
-class ConcatenateStrings : public cudf::benchmark {};
-
-template <bool Nullable>
-static void BM_concatenate_strings(benchmark::State& state)
-{
-  using column_wrapper = cudf::test::strings_column_wrapper;
-
-  auto const num_rows  = state.range(0);
-  auto const num_chars = state.range(1);
-  auto const num_cols  = state.range(2);
-
-  std::string str(num_chars, 'a');
-
-  // Create owning columns
-  std::vector<column_wrapper> columns;
-  columns.reserve(num_cols);
-  std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() {
-    auto iter = thrust::make_constant_iterator(c_str);
-    if (Nullable) {
-      auto count_it = thrust::make_counting_iterator(0);
-      auto valid_iter =
-        thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; });
-      return column_wrapper(iter, iter + num_rows, valid_iter);
-    } else {
-      return column_wrapper(iter, iter + num_rows);
-    }
-  });
-
-  // Generate column views
-  std::vector<cudf::column_view> column_views;
-  column_views.reserve(columns.size());
-  std::transform(
-    columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows *
-                          (sizeof(int32_t) + num_chars));  // offset + chars
-}
-
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                                   \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_strings<nullable>(st); }               \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                            \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                               \
-    ->Unit(benchmark::kMillisecond)                                                 \
-    ->UseManualTime();
-
-CONCAT_STRINGS_BENCHMARK_DEFINE(false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp
new file mode 100644
index 00000000000..586b479d0ad
--- /dev/null
+++ b/cpp/benchmarks/copying/concatenate.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void bench_concatenate(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const nulls    = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  auto input = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
+  auto input_columns = input->view();
+  auto column_views  = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int64_t>(num_rows * num_cols);
+  state.add_global_memory_writes<int64_t>(num_rows * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate)
+  .set_name("concatenate")
+  .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
+  .add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
+  .add_float64_axis("nulls", {0.0, 0.3});
+
+static void bench_concatenate_strings(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const nulls     = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .null_probability(nulls);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = column->view();
+
+  auto column_views = std::vector<cudf::column_view>(num_cols, input);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const sv = cudf::strings_column_view(input);
+  state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
+  state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate_strings)
+  .set_name("concatenate_strings")
+  .add_int64_axis("num_rows", {256, 512, 4096, 16384})
+  .add_int64_axis("num_cols", {2, 8, 64, 256})
+  .add_int64_axis("row_width", {32, 128})
+  .add_float64_axis("nulls", {0.0, 0.3});
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 45b46005c47..38a21961735 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -17,7 +17,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index fa017ca9e29..267aa3a93f3 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<Data
   data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
     input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table =
-    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
+  auto source_table = create_random_table(
+    cycle_dtypes({input_type, cudf::type_id::STRING}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
@@ -85,6 +85,6 @@ using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view
 NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
   .set_name("apply_boolean_mask")
   .set_type_axes_names({"type"})
-  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("columns", {1, 4, 9})
   .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
-  .add_int64_axis("hits_%", {10, 50, 100});
+  .add_int64_axis("hits_%", {10, 20, 50, 80, 90, 100});
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index d7deebca89a..75d04bb4e8e 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
   cudf::size_type const num_rows    = state.get_int64("NumRows");
   auto const keep                   = get_keep(state.get_string("keep"));
   cudf::size_type const cardinality = state.get_int64("cardinality");
+  auto const null_probability       = state.get_float64("null_probability");
 
   if (cardinality > num_rows) {
     state.skip("cardinality > num_rows");
@@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   data_profile profile = data_profile_builder()
                            .cardinality(cardinality)
-                           .null_probability(0.01)
+                           .null_probability(null_probability)
                            .distribution(cudf::type_to_id<Type>(),
                                          distribution_id::UNIFORM,
                                          static_cast<Type>(0),
@@ -65,6 +66,7 @@ using data_type = nvbench::type_list<int32_t, int64_t>;
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
+  .add_float64_axis("null_probability", {0.01})
   .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
   .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index cd4d3ca964b..9750475a079 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -24,18 +24,14 @@
 
 void bench_case(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const encoding  = state.get_string("encoding");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
 
   auto col_view = column->view();
 
@@ -74,6 +70,7 @@ void bench_case(nvbench::state& state)
 
 NVBENCH_BENCH(bench_case)
   .set_name("case")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index eec9a5f54d7..abc5254392e 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -25,16 +25,12 @@
 static void bench_char_types(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const api_type  = state.get_string("api");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state)
 
 NVBENCH_BENCH(bench_char_types)
   .set_name("char_types")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index a73017dda18..e3940cbc0c7 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"
 
 static void bench_contains(nvbench::state& state)
 {
-  auto const n_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
   auto const hit_rate      = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto col   = create_string_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(num_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
@@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state)
 
 NVBENCH_BENCH(bench_contains)
   .set_name("contains")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
   .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp
index e06cca497c2..5a5743dfddf 100644
--- a/cpp/benchmarks/string/copy_if_else.cpp
+++ b/cpp/benchmarks/string/copy_if_else.cpp
@@ -25,15 +25,11 @@
 static void bench_copy(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const str_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const source_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
   auto const target_table =
@@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy)
   .set_name("copy_if_else")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp
index af217a49195..7e7353a0e78 100644
--- a/cpp/benchmarks/string/copy_range.cpp
+++ b/cpp/benchmarks/string/copy_range.cpp
@@ -25,16 +25,12 @@
 static void bench_copy_range(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const source_tables = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
@@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy_range)
   .set_name("copy_range")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index f964bc5d224..cf90e316f71 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"};
 static void bench_count(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state)
 
 NVBENCH_BENCH(bench_count)
   .set_name("count")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index af4fedb5799..d6866598ff4 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto groups = static_cast<cudf::size_type>(state.get_int64("groups"));
 
   std::default_random_engine generator;
@@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state)
 
 NVBENCH_BENCH(bench_extract)
   .set_name("extract")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("groups", {1, 2, 4});
diff --git a/cpp/benchmarks/string/factory.cpp b/cpp/benchmarks/string/factory.cpp
new file mode 100644
index 00000000000..03870b0ae23
--- /dev/null
+++ b/cpp/benchmarks/string/factory.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <limits>
+
+static void bench_factory(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const sv     = cudf::strings_column_view(column->view());
+
+  auto stream    = cudf::get_default_stream();
+  auto mr        = cudf::get_current_device_resource_ref();
+  auto d_strings = cudf::strings::detail::create_string_vector_from_column(sv, stream, mr);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = sv.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::make_strings_column(d_strings, cudf::string_view{nullptr, 0});
+  });
+}
+
+NVBENCH_BENCH(bench_factory)
+  .set_name("factory")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
deleted file mode 100644
index c4e74c4d97e..00000000000
--- a/cpp/benchmarks/string/factory.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "string_bench_args.hpp"
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/transform.h>
-
-#include <limits>
-
-namespace {
-using string_pair = thrust::pair<char const*, cudf::size_type>;
-struct string_view_to_pair {
-  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
-  {
-    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
-  }
-};
-}  // namespace
-
-class StringsFactory : public cudf::benchmark {};
-
-static void BM_factory(benchmark::State& state)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  auto d_column     = cudf::column_device_view::create(column->view());
-  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::get_default_stream());
-  thrust::transform(thrust::device,
-                    d_column->pair_begin<cudf::string_view, true>(),
-                    d_column->pair_end<cudf::string_view, true>(),
-                    pairs.data(),
-                    string_view_to_pair{});
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::make_strings_column(pairs, cudf::get_default_stream());
-  }
-
-  cudf::strings_column_view input(column->view());
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
-
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define STRINGS_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringsFactory, name)      \
-  (::benchmark::State & st) { BM_factory(st); } \
-  BENCHMARK_REGISTER_F(StringsFactory, name)    \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(factory)
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 3ea3ff13a2f..2ba793e998e 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -28,21 +28,19 @@
 
 static void bench_find_string(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
   auto const api       = state.get_string("api");
-
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const tgt_type  = state.get_string("target");
 
   auto const stream = cudf::get_default_stream();
-  auto const col    = create_string_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(num_rows, max_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
-  cudf::string_scalar target("0987 5W43");
+  auto target        = cudf::string_scalar("0987 5W43");
+  auto targets_col   = cudf::make_column_from_scalar(target, num_rows);
+  auto const targets = cudf::strings_column_view(targets_col->view());
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   auto const chars_size = input.chars_size(stream);
@@ -55,23 +53,44 @@ static void bench_find_string(nvbench::state& state)
   }
 
   if (api == "find") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::find(input, targets); });
+    }
   } else if (api == "contains") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::contains(input, targets); });
+    }
   } else if (api == "starts_with") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::starts_with(input, targets); });
+    }
   } else if (api == "ends_with") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+    if (tgt_type == "scalar") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+    } else if (tgt_type == "column") {
+      state.exec(nvbench::exec_tag::sync,
+                 [&](nvbench::launch& launch) { cudf::strings::ends_with(input, targets); });
+    }
   }
 }
 
 NVBENCH_BENCH(bench_find_string)
   .set_name("find_string")
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("hit_rate", {20, 80})  // percentage
   .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"})
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
-  .add_int64_axis("hit_rate", {20, 80});  // percentage
+  .add_string_axis("target", {"scalar", "column"});
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
index 6dcf731ad3c..27652193b7b 100644
--- a/cpp/benchmarks/string/join_strings.cpp
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -25,15 +25,11 @@
 static void bench_join(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state)
 
 NVBENCH_BENCH(bench_join)
   .set_name("strings_join")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
index a19060ead3b..8156e19412b 100644
--- a/cpp/benchmarks/string/lengths.cpp
+++ b/cpp/benchmarks/string/lengths.cpp
@@ -25,15 +25,11 @@
 static void bench_lengths(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state)
 
 NVBENCH_BENCH(bench_lengths)
   .set_name("lengths")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 105ae65cbe8..f6410aaef30 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hit_rate  = static_cast<int32_t>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
@@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state)
 
 NVBENCH_BENCH(bench_like)
   .set_name("strings_like")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {10, 25, 70, 100});
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index 4dcf1314f83..69426a2d484 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -26,18 +26,14 @@
 
 static void bench_replace(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const rtype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
   auto program = cudf::strings::regex_program::create("(\\d+)");
@@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"replace", "backref"});
diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp
index a2676609a40..e2e914cb350 100644
--- a/cpp/benchmarks/string/reverse.cpp
+++ b/cpp/benchmarks/string/reverse.cpp
@@ -25,15 +25,11 @@
 static void bench_reverse(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state)
 
 NVBENCH_BENCH(bench_reverse)
   .set_name("reverse")
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 1898f0340b6..c828a8ed0b0 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
@@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state)
 
 NVBENCH_BENCH(bench_slice)
   .set_name("slice")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"position", "multi"});
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 9ef58daf0fc..9c7c27c4f07 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -28,16 +28,12 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   cudf::string_scalar target("+");
@@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp
index 1fdb6e67109..34a7aa96e84 100644
--- a/cpp/benchmarks/string/split_re.cpp
+++ b/cpp/benchmarks/string/split_re.cpp
@@ -28,17 +28,13 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   auto prog = cudf::strings::regex_program::create("\\d+");
 
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
deleted file mode 100644
index a34026281e8..00000000000
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-
-#include <benchmark/benchmark.h>
-
-#include <limits>
-
-/**
- * @brief Generate row count and row length argument ranges for a string benchmark.
- *
- * Generates a series of row count and row length arguments for string benchmarks.
- * Combinations of row count and row length that would exceed the maximum string character
- * column data length are not generated.
- *
- * @param b           Benchmark to update with row count and row length arguments.
- * @param min_rows    Minimum row count argument to generate.
- * @param max_rows    Maximum row count argument to generate.
- * @param rows_mult   Row count multiplier to generate intermediate row count arguments.
- * @param min_rowlen  Minimum row length argument to generate.
- * @param max_rowlen  Maximum row length argument to generate.
- * @param rowlen_mult Row length multiplier to generate intermediate row length arguments.
- */
-inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
-                                       int min_rows,
-                                       int max_rows,
-                                       int rows_mult,
-                                       int min_rowlen,
-                                       int max_rowlen,
-                                       int rowlen_mult)
-{
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
-  }
-}
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
index 6ffa90edb8f..0ad1ae30f8c 100644
--- a/cpp/benchmarks/text/edit_distance.cpp
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -27,15 +27,11 @@
 static void bench_edit_distance(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input1(strings_table->view().column(0));
@@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state)
 
 NVBENCH_BENCH(bench_edit_distance)
   .set_name("edit_distance")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144});
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
index 4e5daf83a3c..7577cf00c0f 100644
--- a/cpp/benchmarks/text/hash_ngrams.cpp
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -27,16 +27,12 @@
 static void bench_hash_ngrams(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
@@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state)
 
 NVBENCH_BENCH(bench_hash_ngrams)
   .set_name("hash_ngrams")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {16384, 32768, 262144})
   .add_int64_axis("ngrams", {5, 10});
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d5b74da6773..5506501138b 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -28,17 +28,13 @@
 static void bench_jaccard(nvbench::state& state)
 {
   auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width       = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width       = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width       = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const substring_width = static_cast<cudf::size_type>(state.get_int64("substring_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const input_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
@@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 1024, 2048})
   .add_int64_axis("num_rows", {32768, 131072, 262144})
-  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index a80d0dcbdb8..8c86e8d4366 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64
-                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
-                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
+    auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width)
+                         : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 71bccd80d39..594dc0de28a 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -28,16 +28,12 @@
 static void bench_normalize(nvbench::state& state)
 {
   auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width      = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width      = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const normalize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 767ebab3eee..24ca4e5dfd7 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
                                  "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
@@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index dd8df695d3e..0b4e3bdefa5 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/file_utilities.hpp>
 
@@ -24,6 +21,8 @@
 
 #include <nvtext/subword_tokenize.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -54,40 +53,33 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_subword_tokenizer(benchmark::State& state)
+static void bench_subword_tokenizer(nvbench::state& state)
 {
-  auto const nrows = static_cast<cudf::size_type>(state.range(0));
-  std::vector<char const*> h_strings(nrows, "This is a test ");
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+
+  std::vector<char const*> h_strings(num_rows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
-  uint32_t max_sequence_length = 64;
-  uint32_t stride              = 48;
-  uint32_t do_truncate         = 0;
-  uint32_t do_lower            = 1;
-  //
-  auto vocab = nvtext::load_vocabulary_file(hash_file);
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                           *vocab,
-                                           max_sequence_length,
-                                           stride,
-                                           do_lower,
-                                           do_truncate);
-  }
-}
+  uint32_t max_sequence = 64;
+  uint32_t stride       = 48;
+  uint32_t do_truncate  = 0;
+  uint32_t do_lower     = 1;
 
-class Subword : public cudf::benchmark {};
+  auto input = cudf::strings_column_view{strings};
 
-#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
-  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
-  BENCHMARK_REGISTER_F(Subword, name)                                                            \
-    ->RangeMultiplier(2)                                                                         \
-    ->Range(1 << 10, 1 << 17)                                                                    \
-    ->UseManualTime()                                                                            \
-    ->Unit(benchmark::kMillisecond);
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * max_sequence);
 
-SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result =
+      nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate);
+  });
+}
 
-// BENCHMARK_MAIN();
+NVBENCH_BENCH(bench_subword_tokenizer)
+  .set_name("subword_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index e83310e0343..b9590c5539f 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -31,17 +31,13 @@
 static void bench_tokenize(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const tokenize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
@@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_tokenize)
   .set_name("tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 523d277df18..0502f375d99 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state)
 {
   auto const stream    = cudf::get_default_stream();
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto const column = [num_rows, row_width] {
+  auto const column = [num_rows, min_width, max_width] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+      cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
@@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_vocab_tokenize)
   .set_name("vocab_tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
deleted file mode 100644
index adc3dddc59c..00000000000
--- a/cpp/benchmarks/text/word_minhash.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <nvtext/minhash.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-static void bench_word_minhash(nvbench::state& state)
-{
-  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
-  auto const base64     = state.get_int64("hash_type") == 64;
-
-  data_profile const strings_profile =
-    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
-  auto strings_table =
-    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
-
-  auto const num_offsets = (num_rows / row_width) + 1;
-  auto offsets           = cudf::sequence(num_offsets,
-                                cudf::numeric_scalar<cudf::size_type>(0),
-                                cudf::numeric_scalar<cudf::size_type>(row_width));
-
-  auto source = cudf::make_lists_column(num_offsets - 1,
-                                        std::move(offsets),
-                                        std::move(strings_table->release().front()),
-                                        0,
-                                        rmm::device_buffer{});
-
-  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-
-  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
-  auto chars_size = input.chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
-                         : nvtext::word_minhash(source->view(), seeds.view());
-  });
-}
-
-NVBENCH_BENCH(bench_word_minhash)
-  .set_name("word_minhash")
-  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
-  .add_int64_axis("row_width", {10, 100, 1000})
-  .add_int64_axis("seed_count", {2, 25})
-  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
deleted file mode 100644
index 90b0f4d8a8e..00000000000
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Use CPM to find or clone speedlog
-function(find_and_configure_spdlog)
-
-  include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(
-    FMT_OPTION "EXTERNAL_FMT_HO"
-    INSTALL_EXPORT_SET cudf-exports
-    BUILD_EXPORT_SET cudf-exports
-  )
-
-endfunction()
-
-find_and_configure_spdlog()
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index dcf9c1139f9..d5cadce40c2 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,16 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/cccl_symbol_visibility.diff",
-          "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]",
-          "fixed_in" : "2.6"
-        },
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
deleted file mode 100644
index f745d5fa314..00000000000
--- a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
+++ /dev/null
@@ -1,27 +0,0 @@
-diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-index e7c62c031b..5db861853a 100644
---- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
-@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t;
- #      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
- #    endif
- 
--#    define _LIBCUDACXX_TYPE_VIS      _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_FUNC_VIS      _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
- #    define _LIBCUDACXX_HIDDEN
-@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
- #    define _LIBCUDACXX_WEAK __attribute__((__weak__))
- #  endif
- 
--// Redefine some macros for internal use
--#  if defined(__cuda_std__)
--#    undef _LIBCUDACXX_FUNC_VIS
--#    define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
--#    undef _LIBCUDACXX_TYPE_VIS
--#    define _LIBCUDACXX_TYPE_VIS
--#  endif // __cuda_std__
--
- // Thread API
- #  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
- #    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
deleted file mode 100644
index 6ae1e1c917b..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ /dev/null
@@ -1,25 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 2a3cc4e33..8fb337b26 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -44,8 +44,7 @@
-   }                                                                                   \
-   else                                                                                \
-   {                                                                                   \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--    status                             = call arguments;                              \
-+    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- 
- /**
-@@ -66,9 +65,7 @@
-   }                                                                                          \
-   else                                                                                       \
-   {                                                                                          \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
--    status                              = call arguments;                                    \
-+    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
index cb0cc55f4d2..5f1981e9806 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -1,20 +1,20 @@
 diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index eb76ebb0b..c6c529a50 100644
+index 29510db5e..cf57e5786 100644
 --- a/cub/cub/block/block_merge_sort.cuh
 +++ b/cub/cub/block/block_merge_sort.cuh
 @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
    KeyT key1 = keys_shared[keys1_beg];
    KeyT key2 = keys_shared[keys2_beg];
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
    {
-     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-@@ -376,7 +376,7 @@ public:
+     const bool p  = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -374,7 +374,7 @@ public:
        //
        KeyT max_key = oob_default;
- 
+
 -#pragma unroll
 +#pragma unroll 1
        for (int item = 1; item < ITEMS_PER_THREAD; ++item)
@@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644
 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
  {
    constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 1c1052487f2..5032a073b58 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable,
 in between the two settings will be excluded from the written log. The available levels are the same
 as for the CMake variable.
 * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime.
-For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that
+For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that
 are not errors or critical errors. This API should not be used within libcudf to manipulate logging,
 its purpose is to allow upstream users to configure libcudf logging to fit their application.
 
 By default, logging messages are output to stderr.
 Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the
 specified path (can be relative to the current directory).
-Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to
-standard output or even a custom spdlog sink.
+Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to
+standard output.
 
 # Data Types
 
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35a39ef9758..ea480b133dc 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -33,11 +33,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
+#include <type_traits>
 
 /**
  * @file column_device_view.cuh
@@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf {
  *
  */
 struct nullate {
-  struct YES : std::bool_constant<true> {};
-  struct NO : std::bool_constant<false> {};
+  struct YES : cuda::std::bool_constant<true> {};
+  struct NO : cuda::std::bool_constant<false> {};
   /**
    * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than
    * compile time. The calling code is responsible for specifying whether or not nulls are
@@ -80,7 +82,7 @@ struct nullate {
      * @return `true` if nulls are expected in the operation in which this object is applied,
      * otherwise false
      */
-    constexpr operator bool() const noexcept { return value; }
+    CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; }
     bool value;  ///< True if nulls are expected
   };
 };
@@ -319,14 +321,14 @@ class alignas(16) column_device_view_base {
   }
 
   template <typename C, typename T, typename = void>
-  struct has_element_accessor_impl : std::false_type {};
+  struct has_element_accessor_impl : cuda::std::false_type {};
 
   template <typename C, typename T>
   struct has_element_accessor_impl<
     C,
     T,
-    void_t<decltype(std::declval<C>().template element<T>(std::declval<size_type>()))>>
-    : std::true_type {};
+    void_t<decltype(cuda::std::declval<C>().template element<T>(cuda::std::declval<size_type>()))>>
+    : cuda::std::true_type {};
 };
 // @cond
 // Forward declaration
@@ -460,7 +462,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    */
   struct index_element_fn {
     template <typename IndexType,
-              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>)>
+              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_signed_v<IndexType>)>
     __device__ size_type operator()(column_device_view const& indices, size_type index)
     {
       return static_cast<size_type>(indices.element<IndexType>(index));
@@ -468,10 +470,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
 
     template <typename IndexType,
               typename... Args,
-              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>))>
+              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_signed_v<IndexType>))>
     __device__ size_type operator()(Args&&... args)
     {
-      CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type");
+      CUDF_UNREACHABLE("dictionary indices must be a signed integral type");
     }
   };
 
@@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return `true` if `column_device_view::element<T>()` has a valid overload, `false` otherwise
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<column_device_view, T>::value;
   }
@@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return `true` if `mutable_column_device_view::element<T>()` has a valid overload, `false`
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<mutable_column_device_view, T>::value;
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index de53e7586cd..c30c3d6f4bd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -36,7 +36,7 @@
 namespace cudf {
 namespace detail {
 template <typename T>
-constexpr bool is_product_supported()
+CUDF_HOST_DEVICE constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4159e324472..9226697a7f6 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -16,300 +16,25 @@
 
 #pragma once
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cub/cub.cuh>
-#include <cuda/atomic>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
-
 namespace cudf {
 namespace detail {
 
-// Compute the count of elements that pass the mask within each block
-template <typename Filter, int block_size>
-CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                      cudf::size_type size,
-                                      cudf::size_type per_thread,
-                                      Filter filter)
-{
-  int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
-  int count = 0;
-
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-    count += __syncthreads_count(mask_true);
-    tid += block_size;
-  }
-
-  if (threadIdx.x == 0) block_counts[blockIdx.x] = count;
-}
-
-// Compute the exclusive prefix sum of each thread's mask value within each block
-template <int block_size>
-__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum)
-{
-  int offset = 0;
-
-  using BlockScan = cub::BlockScan<cudf::size_type, block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum);
-
-  return offset;
-}
-
-// This kernel scatters data and validity mask of a column based on the
-// scan of the boolean mask. The block offsets for the scan are already computed.
-// Just compute the scan of the mask in each block and add it to the block's
-// output offset. This is the output index of each element. Scattering
-// the valid mask is not as easy, because each thread is only responsible for
-// one bit. Warp-level processing (ballot) makes this simpler.
-// To make scattering efficient, we "coalesce" the block's scattered data and
-// valids in shared memory, and then write from shared memory to global memory
-// in a contiguous manner.
-// The has_validity template parameter specializes this kernel for the
-// non-nullable case for performance without writing another kernel.
-//
-// Note: `filter` is not run on indices larger than the input column size
-template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) CUDF_KERNEL
-  void scatter_kernel(cudf::mutable_column_device_view output_view,
-                      cudf::size_type* output_null_count,
-                      cudf::column_device_view input_view,
-                      cudf::size_type const* __restrict__ block_offsets,
-                      cudf::size_type size,
-                      cudf::size_type per_thread,
-                      Filter filter)
-{
-  T* __restrict__ output_data                   = output_view.data<T>();
-  cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  static_assert(block_size <= 1024, "Maximum thread block size exceeded");
-
-  int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
-  cudf::size_type block_offset = block_offsets[blockIdx.x];
-
-  // one extra warp worth in case the block is not aligned
-  __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
-  __shared__ T temp_data[block_size];
-
-  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
-  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
-
-  // Note that since the maximum gridDim.x on all supported GPUs is as big as
-  // cudf::size_type, this loop is sufficient to cover our maximum column size
-  // regardless of the value of block_size and per_thread.
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-
-    cudf::size_type tmp_block_sum = 0;
-    // get output location using a scan of the mask result
-    cudf::size_type const local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
-    block_sum += tmp_block_sum;
-
-    if (has_validity) {
-      temp_valids[threadIdx.x] = false;  // init shared memory
-      if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();  // wait for init
-    }
-
-    if (mask_true) {
-      temp_data[local_index] = input_view.data<T>()[tid];  // scatter data to shared
-
-      // scatter validity mask to shared memory
-      if (has_validity and input_view.is_valid(tid)) {
-        // determine aligned offset for this warp's output
-        cudf::size_type const aligned_offset      = block_offset % cudf::detail::warp_size;
-        temp_valids[local_index + aligned_offset] = true;
-      }
-    }
-
-    __syncthreads();  // wait for shared data and validity mask to be complete
-
-    // Copy output data coalesced from shared to global
-    if (threadIdx.x < tmp_block_sum)
-      output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x];
-
-    if (has_validity) {
-      // Since the valid bools are contiguous in shared memory now, we can use
-      // __popc to combine them into a single mask element.
-      // Then, most mask elements can be directly copied from shared to global
-      // memory. Only the first and last 32-bit mask elements of each block must
-      // use an atomicOr, because these are where other blocks may overlap.
-
-      constexpr int num_warps = block_size / cudf::detail::warp_size;
-      // account for partial blocks with non-warp-aligned offsets
-      int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
-      int const last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
-      int const wid        = threadIdx.x / cudf::detail::warp_size;
-      int const lane       = threadIdx.x % cudf::detail::warp_size;
-
-      cudf::size_type tmp_warp_valid_counts{0};
-
-      if (tmp_block_sum > 0 && wid <= last_warp) {
-        int valid_index = (block_offset / cudf::detail::warp_size) + wid;
-
-        // compute the valid mask for this warp
-        uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]);
-
-        // Note the atomicOr's below assume that output_valid has been set to
-        // all zero before the kernel
-        if (lane == 0 && valid_warp != 0) {
-          tmp_warp_valid_counts = __popc(valid_warp);
-          if (wid > 0 && wid < last_warp)
-            output_valid[valid_index] = valid_warp;
-          else {
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-
-        // if the block is full and not aligned then we have one more warp to cover
-        if ((wid == 0) && (last_warp == num_warps)) {
-          uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
-          if (lane == 0 && valid_warp != 0) {
-            tmp_warp_valid_counts += __popc(valid_warp);
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index + num_warps]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-      }
-      warp_valid_counts += tmp_warp_valid_counts;
-    }
-
-    block_offset += tmp_block_sum;
-    tid += block_size;
-  }
-  // Compute total null_count for this block and add it to global count
-  constexpr cudf::size_type leader_lane{0};
-  cudf::size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
-
-  if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
-    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
-  }
-}
-
-template <typename T, typename Enable = void>
-struct DeviceType {
-  using type = T;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
-  using type = typename T::rep;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-// Dispatch functor which performs the scatter for fixed column types and gather for other
-template <typename Filter, int block_size>
-struct scatter_gather_functor {
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const* block_offsets,
-                                           Filter filter,
-                                           cudf::size_type per_thread,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    auto output_column =
-      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
-    auto output = output_column->mutable_view();
-
-    bool has_valid = input.nullable();
-
-    using Type = typename DeviceType<T>::type;
-
-    auto scatter = (has_valid) ? scatter_kernel<Type, Filter, block_size, true>
-                               : scatter_kernel<Type, Filter, block_size, false>;
-
-    cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
-
-    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
-    if (output.nullable()) {
-      // Have to initialize the output mask to all zeros because we may update
-      // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
-    }
-
-    auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
-    auto input_device_view  = cudf::column_device_view::create(input, stream);
-    scatter<<<grid.num_blocks, block_size, 0, stream.value()>>>(*output_device_view,
-                                                                null_count.data(),
-                                                                *input_device_view,
-                                                                block_offsets,
-                                                                input.size(),
-                                                                per_thread,
-                                                                filter);
-
-    if (has_valid) { output_column->set_null_count(null_count.value(stream)); }
-    return output_column;
-  }
-
-  template <typename T,
-            std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const*,
-                                           Filter filter,
-                                           cudf::size_type,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    rmm::device_uvector<cudf::size_type> indices(output_size, stream);
-
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::counting_iterator<cudf::size_type>(0),
-                    thrust::counting_iterator<cudf::size_type>(input.size()),
-                    indices.begin(),
-                    filter);
-
-    auto output_table = cudf::detail::gather(cudf::table_view{{input}},
-                                             indices,
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-
-    // There will be only one column
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-};
-
 /**
  * @brief Filters `input` using a Filter function object
  *
@@ -319,9 +44,11 @@ struct scatter_gather_functor {
  * false otherwise.
  *
  * @tparam Filter the filter functor type
- * @param[in] input The table_view to filter
- * @param[in] filter A function object that takes an index and returns a bool
- * @return unique_ptr<table> The table generated from filtered `input`.
+ * @param input The table_view to filter
+ * @param filter A function object that takes an index and returns a bool
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for allocating the returned memory
+ * @return The table generated from filtered `input`
  */
 template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
@@ -333,76 +60,22 @@ std::unique_ptr<table> copy_if(table_view const& input,
 
   if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); }
 
-  constexpr int block_size = 256;
-  cudf::size_type per_thread =
-    elements_per_thread(compute_block_counts<Filter, block_size>, input.num_rows(), block_size);
-  cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread};
-
-  // temp storage for block counts and offsets
-  rmm::device_uvector<cudf::size_type> block_counts(grid.num_blocks, stream);
-  rmm::device_uvector<cudf::size_type> block_offsets(grid.num_blocks + 1, stream);
-
-  // 1. Find the count of elements in each block that "pass" the mask
-  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    block_counts.begin(), input.num_rows(), per_thread, filter);
-
-  // initialize just the first element of block_offsets to 0 since the InclusiveSum below
-  // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
-
-  // 2. Find the offset for each block's output using a scan of block counts
-  if (grid.num_blocks > 1) {
-    // Determine and allocate temporary device storage
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(nullptr,
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-    rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-
-    // Run exclusive prefix sum
-    cub::DeviceScan::InclusiveSum(d_temp_storage.data(),
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-  }
-
-  // As it is InclusiveSum, last value in block_offsets will be output_size
-  // unless num_blocks == 1, in which case output_size is just block_counts[0]
-  cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &output_size,
-    grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
-    sizeof(cudf::size_type),
-    cudaMemcpyDefault,
-    stream.value()));
+  auto indices     = rmm::device_uvector<size_type>(input.num_rows(), stream);
+  auto const begin = thrust::counting_iterator<size_type>(0);
+  auto const end   = begin + input.num_rows();
+  auto const indices_end =
+    thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter);
 
-  stream.synchronize();
+  auto const output_size = static_cast<size_type>(thrust::distance(indices.begin(), indices_end));
 
-  if (output_size == input.num_rows()) {
-    return std::make_unique<table>(input, stream, mr);
-  } else if (output_size > 0) {
-    std::vector<std::unique_ptr<column>> out_columns(input.num_columns());
-    std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) {
-      return cudf::type_dispatcher(col_view.type(),
-                                   scatter_gather_functor<Filter, block_size>{},
-                                   col_view,
-                                   output_size,
-                                   block_offsets.begin(),
-                                   filter,
-                                   per_thread,
-                                   stream,
-                                   mr);
-    });
+  // nothing selected
+  if (output_size == 0) { return empty_like(input); }
+  // everything selected
+  if (output_size == input.num_rows()) { return std::make_unique<table>(input, stream, mr); }
 
-    return std::make_unique<table>(std::move(out_columns));
-  } else {
-    return empty_like(input);
-  }
+  auto const map = device_span<size_type const>(indices.data(), output_size);
+  return cudf::detail::gather(
+    input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 5dc75b1a3fb..a7efb4e6e93 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
-  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
-  int const warp_id              = tidx / cudf::detail::warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
+  auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
+
+  auto const stride         = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const warp_id        = tidx / cudf::detail::warp_size;
+  auto const warps_per_grid = stride / cudf::detail::warp_size;
 
   // begin/end indices for the column data
   size_type const begin = 0;
@@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % cudf::detail::warp_size;
+  auto const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index fcb80fe45f7..022c5c40ea0 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -56,15 +56,15 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
   constexpr cudf::size_type leader_lane{0};
   int const lane_id = threadIdx.x % warp_size;
 
-  cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int const warp_id         = tid / warp_size;
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const warp_id = tid / warp_size;
 
   cudf::size_type const offset         = target.offset();
   cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin);
   cudf::size_type const end_mask_idx   = cudf::word_index(offset + target_end);
 
   cudf::size_type mask_idx             = begin_mask_idx + warp_id;
-  cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size;
+  cudf::size_type const masks_per_grid = cudf::detail::grid_1d::grid_stride() / warp_size;
 
   cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin);
   cudf::size_type source_idx    = tid + target_offset;
@@ -92,7 +92,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
       }
     }
 
-    source_idx += blockDim.x * gridDim.x;
+    source_idx += cudf::detail::grid_1d::grid_stride();
     mask_idx += masks_per_grid;
   }
 
diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 5ea0d06039f..1bfb40e5916 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(data_type(type_to_id<T>()) == col_view.type(), "get_value data type mismatch");
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
-  T result;
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &result, col_view.data<T>() + element_index, sizeof(T), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
-  return result;
+  return cudf::detail::make_host_vector_sync(
+           device_span<T const>{col_view.data<T>() + element_index, 1}, stream)
+    .front();
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 025e2ccc3ec..17ecc0f5539 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -67,7 +67,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
                                       size_type source_size_bits,
                                       size_type* count_ptr)
 {
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const last_bit_index  = source_size_bits - 1;
   auto const last_word_index = cudf::word_index(last_bit_index);
@@ -75,7 +75,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
   size_type thread_count = 0;
 
   for (size_type destination_word_index = tid; destination_word_index < destination.size();
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += cudf::detail::grid_1d::grid_stride()) {
     bitmask_type destination_word =
       detail::get_mask_offset_word(source[0],
                                    destination_word_index,
@@ -214,8 +214,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
-  size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type range_id  = tid;
+  auto range_id = cudf::detail::grid_1d::global_thread_id();
 
   while (range_id < num_ranges) {
     size_type const first_bit_index = *(first_bit_indices + range_id);
@@ -243,7 +242,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
     // Update the null count with the computed delta.
     size_type updated_null_count = *(null_counts + range_id) + delta;
     *(null_counts + range_id)    = updated_null_count;
-    range_id += blockDim.x * gridDim.x;
+    range_id += cudf::detail::grid_1d::grid_stride();
   }
 }
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 61a8e9f7ec3..72cdc3d8067 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -74,9 +74,10 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
-                                                      thread_index_type block_id,
-                                                      thread_index_type num_threads_per_block)
+  __device__ static constexpr thread_index_type global_thread_id(
+    thread_index_type thread_id,
+    thread_index_type block_id,
+    thread_index_type num_threads_per_block)
   {
     return thread_id + block_id * num_threads_per_block;
   }
@@ -114,8 +115,8 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
-                                                 thread_index_type num_blocks_per_grid)
+  __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                            thread_index_type num_blocks_per_grid)
   {
     return num_threads_per_block * num_blocks_per_grid;
   }
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 46f424e051b..923cd04479d 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/functional>
+
 #include <type_traits>
 
 namespace cudf {
@@ -42,7 +44,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
-  return std::min(lhs, rhs);
+  return cuda::std::min(lhs, rhs);
 }
 
 /**
@@ -53,7 +55,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
-  return std::max(lhs, rhs);
+  return cuda::std::max(lhs, rhs);
 }
 }  // namespace detail
 
@@ -68,22 +70,26 @@ struct DeviceSum {
   }
 
   template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{typename T::duration{0}};
   }
 
   template <typename T,
             std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{0};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support device operator identity");
+#endif
     return T{};
   }
 };
@@ -105,7 +111,7 @@ struct DeviceCount {
   }
 
   template <typename T>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{};
   }
@@ -125,7 +131,7 @@ struct DeviceMin {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::max()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -139,9 +145,13 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity");
+#endif
     return cuda::std::numeric_limits<T>::max();
   }
 
@@ -153,7 +163,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::max_value());
   }
@@ -173,7 +183,7 @@ struct DeviceMax {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::min()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -187,9 +197,13 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity");
+#endif
     return cuda::std::numeric_limits<T>::lowest();
   }
 
@@ -200,7 +214,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::lowest_value());
   }
@@ -217,15 +231,19 @@ struct DeviceProduct {
   }
 
   template <typename T, std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{1};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity");
+#endif
     return T{1, numeric::scale_type{0}};
   }
 };
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 8b709f2a8f8..2e3d71815c0 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
 }  // namespace detail
 
 /**
- * Divides the left-hand-side by the right-hand-side, rounding up
+ * @brief Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
  *
- * @param dividend the number to divide
- * @param divisor the number of by which to divide
- * @return The least integer multiple of {@link divisor} which is greater than or equal to
- * the non-integral division dividend/divisor.
+ * The result is undefined if `divisor == 0` or
+ * if `divisor == -1` and `dividend == min<I>()`.
+ *
+ * Will not overflow, and may _or may not_ be slower than the intuitive
+ * approach of using `(dividend + divisor - 1) / divisor`.
  *
- * @note will not overflow, and may _or may not_ be slower than the intuitive
- * approach of using (dividend + divisor - 1) / divisor
+ * @tparam I Integer type for `dividend`, `divisor`, and the return type
+ * @param dividend The number to divide
+ * @param divisor The number by which to divide
+ * @return The least integer multiple of `divisor` which is greater than or equal to
+ * the non-integral division `dividend/divisor`
  */
 template <typename I>
 constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
@@ -183,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr auto absolute_value(T value) -> T
+CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp
deleted file mode 100644
index e7643eb44bd..00000000000
--- a/cpp/include/cudf/detail/utilities/logger.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/logger.hpp>
-
-// Log messages that require computation should only be used at level TRACE and DEBUG
-#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__)
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index dc81fd74992..ced6bd2afa4 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -41,7 +41,7 @@ namespace dictionary {
  *
  * The null mask and null count are copied from the input column to the output column.
  *
- * @throw cudf::logic_error if indices type is not an unsigned integer type
+ * @throw cudf::logic_error if indices type is not a signed integer type
  * @throw cudf::logic_error if the column to encode is already a DICTIONARY type
  *
  * @code{.pseudo}
@@ -58,7 +58,7 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type            = data_type{type_id::UINT32},
+  data_type indices_type            = data_type{type_id::INT32},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index fce08b4a5c4..9e68bafb09a 100644
--- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -22,6 +22,7 @@
 #include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 #include <cstring>
 
@@ -183,7 +184,7 @@ struct floating_converter {
    * @param integer_rep The bit-casted floating value to extract the exponent from
    * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+  CUDF_HOST_DEVICE inline static cuda::std::pair<IntegralType, int> get_significand_and_pow2(
     IntegralType integer_rep)
   {
     // Extract the significand
@@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
@@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
index 0ec41a20ef1..fd3455e761d 100644
--- a/cpp/include/cudf/hashing/detail/hash_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -18,7 +18,8 @@
 
 #include <cudf/utilities/traits.hpp>
 
-#include <limits>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 
 namespace cudf::hashing::detail {
 
@@ -29,7 +30,7 @@ template <typename T>
 T __device__ inline normalize_nans(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+    if (cuda::std::isnan(key)) { return cuda::std::numeric_limits<T>::quiet_NaN(); }
   }
   return key;
 }
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index a978e54a1b9..7cb80081a95 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -82,7 +82,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
  * @param rhs The second hash value
  * @return Combined hash value
  */
-constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
+CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
 {
   return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2));
 }
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
index 5e88b905023..31390aa3edf 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
@@ -15,177 +15,63 @@
  */
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 
-#include <thrust/pair.h>
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/array>
+#include <cuda/std/cstddef>
 
 namespace cudf::hashing::detail {
 
-// MurmurHash3_x64_128 implementation from
-// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
 template <typename Key>
 struct MurmurHash3_x64_128 {
-  using result_type = thrust::pair<uint64_t, uint64_t>;
+  using result_type = cuda::std::array<uint64_t, 2>;
 
-  constexpr MurmurHash3_x64_128() = default;
-  constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}
-
-  __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED)
+    : _impl{seed}
   {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
-  {
-    uint64_t result = getblock32(data, offset + 4);
-    result          = result << 32;
-    return result | getblock32(data, offset);
-  }
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
-  __device__ inline uint64_t fmix64(uint64_t k) const
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
   {
-    k ^= k >> 33;
-    k *= 0xff51afd7ed558ccdUL;
-    k ^= k >> 33;
-    k *= 0xc4ceb9fe1a85ec53UL;
-    k ^= k >> 33;
-    return k;
+    return this->_impl.compute_hash(bytes, size);
   }
 
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
-
+ private:
   template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a 8-byte chunk.
-    uint64_t k1     = 0;
-    uint64_t k2     = 0;
-    auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
-    switch (len & (BLOCK_SIZE - 1)) {
-      case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
-      case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
-      case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
-      case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
-      case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
-      case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
-      case 9:
-        k2 ^= static_cast<uint64_t>(tail[8]) << 0;
-        k2 *= c2;
-        k2 = rotate_bits_left(k2, 33);
-        k2 *= c1;
-        h.second ^= k2;
-
-      case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
-      case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
-      case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
-      case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
-      case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
-      case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
-      case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
-      case 1:
-        k1 ^= static_cast<uint64_t>(tail[0]) << 0;
-        k1 *= c1;
-        k1 = rotate_bits_left(k1, 31);
-        k1 *= c2;
-        h.first ^= k1;
-    };
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  __device__ constexpr result_type compute(T const& key) const
   {
-    auto const nblocks = len / BLOCK_SIZE;
-    uint64_t h1        = m_seed;
-    uint64_t h2        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint64_t k1 = getblock64(data, (i * BLOCK_SIZE));                     // 1st 8 bytes
-      uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2));  // 2nd 8 bytes
-
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, 31);
-      k1 *= c2;
-
-      h1 ^= k1;
-      h1 = rotate_bits_left(h1, 27);
-      h1 += h2;
-      h1 = h1 * 5 + 0x52dce729;
-
-      k2 *= c2;
-      k2 = rotate_bits_left(k2, 33);
-      k2 *= c1;
-
-      h2 ^= k2;
-      h2 = rotate_bits_left(h2, 31);
-      h2 += h1;
-      h2 = h2 * 5 + 0x38495ab5;
-    }
-
-    thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});
-
-    // Finalize hash.
-    h1 ^= len;
-    h2 ^= len;
-
-    h1 += h2;
-    h2 += h1;
-
-    h1 = fmix64(h1);
-    h2 = fmix64(h2);
-
-    h1 += h2;
-    h2 += h1;
-
-    return {h1, h2};
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
   }
 
- private:
-  uint64_t m_seed{};
-  static constexpr uint32_t BLOCK_SIZE = 16;  // 2 x 64-bit = 16 bytes
-
-  static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
-  static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
+  cuco::murmurhash3_x64_128<Key> _impl;
 };
 
 template <>
 MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
   bool const& key) const
 {
-  return compute<uint8_t>(key);
+  return this->compute<uint8_t>(key);
 }
 
 template <>
 MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
   float const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
 MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
   double const& key) const
 {
-  return compute(normalize_nans(key));
+  return this->compute(normalize_nans(key));
 }
 
 template <>
@@ -193,9 +79,8 @@ MurmurHash3_x64_128<cudf::string_view>::result_type
   __device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
     cudf::string_view const& key) const
 {
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
 }
 
 template <>
@@ -203,7 +88,7 @@ MurmurHash3_x64_128<numeric::decimal32>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
     numeric::decimal32 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -211,7 +96,7 @@ MurmurHash3_x64_128<numeric::decimal64>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
     numeric::decimal64 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 template <>
@@ -219,7 +104,7 @@ MurmurHash3_x64_128<numeric::decimal128>::result_type
   __device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
     numeric::decimal128 const& key) const
 {
-  return compute(key.value());
+  return this->compute(key.value());
 }
 
 }  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
index 38a7d927b9c..e0c7ce840d7 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -33,7 +33,7 @@ template <typename Key>
 struct MurmurHash3_x86_32 {
   using result_type = hash_value_type;
 
-  __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
+  CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED)
     : _impl{seed}
   {
   }
diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
index 7d72349e340..d77d040b365 100644
--- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh
+++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh
@@ -31,7 +31,7 @@ template <typename Key>
 struct XXHash_64 {
   using result_type = std::uint64_t;
 
-  __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+  CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
 
   __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index bfe76d5690c..b561d0989e9 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -410,6 +410,7 @@ class parquet_reader_options_builder {
    *
    * @param val Boolean value whether to read matching projected and filter columns from mismatched
    * Parquet sources.
+   *
    * @return this for chaining.
    */
   parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val)
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 85349a421b1..84957ab9f1d 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -31,7 +31,7 @@ namespace strings::detail {
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this is not the first byte of the character
  */
-constexpr bool is_utf8_continuation_char(unsigned char chr)
+CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr)
 {
   // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
   return (chr & 0xC0) == 0x80;
@@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr)
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this the first byte of the character
  */
-constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); }
+CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr)
+{
+  return not is_utf8_continuation_char(chr);
+}
 
 /**
  * @brief This will return true if the passed in byte could be the start of
@@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin
  * @param byte The byte to be tested
  * @return true if this can be the first byte of a character
  */
-constexpr bool is_valid_begin_utf8_char(uint8_t byte)
+CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte)
 {
   // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of:
   //  0b0vvvvvvv a 1 byte character
@@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte)
  * @param character Single character
  * @return Number of bytes
  */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character)
 {
   return 1 + static_cast<size_type>((character & 0x0000'FF00u) > 0) +
          static_cast<size_type>((character & 0x00FF'0000u) > 0) +
@@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character)
  * @param byte Byte from an encoded character.
  * @return Number of bytes.
  */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte)
 {
   return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
          + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
@@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-constexpr size_type to_char_utf8(char const* str, char_utf8& character)
+CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character)
  * @param[out] str Output array.
  * @return The number of bytes in the character
  */
-constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
@@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
  * @param utf8_char Single UTF-8 character to convert.
  * @return Code-point for the UTF-8 character.
  */
-constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
+CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
   if (utf8_char < 0x0000'0080)  // single-byte pass thru
@@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
  * @param unchr Character code-point to convert.
  * @return Single UTF-8 character.
  */
-constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
+CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
   if (unchr < 0x0000'0080)  // single byte utf8
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 34ed3c5618e..f0040e069d8 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -31,6 +31,8 @@
 #include <thrust/execution_policy.h>
 #endif
 
+#include <cuda/std/utility>
+
 #include <algorithm>
 
 // This file should only include device code logic.
@@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte
  * @param pos Character position to count to
  * @return The number of bytes and the left over non-counted position value
  */
-__device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
-                                                                              size_type pos)
+__device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
+  string_view d_str, size_type pos)
 {
   size_type bytes    = 0;
   auto ptr           = d_str.data();
@@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const
 __device__ inline size_type string_view::byte_offset(size_type pos) const
 {
   if (length() == size_bytes()) return pos;
-  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
+  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
 }
 
 __device__ inline int string_view::compare(string_view const& in) const
@@ -373,24 +375,23 @@ __device__ inline size_type string_view::find_impl(char const* str,
                                                    size_type pos,
                                                    size_type count) const
 {
-  auto const nchars = length();
-  if (!str || pos < 0 || pos > nchars) return npos;
-  if (count < 0) count = nchars;
+  if (!str || pos < 0) { return npos; }
+  if (pos > 0 && pos > length()) { return npos; }
 
   // use iterator to help reduce character/byte counting
-  auto itr        = begin() + pos;
+  auto const itr  = begin() + pos;
   auto const spos = itr.byte_offset();
-  auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes();
+  auto const epos =
+    (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes();
 
   auto const find_length = (epos - spos) - bytes + 1;
+  auto const d_target    = string_view{str, bytes};
 
   auto ptr = data() + (forward ? spos : (epos - bytes));
   for (size_type idx = 0; idx < find_length; ++idx) {
-    bool match = true;
-    for (size_type jdx = 0; match && (jdx < bytes); ++jdx) {
-      match = (ptr[jdx] == str[jdx]);
+    if (d_target.compare(ptr, bytes) == 0) {
+      return forward ? pos : character_offset(epos - bytes - idx);
     }
-    if (match) { return forward ? pos : character_offset(epos - bytes - idx); }
     // use pos to record the current find position
     pos += strings::detail::is_begin_utf8_char(*ptr);
     forward ? ++ptr : --ptr;
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 3f33c70c29a..8214ea6e83b 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -33,6 +33,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <cuda/std/tuple>
 #include <cuda/std/utility>
 #include <thrust/detail/use_default.h>
@@ -48,11 +50,8 @@
 #include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
-#include <limits>
 #include <memory>
-#include <optional>
 #include <type_traits>
-#include <utility>
 
 namespace CUDF_EXPORT cudf {
 
@@ -287,15 +286,16 @@ class device_row_comparator {
    * `null_order::BEFORE` for all columns.
    * @param comparator Physical element relational comparison functor.
    */
-  device_row_comparator(Nullate check_nulls,
-                        table_device_view lhs,
-                        table_device_view rhs,
-                        device_span<detail::dremel_device_view const> l_dremel_device_views,
-                        device_span<detail::dremel_device_view const> r_dremel_device_views,
-                        std::optional<device_span<int const>> depth                  = std::nullopt,
-                        std::optional<device_span<order const>> column_order         = std::nullopt,
-                        std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-                        PhysicalElementComparator comparator                         = {}) noexcept
+  device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    device_span<detail::dremel_device_view const> l_dremel_device_views,
+    device_span<detail::dremel_device_view const> r_dremel_device_views,
+    cuda::std::optional<device_span<int const>> depth                  = cuda::std::nullopt,
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel(l_dremel_device_views),
@@ -331,9 +331,9 @@ class device_row_comparator {
     Nullate check_nulls,
     table_device_view lhs,
     table_device_view rhs,
-    std::optional<device_span<order const>> column_order         = std::nullopt,
-    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-    PhysicalElementComparator comparator                         = {}) noexcept
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel{},
@@ -410,7 +410,7 @@ class device_row_comparator {
 
       return cuda::std::pair(_comparator(_lhs.element<Element>(lhs_element_index),
                                          _rhs.element<Element>(rhs_element_index)),
-                             std::numeric_limits<int>::max());
+                             cuda::std::numeric_limits<int>::max());
     }
 
     /**
@@ -455,7 +455,7 @@ class device_row_comparator {
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits<int>::max());
+          return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits<int>::max());
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
@@ -607,7 +607,7 @@ class device_row_comparator {
   __device__ constexpr weak_ordering operator()(size_type const lhs_index,
                                                 size_type const rhs_index) const noexcept
   {
-    int last_null_depth = std::numeric_limits<int>::max();
+    int last_null_depth = cuda::std::numeric_limits<int>::max();
     size_type list_column_index{-1};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; }
@@ -626,9 +626,9 @@ class device_row_comparator {
       // here, otherwise the current code would be failing.
       auto const [l_dremel_i, r_dremel_i] =
         _lhs.column(i).type().id() == type_id::LIST
-          ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
-                            optional_dremel_view(_r_dremel[list_column_index]))
-          : std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+          ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
+                                  optional_dremel_view(_r_dremel[list_column_index]))
+          : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
 
       auto element_comp = element_comparator{_check_nulls,
                                              _lhs.column(i),
@@ -658,9 +658,9 @@ class device_row_comparator {
   device_span<detail::dremel_device_view const> const _l_dremel;
   device_span<detail::dremel_device_view const> const _r_dremel;
   Nullate const _check_nulls;
-  std::optional<device_span<int const>> const _depth;
-  std::optional<device_span<order const>> const _column_order;
-  std::optional<device_span<null_order const>> const _null_precedence;
+  cuda::std::optional<device_span<int const>> const _depth;
+  cuda::std::optional<device_span<order const>> const _column_order;
+  cuda::std::optional<device_span<null_order const>> const _null_precedence;
   PhysicalElementComparator const _comparator;
 };  // class device_row_comparator
 
@@ -882,10 +882,10 @@ struct preprocessed_table {
    * @return Device array containing respective column orders. If no explicit column orders were
    * specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<order const>> column_order() const
+  [[nodiscard]] cuda::std::optional<device_span<order const>> column_order() const
   {
-    return _column_order.size() ? std::optional<device_span<order const>>(_column_order)
-                                : std::nullopt;
+    return _column_order.size() ? cuda::std::optional<device_span<order const>>(_column_order)
+                                : cuda::std::nullopt;
   }
 
   /**
@@ -895,10 +895,11 @@ struct preprocessed_table {
    * @return Device array containing respective column null precedence. If no explicit column null
    * precedences were specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<null_order const>> null_precedence() const
+  [[nodiscard]] cuda::std::optional<device_span<null_order const>> null_precedence() const
   {
-    return _null_precedence.size() ? std::optional<device_span<null_order const>>(_null_precedence)
-                                   : std::nullopt;
+    return _null_precedence.size()
+             ? cuda::std::optional<device_span<null_order const>>(_null_precedence)
+             : cuda::std::nullopt;
   }
 
   /**
@@ -909,9 +910,10 @@ struct preprocessed_table {
    * @return std::optional<device_span<int const>> Device array containing respective column depths.
    * If there are no nested columns in the table then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<int const>> depths() const
+  [[nodiscard]] cuda::std::optional<device_span<int const>> depths() const
   {
-    return _depths.size() ? std::optional<device_span<int const>>(_depths) : std::nullopt;
+    return _depths.size() ? cuda::std::optional<device_span<int const>>(_depths)
+                          : cuda::std::nullopt;
   }
 
   [[nodiscard]] device_span<detail::dremel_device_view const> dremel_device_views() const
@@ -940,8 +942,8 @@ struct preprocessed_table {
   rmm::device_uvector<size_type> const _depths;
 
   // Dremel encoding of list columns used for the comparison algorithm
-  std::optional<std::vector<detail::dremel_data>> _dremel_data;
-  std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
+  cuda::std::optional<std::vector<detail::dremel_data>> _dremel_data;
+  cuda::std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
 
   // Intermediate columns generated from transforming nested children columns into
   // integers columns using `cudf::rank()`, need to be kept alive.
@@ -1808,7 +1810,7 @@ class element_hasher {
   __device__ element_hasher(
     Nullate nulls,
     uint32_t seed             = DEFAULT_HASH_SEED,
-    hash_value_type null_hash = std::numeric_limits<hash_value_type>::max()) noexcept
+    hash_value_type null_hash = cuda::std::numeric_limits<hash_value_type>::max()) noexcept
     : _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
   {
   }
@@ -1892,7 +1894,7 @@ class device_row_hasher {
    */
   template <template <typename> class hash_fn>
   class element_hasher_adapter {
-    static constexpr hash_value_type NULL_HASH     = std::numeric_limits<hash_value_type>::max();
+    static constexpr hash_value_type NULL_HASH = cuda::std::numeric_limits<hash_value_type>::max();
     static constexpr hash_value_type NON_NULL_HASH = 0;
 
    public:
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 16d532ea2b8..4f6238b5fe7 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -16,6 +16,8 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   // A buffer of CPU memory is allocated to hold the ColumnDeviceView
   // objects. Once filled, the CPU memory is then copied to device memory
   // and the pointer is set in the d_columns member.
-  std::vector<int8_t> h_buffer(padded_views_size_bytes);
+  auto h_buffer = cudf::detail::make_host_vector<int8_t>(padded_views_size_bytes, stream);
   // Each ColumnDeviceView instance may have child objects which may
   // require setting some internal device pointers before being copied
   // from CPU to device.
@@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
     source_view.begin(), source_view.end(), h_ptr, d_ptr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  auto const h_span = host_span<int8_t const>{h_buffer}.subspan(
+    static_cast<int8_t const*>(h_ptr) - h_buffer.data(), views_size_bytes);
+  auto const d_span = device_span<int8_t>{static_cast<int8_t*>(d_ptr), views_size_bytes};
+  cudf::detail::cuda_memcpy(d_span, h_span, stream);
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 409b8c825bb..9443bd5cb52 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -266,7 +266,7 @@ class data_type {
    *
    * @param id The type's identifier
    */
-  explicit constexpr data_type(type_id id) : _id{id} {}
+  CUDF_HOST_DEVICE explicit constexpr data_type(type_id id) : _id{id} {}
 
   /**
    * @brief Construct a new `data_type` object for `numeric::fixed_point`
@@ -284,14 +284,17 @@ class data_type {
    *
    * @return The type identifier
    */
-  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    *
    * @return The scale
    */
-  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t scale() const noexcept
+  {
+    return _fixed_point_scale;
+  }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
deleted file mode 100644
index 982554a23f5..00000000000
--- a/cpp/include/cudf/utilities/logger.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/export.hpp>
-
-#include <spdlog/spdlog.h>
-
-namespace CUDF_EXPORT cudf {
-
-namespace detail {
-spdlog::logger& logger();
-}
-
-/**
- * @brief Returns the global logger.
- *
- * This is a global instance of a spdlog logger. It can be used to configure logging behavior in
- * libcudf.
- *
- * Examples:
- * @code{.cpp}
- * // Turn off logging at runtime
- * cudf::logger().set_level(spdlog::level::off);
- * // Add a stdout sink to the logger
- * cudf::logger().sinks().push_back(std::make_shared<spdlog::sinks::stdout_sink_mt>());
- * // Replace the default sink
- * cudf::logger().sinks() ={std::make_shared<spdlog::sinks::stderr_sink_mt>()};
- * @endcode
- *
- * Note: Changes to the sinks are not thread safe and should only be done during global
- * initialization.
- *
- * @return spdlog::logger& The logger.
- */
-[[deprecated(
-  "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger&
-logger();
-
-}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 21ee4fa9e9b..e7b76946248 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -69,52 +70,22 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;  ///< The extent of the span
 
-  constexpr span_base() noexcept {}
+  CUDF_HOST_DEVICE constexpr span_base() noexcept {}
   /**
    * @brief Constructs a span from a pointer and a size.
    *
    * @param data Pointer to the first element in the span.
    * @param size The number of elements in the span.
    */
-  constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
+  CUDF_HOST_DEVICE constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
-  constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
    * @return Reference to this span.
    */
-  constexpr span_base& operator=(span_base const&) noexcept = default;
-
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr reference front() const { return _data[0]; }
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling last on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; }
-  // not noexcept due to undefined behavior when idx < 0 || idx >= size
-  /**
-   * @brief Returns a reference to the idx-th element of the sequence.
-   *
-   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
-   * size()).
-   *
-   * @param idx the index of the element to access
-   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
-   */
-  constexpr reference operator[](size_type idx) const { return _data[idx]; }
+  CUDF_HOST_DEVICE constexpr span_base& operator=(span_base const&) noexcept = default;
 
   /**
    * @brief Returns an iterator to the first element of the span.
@@ -123,7 +94,7 @@ class span_base {
    *
    * @return An iterator to the first element of the span
    */
-  [[nodiscard]] constexpr iterator begin() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator begin() const noexcept { return _data; }
   /**
    * @brief Returns an iterator to the element following the last element of the span.
    *
@@ -131,32 +102,36 @@ class span_base {
    *
    * @return An iterator to the element following the last element of the span
    */
-  [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator end() const noexcept { return _data + _size; }
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr pointer data() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr pointer data() const noexcept { return _data; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return The number of elements in the span
    */
-  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size() const noexcept { return _size; }
   /**
    * @brief Returns the size of the sequence in bytes.
    *
    * @return The size of the sequence in bytes
    */
-  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size_bytes() const noexcept
+  {
+    return sizeof(T) * _size;
+  }
+
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -180,9 +155,9 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
- private:
-  pointer _data{nullptr};
-  size_type _size{0};
+ protected:
+  pointer _data{nullptr};  ///< Pointer to the first element in the span
+  size_type _size{0};      ///< The number of elements in the span
 };
 
 }  // namespace detail
@@ -288,6 +263,39 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  constexpr typename base::reference operator[](size_type idx) const { return this->_data[idx]; }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference front() const { return this->_data[0]; }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
 
   /**
    * @brief Returns whether the data is device accessible (e.g. pinned memory)
@@ -339,7 +347,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
   using base = cudf::detail::span_base<T, Extent, device_span<T, Extent>>;  ///< Base type
   using base::base;
 
-  constexpr device_span() noexcept : base() {}  // required to compile on centos
+  CUDF_HOST_DEVICE constexpr device_span() noexcept : base() {}  // required to compile on centos
 
   /// Constructor from container
   /// @param in The container to construct the span from
@@ -374,11 +382,51 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
-  constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
+  CUDF_HOST_DEVICE constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
 
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  __device__ constexpr typename base::reference operator[](size_type idx) const
+  {
+    return this->_data[idx];
+  }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference front() const
+  {
+    return this->_data[0];
+  }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
+
   /**
    * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
    *
@@ -417,7 +465,9 @@ class base_2dspan {
   constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
     : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+#ifndef __CUDA_ARCH__
     CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
+#endif
   }
 
   /**
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 22a67ca049a..0f4bde204fa 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -169,7 +169,7 @@ bool is_equality_comparable(data_type type);
  * @return false  `T` is not numeric
  */
 template <typename T>
-constexpr inline bool is_numeric()
+CUDF_HOST_DEVICE constexpr inline bool is_numeric()
 {
   return cuda::std::is_arithmetic<T>();
 }
@@ -217,6 +217,29 @@ constexpr inline bool is_index_type()
  */
 bool is_index_type(data_type type);
 
+/**
+ * @brief Indicates whether the type `T` is a signed numeric type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is signed numeric
+ */
+template <typename T>
+constexpr inline bool is_signed()
+{
+  return std::is_signed_v<T>;
+}
+
+/**
+ * @brief Indicates whether `type` is a signed numeric `data_type`.
+ *
+ * "Signed Numeric" types include fundamental integral types such as `INT*`
+ * but can also be `FLOAT*` types.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is signed numeric
+ */
+bool is_signed(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a unsigned numeric type.
  *
@@ -248,9 +271,9 @@ bool is_unsigned(data_type type);
  * @return true if the iterator's value type is unsigned
  */
 template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+CUDF_HOST_DEVICE constexpr inline bool is_signed_iterator()
 {
-  return std::is_signed_v<typename std::iterator_traits<Iterator>::value_type>;
+  return cuda::std::is_signed_v<typename cuda::std::iterator_traits<Iterator>::value_type>;
 }
 
 /**
@@ -333,9 +356,9 @@ bool is_numeric_not_bool(data_type type);
  * @return false  `T` is not floating point
  */
 template <typename T>
-constexpr inline bool is_floating_point()
+CUDF_HOST_DEVICE constexpr inline bool is_floating_point()
 {
-  return std::is_floating_point_v<T>;
+  return cuda::std::is_floating_point_v<T>;
 }
 
 /**
@@ -392,7 +415,7 @@ bool is_boolean(data_type type);
  * @return false  `T` is not a timestamp
  */
 template <typename T>
-constexpr inline bool is_timestamp()
+CUDF_HOST_DEVICE constexpr inline bool is_timestamp()
 {
   return is_timestamp_t<T>::value;
 }
@@ -416,13 +439,14 @@ bool is_timestamp(data_type type);
  * @return false  `T` is not a fixed-point type
  */
 template <typename T>
-constexpr inline bool is_fixed_point()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T> ||
-         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
+  return cuda::std::is_same_v<numeric::decimal32, T> ||
+         cuda::std::is_same_v<numeric::decimal64, T> ||
+         cuda::std::is_same_v<numeric::decimal128, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
@@ -442,7 +466,7 @@ bool is_fixed_point(data_type type);
  * @return false  `T` is not a duration
  */
 template <typename T>
-constexpr inline bool is_duration()
+CUDF_HOST_DEVICE constexpr inline bool is_duration()
 {
   return is_duration_t<T>::value;
 }
@@ -466,7 +490,7 @@ bool is_duration(data_type type);
  * @return false  `T` is neither a duration nor a timestamp type
  */
 template <typename T>
-constexpr inline bool is_chrono()
+CUDF_HOST_DEVICE constexpr inline bool is_chrono()
 {
   return is_duration<T>() || is_timestamp<T>();
 }
@@ -534,7 +558,7 @@ bool is_dictionary(data_type type);
  * @return false `T` corresponds to a variable-width element type
  */
 template <typename T>
-constexpr inline bool is_fixed_width()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_width()
 {
   // TODO Add fixed width wrapper types
   // Is a category fixed width?
@@ -567,10 +591,11 @@ class string_view;
  * @return false `T` corresponds to a "simple" type
  */
 template <typename T>
-constexpr inline bool is_compound()
+CUDF_HOST_DEVICE constexpr inline bool is_compound()
 {
-  return std::is_same_v<T, cudf::string_view> or std::is_same_v<T, cudf::dictionary32> or
-         std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::string_view> or
+         cuda::std::is_same_v<T, cudf::dictionary32> or cuda::std::is_same_v<T, cudf::list_view> or
+         cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
@@ -599,9 +624,9 @@ bool is_compound(data_type type);
  * @return false T is not a nested type
  */
 template <typename T>
-constexpr inline bool is_nested()
+CUDF_HOST_DEVICE constexpr inline bool is_nested()
 {
-  return std::is_same_v<T, cudf::list_view> || std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::list_view> || cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 6206c1311d2..6300bb87572 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped =
       cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
-                               cudf::data_type{type_id::UINT32},
+                               cudf::data_type{type_id::INT32},
                                cudf::test::get_default_stream());
   }
 
@@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped = cudf::dictionary::encode(
       fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
-      cudf::data_type{type_id::UINT32},
+      cudf::data_type{type_id::INT32},
       cudf::test::get_default_stream());
   }
 
@@ -1173,7 +1173,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 
@@ -1210,7 +1210,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index ab862df044d..71b68565e77 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -122,6 +122,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
  * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return An encoded column of strings.
  */
@@ -129,6 +130,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index b2c1a23f57e..f0d5d9ecb5d 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext {
  * @file
  */
 
-/**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint32_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width             = 4,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
-
 /**
  * @brief Returns the minhash values for each string
  *
@@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash_permuted(
+std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   uint32_t seed,
   cudf::device_span<uint32_t const> parameter_a,
@@ -142,67 +79,16 @@ std::unique_ptr<cudf::column> minhash_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
- * The hash function returns 2 uint64 values but only the first value
- * is used with the minhash calculation.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values as UINT64 for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint64_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * @copydoc nvtext::minhash
  *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash()
  */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash_permuted(
   cudf::strings_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width             = 4,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
@@ -244,7 +130,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64_permuted(
+std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   uint64_t seed,
   cudf::device_span<uint64_t const> parameter_a,
@@ -254,64 +140,18 @@ std::unique_ptr<cudf::column> minhash64_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
+ * @copydoc nvtext::minhash64
  *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash64()
  */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
+[[deprecated]] std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
-/**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm though
- * only the first 64-bits of the hash are used in computing the output.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec63504a414..0e31a0b6cf5 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -32,10 +32,10 @@ namespace binops {
 namespace compiled {
 
 template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-constexpr bool is_bool_result()
+CUDF_HOST_DEVICE constexpr bool is_bool_result()
 {
-  using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
-  return std::is_same_v<bool, ReturnType>;
+  using ReturnType = cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+  return cuda::std::is_same_v<bool, ReturnType>;
 }
 
 /**
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 4806c7a94e8..7eb80c4249e 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -30,9 +30,9 @@ bool is_element_valid_sync(column_view const& col_view,
   CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
   if (!col_view.nullable()) { return true; }
 
-  bitmask_type word;
+  bitmask_type word = 0;
   // null_mask() returns device ptr to bitmask without offset
-  size_type index = element_index + col_view.offset();
+  size_type const index = element_index + col_view.offset();
   CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
                                 col_view.null_mask() + word_index(index),
                                 sizeof(bitmask_type),
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 972f97e8668..050c23b0a3d 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -178,7 +178,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
   CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
   return make_dictionary_column(
     make_column_from_scalar(s, 1, stream, mr),
-    make_column_from_scalar(numeric_scalar<uint32_t>(0, true, stream), size, stream, mr),
+    make_column_from_scalar(numeric_scalar<int32_t>(0, true, stream), size, stream, mr),
     rmm::device_buffer{0, stream, mr},
     0);
 }
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index e831aa9645d..ea940676f6a 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -41,7 +41,7 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
       cudf::experimental::prefetch::detail::prefetch_noexcept(
         key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
     } else if (col.type().id() == type_id::STRING) {
-      strings_column_view scv{col};
+      strings_column_view const scv{col};
       if (data_ptr == nullptr) {
         // Do not call chars_size if the data_ptr is nullptr.
         return;
@@ -58,51 +58,6 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
   }
 }
 
-}  // namespace
-
-column_view_base::column_view_base(data_type type,
-                                   size_type size,
-                                   void const* data,
-                                   bitmask_type const* null_mask,
-                                   size_type null_count,
-                                   size_type offset)
-  : _type{type},
-    _size{size},
-    _data{data},
-    _null_mask{null_mask},
-    _null_count{null_count},
-    _offset{offset}
-{
-  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
-
-  if (type.id() == type_id::EMPTY) {
-    _null_count = size;
-    CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
-    CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
-  } else if (is_compound(type)) {
-    if (type.id() != type_id::STRING) {
-      CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
-    }
-  } else if (size > 0) {
-    CUDF_EXPECTS(nullptr != data, "Null data pointer.");
-  }
-
-  CUDF_EXPECTS(offset >= 0, "Invalid offset.");
-
-  if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
-    CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
-  }
-}
-
-size_type column_view_base::null_count(size_type begin, size_type end) const
-{
-  CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
-  return (null_count() == 0)
-           ? 0
-           : cudf::detail::null_count(
-               null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
-}
-
 // Struct to use custom hash combine and fold expression
 struct HashValue {
   std::size_t hash;
@@ -133,8 +88,6 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false
                          });
 }
 
-std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
-
 bool shallow_equivalent_impl(column_view const& lhs,
                              column_view const& rhs,
                              bool is_parent_empty = false)
@@ -151,11 +104,59 @@ bool shallow_equivalent_impl(column_view const& lhs,
                       return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
                     });
 }
+
+}  // namespace
+
+column_view_base::column_view_base(data_type type,
+                                   size_type size,
+                                   void const* data,
+                                   bitmask_type const* null_mask,
+                                   size_type null_count,
+                                   size_type offset)
+  : _type{type},
+    _size{size},
+    _data{data},
+    _null_mask{null_mask},
+    _null_count{null_count},
+    _offset{offset}
+{
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
+
+  if (type.id() == type_id::EMPTY) {
+    _null_count = size;
+    CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
+    CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
+  } else if (is_compound(type)) {
+    if (type.id() != type_id::STRING) {
+      CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
+    }
+  } else if (size > 0) {
+    CUDF_EXPECTS(nullptr != data, "Null data pointer.");
+  }
+
+  CUDF_EXPECTS(offset >= 0, "Invalid offset.");
+
+  if ((null_count > 0) and (type.id() != type_id::EMPTY)) {
+    CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count.");
+  }
+}
+
+size_type column_view_base::null_count(size_type begin, size_type end) const
+{
+  CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
+  return (null_count() == 0)
+           ? 0
+           : cudf::detail::null_count(
+               null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
+}
+
 bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
 {
   return shallow_equivalent_impl(lhs, rhs);
 }
 
+std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
+
 }  // namespace detail
 
 // Immutable view constructor
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d8419760120..6fc49afd7ac 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    cudaMemcpyAsync(m_view.begin<T>() + count,
+                    v.begin<T>(),
+                    v.size() * sizeof(T),
+                    cudaMemcpyDeviceToDevice,
+                    stream.value());
     count += v.size();
   }
 
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e9443980320..e3ed5b55415 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -35,6 +35,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -1675,7 +1676,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
         if (bytes == 0) { return {1, 0}; }
 
         // The number of batches we want to subdivide this buffer into
-        std::size_t const num_batches = std::max(
+        std::size_t const num_batches = cuda::std::max(
           std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
 
         // NOTE: leaving batch size as a separate parameter for future tuning
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 5e2065ba844..89d8cc3f4aa 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -62,11 +62,12 @@ struct scalar_empty_like_functor_impl<cudf::list_view> {
     auto ls = static_cast<list_scalar const*>(&input);
 
     // TODO:  add a manual constructor for lists_column_view.
-    column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
+    column_view const offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
     std::vector<column_view> children;
     children.push_back(offsets);
     children.push_back(ls->view());
-    column_view lcv{cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
+    column_view const lcv{
+      cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
 
     return empty_like(lcv);
   }
@@ -81,8 +82,9 @@ struct scalar_empty_like_functor_impl<cudf::struct_view> {
     // TODO: add a manual constructor for structs_column_view
     // TODO: add cudf::get_element() support for structs
     cudf::table_view tbl = ss->view();
-    std::vector<column_view> children(tbl.begin(), tbl.end());
-    column_view scv{cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
+    std::vector<column_view> const children(tbl.begin(), tbl.end());
+    column_view const scv{
+      cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
 
     return empty_like(scv);
   }
@@ -120,7 +122,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
     is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error);
-  mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
+  mask_state const allocate_mask = should_allocate_mask(mask_alloc, input.nullable());
 
   return std::make_unique<column>(input.type(),
                                   size,
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index a001807c82b..42ea28f5961 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -48,20 +48,20 @@ struct serialized_column {
       null_count(_null_count),
       data_offset(_data_offset),
       null_mask_offset(_null_mask_offset),
-      num_children(_num_children),
-      pad(0)
+      num_children(_num_children)
+
   {
   }
 
   data_type type;
-  size_type size;
-  size_type null_count;
-  int64_t data_offset;       // offset into contiguous data buffer, or -1 if column data is null
-  int64_t null_mask_offset;  // offset into contiguous data buffer, or -1 if column data is null
-  size_type num_children;
+  size_type size{};
+  size_type null_count{};
+  int64_t data_offset{};       // offset into contiguous data buffer, or -1 if column data is null
+  int64_t null_mask_offset{};  // offset into contiguous data buffer, or -1 if column data is null
+  size_type num_children{};
   // Explicitly pad to avoid uninitialized padding bits, allowing `serialized_column` to be bit-wise
   // comparable
-  int pad;
+  int pad{};
 };
 
 /**
@@ -137,6 +137,34 @@ void build_column_metadata(metadata_builder& mb,
     });
 }
 
+table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
+{
+  // gpu data can be null if everything is empty but the metadata must always be valid
+  CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
+  auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
+  uint8_t const* base_ptr = gpu_data;
+  // first entry is a stub where size == the total # of top level columns (see pack_metadata above)
+  auto const num_columns = serialized_columns[0].size;
+  size_t current_index   = 1;
+
+  std::function<std::vector<column_view>(size_type)> get_columns;
+  get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
+    std::vector<column_view> cols;
+    for (size_t i = 0; i < num_columns; i++) {
+      auto serial_column = serialized_columns[current_index];
+      current_index++;
+
+      std::vector<column_view> const children = get_columns(serial_column.num_children);
+
+      cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
+    }
+
+    return cols;
+  };
+
+  return table_view{get_columns(num_columns)};
+}
+
 }  // anonymous namespace
 
 /**
@@ -198,37 +226,6 @@ class metadata_builder_impl {
   std::vector<detail::serialized_column> metadata;
 };
 
-/**
- * @copydoc cudf::detail::unpack
- */
-table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
-{
-  // gpu data can be null if everything is empty but the metadata must always be valid
-  CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input");
-  auto serialized_columns = reinterpret_cast<serialized_column const*>(metadata);
-  uint8_t const* base_ptr = gpu_data;
-  // first entry is a stub where size == the total # of top level columns (see pack_metadata above)
-  auto const num_columns = serialized_columns[0].size;
-  size_t current_index   = 1;
-
-  std::function<std::vector<column_view>(size_type)> get_columns;
-  get_columns = [&serialized_columns, &current_index, base_ptr, &get_columns](size_t num_columns) {
-    std::vector<column_view> cols;
-    for (size_t i = 0; i < num_columns; i++) {
-      auto serial_column = serialized_columns[current_index];
-      current_index++;
-
-      std::vector<column_view> children = get_columns(serial_column.num_children);
-
-      cols.emplace_back(deserialize_column(serial_column, children, base_ptr));
-    }
-
-    return cols;
-  };
-
-  return table_view{get_columns(num_columns)};
-}
-
 metadata_builder::metadata_builder(size_type const num_root_columns)
   : impl(std::make_unique<metadata_builder_impl>(num_root_columns +
                                                  1 /*one more extra metadata entry as below*/))
@@ -280,9 +277,6 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
   return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder);
 }
 
-/**
- * @copydoc cudf::unpack
- */
 table_view unpack(packed_columns const& input)
 {
   CUDF_FUNC_RANGE();
@@ -292,9 +286,6 @@ table_view unpack(packed_columns const& input)
                             reinterpret_cast<uint8_t const*>(input.gpu_data->data()));
 }
 
-/**
- * @copydoc cudf::unpack(uint8_t const*, uint8_t const* )
- */
 table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index f786624680c..78e4198f60c 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -62,7 +62,7 @@ struct dst_transition_s {
 #pragma pack(pop)
 
 struct timezone_file {
-  timezone_file_header header;
+  timezone_file_header header{};
   bool is_header_from_64bit = false;
 
   std::vector<int64_t> transition_times;
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 565055009ba..a851fc6069d 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -106,10 +106,10 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   auto indices_column     = [&] {
     column_view gather_result = table_indices.front()->view();
     auto const indices_size   = gather_result.size();
-    // we can just use the lower-bound/gather data directly for UINT32 case
-    if (indices_type.id() == type_id::UINT32) {
+    // we can just use the lower-bound/gather data directly for INT32 case
+    if (indices_type.id() == type_id::INT32) {
       auto contents = table_indices.front()->release();
-      return std::make_unique<column>(data_type{type_id::UINT32},
+      return std::make_unique<column>(data_type{type_id::INT32},
                                       indices_size,
                                       std::move(*(contents.data.release())),
                                       rmm::device_buffer{0, stream, mr},
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index b3a8bb4cd20..0f17858094b 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -252,7 +252,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     if (dict_view.is_empty()) {
-      return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0};
+      return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0};
     }
     return dict_view.get_indices_annotated();  // nicely includes validity mask and view offset
   });
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 3e0c98d36ea..9f81c852a30 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -33,7 +33,7 @@ struct dispatch_create_indices {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
+    CUDF_EXPECTS(cudf::is_signed<IndexType>(), "indices must be a signed type");
     column_view indices_view{
       indices.type(), indices.size(), indices.data<IndexType>(), nullptr, 0, indices.offset()};
     return std::make_unique<column>(indices_view, stream, mr);
@@ -83,7 +83,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 {
   CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
   CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
-  CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer");
+  CUDF_EXPECTS(is_signed(indices_column->type()) && is_index_type(indices_column->type()),
+               "indices must be type unsigned integer");
 
   auto count = indices_column->size();
   std::vector<std::unique_ptr<column>> children;
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index c8ccb511e8f..5935b4f13e8 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -44,7 +44,8 @@ std::unique_ptr<column> encode(column_view const& input_column,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
+  CUDF_EXPECTS(is_signed(indices_type) && is_index_type(indices_type),
+               "indices must be type signed integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
                "cannot encode a dictionary from a dictionary");
 
@@ -63,10 +64,6 @@ std::unique_ptr<column> encode(column_view const& input_column,
     keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // remove the null-mask
   }
 
-  // the encode() returns INT32 for indices
-  if (indices_column->type().id() != indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);
-
   // create column with keys_column and indices_column
   return make_dictionary_column(std::move(keys_column),
                                 std::move(indices_column),
@@ -79,9 +76,9 @@ std::unique_ptr<column> encode(column_view const& input_column,
  */
 data_type get_indices_type_for_size(size_type keys_size)
 {
-  if (keys_size <= std::numeric_limits<uint8_t>::max()) return data_type{type_id::UINT8};
-  if (keys_size <= std::numeric_limits<uint16_t>::max()) return data_type{type_id::UINT16};
-  return data_type{type_id::UINT32};
+  if (keys_size <= std::numeric_limits<int8_t>::max()) return data_type{type_id::INT8};
+  if (keys_size <= std::numeric_limits<int16_t>::max()) return data_type{type_id::INT16};
+  return data_type{type_id::INT32};
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 119f43a4ae9..4715931a7a9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
@@ -180,11 +181,11 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
   // search the indices values with key indices to look for any holes
   auto const matches = [&] {
     // build keys index to verify against indices values
-    rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
+    rmm::device_uvector<int32_t> keys_positions(keys_size, stream);
     thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
     column_view keys_positions_view(
-      data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0);
+      data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0);
     return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
   }();
   auto d_matches = matches->view().data<bool>();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 04e2c17635d..286b1a87df2 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -158,8 +158,9 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
-  if (dictionary.is_empty())
-    return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
+  if (dictionary.is_empty()) {
+    return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
+  }
   return type_dispatcher<dispatch_storage_type>(
     dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr);
 }
@@ -169,8 +170,9 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  if (dictionary.is_empty())
-    return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
+  if (dictionary.is_empty()) {
+    return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
+  }
   return type_dispatcher<dispatch_storage_type>(
     dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr);
 }
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 3041e261945..7a8a1883ed4 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -45,6 +45,42 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
+namespace {
+
+/**
+ * @brief Creates column views with only valid elements in both input column views
+ *
+ * @param column_0 The first column
+ * @param column_1 The second column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return tuple with new null mask (if null masks of input differ) and new column views
+ */
+auto column_view_with_common_nulls(column_view const& column_0,
+                                   column_view const& column_1,
+                                   rmm::cuda_stream_view stream)
+{
+  auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream);
+  if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); }
+  auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) {
+    return column_view(col.type(),
+                       col.size(),
+                       col.head(),
+                       static_cast<cudf::bitmask_type const*>(nullmask),
+                       null_count,
+                       col.offset(),
+                       std::vector(col.child_begin(), col.child_end()));
+  };
+  auto new_column_0 = null_count == column_0.null_count()
+                        ? column_0
+                        : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count);
+  auto new_column_1 = null_count == column_1.null_count()
+                        ? column_1
+                        : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count);
+  return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1);
+}
+
+}  // namespace
+
 /**
  * @brief Functor to dispatch aggregation with
  *
@@ -170,13 +206,13 @@ void aggregate_result_functor::operator()<aggregation::MIN>(aggregation const& a
     } else {
       auto argmin_agg = make_argmin_aggregation();
       operator()<aggregation::ARGMIN>(*argmin_agg);
-      column_view argmin_result = cache.get_result(values, *argmin_agg);
+      column_view const argmin_result = cache.get_result(values, *argmin_agg);
 
       // We make a view of ARGMIN result without a null mask and gather using
       // this mask. The values in data buffer of ARGMIN result corresponding
       // to null values was initialized to ARGMIN_SENTINEL which is an out of
       // bounds index value and causes the gathered value to be null.
-      column_view null_removed_map(
+      column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmin_result.size(),
         static_cast<void const*>(argmin_result.template data<size_type>()),
@@ -212,13 +248,13 @@ void aggregate_result_functor::operator()<aggregation::MAX>(aggregation const& a
     } else {
       auto argmax_agg = make_argmax_aggregation();
       operator()<aggregation::ARGMAX>(*argmax_agg);
-      column_view argmax_result = cache.get_result(values, *argmax_agg);
+      column_view const argmax_result = cache.get_result(values, *argmax_agg);
 
       // We make a view of ARGMAX result without a null mask and gather using
       // this mask. The values in data buffer of ARGMAX result corresponding
       // to null values was initialized to ARGMAX_SENTINEL which is an out of
       // bounds index value and causes the gathered value to be null.
-      column_view null_removed_map(
+      column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmax_result.size(),
         static_cast<void const*>(argmax_result.template data<size_type>()),
@@ -248,8 +284,8 @@ void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const&
   auto count_agg = make_count_aggregation();
   operator()<aggregation::SUM>(*sum_agg);
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view sum_result   = cache.get_result(values, *sum_agg);
-  column_view count_result = cache.get_result(values, *count_agg);
+  column_view const sum_result   = cache.get_result(values, *sum_agg);
+  column_view const count_result = cache.get_result(values, *count_agg);
 
   // TODO (dm): Special case for timestamp. Add target_type_impl for it.
   //            Blocked until we support operator+ on timestamps
@@ -291,8 +327,8 @@ void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation con
   auto count_agg = make_count_aggregation();
   operator()<aggregation::MEAN>(*mean_agg);
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view mean_result = cache.get_result(values, *mean_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const mean_result = cache.get_result(values, *mean_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   auto result = detail::group_var(get_grouped_values(),
                                   mean_result,
@@ -312,7 +348,7 @@ void aggregate_result_functor::operator()<aggregation::STD>(aggregation const& a
   auto& std_agg = dynamic_cast<cudf::detail::std_aggregation const&>(agg);
   auto var_agg  = make_variance_aggregation(std_agg._ddof);
   operator()<aggregation::VARIANCE>(*var_agg);
-  column_view var_result = cache.get_result(values, *var_agg);
+  column_view const var_result = cache.get_result(values, *var_agg);
 
   auto result = cudf::detail::unary_operation(var_result, unary_operator::SQRT, stream, mr);
   cache.add_result(values, agg, std::move(result));
@@ -325,8 +361,8 @@ void aggregate_result_functor::operator()<aggregation::QUANTILE>(aggregation con
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
-  auto& quantile_agg      = dynamic_cast<cudf::detail::quantile_aggregation const&>(agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
+  auto& quantile_agg            = dynamic_cast<cudf::detail::quantile_aggregation const&>(agg);
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
@@ -346,7 +382,7 @@ void aggregate_result_functor::operator()<aggregation::MEDIAN>(aggregation const
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
@@ -391,7 +427,7 @@ void aggregate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation
   } else {
     CUDF_FAIL("Wrong count aggregation kind");
   }
-  column_view group_sizes = cache.get_result(values, *count_agg);
+  column_view const group_sizes = cache.get_result(values, *count_agg);
 
   cache.add_result(values,
                    agg,
@@ -564,38 +600,6 @@ void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregat
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
-/**
- * @brief Creates column views with only valid elements in both input column views
- *
- * @param column_0 The first column
- * @param column_1 The second column
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return tuple with new null mask (if null masks of input differ) and new column views
- */
-auto column_view_with_common_nulls(column_view const& column_0,
-                                   column_view const& column_1,
-                                   rmm::cuda_stream_view stream)
-{
-  auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream);
-  if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); }
-  auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) {
-    return column_view(col.type(),
-                       col.size(),
-                       col.head(),
-                       static_cast<cudf::bitmask_type const*>(nullmask),
-                       null_count,
-                       col.offset(),
-                       std::vector(col.child_begin(), col.child_end()));
-  };
-  auto new_column_0 = null_count == column_0.null_count()
-                        ? column_0
-                        : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count);
-  auto new_column_1 = null_count == column_1.null_count()
-                        ? column_1
-                        : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count);
-  return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1);
-}
-
 /**
  * @brief Perform covariance between two child columns of non-nullable struct column.
  *
@@ -734,7 +738,7 @@ void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation cons
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
-  column_view valid_counts = cache.get_result(values, *count_agg);
+  column_view const valid_counts = cache.get_result(values, *count_agg);
 
   cache.add_result(values,
                    agg,
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 65bd5ac408f..583357d9090 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/pair.h>
@@ -185,7 +186,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
-      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+      return unequal ? row_index_in_group + 1 : cuda::std::numeric_limits<size_type>::max();
     },
     DeviceMin{},
     has_nested_nulls(table_view{{grouped_values}}),
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 090bd92af8c..ccdd097fa9c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -24,6 +24,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/array>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -31,7 +33,7 @@ namespace hashing {
 namespace detail {
 namespace {
 
-using hash_value_type = thrust::pair<uint64_t, uint64_t>;
+using hash_value_type = cuda::std::array<uint64_t, 2>;
 
 /**
  * @brief Computes the hash value of a row in the given table.
@@ -58,7 +60,7 @@ class murmur_device_row_hasher {
    */
   __device__ void operator()(size_type row_index) const noexcept
   {
-    auto h = cudf::detail::accumulate(
+    auto const h = cudf::detail::accumulate(
       _input.begin(),
       _input.end(),
       hash_value_type{_seed, 0},
@@ -66,8 +68,8 @@ class murmur_device_row_hasher {
         return cudf::type_dispatcher(
           column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
       });
-    _output1[row_index] = h.first;
-    _output2[row_index] = h.second;
+    _output1[row_index] = h[0];
+    _output2[row_index] = h[1];
   }
 
   /**
@@ -78,13 +80,14 @@ class murmur_device_row_hasher {
     template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
     __device__ hash_value_type operator()(column_device_view const& col,
                                           size_type row_index,
-                                          Nullate const _check_nulls,
-                                          hash_value_type const _seed) const noexcept
+                                          Nullate const check_nulls,
+                                          hash_value_type const seed) const noexcept
     {
-      if (_check_nulls && col.is_null(row_index)) {
-        return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
+      if (check_nulls && col.is_null(row_index)) {
+        return {cuda::std::numeric_limits<uint64_t>::max(),
+                cuda::std::numeric_limits<uint64_t>::max()};
       }
-      auto const hasher = MurmurHash3_x64_128<T>{_seed.first};
+      auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
     }
 
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index eb002cf9c6f..52f31667ff0 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -37,7 +38,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -252,7 +252,7 @@ struct HasherDispatcher {
   {
     Element const& key = input_col.element<Element>(row_index);
     if (isnan(key)) {
-      Element nan = std::numeric_limits<Element>::quiet_NaN();
+      Element nan = cuda::std::numeric_limits<Element>::quiet_NaN();
       hasher->process_fixed_width(nan);
     } else if (key == Element{0.0}) {
       hasher->process_fixed_width(Element{0.0});
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index bdbe13b1ffb..5e74148ceaf 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -72,7 +73,7 @@ class device_row_hasher {
                                           hash_value_type const _seed) const noexcept
     {
       if (_check_nulls && col.is_null(row_index)) {
-        return std::numeric_limits<hash_value_type>::max();
+        return cuda::std::numeric_limits<hash_value_type>::max();
       }
       auto const hasher = XXHash_64<T>{_seed};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index b5cc4cbba0d..fee767255c2 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -115,8 +115,8 @@ DLDataType data_type_to_DLDataType(data_type type)
 
 // Context object to own memory allocated for DLManagedTensor
 struct dltensor_context {
-  int64_t shape[2];    // NOLINT
-  int64_t strides[2];  // NOLINT
+  int64_t shape[2]{};    // NOLINT
+  int64_t strides[2]{};  // NOLINT
   rmm::device_buffer buffer;
 
   static void deleter(DLManagedTensor* arg)
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 057e563c86e..cb3c4c55a61 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -194,19 +194,12 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
     get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
 
   auto const dict_indices_type = [&schema]() -> data_type {
-    // cudf dictionary requires an unsigned type for the indices,
-    // since it is invalid for an arrow dictionary to contain negative
-    // indices, we can safely use the unsigned equivalent without having
-    // to modify the buffers.
+    // cudf dictionary requires a signed type for the indices
     switch (schema->storage_type) {
-      case NANOARROW_TYPE_INT8:
-      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
-      case NANOARROW_TYPE_INT16:
-      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
-      case NANOARROW_TYPE_INT32:
-      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
-      case NANOARROW_TYPE_INT64:
-      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+      case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+      case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+      case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
       default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
     }
   }();
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 2e9504a6726..b5d2427e288 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -267,19 +267,12 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::dictiona
     get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
 
   auto const dict_indices_type = [&schema]() -> data_type {
-    // cudf dictionary requires an unsigned type for the indices,
-    // since it is invalid for an arrow dictionary to contain negative
-    // indices, we can safely use the unsigned equivalent without having
-    // to modify the buffers.
+    // cudf dictionary requires a signed type for the indices
     switch (schema->storage_type) {
-      case NANOARROW_TYPE_INT8:
-      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
-      case NANOARROW_TYPE_INT16:
-      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
-      case NANOARROW_TYPE_INT32:
-      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
-      case NANOARROW_TYPE_INT64:
-      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+      case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+      case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+      case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
       default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
     }
   }();
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 5afed772656..5dd8d77c261 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -44,7 +44,7 @@ struct dispatch_to_arrow_type {
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
   {
-    cudf::type_id id = input_view.type().id();
+    cudf::type_id const id = input_view.type().id();
     switch (id) {
       case cudf::type_id::TIMESTAMP_SECONDS:
         return ArrowSchemaSetTypeDateTime(
@@ -186,7 +186,7 @@ int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
                                                            column_metadata const& metadata,
                                                            ArrowSchema* out)
 {
-  cudf::dictionary_column_view dview{input};
+  cudf::dictionary_column_view const dview{input};
 
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
   NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index b3fcca62314..c3a7f0f3053 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -200,7 +200,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     // encountered.  If they don't, we have to assume the data is corrupted,
     // and thus, we terminate processing immediately.
     std::array const sync_marker = {get_raw<uint64_t>(), get_raw<uint64_t>()};
-    bool valid_sync_markers =
+    bool const valid_sync_markers =
       ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1]));
     if (!valid_sync_markers) { return false; }
   }
@@ -218,10 +218,10 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   md->selected_data_size = m_cur - m_start;
   // Extract columns
   for (size_t i = 0; i < md->schema.size(); i++) {
-    type_kind_e kind                = md->schema[i].kind;
-    logicaltype_kind_e logical_kind = md->schema[i].logical_kind;
+    type_kind_e const kind                = md->schema[i].kind;
+    logicaltype_kind_e const logical_kind = md->schema[i].logical_kind;
 
-    bool is_supported_kind = ((kind > type_null) && (kind < type_record));
+    bool const is_supported_kind = ((kind > type_null) && (kind < type_record));
     if (is_supported_logical_type(logical_kind) || is_supported_kind) {
       column_desc col;
       int parent_idx       = md->schema[i].parent_idx;
@@ -302,7 +302,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   // Empty schema
   if (json_str == "[]") return true;
 
-  std::array<char, MAX_SCHEMA_DEPTH> depthbuf;
+  std::array<char, MAX_SCHEMA_DEPTH> depthbuf{};
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
@@ -341,7 +341,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const&
   m_cur               = m_base;
   m_end               = m_base + json_str.length();
   while (more_data()) {
-    int c = *m_cur++;
+    int const c = *m_cur++;
     switch (c) {
       case '"':
         str = get_str();
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 9bf66369d6a..4c05d78292b 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -142,7 +142,7 @@ enum logicaltype_kind_e {
  *
  * @return true if the logical type is supported, false otherwise.
  */
-inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+CUDF_HOST_DEVICE inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
 {
   switch (logical_kind) {
     case logicaltype_date: return true;
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index b26a6292806..2dda2287e09 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -48,13 +48,13 @@ std::vector<std::uint8_t> compress_gzip(host_span<uint8_t const> src)
   zs.avail_out = 0;
   zs.next_out  = nullptr;
 
-  int windowbits    = 15;
-  int gzip_encoding = 16;
-  int ret           = deflateInit2(
+  constexpr int windowbits    = 15;
+  constexpr int gzip_encoding = 16;
+  int ret                     = deflateInit2(
     &zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowbits | gzip_encoding, 8, Z_DEFAULT_STRATEGY);
   CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression initialization failed.");
 
-  uint32_t estcomplen = deflateBound(&zs, src.size());
+  uint32_t const estcomplen = deflateBound(&zs, src.size());
   dst.resize(estcomplen);
   zs.avail_out = estcomplen;
   zs.next_out  = dst.data();
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index c3187f73a95..d45c02f374f 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -18,8 +18,8 @@
 
 #include "nvcomp_adapter.cuh"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
@@ -31,6 +31,7 @@
 #include <mutex>
 
 namespace cudf::io::nvcomp {
+namespace {
 
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
@@ -50,19 +51,6 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
-size_t batched_decompress_temp_size(compression_type compression,
-                                    size_t num_chunks,
-                                    size_t max_uncomp_chunk_size,
-                                    size_t max_total_uncomp_size)
-{
-  size_t temp_size             = 0;
-  nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex(
-    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
-  return temp_size;
-}
 
 // Dispatcher for nvcompBatched<format>DecompressAsync
 template <typename... Args>
@@ -82,7 +70,7 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
   }
 }
 
-std::string compression_type_name(compression_type compression)
+[[maybe_unused]] std::string compression_type_name(compression_type compression)
 {
   switch (compression) {
     case compression_type::SNAPPY: return "Snappy";
@@ -94,40 +82,6 @@ std::string compression_type_name(compression_type compression)
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
 
-void batched_decompress(compression_type compression,
-                        device_span<device_span<uint8_t const> const> inputs,
-                        device_span<device_span<uint8_t> const> outputs,
-                        device_span<compression_result> results,
-                        size_t max_uncomp_chunk_size,
-                        size_t max_total_uncomp_size,
-                        rmm::cuda_stream_view stream)
-{
-  auto const num_chunks = inputs.size();
-
-  // cuDF inflate inputs converted to nvcomp inputs
-  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
-  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
-  // Temporary space required for decompression
-  auto const temp_size = batched_decompress_temp_size(
-    compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
-  rmm::device_buffer scratch(temp_size, stream);
-  auto const nvcomp_status = batched_decompress_async(compression,
-                                                      nvcomp_args.input_data_ptrs.data(),
-                                                      nvcomp_args.input_data_sizes.data(),
-                                                      nvcomp_args.output_data_sizes.data(),
-                                                      actual_uncompressed_data_sizes.data(),
-                                                      num_chunks,
-                                                      scratch.data(),
-                                                      scratch.size(),
-                                                      nvcomp_args.output_data_ptrs.data(),
-                                                      nvcomp_statuses.data(),
-                                                      stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
-
-  update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
-}
-
 size_t batched_compress_temp_size(compression_type compression,
                                   size_t batch_size,
                                   size_t max_uncompressed_chunk_bytes,
@@ -172,52 +126,17 @@ size_t batched_compress_temp_size(compression_type compression,
   return temp_size;
 }
 
-// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
-size_t compress_max_output_chunk_size(compression_type compression,
-                                      uint32_t max_uncompressed_chunk_bytes)
-{
-  auto const capped_uncomp_bytes = std::min<size_t>(
-    compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes),
-    max_uncompressed_chunk_bytes);
-
-  size_t max_comp_chunk_size = 0;
-  nvcompStatus_t status      = nvcompStatus_t::nvcompSuccess;
-  switch (compression) {
-    case compression_type::SNAPPY:
-      status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::DEFLATE:
-      status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::ZSTD:
-      status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
-      break;
-    case compression_type::LZ4:
-      status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
-      break;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-
-  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
-               "failed to get max uncompressed chunk size");
-  return max_comp_chunk_size;
-}
-
 // Dispatcher for nvcompBatched<format>CompressAsync
-static void batched_compress_async(compression_type compression,
-                                   void const* const* device_uncompressed_ptrs,
-                                   size_t const* device_uncompressed_bytes,
-                                   size_t max_uncompressed_chunk_bytes,
-                                   size_t batch_size,
-                                   void* device_temp_ptr,
-                                   size_t temp_bytes,
-                                   void* const* device_compressed_ptrs,
-                                   size_t* device_compressed_bytes,
-                                   rmm::cuda_stream_view stream)
+void batched_compress_async(compression_type compression,
+                            void const* const* device_uncompressed_ptrs,
+                            size_t const* device_uncompressed_bytes,
+                            size_t max_uncompressed_chunk_bytes,
+                            size_t batch_size,
+                            void* device_temp_ptr,
+                            size_t temp_bytes,
+                            void* const* device_compressed_ptrs,
+                            size_t* device_compressed_bytes,
+                            rmm::cuda_stream_view stream)
 {
   nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
   switch (compression) {
@@ -279,6 +198,137 @@ bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept
   return (reinterpret_cast<std::uintptr_t>(ptr) % alignment) == 0;
 }
 
+std::optional<std::string> is_compression_disabled_impl(compression_type compression,
+                                                        feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: {
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD:
+      if (not params.are_stable_integrations_enabled) {
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    default: return "Unsupported compression type";
+  }
+}
+
+std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
+                                                          feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE:
+    case compression_type::GZIP: {
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE decompression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD: {
+      if (not params.are_stable_integrations_enabled) {
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+  }
+  return "Unsupported compression type";
+}
+
+}  // namespace
+
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size)
+{
+  size_t temp_size                   = 0;
+  nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex(
+    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
+
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+  return temp_size;
+}
+
+void batched_decompress(compression_type compression,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<compression_result> results,
+                        size_t max_uncomp_chunk_size,
+                        size_t max_total_uncomp_size,
+                        rmm::cuda_stream_view stream)
+{
+  auto const num_chunks = inputs.size();
+
+  // cuDF inflate inputs converted to nvcomp inputs
+  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
+  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
+  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
+  // Temporary space required for decompression
+  auto const temp_size = batched_decompress_temp_size(
+    compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
+  rmm::device_buffer scratch(temp_size, stream);
+  auto const nvcomp_status = batched_decompress_async(compression,
+                                                      nvcomp_args.input_data_ptrs.data(),
+                                                      nvcomp_args.input_data_sizes.data(),
+                                                      nvcomp_args.output_data_sizes.data(),
+                                                      actual_uncompressed_data_sizes.data(),
+                                                      num_chunks,
+                                                      scratch.data(),
+                                                      scratch.size(),
+                                                      nvcomp_args.output_data_ptrs.data(),
+                                                      nvcomp_statuses.data(),
+                                                      stream.value());
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+
+  update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
+}
+
+// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
+size_t compress_max_output_chunk_size(compression_type compression,
+                                      uint32_t max_uncompressed_chunk_bytes)
+{
+  auto const capped_uncomp_bytes = std::min<size_t>(
+    compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes),
+    max_uncompressed_chunk_bytes);
+
+  size_t max_comp_chunk_size = 0;
+  nvcompStatus_t status      = nvcompStatus_t::nvcompSuccess;
+  switch (compression) {
+    case compression_type::SNAPPY:
+      status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::DEFLATE:
+      status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::ZSTD:
+      status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
+      break;
+    case compression_type::LZ4:
+      status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
+      break;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+
+  CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
+               "failed to get max uncompressed chunk size");
+  return max_comp_chunk_size;
+}
+
 void batched_compress(compression_type compression,
                       device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
@@ -347,28 +397,6 @@ struct hash_feature_status_inputs {
 using feature_status_memo_map =
   std::unordered_map<feature_status_inputs, std::optional<std::string>, hash_feature_status_inputs>;
 
-std::optional<std::string> is_compression_disabled_impl(compression_type compression,
-                                                        feature_status_parameters params)
-{
-  switch (compression) {
-    case compression_type::DEFLATE: {
-      if (not params.are_all_integrations_enabled) {
-        return "DEFLATE compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::LZ4:
-    case compression_type::SNAPPY:
-    case compression_type::ZSTD:
-      if (not params.are_stable_integrations_enabled) {
-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    default: return "Unsupported compression type";
-  }
-}
-
 std::optional<std::string> is_compression_disabled(compression_type compression,
                                                    feature_status_parameters params)
 {
@@ -398,30 +426,6 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   return reason;
 }
 
-std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
-                                                          feature_status_parameters params)
-{
-  switch (compression) {
-    case compression_type::DEFLATE:
-    case compression_type::GZIP: {
-      if (not params.are_all_integrations_enabled) {
-        return "DEFLATE decompression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::LZ4:
-    case compression_type::SNAPPY:
-    case compression_type::ZSTD: {
-      if (not params.are_stable_integrations_enabled) {
-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
-  }
-  return "Unsupported compression type";
-}
-
 std::optional<std::string> is_decompression_disabled(compression_type compression,
                                                      feature_status_parameters params)
 {
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index b3d43fa786a..4ab5174387e 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -127,7 +127,7 @@ struct zip_archive_s {
 
 bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
 {
-  gz_file_header_s const* fhdr;
+  gz_file_header_s const* fhdr = nullptr;
 
   if (!dst) return false;
   memset(dst, 0, sizeof(gz_archive_s));
@@ -138,7 +138,7 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
   raw += sizeof(gz_file_header_s);
   len -= sizeof(gz_file_header_s);
   if (fhdr->flags & GZIPHeaderFlag::fextra) {
-    uint32_t xlen;
+    uint32_t xlen = 0;
 
     if (len < 2) return false;
     xlen = raw[0] | (raw[1] << 8);
@@ -151,8 +151,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
     len -= xlen;
   }
   if (fhdr->flags & GZIPHeaderFlag::fname) {
-    size_t l = 0;
-    uint8_t c;
+    size_t l  = 0;
+    uint8_t c = 0;
     do {
       if (l >= len) return false;
       c = raw[l];
@@ -163,8 +163,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
     len -= l;
   }
   if (fhdr->flags & GZIPHeaderFlag::fcomment) {
-    size_t l = 0;
-    uint8_t c;
+    size_t l  = 0;
+    uint8_t c = 0;
     do {
       if (l >= len) return false;
       c = raw[l];
@@ -219,7 +219,7 @@ bool OpenZipArchive(zip_archive_s* dst, uint8_t const* raw, size_t len)
 
 int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, uint8_t const* comp_data, size_t comp_len)
 {
-  int zerr;
+  int zerr = 0;
   z_stream strm;
 
   memset(&strm, 0, sizeof(strm));
@@ -291,7 +291,7 @@ size_t decompress_zlib(host_span<uint8_t const> src, host_span<uint8_t> dst)
  */
 size_t decompress_gzip(host_span<uint8_t const> src, host_span<uint8_t> dst)
 {
-  gz_archive_s gz;
+  gz_archive_s gz{};
   auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size());
   CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header");
   return decompress_zlib({gz.comp_data, gz.comp_len}, dst);
@@ -303,12 +303,12 @@ size_t decompress_gzip(host_span<uint8_t const> src, host_span<uint8_t> dst)
 size_t decompress_snappy(host_span<uint8_t const> src, host_span<uint8_t> dst)
 {
   CUDF_EXPECTS(not dst.empty() and src.size() >= 1, "invalid Snappy decompress inputs");
-  uint32_t uncompressed_size, bytes_left, dst_pos;
+  uint32_t uncompressed_size = 0, bytes_left = 0, dst_pos = 0;
   auto cur       = src.begin();
   auto const end = src.end();
   // Read uncompressed length (varint)
   {
-    uint32_t l        = 0, c;
+    uint32_t l = 0, c = 0;
     uncompressed_size = 0;
     do {
       c              = *cur++;
@@ -328,7 +328,7 @@ size_t decompress_snappy(host_span<uint8_t const> src, host_span<uint8_t> dst)
 
     if (blen & 3) {
       // Copy
-      uint32_t offset;
+      uint32_t offset = 0;
       if (blen & 2) {
         // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset
         if (cur + 2 > end) break;
@@ -441,7 +441,7 @@ source_properties get_source_properties(compression_type compression, host_span<
   switch (compression) {
     case compression_type::AUTO:
     case compression_type::GZIP: {
-      gz_archive_s gz;
+      gz_archive_s gz{};
       auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size());
       CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header while fetching source properties");
       compression = compression_type::GZIP;
@@ -452,26 +452,28 @@ source_properties get_source_properties(compression_type compression, host_span<
       [[fallthrough]];
     }
     case compression_type::ZIP: {
-      zip_archive_s za;
+      zip_archive_s za{};
       if (OpenZipArchive(&za, raw, src.size())) {
         size_t cdfh_ofs = 0;
         for (int i = 0; i < za.eocd->num_entries; i++) {
           auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
             reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
-          int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
+          int const cdfh_len =
+            sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
           if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
             // Bad cdir
             break;
           }
           // For now, only accept with non-zero file sizes and DEFLATE
           if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
-            size_t lfh_ofs  = cdfh->hdr_ofs;
-            auto const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
+            size_t const lfh_ofs = cdfh->hdr_ofs;
+            auto const* lfh      = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
             if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
                 lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
               if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
-                size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
-                size_t file_end   = file_start + lfh->comp_size;
+                size_t const file_start =
+                  lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
+                size_t const file_end = file_start + lfh->comp_size;
                 if (file_end <= src.size()) {
                   // Pick the first valid file of non-zero size (only 1 file expected in archive)
                   compression = compression_type::ZIP;
@@ -510,7 +512,7 @@ source_properties get_source_properties(compression_type compression, host_span<
       auto const end = src.end();
       // Read uncompressed length (varint)
       {
-        uint32_t l = 0, c;
+        uint32_t l = 0, c = 0;
         do {
           c              = *cur++;
           auto const lo7 = c & 0x7f;
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index b48e49ffd78..9b01272ac70 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -65,7 +65,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+  CUDF_HOST_DEVICE constexpr unsnap_state_s() noexcept {
+  }  // required to compile on ctk-12.2 + aarch64
 
   uint8_t const* base{};           ///< base ptr of compressed stream
   uint8_t const* end{};            ///< end of compressed stream
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 273e82edf8b..e2bc75d4bab 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -495,7 +495,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
  * @param t thread id (leaf node id)
  */
 template <uint32_t lanemask, uint32_t tmask, uint32_t base, uint32_t level_scale>
-inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t)
+inline __device__ void ctx_merge(device_span<uint64_t> ctxtree, packed_rowctx_t* ctxb, uint32_t t)
 {
   uint64_t tmp = shuffle_xor(*ctxb, lanemask);
   if (!(t & tmask)) {
@@ -518,7 +518,7 @@ inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint3
  */
 template <uint32_t rmask>
 inline __device__ void ctx_unmerge(
-  uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t)
+  uint32_t base, device_span<uint64_t const> ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t)
 {
   rowctx32_t ctxb_left, ctxb_right, ctxb_sum;
   ctxb_sum   = get_row_context(ctxtree[base], *ctx);
@@ -550,7 +550,7 @@ inline __device__ void ctx_unmerge(
  * @param[in] ctxb packed row context for the current character block
  * @param t thread id (leaf node id)
  */
-static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
+static inline __device__ void rowctx_merge_transform(device_span<uint64_t> ctxtree,
                                                      packed_rowctx_t ctxb,
                                                      uint32_t t)
 {
@@ -584,8 +584,8 @@ static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
  *
  * @return Final row context and count (row_position*4 + context_id format)
  */
-static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxtree[1024],
-                                                                   uint32_t t)
+static inline __device__ rowctx32_t
+rowctx_inverse_merge_transform(device_span<uint64_t const> ctxtree, uint32_t t)
 {
   uint32_t ctx     = ctxtree[0] & 3;  // Starting input context
   rowctx32_t brow4 = 0;               // output row in block *4
@@ -603,6 +603,8 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
   return brow4 + ctx;
 }
 
+constexpr auto bk_ctxtree_size = rowofs_block_dim * 2;
+
 /**
  * @brief Gather row offsets from CSV character data split into 16KB chunks
  *
@@ -634,6 +636,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  */
 CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   gather_row_offsets_gpu(uint64_t* row_ctx,
+                         device_span<uint64_t> ctxtree,
                          device_span<uint64_t> offsets_out,
                          device_span<char const> const data,
                          size_t chunk_size,
@@ -649,12 +652,8 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
                          int escapechar,
                          int commentchar)
 {
-  auto start         = data.begin();
-  using block_reduce = typename cub::BlockReduce<uint32_t, rowofs_block_dim>;
-  __shared__ union {
-    typename block_reduce::TempStorage bk_storage;
-    __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
-  } temp_storage;
+  auto start            = data.begin();
+  auto const bk_ctxtree = ctxtree.subspan(blockIdx.x * bk_ctxtree_size, bk_ctxtree_size);
 
   char const* end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
   uint32_t t      = threadIdx.x;
@@ -723,16 +722,16 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   // Convert the long-form {rowmap,outctx}[inctx] version into packed version
   // {rowcount,ouctx}[inctx], then merge the row contexts of the 32-character blocks into
   // a single 16K-character block context
-  rowctx_merge_transform(temp_storage.ctxtree, pack_rowmaps(ctx_map), t);
+  rowctx_merge_transform(bk_ctxtree, pack_rowmaps(ctx_map), t);
 
   // If this is the second phase, get the block's initial parser state and row counter
   if (offsets_out.data()) {
-    if (t == 0) { temp_storage.ctxtree[0] = row_ctx[blockIdx.x]; }
+    if (t == 0) { bk_ctxtree[0] = row_ctx[blockIdx.x]; }
     __syncthreads();
 
     // Walk back the transform tree with the known initial parser state
-    rowctx32_t ctx             = rowctx_inverse_merge_transform(temp_storage.ctxtree, t);
-    uint64_t row               = (temp_storage.ctxtree[0] >> 2) + (ctx >> 2);
+    rowctx32_t ctx             = rowctx_inverse_merge_transform(bk_ctxtree, t);
+    uint64_t row               = (bk_ctxtree[0] >> 2) + (ctx >> 2);
     uint32_t rows_out_of_range = 0;
     uint32_t rowmap            = select_rowmap(ctx_map, ctx & 3);
     // Output row positions
@@ -749,11 +748,14 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
     }
     __syncthreads();
     // Return the number of rows out of range
-    rows_out_of_range = block_reduce(temp_storage.bk_storage).Sum(rows_out_of_range);
+
+    using block_reduce = typename cub::BlockReduce<uint32_t, rowofs_block_dim>;
+    __shared__ typename block_reduce::TempStorage bk_storage;
+    rows_out_of_range = block_reduce(bk_storage).Sum(rows_out_of_range);
     if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; }
   } else {
     // Just store the row counts and output contexts
-    if (t == 0) { row_ctx[blockIdx.x] = temp_storage.ctxtree[1]; }
+    if (t == 0) { row_ctx[blockIdx.x] = bk_ctxtree[1]; }
   }
 }
 
@@ -829,7 +831,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options,
   // Calculate actual block count to use based on records count
   auto const block_size = csvparse_block_dim;
   auto const num_rows   = row_offsets.size() - 1;
-  auto const grid_size  = (num_rows + block_size - 1) / block_size;
+  auto const grid_size  = cudf::util::div_rounding_up_safe<size_t>(num_rows, block_size);
 
   convert_csv_to_cudf<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts);
@@ -849,9 +851,11 @@ uint32_t __host__ gather_row_offsets(parse_options_view const& options,
                                      rmm::cuda_stream_view stream)
 {
   uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes);
+  auto ctxtree      = rmm::device_uvector<packed_rowctx_t>(dim_grid * bk_ctxtree_size, stream);
 
   gather_row_offsets_gpu<<<dim_grid, rowofs_block_dim, 0, stream.value()>>>(
     row_ctx,
+    ctxtree,
     offsets_out,
     data,
     chunk_size,
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 6c84b53db46..7f0b5e07b09 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -28,13 +28,13 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 0e70984b39c..2a75c034dc8 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,6 +18,7 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -342,8 +343,9 @@ class WriteCoalescingCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __device__ __forceinline__ StateVectorTransitionOp(
-    TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
+  __device__ __forceinline__
+  StateVectorTransitionOp(TransitionTableT const& transition_table,
+                          cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
@@ -360,7 +362,7 @@ class StateVectorTransitionOp {
   }
 
  public:
-  std::array<StateIndexT, NUM_INSTANCES>& state_vector;
+  cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector;
   TransitionTableT const& transition_table;
 };
 
@@ -620,7 +622,7 @@ struct AgentDFA {
     SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
-    std::array<StateIndexT, NUM_STATES>& state_vector)
+    cuda::std::array<StateIndexT, NUM_STATES>& state_vector)
   {
     using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, TransitionTableT>;
 
@@ -796,10 +798,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
-    std::array<StateIndexT, NUM_STATES> state_vector;
+    cuda::std::array<StateIndexT, NUM_STATES> state_vector;
 
     // Initialize the seed state transition vector with the identity vector
-    thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector));
+    thrust::sequence(thrust::seq, cuda::std::begin(state_vector), cuda::std::end(state_vector));
 
     // Compute the state transition vector
     agent_dfa.GetThreadStateTransitionVector<NUM_STATES>(symbol_matcher,
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ceaeb5d8f85..88423122e16 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -39,6 +39,38 @@
 #include <utility>
 
 namespace cudf::io {
+namespace {
+
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths()[0];
+
+  // Attempt to infer from the file extension
+  auto const pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](auto const& begin, auto const& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
+}
+
+}  // namespace
 
 // Returns builder for csv_reader_options
 csv_reader_options_builder csv_reader_options::builder(source_info src)
@@ -170,35 +202,6 @@ table_with_metadata read_avro(avro_reader_options const& options, rmm::device_as
   return avro::read_avro(std::move(datasources[0]), options, cudf::get_default_stream(), mr);
 }
 
-compression_type infer_compression_type(compression_type compression, source_info const& info)
-{
-  if (compression != compression_type::AUTO) { return compression; }
-
-  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
-
-  auto filepath = info.filepaths()[0];
-
-  // Attempt to infer from the file extension
-  auto const pos = filepath.find_last_of('.');
-
-  if (pos == std::string::npos) { return {}; }
-
-  auto str_tolower = [](auto const& begin, auto const& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
-
-  if (ext == "gz") { return compression_type::GZIP; }
-  if (ext == "zip") { return compression_type::ZIP; }
-  if (ext == "bz2") { return compression_type::BZIP2; }
-  if (ext == "xz") { return compression_type::XZ; }
-
-  return compression_type::NONE;
-}
-
 table_with_metadata read_json(json_reader_options options,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
@@ -287,7 +290,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
     CUDF_FAIL("Unsupported source type");
   }
 
-  orc::metadata metadata(source.get(), stream);
+  orc::metadata const metadata(source.get(), stream);
 
   // Initialize statistics to return
   raw_orc_statistics result;
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7fafa885c66..7b9fc25d1cc 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -222,18 +222,19 @@ struct json_column_data {
 using hashmap_of_device_columns =
   std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr);
 
 void scatter_offsets(tree_meta_t const& tree,
                      device_span<NodeIndexT const> col_ids,
@@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream);
 
@@ -363,17 +365,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
+  auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root,
+                                                                  is_str_column_all_nulls,
+                                                                  d_column_tree,
+                                                                  d_unique_col_ids,
+                                                                  d_max_row_offsets,
+                                                                  column_names,
+                                                                  row_array_parent_col_id,
+                                                                  is_array_of_arrays,
+                                                                  options,
+                                                                  stream,
+                                                                  mr);
   if (ignore_vals.empty()) return;
   scatter_offsets(tree,
                   col_ids,
@@ -382,22 +384,24 @@ void make_device_json_column(device_span<SymbolT const> input,
                   sorted_col_ids,
                   d_column_tree,
                   ignore_vals,
+                  is_mixed_pruned,
                   columns,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
@@ -488,7 +492,9 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   // NoPruning: iterate through schema and enforce type.
 
   if (adj[parent_node_sentinel].empty())
-    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+    return {cudf::detail::make_host_vector<bool>(0, stream),
+            cudf::detail::make_host_vector<bool>(0, stream),
+            {}};  // for empty file
   CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
   auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
   std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
@@ -551,11 +557,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       auto list_child = schema.child_types.at(this_list_child_name);
       for (auto const& child_id : child_ids)
         mark_is_pruned(child_id, list_child);
+      // TODO: Store null map of non-target types for list children to mark list entry as null.
     }
   };
   if (is_array_of_arrays) {
     if (adj[adj[parent_node_sentinel][0]].empty())
-      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+      return {cudf::detail::make_host_vector<bool>(0, stream),
+              cudf::detail::make_host_vector<bool>(0, stream),
+              {}};
     auto root_list_col_id =
       is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
     // mark root and row array col_id as not pruned.
@@ -647,8 +656,12 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       ? adj[parent_node_sentinel][0]
       : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
 
+  // List children which are pruned mixed types, nullify parent list row.
+  auto is_mixed_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_mixed_pruned.begin(), num_columns, false);
   auto handle_mixed_types = [&column_categories,
                              &is_str_column_all_nulls,
+                             &is_mixed_pruned,
                              &is_pruned,
                              &expected_types,
                              &is_enabled_mixed_types_as_string,
@@ -794,6 +807,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                      "list child column insertion failed, duplicate column name in the parent");
         ref.get().column_order.emplace_back(list_child_name);
         auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        if (options.is_enabled_experimental()) {
+          for (auto const& child_id : child_ids) {
+            if (is_pruned[child_id]) {
+              // store this child_id for mixed_type nullify parent list_id.
+              is_mixed_pruned[child_id] = is_pruned[child_id];
+            }
+          }
+        }
         // Mixed type handling
         handle_mixed_types(child_ids);
         if (child_ids.empty()) {
@@ -829,7 +850,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
   cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
-  return {is_pruned, columns};
+  return {is_pruned, is_mixed_pruned, columns};
 }
 
 void scatter_offsets(tree_meta_t const& tree,
@@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed_pruned,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream)
 {
@@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree,
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
     ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async(
+    is_mixed_pruned, stream, cudf::get_current_device_resource_ref());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
     columns_data, stream, cudf::get_current_device_resource_ref());
 
@@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree,
              column_categories[col_ids[parent_node_id]] == NC_LIST and
              (!d_ignore_vals[col_ids[parent_node_id]]);
     });
+  // For children of list and in ignore_vals, find it's parent node id, and set corresponding
+  // parent's null mask to null. Setting mixed type list rows to null.
+  auto const num_list_children = thrust::distance(
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end);
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids          = node_ids.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     d_is_mixed_pruned = d_is_mixed_pruned.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
+      if (column_categories[col_ids[parent_node_id]] == NC_LIST and
+          d_is_mixed_pruned[col_ids[node_id]]) {
+        clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
+      }
+    });
 
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
   thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 30a154fdda2..1fe58a0449f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -464,46 +464,49 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       column_names.emplace_back(
         json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
-      // Note: json_col modified here, reuse the memory
+      // If child is not present, set the null mask correctly, but offsets are zero, and children
+      // are empty. Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
                                                      num_rows + 1,
                                                      json_col.child_offsets.release(),
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element =
-        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
-      auto [child_column, names] =
-        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
-          ? std::pair<std::unique_ptr<column>,
-                      // EMPTY type could not used because gather throws exception on EMPTY type.
-                      std::vector<column_name_info>>{std::make_unique<column>(
-                                                       data_type{type_id::INT8},
-                                                       0,
-                                                       rmm::device_buffer{},
-                                                       rmm::device_buffer{},
-                                                       0),
-                                                     std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                              d_input,
-                                              options,
-                                              prune_columns,
-                                              child_schema_element,
-                                              stream,
-                                              mr);
+      auto child_schema_element  = get_list_child_schema();
+      auto [child_column, names] = [&]() {
+        if (json_col.child_columns.empty()) {
+          // EMPTY type could not used because gather throws exception on EMPTY type.
+          auto empty_col = make_empty_column(
+            child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr);
+          auto children_metadata = std::vector<column_name_info>{
+            make_column_name_info(
+              child_schema_element.value_or(schema_element{data_type{type_id::INT8}}),
+              list_child_name)
+              .children};
+
+          return std::pair<std::unique_ptr<column>, std::vector<column_name_info>>{
+            std::move(empty_col), children_metadata};
+        }
+        return device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                                 d_input,
+                                                 options,
+                                                 prune_columns,
+                                                 child_schema_element,
+                                                 stream,
+                                                 mr);
+      }();
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
-      auto ret_col                      = make_lists_column(num_rows,
-                                       std::move(offsets_column),
-                                       std::move(child_column),
-                                       0,
-                                       rmm::device_buffer{0, stream, mr},
-                                       stream,
-                                       mr);
-      // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and
-      // null validation applied in make_lists_column factory, which is not needed for json
-      // parent column cannot be null when its children is non-empty in JSON
-      if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
+      auto ret_col                      = make_lists_column(
+        num_rows,
+        std::move(offsets_column),
+        std::move(child_column),
+        null_count,
+        null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask),
+        stream,
+        mr);
+      // Since some rows in child column may need to be nullified due to mixed types, we can not
+      // skip the purge_nonempty_nulls call in make_lists_column factory
       return {std::move(ret_col), std::move(column_names)};
     }
     default: CUDF_FAIL("Unsupported column type"); break;
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 4989fff4b30..cc5f256ea80 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Create empty column of a given nested schema
+ *
+ * @param schema The schema of the column to create
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr resource with which to allocate
+ * @return The empty column
+ */
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr);
+
 /**
  * @brief Create all null column of a given nested schema
  *
@@ -452,17 +464,6 @@ std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
  */
 column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name);
 
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the column
- * @param options json reader options which holds schema
- * @return data type of the column if present
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options);
-
 /**
  * @brief Helper class to get path of a column by column id from reduced column tree
  *
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index e795e8e09d8..4b4827ca8d9 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -68,7 +68,6 @@ void json_reader_options::set_dtypes(schema_element types)
 }  // namespace cudf::io
 
 namespace cudf::io::json::detail {
-
 /// Created an empty column of the specified schema
 struct empty_column_functor {
   rmm::cuda_stream_view stream;
@@ -88,7 +87,17 @@ struct empty_column_functor {
     std::unique_ptr<column> child = cudf::type_dispatcher(
       schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name));
     auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
-    return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    0,
+                                    rmm::device_buffer{},
+                                    rmm::device_buffer{},
+                                    0,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -103,6 +112,13 @@ struct empty_column_functor {
   }
 };
 
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr)
+{
+  return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema);
+}
+
 /// Created all null column of the specified schema
 struct allnull_column_functor {
   rmm::cuda_stream_view stream;
@@ -127,10 +143,9 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
     return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr);
     auto indices   = make_zeroed_offsets(size - 1);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
@@ -150,14 +165,22 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
-    auto offsets                  = make_zeroed_offsets(size);
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
+    auto offsets   = make_zeroed_offsets(size);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_lists_column(
-      size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -169,8 +192,14 @@ struct allnull_column_functor {
         schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size));
     }
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_structs_column(
-      size, std::move(child_columns), size, std::move(null_mask), stream, mr);
+    // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on
+    // the children columns. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::STRUCT},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 };
 
@@ -210,116 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string
   }
   return info;
 }
-
-std::optional<schema_element> child_schema_element(std::string const& col_name,
-                                                   cudf::io::json_reader_options const& options)
-{
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-        auto column_index = atol(col_name.data());
-        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? user_dtypes.find(col_name)->second
-                 : std::optional<schema_element>{};
-      },
-      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
-                 ? user_dtypes.child_types.find(col_name)->second
-                 : std::optional<schema_element>{};
-      }},
-    options.get_dtypes());
-}
-
-// example schema and its path.
-// "a": int             {"a", int}
-// "a": [ int ]         {"a", list}, {"element", int}
-// "a": { "b": int}     {"a", struct}, {"b", int}
-// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
-// "a": [ null]         {"a", list}, {"element", str}
-// back() is root.
-// front() is leaf.
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the json column
- * @param root root of input schema element
- * @return data type of the column if present, otherwise std::nullopt
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
-{
-  if (path.empty() || path.size() == 1) {
-    return root.type;
-  } else {
-    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
-      auto const child_name      = path.first(path.size() - 1).back().first;
-      auto const child_schema_it = root.child_types.find(child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
-      auto const child_schema_it = root.child_types.find(list_child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    }
-    return std::optional<data_type>{};
-  }
-}
-
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options)
-{
-  if (path.empty()) return {};
-  std::optional<schema_element> col_schema = child_schema_element(path.back().first, options);
-  // check if it has value, then do recursive call and return.
-  if (col_schema.has_value()) {
-    return get_path_data_type(path, col_schema.value());
-  } else {
-    return {};
-  }
-}
-
-// idea: write a memoizer using template and lambda?, then call recursively.
-std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
-{
-  std::vector<path_rep> path;
-  // stops at root.
-  while (this_col_id != parent_node_sentinel) {
-    auto type        = column_categories[this_col_id];
-    std::string name = "";
-    // code same as name_and_parent_index lambda.
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    }
-    // "name": type/schema
-    path.emplace_back(name, type);
-    this_col_id = parent_col_id;
-    if (this_col_id == row_array_parent_col_id) return path;
-  }
-  return {};
-}
-
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index a4885d59cc5..1a0c59e365a 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -327,7 +327,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                               -> size_type { return idx / tbl.num_columns(); }));
     auto validity_iterator =
       cudf::detail::make_counting_transform_iterator(0, validity_fn{*tbl_device_view});
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+    thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(stream),
                                   row_num,
                                   row_num + total_rows,
                                   validity_iterator,
@@ -335,7 +335,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                   false,
                                   thrust::equal_to<size_type>{},
                                   thrust::logical_or<bool>{});
-    thrust::for_each(rmm::exec_policy(stream),
+    thrust::for_each(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(total_rows),
                      [write_separator = d_str_separator.begin(),
@@ -362,7 +362,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
     0, cuda::proclaim_return_type<size_type>([num_strviews_per_row] __device__(size_type const i) {
       return i * num_strviews_per_row;
     }));
-  thrust::gather(rmm::exec_policy(stream),
+  thrust::gather(rmm::exec_policy_nosync(stream),
                  d_strview_offsets,
                  d_strview_offsets + row_string_offsets.size(),
                  old_offsets.begin<size_type>(),
@@ -427,7 +427,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
         auto const length = offsets[idx + 1] - offsets[idx];
         return length == 0 ? 2 : (2 + length + length - 1);
       }));
-  thrust::exclusive_scan(rmm::exec_policy(stream),
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
                          num_strings_per_list,
                          num_strings_per_list + num_offsets,
                          d_strview_offsets.begin());
@@ -436,7 +436,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   rmm::device_uvector<string_view> d_strviews(total_strings, stream);
   // scatter null_list and list_prefix, list_suffix
   auto col_device_view = cudf::column_device_view::create(lists_strings.parent(), stream);
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_lists),
                    [col = *col_device_view,
@@ -458,7 +458,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   auto labels = cudf::lists::detail::generate_labels(
     lists_strings, num_strings, stream, cudf::get_current_device_resource_ref());
   auto d_strings_children = cudf::column_device_view::create(strings_children, stream);
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_strings),
                    [col                = *col_device_view,
@@ -485,7 +485,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   // gather from offset and create a new string column
   auto old_offsets = strings_column_view(joined_col->view()).offsets();
   rmm::device_uvector<size_type> row_string_offsets(num_offsets, stream, mr);
-  thrust::gather(rmm::exec_policy(stream),
+  thrust::gather(rmm::exec_policy_nosync(stream),
                  d_strview_offsets.begin(),
                  d_strview_offsets.end(),
                  old_offsets.begin<size_type>(),
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index fcaee9c548e..726c79bd004 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ed0b6969154..07172b6b7f7 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -23,10 +23,10 @@
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d432deb8e79..8e532b01788 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -28,10 +28,11 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -506,7 +507,7 @@ size_t max_varint_size()
   return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7);
 }
 
-constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
+size_t RLE_stream_size(TypeKind kind, size_t count)
 {
   using cudf::util::div_rounding_up_unsafe;
   constexpr auto byte_rle_max_len = 128;
@@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   // we know the size of each array. The number of stripes per column in a chunk array can
   // be calculated by dividing the number of chunks by the number of columns.
   // That many chunks need to be copied at a time to the proper destination.
-  size_t num_entries_seen = 0;
+  size_t num_entries_seen        = 0;
+  auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2;
+  auto h_srcs = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_dsts = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_lens = cudf::detail::make_empty_host_vector<size_t>(num_buffers_to_copy, stream);
+
   for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
     auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns;
 
-    auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
-    auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
     for (size_t col = 0; col < num_columns; ++col) {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
-                        chunk_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
-                        merge_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
+      h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col);
+      h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_chunk));
+
+      h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col);
+      h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group));
     }
     num_entries_seen += stripes_per_col;
   }
 
+  auto const& mr    = cudf::get_current_device_resource_ref();
+  auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
+  auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
+  auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr);
+  cudf::detail::batched_memcpy_async(
+    d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream);
+
   auto file_stats_merge =
     cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index d15435b2553..a4536ac6a3b 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -336,7 +336,7 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con
 {
   // Lambda function to convert int32 to a string of uint8 bytes
   auto const convert_int32_to_byte_string = [&](int32_t const value) {
-    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::array<uint8_t, sizeof(int32_t)> buffer{};
     std::memcpy(buffer.data(), &value, sizeof(int32_t));
     return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
   };
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index d276e946a51..f1ecf66c29f 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -27,23 +27,7 @@
 #include <tuple>
 
 namespace cudf::io::parquet::detail {
-
-/**
- * @brief Base class for parquet field functors.
- *
- * Holds the field value used by all of the specialized functors.
- */
-class parquet_field {
- private:
-  int _field_val;
-
- protected:
-  parquet_field(int f) : _field_val(f) {}
-
- public:
-  virtual ~parquet_field() = default;
-  [[nodiscard]] int field() const { return _field_val; }
-};
+namespace {
 
 std::string field_type_string(FieldType type)
 {
@@ -79,6 +63,72 @@ void assert_bool_field_type(int type)
                "expected bool field, got " + field_type_string(field_type) + " field instead");
 }
 
+template <int index>
+struct FunctionSwitchImpl {
+  template <typename... Operator>
+  static inline void run(CompactProtocolReader* cpr,
+                         int field_type,
+                         int const& field,
+                         std::tuple<Operator...>& ops)
+  {
+    if (field == std::get<index>(ops).field()) {
+      std::get<index>(ops)(cpr, field_type);
+    } else {
+      FunctionSwitchImpl<index - 1>::run(cpr, field_type, field, ops);
+    }
+  }
+};
+
+template <>
+struct FunctionSwitchImpl<0> {
+  template <typename... Operator>
+  static inline void run(CompactProtocolReader* cpr,
+                         int field_type,
+                         int const& field,
+                         std::tuple<Operator...>& ops)
+  {
+    if (field == std::get<0>(ops).field()) {
+      std::get<0>(ops)(cpr, field_type);
+    } else {
+      cpr->skip_struct_field(field_type);
+    }
+  }
+};
+
+template <typename... Operator>
+inline void function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>& op)
+{
+  constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
+  int field           = 0;
+  while (true) {
+    int const current_byte = cpr->getb();
+    if (!current_byte) { break; }
+    int const field_delta = current_byte >> 4;
+    int const field_type  = current_byte & 0xf;
+    field                 = field_delta ? field + field_delta : cpr->get_i16();
+    FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
+  }
+}
+
+}  // namespace
+
+/**
+ * @brief Base class for parquet field functors.
+ *
+ * Holds the field value used by all of the specialized functors.
+ */
+class parquet_field {
+ private:
+  int _field_val;
+
+ protected:
+  parquet_field(int f) : _field_val(f) {}
+
+ public:
+  virtual ~parquet_field() = default;
+  [[nodiscard]] int field() const { return _field_val; }
+};
+
 /**
  * @brief Abstract base class for list functors.
  */
@@ -494,53 +544,6 @@ void CompactProtocolReader::skip_struct_field(int t, int depth)
   }
 }
 
-template <int index>
-struct FunctionSwitchImpl {
-  template <typename... Operator>
-  static inline void run(CompactProtocolReader* cpr,
-                         int field_type,
-                         int const& field,
-                         std::tuple<Operator...>& ops)
-  {
-    if (field == std::get<index>(ops).field()) {
-      std::get<index>(ops)(cpr, field_type);
-    } else {
-      FunctionSwitchImpl<index - 1>::run(cpr, field_type, field, ops);
-    }
-  }
-};
-
-template <>
-struct FunctionSwitchImpl<0> {
-  template <typename... Operator>
-  static inline void run(CompactProtocolReader* cpr,
-                         int field_type,
-                         int const& field,
-                         std::tuple<Operator...>& ops)
-  {
-    if (field == std::get<0>(ops).field()) {
-      std::get<0>(ops)(cpr, field_type);
-    } else {
-      cpr->skip_struct_field(field_type);
-    }
-  }
-};
-
-template <typename... Operator>
-inline void function_builder(CompactProtocolReader* cpr, std::tuple<Operator...>& op)
-{
-  constexpr int index = std::tuple_size<std::tuple<Operator...>>::value - 1;
-  int field           = 0;
-  while (true) {
-    int const current_byte = cpr->getb();
-    if (!current_byte) { break; }
-    int const field_delta = current_byte >> 4;
-    int const field_type  = current_byte & 0xf;
-    field                 = field_delta ? field + field_delta : cpr->get_i16();
-    FunctionSwitchImpl<index>::run(cpr, field_type, field, op);
-  }
-}
-
 void CompactProtocolReader::read(FileMetaData* f)
 {
   using optional_list_column_order =
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 14c99f728de..bf2db013118 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -291,7 +291,7 @@ uint32_t CompactProtocolFieldWriter::put_uint(uint64_t v)
 
 uint32_t CompactProtocolFieldWriter::put_int(int64_t v)
 {
-  int64_t s = (v < 0);
+  int64_t const s = (v < 0);
   return put_uint(((v ^ -s) << 1) + s);
 }
 
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index cd3dcd2bce4..b0cbabf1c12 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -426,7 +426,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
   // For each column, it contains #sources * #column_chunks_per_src rows.
   std::vector<std::unique_ptr<column>> columns;
-  stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
+  stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
     auto const schema_idx = output_column_schemas[col_idx];
     auto const& dtype     = output_dtypes[col_idx];
@@ -447,7 +447,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   auto stats_table = cudf::table(std::move(columns));
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter.get(), static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter const stats_expr{filter.get(),
+                                              static_cast<size_type>(output_dtypes.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index d74ae83b635..c48ff896e33 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -148,7 +148,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
 
-    size_t max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
+    size_t const max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
     chunk_offsets.push_back(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
@@ -203,7 +203,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
       auto& out_buf = (*cols)[input_col.nesting[idx]];
       cols          = &out_buf.children;
 
-      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      int const owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
@@ -435,7 +435,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     ColumnChunkDesc* col               = &pass.chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    int const index             = pi->nesting_decode - page_nesting_decode.device_ptr();
     PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index a6562d33de2..0dd1aff41e9 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -23,7 +23,7 @@
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
@@ -833,7 +833,7 @@ std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update
   // buffer pointer and size
   auto read_int32_from_ipc_message = [&]() {
-    int32_t bytes;
+    int32_t bytes = 0;
     std::memcpy(&bytes, message_buf, sizeof(int32_t));
     // Offset the message buf and reduce remaining size
     message_buf += sizeof(int32_t);
@@ -991,7 +991,7 @@ std::string aggregate_reader_metadata::get_pandas_index() const
     // One-liner regex:
     // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
     // Documented below.
-    std::regex index_columns_expr{
+    std::regex const index_columns_expr{
       R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
       R"(()"                            // Open first capturing group
       R"((?:")"                         // Open non-capturing group match opening quote
@@ -1013,12 +1013,12 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   std::vector<std::string> names;
   auto str = get_pandas_index();
   if (str.length() != 0) {
-    std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
+    std::regex const index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
     std::smatch sm;
     while (std::regex_search(str, sm, index_name_expr)) {
       if (sm.size() == 2) {  // 2 = whole match, first item
         if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
-          std::regex esc_quote{R"(\\")"};
+          std::regex const esc_quote{R"(\\")"};
           names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
         }
       }
@@ -1362,8 +1362,8 @@ aggregate_reader_metadata::select_columns(
     std::vector<path_info> all_paths;
     std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
                                                          int schema_idx) {
-      auto const& schema_elem = get_schema(schema_idx);
-      std::string curr_path   = path_till_now + schema_elem.name;
+      auto const& schema_elem     = get_schema(schema_idx);
+      std::string const curr_path = path_till_now + schema_elem.name;
       all_paths.push_back({curr_path, schema_idx});
       for (auto const& child_idx : schema_elem.children_idx) {
         add_path(curr_path + ".", child_idx);
@@ -1376,7 +1376,7 @@ aggregate_reader_metadata::select_columns(
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
     // vector reference pushback (*use_names). If filter names passed.
-    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> const column_names{
       *use_names, *filter_columns_names};
     for (auto const& used_column_names : column_names) {
       for (auto const& selected_path : used_column_names.get()) {
@@ -1408,7 +1408,7 @@ aggregate_reader_metadata::select_columns(
 
     std::vector<column_name_info> selected_columns;
     if (include_index) {
-      std::vector<std::string> index_names = get_pandas_index_names();
+      std::vector<std::string> const index_names = get_pandas_index_names();
       std::transform(index_names.cbegin(),
                      index_names.cend(),
                      std::back_inserter(selected_columns),
@@ -1457,7 +1457,7 @@ aggregate_reader_metadata::select_columns(
     }
     for (auto& col : selected_columns) {
       auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
-      bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
+      bool const valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
       if (valid_column) {
         output_column_schemas.push_back(top_level_col_schema_idx);
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f865c9a7643..188e6a8c0d8 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -38,10 +38,10 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 58698c6a19d..50d823ade88 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <cuda/std/limits>
+
 namespace cudf::io::statistics {
 
 /**
@@ -30,15 +32,19 @@ class byte_array_view {
  public:
   using element_type = std::byte const;  ///< The type of the elements in the byte array
 
-  constexpr byte_array_view() noexcept {}
+  CUDF_HOST_DEVICE constexpr byte_array_view() noexcept {}
   /**
    * @brief Constructs a byte_array_view from a pointer and a size.
    *
    * @param data Pointer to the first element in the byte array.
    * @param size The number of elements in the byte array.
    */
-  constexpr byte_array_view(element_type* data, std::size_t size) : _data(data, size) {}
-  constexpr byte_array_view(byte_array_view const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr byte_array_view(element_type* data, std::size_t size)
+    : _data(data, size)
+  {
+  }
+  CUDF_HOST_DEVICE constexpr byte_array_view(byte_array_view const&) noexcept =
+    default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
@@ -55,14 +61,20 @@ class byte_array_view {
    * @param idx The index of the element to access.
    * @return A reference to the idx-th element of the byte_array_view, i.e., `_data.data()[idx]`.
    */
-  [[nodiscard]] constexpr element_type& operator[](std::size_t idx) const { return _data[idx]; }
+  [[nodiscard]] __device__ constexpr element_type& operator[](std::size_t idx) const
+  {
+    return _data[idx];
+  }
 
   /**
    * @brief Returns a pointer to the beginning of the byte_array_view.
    *
    * @return A pointer to the first element of the byte_array_view.
    */
-  [[nodiscard]] constexpr element_type* data() const noexcept { return _data.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr element_type* data() const noexcept
+  {
+    return _data.data();
+  }
 
   /**
    * @brief Returns the number of elements in the byte_array_view.
@@ -76,7 +88,10 @@ class byte_array_view {
    *
    * @return The size of the byte_array_view in bytes
    */
-  [[nodiscard]] constexpr std::size_t size_bytes() const noexcept { return _data.size_bytes(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr std::size_t size_bytes() const noexcept
+  {
+    return _data.size_bytes();
+  }
 
   /**
    * @brief Comparing target byte_array_view with this byte_array_view. Each byte in the array is
@@ -98,9 +113,9 @@ class byte_array_view {
     auto const* ptr2 = rhs.data();
     if ((ptr1 == ptr2) && (len1 == len2)) { return 0; }
     // if I am max, I am greater than the argument
-    if (ptr1 == nullptr && len1 == std::numeric_limits<std::size_t>::max()) { return 1; }
+    if (ptr1 == nullptr && len1 == cuda::std::numeric_limits<std::size_t>::max()) { return 1; }
     // if the argument is max, it is greater than me
-    if (ptr2 == nullptr && len2 == std::numeric_limits<std::size_t>::max()) { return -1; }
+    if (ptr2 == nullptr && len2 == cuda::std::numeric_limits<std::size_t>::max()) { return -1; }
     std::size_t idx = 0;
     for (; (idx < len1) && (idx < len2); ++idx) {
       if (ptr1[idx] != ptr2[idx]) {
@@ -170,7 +185,7 @@ class byte_array_view {
    */
   [[nodiscard]] __device__ inline static byte_array_view max()
   {
-    return {nullptr, std::numeric_limits<std::size_t>::max()};
+    return {nullptr, cuda::std::numeric_limits<std::size_t>::max()};
   }
 
  private:
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 01db781c766..dc023e69423 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -30,6 +30,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <cuda/std/limits>
 #include <math_constants.h>
 #include <thrust/extrema.h>
 
@@ -246,9 +247,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
       if (!chunk.has_minmax) { return true; }
-      return std::numeric_limits<E>::max() / chunk.non_nulls >=
+      return cuda::std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
-             std::numeric_limits<E>::lowest() / chunk.non_nulls <=
+             cuda::std::numeric_limits<E>::lowest() / chunk.non_nulls <=
                static_cast<E>(chunk.minimum_value);
     }
     return true;
diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp
index cb412828e2d..77da2a44c7c 100644
--- a/cpp/src/io/text/bgzip_utils.cpp
+++ b/cpp/src/io/text/bgzip_utils.cpp
@@ -40,7 +40,7 @@ IntType read_int(char* data)
 template <typename T>
 void write_int(std::ostream& output_stream, T val)
 {
-  std::array<char, sizeof(T)> bytes;
+  std::array<char, sizeof(T)> bytes{};
   // we assume little-endian
   std::memcpy(&bytes[0], &val, sizeof(T));
   output_stream.write(bytes.data(), bytes.size());
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
index 856c29599a7..00fc54f9883 100644
--- a/cpp/src/io/utilities/base64_utilities.cpp
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -60,7 +60,7 @@
 
 #include "base64_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -86,7 +86,7 @@ std::string base64_encode(std::string_view string_to_encode)
   num_iterations += (input_length % 3) ? 1 : 0;
 
   std::string encoded;
-  size_t encoded_length = (input_length + 2) / 3 * 4;
+  size_t const encoded_length = (input_length + 2) / 3 * 4;
   encoded.reserve(encoded_length);
 
   // altered: modify base64 encoder loop using STL and Thrust.
@@ -135,7 +135,7 @@ std::string base64_decode(std::string_view encoded_string)
     return std::string{};
   }
 
-  size_t input_length = encoded_string.length();
+  size_t const input_length = encoded_string.length();
   std::string decoded;
 
   // altered: compute number of decoding iterations = floor (multiple of 4)
@@ -147,7 +147,7 @@ std::string base64_decode(std::string_view encoded_string)
   // two bytes smaller, depending on the amount of trailing equal signs
   // in the encoded string. This approximation is needed to reserve
   // enough space in the string to be returned.
-  size_t approx_decoded_length = input_length / 4 * 3;
+  size_t const approx_decoded_length = input_length / 4 * 3;
   decoded.reserve(approx_decoded_length);
 
   //
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index b37a5ac900a..dfa5d46cf48 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -16,9 +16,9 @@
 
 #include "file_io_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <kvikio/file_handle.hpp>
@@ -86,7 +86,7 @@ class file_sink : public data_sink {
   {
     if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
 
-    size_t offset = _bytes_written;
+    size_t const offset = _bytes_written;
     _bytes_written += size;
 
     if (!_kvikio_file.closed()) {
@@ -170,7 +170,7 @@ class void_sink : public data_sink {
   size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t _bytes_written;
+  size_t _bytes_written{};
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 10814eea458..38dedcc2627 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -17,11 +17,11 @@
 #include "file_io_utilities.hpp"
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -128,7 +128,8 @@ class file_source : public datasource {
                                                   rmm::cuda_stream_view stream) override
   {
     rmm::device_buffer out_data(size, stream);
-    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    size_t const read =
+      device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
     out_data.resize(read, stream);
     return datasource::buffer::create(std::move(out_data));
   }
@@ -444,7 +445,8 @@ class remote_file_source : public datasource {
                                                   rmm::cuda_stream_view stream) override
   {
     rmm::device_buffer out_data(size, stream);
-    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    size_t const read =
+      device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
     out_data.resize(read, stream);
     return datasource::buffer::create(std::move(out_data));
   }
@@ -471,7 +473,7 @@ class remote_file_source : public datasource {
   static bool is_supported_remote_url(std::string const& url)
   {
     // Regular expression to match "s3://"
-    static std::regex pattern{R"(^s3://)", std::regex_constants::icase};
+    static std::regex const pattern{R"(^s3://)", std::regex_constants::icase};
     return std::regex_search(url, pattern);
   }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index f9750e4a505..28367c95430 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -19,10 +19,11 @@
 #include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 
 #include <dlfcn.h>
+#include <sys/stat.h>
 
 #include <cerrno>
 #include <cstring>
@@ -33,6 +34,24 @@
 namespace cudf {
 namespace io {
 namespace detail {
+namespace {
+
+[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode)
+{
+  auto const fd = open(filepath.c_str(), flags, mode);
+  if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); }
+
+  return fd;
+}
+
+[[nodiscard]] size_t get_file_size(int file_descriptor)
+{
+  struct stat st {};
+  CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size");
+  return static_cast<size_t>(st.st_size);
+}
+
+}  // namespace
 
 void force_init_cuda_context()
 {
@@ -55,26 +74,11 @@ void force_init_cuda_context()
     CUDF_EXPECTS(std::filesystem::exists(path), "Cannot open file; it does not exist");
   }
 
-  std::array<char, 1024> error_msg_buffer;
+  std::array<char, 1024> error_msg_buffer{};
   auto const error_msg = strerror_r(err, error_msg_buffer.data(), 1024);
   CUDF_FAIL("Cannot open file; failed with errno: " + std::string{error_msg});
 }
 
-[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode)
-{
-  auto const fd = open(filepath.c_str(), flags, mode);
-  if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); }
-
-  return fd;
-}
-
-[[nodiscard]] size_t get_file_size(int file_descriptor)
-{
-  struct stat st;
-  CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size");
-  return static_cast<size_t>(st.st_size);
-}
-
 file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode)
   : fd(open_file_checked(filepath.c_str(), flags, mode)), _size{get_file_size(fd)}
 {
@@ -125,7 +129,7 @@ class cufile_shim {
 void cufile_shim::modify_cufile_json() const
 {
   std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-  static temp_directory tmp_config_dir{"cudf_cufile_config"};
+  static temp_directory const tmp_config_dir{"cudf_cufile_config"};
 
   // Modify the config file based on the policy
   auto const config_file_path = getenv_or<std::string>(json_path_env_var, "/etc/cufile.json");
@@ -253,7 +257,7 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
                                                   uint8_t* dst,
                                                   rmm::cuda_stream_view stream)
 {
-  int device;
+  int device = 0;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto read_slice = [device, gds_read = shim->read, file_handle = cf_file.handle()](
@@ -285,7 +289,7 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath)
 
 std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
 {
-  int device;
+  int device = 0;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
 
   auto write_slice = [device, gds_write = shim->write, file_handle = cf_file.handle()](
diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp
index 3fd97a00b61..b9613428418 100644
--- a/cpp/src/io/utilities/getenv_or.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <cstdlib>
 #include <sstream>
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 734067582f7..75e45a68842 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,12 +30,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
 
-#include <optional>
-
 using cudf::device_span;
 
 namespace cudf {
@@ -183,7 +182,7 @@ constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' -
  * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-constexpr bool is_infinity(char const* begin, char const* end)
+CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
 {
   if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
@@ -208,9 +207,9 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ std::optional<T> parse_numeric(char const* begin,
-                                                   char const* end,
-                                                   parse_options_view const& opts)
+__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
+                                                         char const* end,
+                                                         parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -267,7 +266,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::optional<T>{}; }
+  if (!all_digits_valid) { return cuda::std::optional<T>{}; }
 
   return value * sign;
 }
@@ -524,7 +523,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
+    auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
@@ -573,7 +572,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
@@ -602,7 +601,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh
index 687766c1bcc..ff1b9f58e6c 100644
--- a/cpp/src/io/utilities/time_utils.cuh
+++ b/cpp/src/io/utilities/time_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = {
 
 struct get_period {
   template <typename T>
-  constexpr int32_t operator()()
+  int32_t operator()()
   {
     if constexpr (is_chrono<T>()) { return T::period::den; }
     CUDF_FAIL("Invalid, non chrono type");
@@ -42,7 +42,7 @@ struct get_period {
 /**
  * @brief Function that translates cuDF time unit to clock frequency
  */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
+inline int32_t to_clockrate(type_id timestamp_type_id)
 {
   return timestamp_type_id == type_id::EMPTY
            ? 0
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index caea8dabb88..c0efc5b6f20 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -82,8 +82,8 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-__host__ __device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                         device_span<char const> key)
+CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                      device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 34a0bdce124..49f92756e43 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -22,6 +22,7 @@
 
 namespace cudf {
 namespace jit {
+namespace {
 
 // Get the directory in home to use for storing the cache
 std::filesystem::path get_user_home_cache_dir()
@@ -72,13 +73,13 @@ std::filesystem::path get_cache_dir()
 
     // Make per device cache based on compute capability. This is to avoid multiple devices of
     // different compute capability to access the same kernel cache.
-    int device;
-    int cc_major;
-    int cc_minor;
+    int device   = 0;
+    int cc_major = 0;
+    int cc_minor = 0;
     CUDF_CUDA_TRY(cudaGetDevice(&device));
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
-    int cc = cc_major * 10 + cc_minor;
+    int const cc = cc_major * 10 + cc_minor;
 
     kernel_cache_path /= std::to_string(cc);
 
@@ -107,13 +108,14 @@ std::size_t try_parse_numeric_env_var(char const* const env_name, std::size_t de
   auto const value = std::getenv(env_name);
   return value != nullptr ? std::stoull(value) : default_val;
 }
+}  // namespace
 
 jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog)
 {
   static std::mutex caches_mutex{};
   static std::unordered_map<std::string, std::unique_ptr<jitify2::ProgramCache<>>> caches{};
 
-  std::lock_guard<std::mutex> caches_lock(caches_mutex);
+  std::lock_guard<std::mutex> const caches_lock(caches_mutex);
 
   auto existing_cache = caches.find(preprog.name());
 
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 519ac2d1a2e..c79ba4347bf 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -26,10 +26,37 @@
 
 namespace cudf {
 namespace jit {
-constexpr char percent_escape[] = "_";  // NOLINT
+namespace {
 
 inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 
+std::string remove_comments(std::string const& src)
+{
+  std::string output;
+  auto f = src.cbegin();
+  while (f < src.cend()) {
+    auto l = std::find(f, src.cend(), '/');
+    output.append(f, l);  // push chunk instead of 1 char at a time
+    f = std::next(l);     // skip over '/'
+    if (l < src.cend()) {
+      char const n = f < src.cend() ? *f : '?';
+      if (n == '/') {                        // found "//"
+        f = std::find(f, src.cend(), '\n');  // skip to end of line
+      } else if (n == '*') {                 // found "/*"
+        auto term = std::string("*/");       // skip to end of next "*/"
+        f         = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size();
+      } else {
+        output.push_back('/');  // lone '/' should be pushed into output
+      }
+    }
+  }
+  return output;
+}
+
+}  // namespace
+
+constexpr char percent_escape[] = "_";  // NOLINT
+
 std::string ptx_parser::escape_percent(std::string const& src)
 {
   // b/c we're transforming into inline ptx we aren't allowed to have register names starting with %
@@ -106,7 +133,7 @@ std::string ptx_parser::parse_instruction(std::string const& src)
   std::string output;
   std::string suffix;
 
-  std::string original_code = "\n   /**   " + src + "  */\n";
+  std::string const original_code = "\n   /**   " + src + "  */\n";
 
   int piece_count = 0;
 
@@ -316,33 +343,10 @@ std::string ptx_parser::parse_function_header(std::string const& src)
   return "\n__device__ __inline__ void " + function_name + "(" + input_arg + "){" + "\n";
 }
 
-std::string remove_comments(std::string const& src)
-{
-  std::string output;
-  auto f = src.cbegin();
-  while (f < src.cend()) {
-    auto l = std::find(f, src.cend(), '/');
-    output.append(f, l);  // push chunk instead of 1 char at a time
-    f = std::next(l);     // skip over '/'
-    if (l < src.cend()) {
-      char n = f < src.cend() ? *f : '?';
-      if (n == '/') {                        // found "//"
-        f = std::find(f, src.cend(), '\n');  // skip to end of line
-      } else if (n == '*') {                 // found "/*"
-        auto term = std::string("*/");       // skip to end of next "*/"
-        f         = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size();
-      } else {
-        output.push_back('/');  // lone '/' should be pushed into output
-      }
-    }
-  }
-  return output;
-}
-
 // The interface
 std::string ptx_parser::parse()
 {
-  std::string no_comments = remove_comments(ptx);
+  std::string const no_comments = remove_comments(ptx);
 
   input_arg_list.clear();
   auto const _func = std::string(".func");  // Go directly to the .func mark
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index c0bc10dd266..6f2acbb0712 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -72,7 +72,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   // - Generate labels for lhs and rhs child elements.
   // - Check existence for rows of the table {rhs_labels, rhs_child} in the table
   //   {lhs_labels, lhs_child}.
-  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults
+  // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence results
   //   computed in the previous step.
 
   auto const lhs_child = lhs.get_sliced_child(stream);
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index ebab3beb08f..d6b85db3f0f 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_sizes[partition_number] = 0;
     partition_number += blockDim.x;
@@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit
   extern __shared__ size_type shared_partition_offsets[];
 
   // Initialize array of this blocks offsets from global array
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_offsets[partition_number] =
       block_partition_offsets[partition_number * gridDim.x + blockIdx.x];
@@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter,
 
   // Fetch the offset in the output buffer of each partition in this thread
   // block
-  for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) {
+  for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions;
+       ipartition += blockDim.x) {
     partition_offset_global[ipartition] =
       scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x];
   }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 47864c25c5f..a60cbbb8db2 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -20,7 +20,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cmath>
+#include <cuda/std/cmath>
+#include <cuda/std/functional>
 
 namespace cudf {
 namespace detail {
@@ -96,12 +97,12 @@ struct quantile_index {
 
   CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
-    quantile = std::min(std::max(quantile, 0.0), 1.0);
+    quantile = cuda::std::min(cuda::std::max(quantile, 0.0), 1.0);
 
     double val = quantile * (count - 1);
     lower      = std::floor(val);
-    higher     = static_cast<size_type>(std::ceil(val));
-    nearest    = static_cast<size_type>(std::nearbyint(val));
+    higher     = static_cast<size_type>(cuda::std::ceil(val));
+    nearest    = static_cast<size_type>(cuda::std::nearbyint(val));
     fraction   = val - lower;
   }
 };
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d27420658d6..2128bacff80 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 size_type const* group_cluster_offsets,
                                                 bool has_nulls)
 {
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const group_index = tid;
   if (group_index >= num_groups) { return; }
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index 17844b6bb0a..933ef1bfcbd 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -29,14 +29,14 @@ tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(c
   CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
   CUDF_EXPECTS(not col.nullable(), "Encountered nullable tdigest column");
 
-  structs_column_view scv(col);
+  structs_column_view const scv(col);
   CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
   CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
                "Encountered invalid tdigest column");
   CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
                "Encountered invalid tdigest column");
 
-  lists_column_view lcv(scv.child(centroid_column_index));
+  lists_column_view const lcv(scv.child(centroid_column_index));
   auto data = lcv.child();
   CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
   CUDF_EXPECTS(data.num_children() == 2,
@@ -52,14 +52,14 @@ lists_column_view tdigest_column_view::centroids() const { return child(centroid
 column_view tdigest_column_view::means() const
 {
   auto c = centroids();
-  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  structs_column_view const inner(c.parent().child(lists_column_view::child_column_index));
   return inner.child(mean_column_index);
 }
 
 column_view tdigest_column_view::weights() const
 {
   auto c = centroids();
-  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  structs_column_view const inner(c.parent().child(lists_column_view::child_column_index));
   return inner.child(weight_column_index);
 }
 
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index b91ae19b51a..7afd3ba3c00 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -20,8 +20,8 @@
 #include <cudf/reduction.hpp>
 
 namespace cudf {
-
 namespace detail {
+namespace {
 std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
@@ -50,6 +50,7 @@ std::unique_ptr<column> scan(column_view const& input,
            : detail::scan_inclusive(input, agg, null_handling, stream, mr);
 }
 
+}  // namespace
 }  // namespace detail
 
 std::unique_ptr<column> scan(column_view const& input,
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index c4f6c135dde..1c3a2b0c0f3 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reduction.hpp>
@@ -26,6 +27,8 @@
 namespace cudf {
 namespace reduction {
 namespace detail {
+namespace {
+
 struct segmented_reduce_dispatch_functor {
   column_view const& col;
   device_span<size_type const> offsets;
@@ -118,6 +121,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
     CUDF_FAIL(
       "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
   }
+
+  if (segmented_values.is_empty() && offsets.empty()) {
+    return cudf::make_empty_column(output_dtype);
+  }
+
   CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element.");
 
   return cudf::detail::aggregation_dispatcher(
@@ -126,6 +134,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
       segmented_values, offsets, output_dtype, null_handling, init, stream, mr},
     agg);
 }
+}  // namespace
 }  // namespace detail
 }  // namespace reduction
 
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 7cad31c0658..9c22c27144d 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -25,32 +25,7 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
-
-bool can_optimize_unbounded_window(bool unbounded_preceding,
-                                   bool unbounded_following,
-                                   size_type min_periods,
-                                   rolling_aggregation const& agg)
-{
-  auto is_supported = [](auto const& agg) {
-    switch (agg.kind) {
-      case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]];
-      case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]];
-      case cudf::aggregation::Kind::SUM: [[fallthrough]];
-      case cudf::aggregation::Kind::MIN: [[fallthrough]];
-      case cudf::aggregation::Kind::MAX: return true;
-      default:
-        // COLLECT_LIST and COLLECT_SET can be added at a later date.
-        // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED]
-        // category. For instance:
-        // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK)
-        //    use [UNBOUNDED PRECEDING, CURRENT ROW].
-        // 2. LEAD/LAG are defined on finite row boundaries.
-        return false;
-    }
-  };
-
-  return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg);
-}
+namespace {
 
 /// Converts rolling_aggregation to corresponding reduce/groupby_aggregation.
 template <typename Base>
@@ -145,6 +120,33 @@ std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
   // Blow up results into separate column.
   return cudf::make_column_from_scalar(*reduce_results, input.size(), stream, mr);
 }
+}  // namespace
+
+bool can_optimize_unbounded_window(bool unbounded_preceding,
+                                   bool unbounded_following,
+                                   size_type min_periods,
+                                   rolling_aggregation const& agg)
+{
+  auto is_supported = [](auto const& agg) {
+    switch (agg.kind) {
+      case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]];
+      case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]];
+      case cudf::aggregation::Kind::SUM: [[fallthrough]];
+      case cudf::aggregation::Kind::MIN: [[fallthrough]];
+      case cudf::aggregation::Kind::MAX: return true;
+      default:
+        // COLLECT_LIST and COLLECT_SET can be added at a later date.
+        // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED]
+        // category. For instance:
+        // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK)
+        //    use [UNBOUNDED PRECEDING, CURRENT ROW].
+        // 2. LEAD/LAG are defined on finite row boundaries.
+        return false;
+    }
+  };
+
+  return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg);
+}
 
 std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 7d11b02d3e1..9ab8ed5938a 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -95,8 +95,8 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const helper_func = [&](auto const& d_equal) {
-    using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{
+    using RowEqual = std::decay_t<decltype(d_equal)>;
+    auto set       = distinct_set_t<RowEqual>{
       num_rows,
       0.5,  // desired load factor
       cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index c3a004b7f28..aadb438b019 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -21,8 +21,8 @@
 
 namespace cudf::detail {
 
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,
@@ -100,7 +100,7 @@ rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
 }
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -110,7 +110,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -120,7 +120,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
@@ -130,7 +130,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index f15807c2434..4ca1cab937a 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
-template <typename RowHasher>
-using hash_set_type =
+template <typename RowEqual>
+using distinct_set_t =
   cuco::static_set<size_type,
                    cuco::extent<int64_t>,
                    cuda::thread_scope_device,
-                   RowHasher,
+                   RowEqual,
                    cuco::linear_probing<1,
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
@@ -79,6 +79,8 @@ using hash_set_type =
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
+ * @tparam RowEqual The type of row equality comparator
+ *
  * @param set The auxiliary set to perform reduction
  * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
@@ -87,8 +89,8 @@ using hash_set_type =
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the output indices
  */
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index b923a301f84..b7b1338dd89 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -80,8 +80,8 @@ std::array<char, 33> const escapable_chars{
  */
 std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
 {
-  auto size       = static_cast<size_type>(pattern.size());
-  size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
+  auto size             = static_cast<size_type>(pattern.size());
+  size_type const count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
     return is_begin_utf8_char(static_cast<uint8_t>(ch));
   });
   std::vector<char32_t> result(count + 1);
@@ -89,7 +89,7 @@ std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
   char const* input_ptr = pattern.data();
   for (size_type idx = 0; idx < size; ++idx) {
     char_utf8 output_character = 0;
-    size_type ch_width         = to_char_utf8(input_ptr, output_character);
+    size_type const ch_width   = to_char_utf8(input_ptr, output_character);
     input_ptr += ch_width;
     idx += ch_width - 1;
     *output_ptr++ = output_character;
@@ -102,7 +102,7 @@ std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
 
 int32_t reprog::add_inst(int32_t t)
 {
-  reinst inst;
+  reinst inst{};
   inst.type        = t;
   inst.u2.left_id  = 0;
   inst.u1.right_id = 0;
@@ -968,7 +968,7 @@ class regex_compiler {
     }
     if (token != RBRA) { push_operator(token, subid); }
 
-    static std::vector<int> tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA};
+    static std::vector<int> const tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA};
     _last_was_and =
       std::any_of(tokens.cbegin(), tokens.cend(), [token](auto t) { return t == token; });
   }
@@ -1046,7 +1046,7 @@ reprog reprog::create_from(std::string_view pattern,
 {
   reprog rtn;
   auto pattern32 = string_to_char32_vector(pattern);
-  regex_compiler compiler(pattern32.data(), flags, capture, rtn);
+  regex_compiler const compiler(pattern32.data(), flags, capture, rtn);
   // for debugging, it can be helpful to call rtn.print(flags) here to dump
   // out the instructions that have been created from the given pattern
   return rtn;
@@ -1114,7 +1114,7 @@ void reprog::build_start_ids()
   std::stack<int> ids;
   ids.push(_startinst_id);
   while (!ids.empty()) {
-    int id = ids.top();
+    int const id = ids.top();
     ids.pop();
     reinst const& inst = _insts[id];
     if (inst.type == OR) {
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index 60ad714dfec..3d11b641b3f 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -99,9 +99,9 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   // place each class and append the variable length data
   for (int32_t idx = 0; idx < classes_count; ++idx) {
     auto const& h_class = h_prog.class_at(idx);
-    reclass_device d_class{h_class.builtins,
-                           static_cast<int32_t>(h_class.literals.size()),
-                           reinterpret_cast<reclass_range*>(d_end)};
+    reclass_device const d_class{h_class.builtins,
+                                 static_cast<int32_t>(h_class.literals.size()),
+                                 reinterpret_cast<reclass_range*>(d_end)};
     *classes++ = d_class;
     memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_range));
     h_end += h_class.literals.size() * sizeof(reclass_range);
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 3cf4970d36e..4ed66622508 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -32,7 +32,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -70,13 +72,11 @@ struct finder_fn {
     if (d_strings.is_null(idx)) { return -1; }
     auto const d_str = d_strings.element<string_view>(idx);
     if (d_str.empty() && (start > 0)) { return -1; }
+    if (stop >= 0 && start > stop) { return -1; }
     auto const d_target = d_targets[idx];
 
-    auto const length = d_str.length();
-    auto const begin  = (start > length) ? length : start;
-    auto const end    = (stop < 0) || (stop > length) ? length : stop;
-    return forward ? d_str.find(d_target, begin, end - begin)
-                   : d_str.rfind(d_target, begin, end - begin);
+    auto const count = (stop < 0) ? stop : (stop - start);
+    return forward ? d_str.find(d_target, start, count) : d_str.rfind(d_target, start, count);
   }
 };
 
@@ -143,7 +143,7 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
     if (stop < 0) { return d_str.size_bytes(); }
     if (stop <= start) { return begin; }
     // we count from `begin` instead of recounting from the beginning of the string
-    return begin + std::get<0>(bytes_to_character_position(
+    return begin + cuda::std::get<0>(bytes_to_character_position(
                      string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start));
   }();
 
@@ -349,13 +349,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  auto const idx    = cudf::detail::grid_1d::global_thread_id();
-  using warp_reduce = cub::WarpReduce<bool>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
   auto const str_idx = idx / cudf::detail::warp_size;
   if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
+
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
@@ -367,7 +369,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
        i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
     // this is definitely faster for very long strings > 128B
-    for (auto j = 0; j < bytes_per_warp; j++) {
+    for (auto j = 0; !found && (j < bytes_per_warp); j++) {
       if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
           d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
         found = true;
@@ -375,7 +377,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
     }
   }
 
-  auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+  auto const result = warp.any(found);
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
 
@@ -531,7 +533,6 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   results->set_null_count(strings.null_count());
   return results;
 }
-
 }  // namespace
 
 std::unique_ptr<column> contains(strings_column_view const& input,
@@ -541,13 +542,17 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
-      ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
+      ((input.chars_size(stream) / (input.size() - input.null_count())) >
+       AVG_CHAR_BYTES_THRESHOLD)) {
     return contains_warp_parallel(input, target, stream, mr);
   }
 
   // benchmark measurements showed this to be faster for smaller strings
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
-    return d_string.find(d_target) != string_view::npos;
+    for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) {
+      if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; }
+    }
+    return false;
   };
   return contains_fn(input, target, pfn, stream, mr);
 }
@@ -558,7 +563,10 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
-    return d_string.find(d_target) != string_view::npos;
+    for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) {
+      if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; }
+    }
+    return false;
   };
   return contains_fn(strings, targets, pfn, stream, mr);
 }
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 4c39fc96397..a74b19aae28 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -35,6 +35,7 @@
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -140,14 +141,16 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
-      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+      first_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
     stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
-      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+      last_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
     }
 
     d_output[str_idx] = (first_byte < last_byte)
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 4012ee3d21c..22328726c0e 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -47,7 +47,7 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
     std::vector<column_view> children;
     children.reserve(num_cols);
     for (size_type col_index = 0; col_index < num_cols; col_index++) {
-      structs_column_view scv(struct_cols[col_index]);
+      structs_column_view const scv(struct_cols[col_index]);
 
       // all inputs must have the same # of children and they must all be of the
       // same type.
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 659beb749af..ee7136d8f5e 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -25,6 +25,21 @@
 
 namespace cudf {
 namespace detail {
+namespace {
+
+template <typename ViewType>
+auto concatenate_column_views(std::vector<ViewType> const& views)
+{
+  using ColumnView = typename ViewType::ColumnView;
+  std::vector<ColumnView> concat_cols;
+  for (auto& view : views) {
+    concat_cols.insert(concat_cols.end(), view.begin(), view.end());
+  }
+  return concat_cols;
+}
+
+}  // namespace
+
 template <typename ColumnView>
 table_view_base<ColumnView>::table_view_base(std::vector<ColumnView> const& cols) : _columns{cols}
 {
@@ -38,17 +53,6 @@ table_view_base<ColumnView>::table_view_base(std::vector<ColumnView> const& cols
   }
 }
 
-template <typename ViewType>
-auto concatenate_column_views(std::vector<ViewType> const& views)
-{
-  using ColumnView = typename ViewType::ColumnView;
-  std::vector<ColumnView> concat_cols;
-  for (auto& view : views) {
-    concat_cols.insert(concat_cols.end(), view.begin(), view.end());
-  }
-  return concat_cols;
-}
-
 // Explicit instantiation for a table of `column_view`s
 template class table_view_base<column_view>;
 
@@ -65,17 +69,16 @@ table_view table_view::select(std::vector<size_type> const& column_indices) cons
 // Convert mutable view to immutable view
 mutable_table_view::operator table_view()
 {
-  std::vector<column_view> cols{begin(), end()};
-  return table_view{cols};
+  return table_view{std::vector<column_view>{begin(), end()}};
 }
 
 table_view::table_view(std::vector<table_view> const& views)
-  : table_view{concatenate_column_views(views)}
+  : table_view{detail::concatenate_column_views(views)}
 {
 }
 
 mutable_table_view::mutable_table_view(std::vector<mutable_table_view> const& views)
-  : mutable_table_view{concatenate_column_views(views)}
+  : mutable_table_view{detail::concatenate_column_views(views)}
 {
 }
 
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index f46f49ddc0e..0aacfd16f67 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -459,10 +459,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
                                                  bpe_merge_pairs const& merges_table,
                                                  cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
+  return detail::byte_pair_encoding(input, merges_table, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index cd68566bdec..a13a435a271 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -103,7 +103,8 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
+  auto pairs =
+    cudf::strings::split_record(input, cudf::string_scalar(" ", true, stream, mr), 1, stream, mr);
   auto content = pairs->release();
   return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index b7a719a2041..9a44d9477ab 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -52,118 +52,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Compute the minhash of each string for each seed
- *
- * This is a warp-per-string algorithm where parallel threads within a warp
- * work on substrings of a single string row.
- *
- * @tparam HashFunction hash function to use on each substring
- *
- * @param d_strings Strings column to process
- * @param seeds Seeds for hashing each string
- * @param width Substring window size in characters
- * @param d_hashes Minhash output values for each string
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
-                                cudf::device_span<hash_value_type const> seeds,
-                                cudf::size_type width,
-                                hash_value_type* d_hashes)
-{
-  auto const idx = cudf::detail::grid_1d::global_thread_id();
-
-  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-  if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-
-  if (d_strings.is_null(str_idx)) { return; }
-
-  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const d_output = d_hashes + (str_idx * seeds.size());
-
-  // initialize hashes output for this string
-  if (lane_idx == 0) {
-    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  auto const begin = d_str.data() + lane_idx;
-  auto const end   = d_str.data() + d_str.size_bytes();
-
-  // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
-    auto const check_str =  // used for counting 'width' characters
-      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
-    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
-    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
-
-    auto const hash_str = cudf::string_view(itr, bytes);
-    // hashing with each seed on the same section of the string is 10x faster than
-    // computing the substrings for each seed
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash substring and store the min value
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        auto const hvalue = hasher(hash_str);
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        auto const hvalue = thrust::get<0>(hasher(hash_str));
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      }
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
-                                         cudf::device_span<hash_value_type const> seeds,
-                                         cudf::size_type width,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS(width >= 2,
-               "Parameter width should be an integer value of 2 or greater",
-               std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, d_hashes);
-
-  return hashes;
-}
-
 constexpr cudf::thread_index_type block_size = 256;
 // for potentially tuning minhash_seed_kernel independently from block_size
 constexpr cudf::thread_index_type tile_size = block_size;
@@ -297,13 +185,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
  * @param d_results Final results vector of calculate values
  */
 template <typename hash_value_type, int blocks_per_string>
-CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
-                                         cudf::device_span<cudf::size_type const> indices,
-                                         cudf::device_span<hash_value_type const> parameter_a,
-                                         cudf::device_span<hash_value_type const> parameter_b,
-                                         cudf::size_type width,
-                                         hash_value_type const* d_hashes,
-                                         hash_value_type* d_results)
+CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+                                cudf::device_span<cudf::size_type const> indices,
+                                cudf::device_span<hash_value_type const> parameter_a,
+                                cudf::device_span<hash_value_type const> parameter_b,
+                                cudf::size_type width,
+                                hash_value_type const* d_hashes,
+                                hash_value_type* d_results)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
   auto const idx = (tid / blocks_per_string) / block_size;
@@ -478,7 +366,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
                                block_size};
-    minhash_permuted_kernel<hash_value_type, 1>
+    minhash_kernel<hash_value_type, 1>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -489,7 +377,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices =
       cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
     cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
-    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+    minhash_kernel<hash_value_type, blocks_per_string>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -497,101 +385,6 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return results;
 }
 
-/**
- * @brief Compute the minhash of each list row of strings for each seed
- *
- * This is a warp-per-row algorithm where parallel threads within a warp
- * work on strings in a single list row.
- *
- * @tparam HashFunction hash function to use on each string
- *
- * @param d_input List of strings to process
- * @param seeds Seeds for hashing each string
- * @param d_hashes Minhash output values (one per row)
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
-                                     cudf::device_span<hash_value_type const> seeds,
-                                     hash_value_type* d_hashes)
-{
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const row_idx = idx / cudf::detail::warp_size;
-
-  if (row_idx >= d_input.size()) { return; }
-  if (d_input.is_null(row_idx)) { return; }
-
-  auto const d_row    = cudf::list_device_view(d_input, row_idx);
-  auto const d_output = d_hashes + (row_idx * seeds.size());
-
-  // initialize hashes output for this row
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-  if (lane_idx == 0) {
-    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  // each lane hashes a string from the input row
-  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
-    auto const hash_str =
-      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash string and store the min value
-      hash_value_type hv;
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv = hasher(hash_str);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        hv = thrust::get<0>(hasher(hash_str));
-      }
-      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
-                                              cudf::device_span<hash_value_type const> seeds,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_word_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
-
-  return hashes;
-}
-
 std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
@@ -620,30 +413,6 @@ std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
 }
 }  // namespace
 
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> const& seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       uint32_t seed,
                                       cudf::device_span<uint32_t const> parameter_a,
@@ -658,30 +427,6 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> const& seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
                                         cudf::device_span<uint64_t const> parameter_a,
@@ -696,45 +441,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seeds, width, stream, mr);
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
@@ -750,23 +468,15 @@ std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const&
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seeds, width, stream, mr);
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
@@ -781,21 +491,4 @@ std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const
   return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash(input, seeds, stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash64(input, seeds, stream, mr);
-}
 }  // namespace nvtext
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 4fd0369c26b..9d96c11c3f2 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
-  thread_index_type const start  = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const block_size          = static_cast<thread_index_type>(blockDim.x);
+  thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
+  thread_index_type const stride = block_size * gridDim.x;
 
   for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
     GENERIC_UNARY_OP(&out_data[i], in_data[i]);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 66bbe532e46..39c11295fbd 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
                                        size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
 
   auto const num_segments = static_cast<size_type>(output.size());
   if (tid >= num_segments) { return; }
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b919ac16956..4a383bfba47 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -33,7 +33,7 @@
 namespace cudf {
 namespace transformation {
 namespace jit {
-
+namespace {
 void unary_operation(mutable_column_view output,
                      column_view input,
                      std::string const& udf,
@@ -41,7 +41,7 @@ void unary_operation(mutable_column_view output,
                      bool is_ptx,
                      rmm::cuda_stream_view stream)
 {
-  std::string kernel_name =
+  std::string const kernel_name =
     jitify2::reflection::Template("cudf::transformation::jit::kernel")  //
       .instantiate(cudf::type_to_name(output.type()),  // list of template arguments
                    cudf::type_to_name(input.type()));
@@ -62,6 +62,7 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(output),
              cudf::jit::get_data_ptr(input));
 }
+}  // namespace
 
 }  // namespace jit
 }  // namespace transformation
@@ -81,7 +82,7 @@ std::unique_ptr<column> transform(column_view const& input,
 
   if (input.is_empty()) { return output; }
 
-  mutable_column_view output_view = *output;
+  mutable_column_view const output_view = *output;
 
   // transform
   transformation::jit::unary_operation(output_view, input, unary_udf, output_type, is_ptx, stream);
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index e30806a5011..4196523d211 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp
deleted file mode 100644
index e52fffbd8c6..00000000000
--- a/cpp/src/utilities/logger.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/logger.hpp>
-
-#include <spdlog/sinks/basic_file_sink.h>
-#include <spdlog/sinks/stdout_sinks.h>
-
-#include <string>
-
-namespace {
-
-/**
- * @brief Creates a sink for libcudf logging.
- *
- * Returns a file sink if the file name has been specified, otherwise returns a stderr sink.
- */
-[[nodiscard]] spdlog::sink_ptr make_libcudf_sink()
-{
-  if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) {
-    return std::make_shared<spdlog::sinks::basic_file_sink_mt>(filename, true);
-  } else {
-    return std::make_shared<spdlog::sinks::stderr_sink_mt>();
-  }
-}
-
-/**
- * @brief Converts the level name into the `spdlog` level enum.
- */
-[[nodiscard]] spdlog::level::level_enum libcudf_log_level()
-{
-  auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL");
-  if (env_level == nullptr) { return spdlog::level::warn; }
-
-  auto const env_lvl_str = std::string(env_level);
-  if (env_lvl_str == "TRACE") return spdlog::level::trace;
-  if (env_lvl_str == "DEBUG") return spdlog::level::debug;
-  if (env_lvl_str == "INFO") return spdlog::level::info;
-  if (env_lvl_str == "WARN") return spdlog::level::warn;
-  if (env_lvl_str == "ERROR") return spdlog::level::err;
-  if (env_lvl_str == "CRITICAL") return spdlog::level::critical;
-  if (env_lvl_str == "OFF") return spdlog::level::off;
-
-  CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable");
-}
-
-/**
- * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization.
- */
-struct logger_wrapper {
-  spdlog::logger logger_;
-
-  logger_wrapper() : logger_{"CUDF", make_libcudf_sink()}
-  {
-    logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
-    logger_.set_level(libcudf_log_level());
-    logger_.flush_on(spdlog::level::warn);
-  }
-};
-
-}  // namespace
-
-spdlog::logger& cudf::detail::logger()
-{
-  static logger_wrapper wrapped{};
-  return wrapped.logger_;
-}
-
-spdlog::logger& cudf::logger() { return cudf::detail::logger(); }
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 000526723c4..6c9f677afb3 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -33,14 +33,14 @@ prefetch_config& prefetch_config::instance()
 
 bool prefetch_config::get(std::string_view key)
 {
-  std::shared_lock<std::shared_mutex> lock(config_mtx);
+  std::shared_lock<std::shared_mutex> const lock(config_mtx);
   auto const it = config_values.find(key.data());
   return it == config_values.end() ? false : it->second;  // default to not prefetching
 }
 
 void prefetch_config::set(std::string_view key, bool value)
 {
-  std::lock_guard<std::shared_mutex> lock(config_mtx);
+  std::lock_guard<std::shared_mutex> const lock(config_mtx);
   config_values[key.data()] = value;
 }
 
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 7069b59be26..b0f2d8c0637 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -55,6 +55,63 @@ std::size_t constexpr STREAM_POOL_SIZE = 32;
   } while (0)
 #endif
 
+/**
+ * @brief RAII struct to wrap a cuda event and ensure its proper destruction.
+ */
+struct cuda_event {
+  cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
+  virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
+
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
+  operator cudaEvent_t() { return e_; }
+
+ private:
+  cudaEvent_t e_{};
+};
+
+namespace {
+
+// FIXME: these will be available in rmm soon
+inline int get_num_cuda_devices()
+{
+  rmm::cuda_device_id::value_type num_dev{};
+  CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+rmm::cuda_device_id get_current_cuda_device()
+{
+  int device_id = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  return rmm::cuda_device_id{device_id};
+}
+
+/**
+ * @brief Returns a cudaEvent_t for the current thread.
+ *
+ * The returned event is valid for the current device.
+ *
+ * @return A cudaEvent_t unique to the current thread and valid on the current device.
+ */
+cudaEvent_t event_for_thread()
+{
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local static std::vector<cuda_event*> thread_events(get_num_cuda_devices());
+  auto const device_id = get_current_cuda_device();
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
+  return *thread_events[device_id.value()];
+}
+
+}  // namespace
+
 /**
  * @brief Implementation of `cuda_stream_pool` that wraps an `rmm::cuda_stram_pool`.
  */
@@ -109,59 +166,6 @@ cuda_stream_pool* create_global_cuda_stream_pool()
   return new rmm_cuda_stream_pool();
 }
 
-// FIXME: these will be available in rmm soon
-inline int get_num_cuda_devices()
-{
-  rmm::cuda_device_id::value_type num_dev{};
-  CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev));
-  return num_dev;
-}
-
-rmm::cuda_device_id get_current_cuda_device()
-{
-  int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
-  return rmm::cuda_device_id{device_id};
-}
-
-/**
- * @brief RAII struct to wrap a cuda event and ensure its proper destruction.
- */
-struct cuda_event {
-  cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
-  virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
-
-  // Moveable but not copyable.
-  cuda_event(const cuda_event&)            = delete;
-  cuda_event& operator=(const cuda_event&) = delete;
-
-  cuda_event(cuda_event&&)            = default;
-  cuda_event& operator=(cuda_event&&) = default;
-
-  operator cudaEvent_t() { return e_; }
-
- private:
-  cudaEvent_t e_;
-};
-
-/**
- * @brief Returns a cudaEvent_t for the current thread.
- *
- * The returned event is valid for the current device.
- *
- * @return A cudaEvent_t unique to the current thread and valid on the current device.
- */
-cudaEvent_t event_for_thread()
-{
-  // The program may crash if this function is called from the main thread and user application
-  // subsequently calls cudaDeviceReset().
-  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
-  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
-  auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
-  return *thread_events[device_id.value()];
-}
-
 /**
  * @brief Returns a reference to the global stream pool for the current device.
  * @return `cuda_stream_pool` valid on the current device.
@@ -174,7 +178,7 @@ cuda_stream_pool& global_cuda_stream_pool()
   static std::mutex mutex;
   auto const device_id = get_current_cuda_device();
 
-  std::lock_guard<std::mutex> lock(mutex);
+  std::lock_guard<std::mutex> const lock(mutex);
   if (pools[device_id.value()] == nullptr) {
     pools[device_id.value()] = create_global_cuda_stream_pool();
   }
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index 41ee4e960b6..86b4db02f54 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -127,6 +127,22 @@ struct is_index_type_impl {
  */
 bool is_index_type(data_type type) { return cudf::type_dispatcher(type, is_index_type_impl{}); }
 
+struct is_signed_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_signed<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` is a signed numeric `data_type`.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is signed numeric
+ */
+bool is_signed(data_type type) { return cudf::type_dispatcher(type, is_signed_impl{}); }
+
 struct is_unsigned_impl {
   template <typename T>
   constexpr bool operator()()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8928d27a871..adf512811cc 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -742,6 +742,7 @@ ConfigureTest(
 )
 ConfigureTest(
   STREAM_TEXT_TEST
+  streams/text/bpe_test.cpp
   streams/text/edit_distance_test.cpp
   streams/text/ngrams_test.cpp
   streams/text/replace_test.cpp
diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index e95c9fb41c6..9f8d22ea94d 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <iostream>
 
+namespace {
 struct valid_bit_functor {
   cudf::bitmask_type const* _null_mask;
   __device__ bool operator()(cudf::size_type element_index) const noexcept
@@ -38,13 +39,7 @@ struct valid_bit_functor {
     return cudf::bit_is_set(_null_mask, element_index);
   }
 };
-
-std::ostream& operator<<(std::ostream& stream, thrust::host_vector<bool> const& bits)
-{
-  for (auto _bit : bits)
-    stream << int(_bit);
-  return stream;
-}
+}  // namespace
 
 struct SetBitmaskTest : public cudf::test::BaseFixture {
   void expect_bitmask_equal(cudf::bitmask_type const* bitmask,  // Device Ptr
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 96f122f21a8..8ffcc552ecb 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -28,6 +28,7 @@
 
 struct ValidIfTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct odds_valid {
   __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; }
 };
@@ -37,6 +38,7 @@ struct all_valid {
 struct all_null {
   __host__ __device__ bool operator()(cudf::size_type i) { return false; }
 };
+}  // namespace
 
 TEST_F(ValidIfTest, EmptyRange)
 {
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index 5570a7d498c..1f29ea9e5fc 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -25,6 +25,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
@@ -47,12 +48,14 @@ struct rep_type_impl<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 
 template <typename T>
 using rep_type_t = typename rep_type_impl<T>::type;
+}  // namespace
 
 template <typename T>
 struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes);
 
+namespace {
 template <typename FromType, typename ToType, typename Iterator>
 void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end)
 {
@@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator
     }
   }
 }
+}  // namespace
 
 TYPED_TEST(ColumnViewAllTypesTests, BitCast)
 {
diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu
index d7e93fb22a3..fff3282fdd5 100644
--- a/cpp/tests/column/compound_test.cu
+++ b/cpp/tests/column/compound_test.cu
@@ -34,6 +34,7 @@
 
 struct CompoundColumnTest : public cudf::test::BaseFixture {};
 
+namespace {
 template <typename ColumnDeviceView>
 struct checker_for_level1 {
   ColumnDeviceView d_column;
@@ -62,6 +63,7 @@ struct checker_for_level2 {
     return bcheck;
   }
 };
+}  // namespace
 
 TEST_F(CompoundColumnTest, ChildrenLevel1)
 {
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index b2d64dac7c8..9e8525cd96b 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed
 TYPED_TEST(DictionaryGetValueTest, BasicGet)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
   auto col = cudf::make_dictionary_column(keys, indices);
 
   auto s = cudf::get_element(*col, 2);
@@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
 TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
@@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 TYPED_TEST(DictionaryGetValueTest, GetNull)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index b81f8196d89..2fb24f6b31e 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -31,6 +31,7 @@
 
 #include <algorithm>
 
+namespace {
 template <typename T>
 CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 {
@@ -109,6 +110,7 @@ std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const>
     xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); });
   return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}};
 }
+}  // namespace
 
 template <typename T>
 struct AtomicsTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index ebc8c11e86c..da8231fb8be 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn)
   cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
 }
 
@@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn)
   cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint8_t> expected{2, 4, 1, 0, 4, 1};
+  cudf::test::fixed_width_column_wrapper<int8_t> expected{2, 4, 1, 0, 4, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index dfa3ede5d46..46319bb376d 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn)
   cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
+  cudf::test::fixed_width_column_wrapper<int32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
 }
 
@@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat)
   cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 7.125};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> expected{2, 3, 1, 0, 3, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{2, 3, 1, 0, 3, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
@@ -64,7 +64,7 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull)
   cudf::test::fixed_width_column_wrapper<int64_t> keys_expected{0, 111, 222, 333, 444};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
 
-  cudf::test::fixed_width_column_wrapper<uint32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
 }
 
@@ -72,6 +72,6 @@ TEST_F(DictionaryEncodeTest, InvalidEncode)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> input{0, 1, 2, 3, -1, -2, -3};
 
-  EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}),
+  EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}),
                cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp
index 051ea45aed6..30e3984d66d 100644
--- a/cpp/tests/dictionary/factories_test.cpp
+++ b/cpp/tests/dictionary/factories_test.cpp
@@ -29,7 +29,7 @@ struct DictionaryFactoriesTest : public cudf::test::BaseFixture {};
 TEST_F(DictionaryFactoriesTest, CreateFromColumnViews)
 {
   cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
-  cudf::test::fixed_width_column_wrapper<uint32_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
 
   auto dictionary = cudf::make_dictionary_column(keys, values);
   cudf::dictionary_column_view view(dictionary->view());
@@ -41,8 +41,8 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumnViews)
 TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<float> keys{-11.75, 4.25, 7.125, 0.5, 12.0};
-  std::vector<uint32_t> h_values{1, 3, 2, 0, 1, 4, 1};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+  std::vector<int32_t> h_values{1, 3, 2, 0, 1, 4, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices(
     h_values.begin(), h_values.end(), thrust::make_transform_iterator(h_values.begin(), [](auto v) {
       return v > 0;
     }));
@@ -50,8 +50,7 @@ TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls)
   cudf::dictionary_column_view view(dictionary->view());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys);
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
 
@@ -59,16 +58,15 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumns)
 {
   std::vector<std::string> h_keys{"pear", "apple", "fruit", "macintosh"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint32_t> h_values{1, 2, 3, 1, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint32_t> values(h_values.begin(), h_values.end());
+  std::vector<int32_t> h_values{1, 2, 3, 1, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values(h_values.begin(), h_values.end());
 
   auto dictionary =
     cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{}, 0);
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -77,8 +75,8 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls)
 {
   std::vector<int64_t> h_keys{-1234567890, -987654321, 0, 19283714};
   cudf::test::fixed_width_column_wrapper<int64_t> keys(h_keys.begin(), h_keys.end());
-  std::vector<uint32_t> h_values{1, 2, 3, 1, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint32_t> values(h_values.begin(), h_values.end());
+  std::vector<int32_t> h_values{1, 2, 3, 1, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> values(h_values.begin(), h_values.end());
   auto size                    = static_cast<cudf::size_type>(h_values.size());
   rmm::device_buffer null_mask = create_null_mask(size, cudf::mask_state::ALL_NULL);
   auto dictionary =
@@ -88,8 +86,7 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls)
   EXPECT_EQ(size, view.null_count());
 
   cudf::test::fixed_width_column_wrapper<int64_t> keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint32_t> values_expected(h_values.begin(),
-                                                                   h_values.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -98,15 +95,15 @@ TEST_F(DictionaryFactoriesTest, KeysWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{{0, 1, 2, 3, 4},
                                                        {true, true, true, false, true}};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{5, 4, 3, 2, 1, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
 }
 
 TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{{5, 4, 3, 2, 1, 0},
-                                                           {true, true, true, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{{5, 4, 3, 2, 1, 0},
+                                                          {true, true, true, false, true, false}};
   EXPECT_THROW(
     cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
     cudf::logic_error);
@@ -115,7 +112,7 @@ TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
 TEST_F(DictionaryFactoriesTest, InvalidIndices)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
-  cudf::test::fixed_width_column_wrapper<int16_t> indices{5, 4, 3, 2, 1, 0};
+  cudf::test::fixed_width_column_wrapper<uint16_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
   EXPECT_THROW(
     cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 2774173b80a..d5877f12184 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -31,8 +31,8 @@ TEST_F(DictionarySearchTest, StringsColumn)
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("ccc"));
   EXPECT_TRUE(result->is_valid());
-  auto n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{3}, n_result->value());
+  auto n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{3}, n_result->value());
 
   result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("eee"));
   EXPECT_FALSE(result->is_valid());
@@ -40,8 +40,8 @@ TEST_F(DictionarySearchTest, StringsColumn)
                                                       cudf::string_scalar("eee"),
                                                       cudf::get_default_stream(),
                                                       cudf::get_current_device_resource_ref());
-  n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{5}, n_result->value());
+  n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{5}, n_result->value());
 }
 
 TEST_F(DictionarySearchTest, WithNulls)
@@ -51,8 +51,8 @@ TEST_F(DictionarySearchTest, WithNulls)
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(4));
   EXPECT_TRUE(result->is_valid());
-  auto n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{0}, n_result->value());
+  auto n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{0}, n_result->value());
 
   result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(5));
   EXPECT_FALSE(result->is_valid());
@@ -60,8 +60,8 @@ TEST_F(DictionarySearchTest, WithNulls)
                                                       cudf::numeric_scalar<int64_t>(5),
                                                       cudf::get_default_stream(),
                                                       cudf::get_current_device_resource_ref());
-  n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
-  EXPECT_EQ(uint32_t{1}, n_result->value());
+  n_result = dynamic_cast<cudf::numeric_scalar<int32_t>*>(result.get());
+  EXPECT_EQ(int32_t{1}, n_result->value());
 }
 
 TEST_F(DictionarySearchTest, EmptyColumn)
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index b96c6909e55..f8f8d525043 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector)
   float_vector_test(0.15, 20, -2, std::multiplies<>());
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index f34760341d8..ddc48c97012 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
   EXPECT_EQ(vec2, vec3);
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 {
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 4ae5d06b214..883a5093bd1 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -30,6 +30,7 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 /**
  * @brief Functor to generate a tdigest by key.
  *
@@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op {
     return std::move(result.second[0].results[0]);
   }
 };
+}  // namespace
 
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {};
@@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
 
+namespace {
 std::unique_ptr<cudf::table> do_agg(
   cudf::column_view key,
   cudf::column_view val,
@@ -537,6 +540,7 @@ std::unique_ptr<cudf::table> do_agg(
 
   return std::make_unique<cudf::table>(std::move(result_columns));
 }
+}  // namespace
 
 TEST_F(TDigestMergeTest, AllValuesAreNull)
 {
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index ef4b9dd9b8a..b7106e823dd 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -26,6 +26,7 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
@@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype)
   EXPECT_EQ(1, dtype.lanes);
   EXPECT_EQ(sizeof(T) * 8, dtype.bits);
 }
+}  // namespace
 
 class DLPackUntypedTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index d93ef28aab8..1ab11b374b6 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -460,19 +460,17 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType)
   // test dictionary arrays with different index types
   // cudf asserts that the index type must be unsigned
   auto array1 =
-    get_nanoarrow_dict_array<int64_t, uint8_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int8_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
   auto array2 =
-    get_nanoarrow_dict_array<int64_t, uint16_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int16_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
   auto array3 =
-    get_nanoarrow_dict_array<int64_t, uint64_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+    get_nanoarrow_dict_array<int64_t, int64_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
 
   // create equivalent cudf dictionary columns
   auto keys_col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 7});
-  auto ind1_col = cudf::test::fixed_width_column_wrapper<uint8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
-  auto ind2_col =
-    cudf::test::fixed_width_column_wrapper<uint16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
-  auto ind3_col =
-    cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind1_col = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind2_col = cudf::test::fixed_width_column_wrapper<int16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind3_col = cudf::test::fixed_width_column_wrapper<int64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
 
   vector_of_columns columns;
   columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col));
@@ -485,19 +483,19 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType)
   ArrowSchemaInit(input_schema.get());
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
   NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
   NANOARROW_THROW_NOT_OK(
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 8be7e087b6d..b7b8202a3c2 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -200,17 +200,19 @@ struct nanoarrow_storage_type {};
     static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \
   }
 
-DEFINE_NANOARROW_STORAGE(bool, BOOL);
+DEFINE_NANOARROW_STORAGE(int8_t, INT8);
+DEFINE_NANOARROW_STORAGE(int16_t, INT16);
+DEFINE_NANOARROW_STORAGE(int32_t, INT32);
 DEFINE_NANOARROW_STORAGE(int64_t, INT64);
+DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
 DEFINE_NANOARROW_STORAGE(uint16_t, UINT16);
 DEFINE_NANOARROW_STORAGE(uint64_t, UINT64);
+DEFINE_NANOARROW_STORAGE(bool, BOOL);
 DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32);
 DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
-DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
-DEFINE_NANOARROW_STORAGE(int32_t, INT32);
 DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128);
 
 #undef DEFINE_NANOARROW_STORAGE
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 29aa928c277..112b3e1d8e2 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -48,7 +48,6 @@ get_nanoarrow_cudf_table(cudf::size_type length)
                          .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin());
-  auto dict_col = cudf::dictionary::encode(col4);
   columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(test_data.bool_data.begin(),
                                                                     test_data.bool_data.end(),
@@ -103,7 +102,7 @@ get_nanoarrow_cudf_table(cudf::size_type length)
     schema->children[1]->flags = 0;
   }
 
-  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_INT32));
   NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2]));
   NANOARROW_THROW_NOT_OK(
     ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
@@ -181,7 +180,7 @@ get_nanoarrow_tables(cudf::size_type length)
 
   populate_from_col<int64_t>(arrow->children[0], table->get_column(0).view());
   populate_from_col<cudf::string_view>(arrow->children[1], table->get_column(1).view());
-  populate_dict_from_col<int64_t, uint32_t>(
+  populate_dict_from_col<int64_t, int32_t>(
     arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view()));
 
   populate_from_col<bool>(arrow->children[3], table->get_column(3).view());
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index faa07ba3311..28a80502f08 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -63,7 +63,6 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto validity_generator = []() { return rand() % 7 != 0; };
   std::generate(
     list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
-  // cudf::size_type n = 0;
   std::generate(
     list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
       return (n++) * length_of_individual_list;
@@ -87,7 +86,6 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
       .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
     int64_data.begin(), int64_data.end(), validity.begin());
-  auto dict_col = cudf::dictionary::encode(col4);
   columns.emplace_back(cudf::dictionary::encode(col4));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
                          bool_data.begin(), bool_data.end(), bool_validity.begin())
@@ -120,11 +118,12 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto int64array = get_arrow_array<int64_t>(int64_data, validity);
 
   auto string_array = get_arrow_array<cudf::string_view>(string_data, validity);
+  auto dict_col     = cudf::dictionary::encode(col4);
   cudf::dictionary_column_view view(dict_col->view());
   auto keys       = cudf::test::to_host<int64_t>(view.keys()).first;
-  auto indices    = cudf::test::to_host<uint32_t>(view.indices()).first;
+  auto indices    = cudf::test::to_host<int32_t>(view.indices()).first;
   auto dict_array = get_arrow_dict_array(std::vector<int64_t>(keys.begin(), keys.end()),
-                                         std::vector<uint32_t>(indices.begin(), indices.end()),
+                                         std::vector<int32_t>(indices.begin(), indices.end()),
                                          validity);
   auto boolarray  = get_arrow_array<bool>(bool_data, bool_validity);
   auto list_array = get_arrow_list_array<int64_t>(
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 58e51cf0389..23ca5734ded 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -56,6 +56,8 @@ using int16_wrapper        = wrapper<int16_t>;
 using int64_wrapper        = wrapper<int64_t>;
 using timestamp_ms_wrapper = wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>;
 using bool_wrapper         = wrapper<bool>;
+using size_type_wrapper    = wrapper<cudf::size_type>;
+using strings_wrapper      = cudf::test::strings_column_wrapper;
 
 using cudf::data_type;
 using cudf::type_id;
@@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder)
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped);
     }
   }
+
+  // test list (all-null) of struct (empty) of string (empty)
+  {
+    std::string json_stringl = R"(
+    {"a" : [1], "c2": [1, 2]}
+    {}
+    )";
+    auto lines               = true;
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+        .prune_columns(true)
+        .experimental(true)
+        .lines(lines);
+
+    cudf::io::schema_element dtype_schema{
+      data_type{cudf::type_id::STRUCT},
+      {
+        {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype<int64_t>()}}}}},
+        {"c2",
+         {data_type{cudf::type_id::LIST},
+          {{"element",
+            {data_type{cudf::type_id::STRUCT},
+             {
+               {"d", {data_type{cudf::type_id::STRING}}},
+             },
+             {{"d"}}}}}}},
+      },
+      {{"a", "c2"}}};
+    in_options.set_dtypes(dtype_schema);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    // Make sure we have column "a":[int64_t]
+    ASSERT_EQ(result.tbl->num_columns(), 2);
+    ASSERT_EQ(result.metadata.schema_info.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element");
+    // Make sure we have all null list "c2": [{"d": ""}]
+    EXPECT_EQ(result.metadata.schema_info[1].name, "c2");
+    ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element");
+    ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d");
+
+    auto const expected0 = [&] {
+      auto const valids = std::vector<bool>{1, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 1, 1}.release(),
+                                     int64_wrapper{1}.release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    auto const expected1 = [&] {
+      auto const get_structs = [] {
+        auto child = cudf::test::strings_column_wrapper{};
+        return cudf::test::structs_column_wrapper{{child}};
+      };
+      auto const valids = std::vector<bool>{0, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 0, 0}.release(),
+                                     get_structs().release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view());
+  }
+}
+
+TEST_F(JsonReaderTest, NullifyMixedList)
+{
+  using namespace cudf::test::iterators;
+  // test list
+  std::string json_stringl = R"(
+      {"c2": []}
+      {"c2": [{}]}
+      {"c2": [[]]}
+      {"c2": [{}, [], {}]}
+      {"c2": [[123], {"b": "1"}]}
+      {"c2": [{"x": "y"}, {"b": "1"}]}
+      {}
+    )";
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  // valid     1  1  0  0  0  1  0
+  // ofset  0, 0, 1, 1, 1, 1, 3, 3
+  // child  {null, null}, {null, null}, {1, null}
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+      .prune_columns(true)
+      .experimental(true)
+      .lines(true);
+
+  // struct<c2: array<struct<b: string, c: string>>> eg. {"c2": [{"b": "1", "c": "2"}]}
+  cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                        {
+                                          {"c2",
+                                           {data_type{cudf::type_id::LIST},
+                                            {{"element",
+                                              {data_type{cudf::type_id::STRUCT},
+                                               {
+                                                 {"b", {data_type{cudf::type_id::STRING}}},
+                                                 {"c", {data_type{cudf::type_id::STRING}}},
+                                               },
+                                               {{"b", "c"}}}}}}},
+                                        },
+                                        {{"c2"}}};
+  in_options.set_dtypes(dtype_schema);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  ASSERT_EQ(result.tbl->num_columns(), 1);
+  ASSERT_EQ(result.metadata.schema_info.size(), 1);
+
+  // Expected: A list of struct of 2-string columns
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  auto get_structs = [] {
+    strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})};
+    strings_wrapper child1{{"", "", ""}, all_nulls()};
+    // purge non-empty nulls in list seems to retain nullmask in struct child column
+    return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release();
+  };
+  std::vector<bool> const list_nulls{1, 1, 0, 0, 0, 1, 0};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend());
+  auto const expected = cudf::make_lists_column(
+    7,
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 1, 1, 1, 1, 3, 3}.release(),
+    get_structs(),
+    null_count,
+    std::move(null_mask));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view());
 }
 
 struct JsonCompressedIOTest : public cudf::test::BaseFixture,
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 887d4fa783f..5201a46ba7d 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -34,6 +34,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 // Host copy of tree_meta_t
 struct tree_meta_t2 {
   std::vector<cuio_json::NodeT> node_categories;
@@ -43,8 +45,6 @@ struct tree_meta_t2 {
   std::vector<cuio_json::SymbolOffsetT> node_range_end;
 };
 
-namespace {
-
 tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream)
 {
   return {cudf::detail::make_std_vector_async(d_value.node_categories, stream),
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
index f988ae24b38..a67830a7864 100644
--- a/cpp/tests/io/json/json_tree_csr.cu
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -36,6 +36,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 struct h_tree_meta_t {
   std::vector<cuio_json::NodeT> node_categories;
   std::vector<cuio_json::NodeIndexT> parent_node_ids;
@@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true)
   // assert equality between csr and meta formats
   ASSERT_TRUE(iseq);
 }
+}  // namespace
 
 struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 153a8a0c5aa..369376b6c95 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
+namespace {
 constexpr size_t input_limit_expected_file_count = 4;
 
 std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
@@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector<std::string> const& test_filenames,
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
 }
+}  // namespace
 
 struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {};
 
@@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 
 struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct offset_gen {
   int const group_size;
   __device__ int operator()(int i) { return i * group_size; }
@@ -1198,6 +1201,8 @@ template <typename T>
 struct value_gen {
   __device__ T operator()(int i) { return i % 1024; }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
   auto base_path      = temp_env->get_temp_filepath("list");
@@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
+namespace {
 void tiny_list_rowgroup_test(bool just_list_col)
 {
   auto iter = thrust::make_counting_iterator(0);
@@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first));
 }
+}  // namespace
 
 TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle)
 {
@@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed)
   tiny_list_rowgroup_test(false);
 }
 
+namespace {
 struct char_values {
   __device__ int8_t operator()(int i)
   {
@@ -1341,6 +1349,8 @@ struct char_values {
     return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 257c0979017..8377060b6ec 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -26,16 +26,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericOptionalIteratorTest : public IteratorTest<T> {};
 
@@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator)
 }
 TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); }
 
+namespace {
 // Transformers and Operators for optional_iterator test
 template <typename ElementType>
 struct transformer_optional_meanvar {
@@ -65,6 +56,7 @@ template <typename T>
 struct optional_to_meanvar {
   CUDF_HOST_DEVICE inline T operator()(cuda::std::optional<T> const& v) { return v.value_or(T{0}); }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 3447aa0dde6..5f707232953 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -24,16 +24,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericPairIteratorTest : public IteratorTest<T> {};
 
@@ -53,6 +43,7 @@ struct transformer_pair_meanvar {
   };
 };
 
+namespace {
 struct sum_if_not_null {
   template <typename T>
   CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(thrust::pair<T, bool> const& lhs,
@@ -66,6 +57,7 @@ struct sum_if_not_null {
       return {rhs};
   }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 37414eb3fba..c146fd2ea4e 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -33,6 +33,7 @@
 
 #include <arrow/util/tdigest.h>
 
+namespace {
 std::unique_ptr<cudf::column> arrow_percentile_approx(cudf::column_view const& _values,
                                                       int delta,
                                                       std::vector<double> const& percentages)
@@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type()
   if constexpr (cudf::is_fixed_point<T>()) { return cudf::data_type{cudf::type_to_id<T>(), -7}; }
   return cudf::data_type{cudf::type_to_id<T>()};
 }
+}  // namespace
 
 using PercentileApproxTypes =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index bc0321bd40a..2281a517aa6 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -1122,6 +1122,26 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_bool);
 }
 
+TEST_F(SegmentedReductionTestUntyped, EmptyInputEmptyOffsets)
+{
+  auto const str_empty = cudf::test::strings_column_wrapper{};
+  auto const int_empty = cudf::test::fixed_width_column_wrapper<cudf::size_type>{};
+  auto result =
+    cudf::segmented_reduce(str_empty,
+                           cudf::column_view{int_empty},
+                           *cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type{cudf::type_id::STRING},
+                           cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, str_empty);
+
+  result = cudf::segmented_reduce(int_empty,
+                                  cudf::column_view{int_empty},
+                                  *cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>(),
+                                  cudf::data_type{cudf::type_id::INT32},
+                                  cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, int_empty);
+}
+
 template <typename T>
 struct SegmentedReductionFixedPointTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index c8fec51e1c9..184725e17e0 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ template <typename T>
 struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {};
 TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes);
 
+namespace {
 struct reduce_op {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& values, int delta) const
   {
@@ -60,6 +61,7 @@ struct reduce_merge_op {
     return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer());
   }
 };
+}  // namespace
 
 TYPED_TEST(ReductionTDigestAllTypes, Simple)
 {
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index 6519b0ed4ee..d82f512329f 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -1098,7 +1098,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary)
 
     auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release();
     auto expected_values =
-      cudf::test::fixed_width_column_wrapper<uint32_t>{
+      cudf::test::fixed_width_column_wrapper<int32_t>{
         {2, 3, 4, 5, 0, 0, 7, 8, 9, 10, 0, 0},
         cudf::test::iterators::nulls_at(std::vector{4, 5, 10, 11})}
         .release();
@@ -1118,7 +1118,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary)
 
     auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release();
     auto expected_values =
-      cudf::test::fixed_width_column_wrapper<uint32_t>{
+      cudf::test::fixed_width_column_wrapper<int32_t>{
         {0, 0, 1, 2, 3, 4, 0, 6, 0, 7, 8, 9}, cudf::test::iterators::nulls_at(std::vector{0, 6})}
         .release();
     auto expected_output =
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
index 03e4cf47470..498504ef212 100644
--- a/cpp/tests/streams/dictionary_test.cpp
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -29,7 +29,7 @@ class DictionaryTest : public cudf::test::BaseFixture {};
 TEST_F(DictionaryTest, FactoryColumnViews)
 {
   cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
-  cudf::test::fixed_width_column_wrapper<uint8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
 
   auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
@@ -42,15 +42,15 @@ TEST_F(DictionaryTest, FactoryColumns)
 {
   std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+  std::vector<int8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values(h_values.begin(), h_values.end());
 
   auto dictionary = cudf::make_dictionary_column(
     keys.release(), values.release(), cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -59,15 +59,15 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
 {
   std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
   cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
-  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
-  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+  std::vector<int8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<int8_t> values(h_values.begin(), h_values.end());
 
   auto dictionary = cudf::make_dictionary_column(
     keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream());
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
-  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> values_expected(h_values.begin(), h_values.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
 }
@@ -75,7 +75,7 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
 TEST_F(DictionaryTest, Encode)
 {
   cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
-  cudf::data_type int32_type(cudf::type_id::UINT32);
+  cudf::data_type int32_type(cudf::type_id::INT32);
   cudf::column_view col_view = col;
   cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index 7133baf6df1..79ea6b7d6d4 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -23,9 +23,11 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
+}  // namespace
 
 struct DLPackTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/text/bpe_test.cpp b/cpp/tests/streams/text/bpe_test.cpp
new file mode 100644
index 00000000000..0510edc122a
--- /dev/null
+++ b/cpp/tests/streams/text/bpe_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/byte_pair_encoding.hpp>
+
+struct TextBytePairEncoding : public cudf::test::BaseFixture {};
+
+TEST_F(TextBytePairEncoding, BytePairEncoding)
+{
+  auto stream = cudf::test::get_default_stream();
+  // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
+  auto mpt = cudf::test::strings_column_wrapper({
+    "e n",    // 14
+    "i t",    // 16
+    "i s",    // 17
+    "e s",    // 20
+    "en t",   // 44
+    "c e",    // 90
+    "es t",   // 141
+    "en ce",  // 340
+    "t h",    // 146
+    "h i",    // 5049
+    "th is",  // 5407
+    "t est",  // 9034
+    "s i",    // 13142
+    "s ent"   // 33832
+  });
+
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt), stream);
+
+  auto validity = cudf::test::iterators::null_at(4);
+  cudf::test::strings_column_wrapper input(
+    {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""},
+    validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto results =
+    nvtext::byte_pair_encoding(sv, *merge_pairs, cudf::string_scalar(" ", true, stream), stream);
+}
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 042ac44621e..8bfb17e0efd 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted)
 
   auto view = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(10);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(10);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide)
   auto input = cudf::test::strings_column_wrapper({small, wide});
   auto view  = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(20);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(20);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
   auto first = thrust::counting_iterator<uint32_t>(20);
   // more than params_per_thread
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   // more than params_per_thread
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view   = cudf::strings_column_view(input->view());
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view    = cudf::strings_column_view(input->view());
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
   auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
-  results       = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
-    std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+               std::invalid_argument);
   auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
   EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
     std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
@@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest)
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
   auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+               std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
     std::overflow_error);
-  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64_permuted(
-                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
-               std::overflow_error);
 
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 }
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 01a042130d6..7e203086fca 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
+namespace {
 struct sum_functor {
   cudf::size_type const* s0;
   cudf::size_type const* s1;
@@ -597,6 +598,7 @@ struct sum_functor {
 
   cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; }
 };
+}  // namespace
 
 TEST_F(RowBitCount, Table)
 {
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index cfab570833b..58396115a54 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -16,29 +16,25 @@
 
 #include <cudf_test/base_fixture.hpp>
 
-#include <cudf/detail/utilities/logger.hpp>
-
-#include <spdlog/sinks/ostream_sink.h>
+#include <cudf/logger.hpp>
 
 #include <string>
 
 class LoggerTest : public cudf::test::BaseFixture {
   std::ostringstream oss;
-  spdlog::level::level_enum prev_level;
-  std::vector<spdlog::sink_ptr> prev_sinks;
+  cudf::level_enum prev_level;
 
  public:
-  LoggerTest()
-    : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()}
+  LoggerTest() : prev_level{cudf::default_logger().level()}
   {
-    cudf::detail::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
-    cudf::detail::logger().set_formatter(
-      std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
+    cudf::default_logger().sinks().push_back(std::make_shared<cudf::ostream_sink_mt>(oss));
+    cudf::default_logger().set_pattern("%v");
   }
   ~LoggerTest() override
   {
-    cudf::detail::logger().set_level(prev_level);
-    cudf::detail::logger().sinks() = prev_sinks;
+    cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
+    cudf::default_logger().set_level(prev_level);
+    cudf::default_logger().sinks().pop_back();
   }
 
   void clear_sink() { oss.str(""); }
@@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture {
 
 TEST_F(LoggerTest, Basic)
 {
-  cudf::detail::logger().critical("crit msg");
+  cudf::default_logger().critical("crit msg");
   ASSERT_EQ(this->sink_content(), "crit msg\n");
 }
 
 TEST_F(LoggerTest, DefaultLevel)
 {
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
-  cudf::detail::logger().error("error");
-  cudf::detail::logger().critical("critical");
-  ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
+  cudf::default_logger().error("error");
+  cudf::default_logger().critical("critical");
+  ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
 {
-  cudf::detail::logger().set_level(spdlog::level::warn);
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
+  cudf::default_logger().set_level(cudf::level_enum::warn);
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
   ASSERT_EQ(this->sink_content(), "warn\n");
 
   this->clear_sink();
 
-  cudf::detail::logger().set_level(spdlog::level::debug);
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
+  cudf::default_logger().set_level(cudf::level_enum::debug);
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
   ASSERT_EQ(this->sink_content(), "debug\n");
 }
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 4086c5a91bb..8e5129dfbd2 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -37,6 +37,7 @@
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 
+namespace {
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
   cudf::size_type size() { return cudf::size_type(100); }
@@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation {
     return primitive == dur.count();
   }
 };
+}  // namespace
 
 TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes);
 
@@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
                                *cudf::column_device_view::create(chrono_col)}));
 }
 
+namespace {
 template <typename ChronoT>
 struct compare_chrono_elements {
   cudf::binary_operator comp;
@@ -129,6 +132,7 @@ struct compare_chrono_elements {
     }
   }
 };
+}  // namespace
 
 TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 {
diff --git a/dependencies.yaml b/dependencies.yaml
index 3976696a41c..44767f1e9d3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -413,7 +413,6 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
-          - spdlog>=1.14.1,<1.15
   depends_on_nvcomp:
     common:
       - output_types: conda
@@ -679,10 +678,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0,<=12.6.0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_pylibcudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
   run_cudf:
     common:
@@ -705,10 +704,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0,<=12.6.0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_cudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
@@ -885,12 +884,9 @@ dependencies:
       - output_types: conda
         matrices:
           - matrix:
-              arch: x86_64
+              cuda: "12.*"
             packages:
-              # Currently, CUDA + aarch64 builds of pytorch do not exist on conda-forge.
-              - pytorch>=2.1.0
-              # We only install these on x86_64 to avoid pulling pytorch as a
-              # dependency of transformers.
+              - pytorch>=2.4.0
               - *tokenizers
               - *transformers
           - matrix:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index fbb9ca4b128..09214803c0c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -439,6 +439,8 @@ def _generate_namespaces(namespaces):
     # Sphinx doesn't know how to distinguish between the ORC and Parquet
     # definitions because Breathe doesn't to preserve namespaces for enums.
     "TypeKind",
+    # Span subclasses access base class members
+    "base::",
 }
 
 _domain_objects = None
@@ -594,6 +596,8 @@ def on_missing_reference(app, env, node, contnode):
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
+    ("py:class", "np.uint32"),
+    ("py:class", "np.uint64"),
 ]
 
 
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index ed029c918e4..d1cc0cc96fe 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource)
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
    *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
+   *                       {@link RmmAllocationMode#CUDA_ASYNC},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
     boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
     if (isAsync && isManaged) {
@@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       } else if (isAsync) {
         resource = new RmmLimitingResourceAdaptor<>(
             new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
+      } else if (isAsyncFabric) {
+        resource = new RmmLimitingResourceAdaptor<>(
+            new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512);
       } else if (isManaged) {
         resource = new RmmManagedMemoryResource();
       } else {
@@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
 
   private static native long allocInternal(long size, long stream) throws RmmException;
 
-
   static native void free(long ptr, long length, long stream) throws RmmException;
 
   /**
@@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle,
 
   static native void releaseArenaMemoryResource(long handle);
 
-  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
+  static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException;
 
   static native void releaseCudaAsyncMemoryResource(long handle);
 
@@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path,
 
   static native void releaseLoggingResourceAdaptor(long handle);
 
-
   static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
 
   static native void releaseTrackingResourceAdaptor(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 966c21bee22..3f7bc1fae76 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,9 @@ public class RmmAllocationMode {
    * Use CUDA async suballocation strategy
    */
   public static final int CUDA_ASYNC = 0x00000008;
+  /**
+   * Use CUDA async suballocation strategy with fabric handles that are
+   * peer accessible with read-write access
+   */
+  public static final int CUDA_ASYNC_FABRIC = 0x00000010;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index fa1f13cb7ed..cf4936e2e24 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource {
    * @param releaseThreshold size in bytes for when memory is released back to cuda
    */
   public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) {
+    this(size, releaseThreshold, false);
+  }
+
+  /**
+   * Create a new async memory resource
+   * @param size the initial size of the pool
+   * @param releaseThreshold size in bytes for when memory is released back to cuda
+   * @param fabric if true request peer read+write accessible fabric handles when
+   *        creating the pool
+   */
+  public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) {
     this.size = size;
     this.releaseThreshold = releaseThreshold;
-    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold);
+    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric);
   }
 
   @Override
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 286b5c208c9..f3155bc5860 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native long makeStructScalar(long[] viewHandles, boolean isValid);
   private static native long repeatString(long scalarHandle, int repeatTimes);
 
-  Scalar(DType type, long scalarHandle) {
+  /**
+   * Constructor to create a scalar from a native handle and a type.
+   *
+   * @param type The type of the scalar
+   * @param scalarHandle The native handle (pointer address) to the scalar data
+   */
+  public Scalar(DType type, long scalarHandle) {
     this.type = type;
     this.offHeap = new OffHeapState(scalarHandle);
     MemoryCleaner.register(this, offHeap);
     incRefCount();
   }
 
+  /**
+   * Get the native handle (native pointer address) for the scalar.
+   *
+   * @return The native handle
+   */
+  public long getScalarHandle() {
+    return offHeap.scalarHandle;
+  }
+
   /**
    * Increment the reference count for this scalar.  You need to call close on this
    * to decrement the reference count again.
@@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() {
     return this;
   }
 
-  long getScalarHandle() {
-    return offHeap.scalarHandle;
-  }
-
   /**
    * Free the memory associated with a scalar.
    */
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 23c7b7fb243..8c733018fa7 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -772,14 +772,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
-                                                                           jclass clazz,
-                                                                           jlong init,
-                                                                           jlong release)
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
+  JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
+
+    auto handle_type =
+      fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}
+             : std::nullopt;
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type);
+
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
diff --git a/pyproject.toml b/pyproject.toml
index 6933484f4e7..0c95ea60408 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,12 +18,13 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn,thirdparty"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
 line-length = 79
+target-version = "py310"
 
 [tool.ruff.lint]
 typing-modules = ["cudf._typing"]
@@ -94,17 +95,35 @@ select = [
     "UP035",
     # usage of legacy `np.random` function calls
     "NPY002",
+    # Ruff-specific rules
+    "RUF",
 ]
 ignore = [
     # whitespace before :
     "E203",
     # line-too-long (due to Copyright header)
     "E501",
+    # type-comparison, disabled because we compare types to numpy dtypes
+    "E721",
+    # String contains ambiguous character
+    "RUF001",
+    # Parenthesize `a and b` expressions when chaining `and` and `or`
+    # together, to make the precedence clear
+    "RUF021",
+    # Mutable class attributes should be annotated with
+    # `typing.ClassVar`
+    "RUF012",
 ]
 fixable = ["ALL"]
 exclude = [
-    # TODO: Remove this in a follow-up where we fix __all__.
-    "__init__.py",
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
+]
+
+[tool.ruff.format]
+exclude = [
+    # TODO: https://github.com/rapidsai/cudf/issues/17461
+    "**/*.ipynb",
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
index c1e9d4d6116..872ba424d20 100644
--- a/python/cudf/benchmarks/common/config.py
+++ b/python/cudf/benchmarks/common/config.py
@@ -42,9 +42,9 @@ def pytest_collection_modifyitems(session, config, items):
         items[:] = list(filter(is_pandas_compatible, items))
 
 else:
-    import cupy  # noqa: W0611, F401
+    import cupy  # noqa: F401
 
-    import cudf  # noqa: W0611, F401
+    import cudf  # noqa: F401
 
     def pytest_collection_modifyitems(session, config, items):
         pass
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 0e4afadccf5..24ff211387c 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,18 +56,16 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-# Turn off isort until we upgrade to 5.8.0
-# https://github.com/pycqa/isort/issues/1594
-from config import (  # noqa: W0611, E402, F401
+from config import (
     NUM_COLS,
     NUM_ROWS,
-    collect_ignore,
-    cudf,  # noqa: W0611, E402, F401
-    pytest_collection_modifyitems,
-    pytest_sessionfinish,
-    pytest_sessionstart,
+    collect_ignore,  # noqa: F401
+    cudf,
+    pytest_collection_modifyitems,  # noqa: F401
+    pytest_sessionfinish,  # noqa: F401
+    pytest_sessionstart,  # noqa: F401
 )
-from utils import (  # noqa: E402
+from utils import (
     OrderedSet,
     collapse_fixtures,
     column_generators,
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 99b759e2166..843f2670b4d 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -99,6 +99,7 @@
 
 
 __all__ = [
+    "NA",
     "BaseIndex",
     "CategoricalDtype",
     "CategoricalIndex",
@@ -114,7 +115,6 @@
     "IntervalIndex",
     "ListDtype",
     "MultiIndex",
-    "NA",
     "NaT",
     "RangeIndex",
     "Scalar",
diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py
index ee1b2c1f1c4..4b080937a17 100644
--- a/python/cudf/cudf/_fuzz_testing/fuzzer.py
+++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py
@@ -95,7 +95,7 @@ def start(self):
                 else:
                     self._data_handler.set_rand_params(self.params)
                     kwargs = self._data_handler._current_params["test_kwargs"]
-                    logging.info(f"Parameters passed: {str(kwargs)}")
+                    logging.info(f"Parameters passed: {kwargs!s}")
                     self._target(file_name, **kwargs)
             except KeyboardInterrupt:
                 logging.info(
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 8ed5d5b896c..427ffcc8c12 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,30 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources
-    aggregation.pyx
-    binaryop.pyx
-    column.pyx
-    copying.pyx
-    csv.pyx
-    filling.pyx
-    groupby.pyx
-    interop.pyx
-    merge.pyx
-    orc.pyx
-    parquet.pyx
-    reduce.pyx
-    replace.pyx
-    round.pyx
-    scalar.pyx
-    sort.pyx
-    stream_compaction.pyx
-    string_casting.pyx
-    strings_udf.pyx
-    text.pyx
-    transform.pyx
-    types.pyx
-    utils.pyx
+set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx
+                   string_casting.pyx strings_udf.pyx types.pyx utils.pyx
 )
 set(linked_libraries cudf::cudf)
 
@@ -51,7 +29,3 @@ target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DI
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
-
-add_subdirectory(io)
-add_subdirectory(nvtext)
-add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index b71c5ea73d6..26afdd62caf 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -2,25 +2,12 @@
 import numpy as np
 
 from . import (
-    binaryop,
     copying,
-    csv,
-    filling,
     groupby,
     interop,
-    merge,
-    nvtext,
-    orc,
-    parquet,
-    reduce,
-    replace,
-    round,
-    sort,
     stream_compaction,
     string_casting,
-    strings,
     strings_udf,
-    text,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
deleted file mode 100644
index 3c96b90f0a1..00000000000
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pandas as pd
-from numba.np import numpy_support
-
-import pylibcudf
-
-import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
-from cudf.utils import cudautils
-
-_agg_name_map = {
-    "COUNT_VALID": "COUNT",
-    "COUNT_ALL": "SIZE",
-    "VARIANCE": "VAR",
-    "NTH_ELEMENT": "NTH",
-    "COLLECT_LIST": "COLLECT",
-    "COLLECT_SET": "UNIQUE",
-}
-
-
-class Aggregation:
-    def __init__(self, agg):
-        self.c_obj = agg
-
-    @property
-    def kind(self):
-        name = self.c_obj.kind().name
-        return _agg_name_map.get(name, name)
-
-    @classmethod
-    def sum(cls):
-        return cls(pylibcudf.aggregation.sum())
-
-    @classmethod
-    def min(cls):
-        return cls(pylibcudf.aggregation.min())
-
-    @classmethod
-    def max(cls):
-        return cls(pylibcudf.aggregation.max())
-
-    @classmethod
-    def idxmin(cls):
-        return cls(pylibcudf.aggregation.argmin())
-
-    @classmethod
-    def idxmax(cls):
-        return cls(pylibcudf.aggregation.argmax())
-
-    @classmethod
-    def mean(cls):
-        return cls(pylibcudf.aggregation.mean())
-
-    @classmethod
-    def count(cls, dropna=True):
-        return cls(pylibcudf.aggregation.count(
-            pylibcudf.types.NullPolicy.EXCLUDE
-            if dropna else pylibcudf.types.NullPolicy.INCLUDE
-        ))
-
-    @classmethod
-    def ewma(cls, com=1.0, adjust=True):
-        return cls(pylibcudf.aggregation.ewma(
-            com,
-            pylibcudf.aggregation.EWMHistory.INFINITE
-            if adjust else pylibcudf.aggregation.EWMHistory.FINITE
-        ))
-
-    @classmethod
-    def size(cls):
-        return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE))
-
-    @classmethod
-    def collect(cls):
-        return cls(
-            pylibcudf.aggregation.collect_list(pylibcudf.types.NullPolicy.INCLUDE)
-        )
-
-    @classmethod
-    def nunique(cls, dropna=True):
-        return cls(pylibcudf.aggregation.nunique(
-            pylibcudf.types.NullPolicy.EXCLUDE
-            if dropna else pylibcudf.types.NullPolicy.INCLUDE
-        ))
-
-    @classmethod
-    def nth(cls, size):
-        return cls(pylibcudf.aggregation.nth_element(size))
-
-    @classmethod
-    def product(cls):
-        return cls(pylibcudf.aggregation.product())
-    prod = product
-
-    @classmethod
-    def sum_of_squares(cls):
-        return cls(pylibcudf.aggregation.sum_of_squares())
-
-    @classmethod
-    def var(cls, ddof=1):
-        return cls(pylibcudf.aggregation.variance(ddof))
-
-    @classmethod
-    def std(cls, ddof=1):
-        return cls(pylibcudf.aggregation.std(ddof))
-
-    @classmethod
-    def median(cls):
-        return cls(pylibcudf.aggregation.median())
-
-    @classmethod
-    def quantile(cls, q=0.5, interpolation="linear"):
-        if not pd.api.types.is_list_like(q):
-            q = [q]
-
-        return cls(pylibcudf.aggregation.quantile(
-            q, pylibcudf.types.Interpolation[interpolation.upper()]
-        ))
-
-    @classmethod
-    def unique(cls):
-        return cls(pylibcudf.aggregation.collect_set(
-                pylibcudf.types.NullPolicy.INCLUDE,
-                pylibcudf.types.NullEquality.EQUAL,
-                pylibcudf.types.NanEquality.ALL_EQUAL,
-
-        ))
-
-    @classmethod
-    def first(cls):
-        return cls(
-            pylibcudf.aggregation.nth_element(0, pylibcudf.types.NullPolicy.EXCLUDE)
-        )
-
-    @classmethod
-    def last(cls):
-        return cls(
-            pylibcudf.aggregation.nth_element(-1, pylibcudf.types.NullPolicy.EXCLUDE)
-        )
-
-    @classmethod
-    def corr(cls, method, min_periods):
-        return cls(pylibcudf.aggregation.correlation(
-            pylibcudf.aggregation.CorrelationType[method.upper()],
-            min_periods
-
-        ))
-
-    @classmethod
-    def cov(cls, min_periods, ddof=1):
-        return cls(pylibcudf.aggregation.covariance(
-            min_periods,
-            ddof
-        ))
-
-    # scan aggregations
-    @classmethod
-    def cumcount(cls):
-        return cls.count(False)
-
-    cumsum = sum
-    cummin = min
-    cummax = max
-    cumprod = product
-
-    @classmethod
-    def rank(cls, method, ascending, na_option, pct):
-        return cls(pylibcudf.aggregation.rank(
-            pylibcudf.aggregation.RankMethod[method.upper()],
-            (pylibcudf.types.Order.ASCENDING if ascending else
-                pylibcudf.types.Order.DESCENDING),
-            (pylibcudf.types.NullPolicy.EXCLUDE if na_option == "keep" else
-                pylibcudf.types.NullPolicy.INCLUDE),
-            (pylibcudf.types.NullOrder.BEFORE
-                if (na_option == "top") == ascending else
-                pylibcudf.types.NullOrder.AFTER),
-            (pylibcudf.aggregation.RankPercentage.ZERO_NORMALIZED
-                if pct else
-                pylibcudf.aggregation.RankPercentage.NONE)
-
-        ))
-
-    # Reduce aggregations
-    @classmethod
-    def any(cls):
-        return cls(pylibcudf.aggregation.any())
-
-    @classmethod
-    def all(cls):
-        return cls(pylibcudf.aggregation.all())
-
-    # Rolling aggregations
-    @classmethod
-    def from_udf(cls, op, *args, **kwargs):
-        # Handling UDF type
-        nb_type = numpy_support.from_dtype(kwargs['dtype'])
-        type_signature = (nb_type[:],)
-        ptx_code, output_dtype = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = cudf.dtype(output_dtype)
-        if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
-            raise TypeError(f"Result of window function has unsupported dtype {op[1]}")
-
-        return cls(
-            pylibcudf.aggregation.udf(
-                ptx_code,
-                pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]),
-            )
-        )
-
-
-def make_aggregation(op, kwargs=None):
-    r"""
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-    \*\*kwargs : dict, optional
-        Any keyword arguments to be passed to the op.
-
-    Returns
-    -------
-    Aggregation
-    """
-    if kwargs is None:
-        kwargs = {}
-
-    if isinstance(op, str):
-        return getattr(Aggregation, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            return Aggregation.collect()
-        elif "dtype" in kwargs:
-            return Aggregation.from_udf(op, **kwargs)
-        else:
-            return op(Aggregation)
-    raise TypeError(f"Unknown aggregation {op}")
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
deleted file mode 100644
index e2547476849..00000000000
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-from cudf.core.buffer import acquire_spill_lock
-
-# Map pandas operation names to pylibcudf operation names.
-_op_map = {
-    "TRUEDIV": "TRUE_DIV",
-    "FLOORDIV": "FLOOR_DIV",
-    "MOD": "PYMOD",
-    "EQ": "EQUAL",
-    "NE": "NOT_EQUAL",
-    "LT": "LESS",
-    "GT": "GREATER",
-    "LE": "LESS_EQUAL",
-    "GE": "GREATER_EQUAL",
-    "AND": "BITWISE_AND",
-    "OR": "BITWISE_OR",
-    "XOR": "BITWISE_XOR",
-    "L_AND": "LOGICAL_AND",
-    "L_OR": "LOGICAL_OR",
-}
-
-
-@acquire_spill_lock()
-def binaryop(lhs, rhs, op, dtype):
-    """
-    Dispatches a binary op call to the appropriate libcudf function:
-    """
-    # TODO: Shouldn't have to keep special-casing. We need to define a separate
-    # pipeline for libcudf binops that don't map to Python binops.
-    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
-        op = op[2:-2]
-    op = op.upper()
-    op = _op_map.get(op, op)
-
-    return Column.from_pylibcudf(
-        # Check if the dtype args are desirable here.
-        pylibcudf.binaryop.binary_operation(
-            lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column)
-            else (
-                <DeviceScalar> as_device_scalar(
-                    lhs, dtype=rhs.dtype if lhs is None else None
-                )
-            ).c_value,
-            rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column)
-            else (
-                <DeviceScalar> as_device_scalar(
-                    rhs, dtype=lhs.dtype if rhs is None else None
-                )
-            ).c_value,
-            pylibcudf.binaryop.BinaryOperator[op],
-            dtype_to_pylibcudf_type(dtype),
-        )
-    )
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9cbe11d61ac..245a5d03981 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -42,8 +42,7 @@ cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
 from pylibcudf.libcudf.column.column cimport column, column_contents
 from pylibcudf.libcudf.column.column_factories cimport (
-    make_column_from_scalar as cpp_make_column_from_scalar,
-    make_numeric_column,
+    make_numeric_column
 )
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
@@ -840,9 +839,8 @@ cdef class Column:
 
     @staticmethod
     def from_scalar(py_val, size_type size):
-        cdef DeviceScalar val = py_val.device_value
-        cdef const scalar* c_val = val.get_raw_ptr()
-        cdef unique_ptr[column] c_result
-        with nogil:
-            c_result = move(cpp_make_column_from_scalar(c_val[0], size))
-        return Column.from_unique_ptr(move(c_result))
+        return Column.from_pylibcudf(
+            pylibcudf.Column.from_scalar(
+                py_val.device_value.c_value, size
+            )
+        )
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 4dfb12d8ab3..ef544dc89eb 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -1,10 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import pickle
-
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 import pylibcudf
 
 import cudf
@@ -16,12 +12,6 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.reduce import minmax
-
-from libcpp.memory cimport make_unique
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
@@ -42,7 +32,7 @@ def _gather_map_is_valid(
     """
     if not check_bounds or nullify or len(gather_map) == 0:
         return True
-    gm_min, gm_max = minmax(gather_map)
+    gm_min, gm_max = gather_map.minmax()
     return gm_min >= -nrows and gm_max < nrows
 
 
@@ -59,12 +49,9 @@ def copy_column(Column input_column):
     -------
     Deep copied column
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view input_column_view = input_column.view()
-    with nogil:
-        c_result = move(make_unique[column](input_column_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        input_column.to_pylibcudf(mode="read").copy()
+    )
 
 
 @acquire_spill_lock()
@@ -367,14 +354,13 @@ class PackedColumns(Serializable):
         header["index-names"] = self.index_names
         header["metadata"] = self._metadata.tobytes()
         for name, dtype in self.column_dtypes.items():
-            dtype_header, dtype_frames = dtype.serialize()
+            dtype_header, dtype_frames = dtype.device_serialize()
             self.column_dtypes[name] = (
                 dtype_header,
                 (len(frames), len(frames) + len(dtype_frames)),
             )
             frames.extend(dtype_frames)
         header["column-dtypes"] = self.column_dtypes
-        header["type-serialized"] = pickle.dumps(type(self))
         return header, frames
 
     @classmethod
@@ -382,9 +368,9 @@ class PackedColumns(Serializable):
         column_dtypes = {}
         for name, dtype in header["column-dtypes"].items():
             dtype_header, (start, stop) = dtype
-            column_dtypes[name] = pickle.loads(
-                dtype_header["type-serialized"]
-            ).deserialize(dtype_header, frames[start:stop])
+            column_dtypes[name] = Serializable.device_deserialize(
+                dtype_header, frames[start:stop]
+            )
         return cls(
             plc.contiguous_split.pack(
                 plc.contiguous_split.unpack_from_memoryviews(
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
deleted file mode 100644
index 641fc18c203..00000000000
--- a/python/cudf/cudf/_lib/csv.pyx
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-cimport pylibcudf.libcudf.types as libcudf_types
-
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import errno
-import os
-from collections import abc
-from io import BytesIO, StringIO
-
-import numpy as np
-import pandas as pd
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-from cudf.api.types import is_hashable
-
-from pylibcudf.types cimport DataType
-
-CSV_HEX_TYPE_MAP = {
-    "hex": np.dtype("int64"),
-    "hex64": np.dtype("int64"),
-    "hex32": np.dtype("int32")
-}
-
-
-def validate_args(
-    object delimiter,
-    object sep,
-    bool delim_whitespace,
-    object decimal,
-    object thousands,
-    object nrows,
-    int skipfooter,
-    object byte_range,
-    int skiprows
-):
-    if delim_whitespace:
-        if delimiter is not None:
-            raise ValueError("cannot set both delimiter and delim_whitespace")
-        if sep != ',':
-            raise ValueError("cannot set both sep and delim_whitespace")
-
-    # Alias sep -> delimiter.
-    actual_delimiter = delimiter if delimiter else sep
-
-    if decimal == actual_delimiter:
-        raise ValueError("decimal cannot be the same as delimiter")
-
-    if thousands == actual_delimiter:
-        raise ValueError("thousands cannot be the same as delimiter")
-
-    if nrows is not None and skipfooter != 0:
-        raise ValueError("cannot use both nrows and skipfooter parameters")
-
-    if byte_range is not None:
-        if skipfooter != 0 or skiprows != 0 or nrows is not None:
-            raise ValueError("""cannot manually limit rows to be read when
-                                using the byte range parameter""")
-
-
-def read_csv(
-    object datasource,
-    object lineterminator="\n",
-    object quotechar='"',
-    int quoting=0,
-    bool doublequote=True,
-    object header="infer",
-    bool mangle_dupe_cols=True,
-    object usecols=None,
-    object sep=",",
-    object delimiter=None,
-    bool delim_whitespace=False,
-    bool skipinitialspace=False,
-    object names=None,
-    object dtype=None,
-    int skipfooter=0,
-    int skiprows=0,
-    bool dayfirst=False,
-    object compression="infer",
-    object thousands=None,
-    object decimal=".",
-    object true_values=None,
-    object false_values=None,
-    object nrows=None,
-    object byte_range=None,
-    bool skip_blank_lines=True,
-    object parse_dates=None,
-    object comment=None,
-    object na_values=None,
-    bool keep_default_na=True,
-    bool na_filter=True,
-    object prefix=None,
-    object index_col=None,
-):
-    """
-    Cython function to call into libcudf API, see `read_csv`.
-
-    See Also
-    --------
-    cudf.read_csv
-    """
-
-    if not isinstance(datasource, (BytesIO, StringIO, bytes)):
-        if not os.path.isfile(datasource):
-            raise FileNotFoundError(
-                errno.ENOENT, os.strerror(errno.ENOENT), datasource
-            )
-
-    if isinstance(datasource, StringIO):
-        datasource = datasource.read().encode()
-    elif isinstance(datasource, str) and not os.path.isfile(datasource):
-        datasource = datasource.encode()
-
-    validate_args(delimiter, sep, delim_whitespace, decimal, thousands,
-                  nrows, skipfooter, byte_range, skiprows)
-
-    # Alias sep -> delimiter.
-    if delimiter is None:
-        delimiter = sep
-
-    delimiter = str(delimiter)
-
-    if byte_range is None:
-        byte_range = (0, 0)
-
-    if compression is None:
-        c_compression = plc.io.types.CompressionType.NONE
-    else:
-        compression_map = {
-            "infer": plc.io.types.CompressionType.AUTO,
-            "gzip": plc.io.types.CompressionType.GZIP,
-            "bz2": plc.io.types.CompressionType.BZIP2,
-            "zip": plc.io.types.CompressionType.ZIP,
-        }
-        c_compression = compression_map[compression]
-
-    # We need this later when setting index cols
-    orig_header = header
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            header = -1
-        else:
-            header = header
-        names = list(names)
-    else:
-        if header is None:
-            header = -1
-        elif header == 'infer':
-            header = 0
-
-    hex_cols = []
-
-    new_dtypes = []
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            new_dtypes = dict()
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    hex_cols.append(str(k))
-
-                new_dtypes[k] = _get_plc_data_type_from_dtype(
-                    cudf.dtype(col_type)
-                )
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                hex_cols.append(0)
-
-            new_dtypes.append(
-                _get_plc_data_type_from_dtype(dtype)
-            )
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    hex_cols.append(index)
-
-                new_dtypes.append(
-                    _get_plc_data_type_from_dtype(col_dtype)
-                )
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-    options = (
-        plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource]))
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(byte_range[0])
-        .byte_range_size(byte_range[1])
-        .nrows(nrows if nrows is not None else -1)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(quoting)
-        .lineterminator(str(lineterminator))
-        .quotechar(quotechar)
-        .decimal(decimal)
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    options.set_header(header)
-
-    if names is not None:
-        options.set_names([str(name) for name in names])
-
-    if prefix is not None:
-        options.set_prefix(prefix)
-
-    if usecols is not None:
-        if all(isinstance(col, int) for col in usecols):
-            options.set_use_cols_indexes(list(usecols))
-        else:
-            options.set_use_cols_names([str(name) for name in usecols])
-
-    if delimiter is not None:
-        options.set_delimiter(delimiter)
-
-    if thousands is not None:
-        options.set_thousands(thousands)
-
-    if comment is not None:
-        options.set_comment(comment)
-
-    if parse_dates is not None:
-        options.set_parse_dates(list(parse_dates))
-
-    if hex_cols is not None:
-        options.set_parse_hex(list(hex_cols))
-
-    options.set_dtypes(new_dtypes)
-
-    if true_values is not None:
-        options.set_true_values([str(val) for val in true_values])
-
-    if false_values is not None:
-        options.set_false_values([str(val) for val in false_values])
-
-    if na_values is not None:
-        options.set_na_values([str(val) for val in na_values])
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(plc.io.csv.read_csv(options))
-    )
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
-                    df._data[str(k)] = df._data[str(k)].astype(v)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
-                df = df.astype(dtype)
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._column_names[index]
-                    df._data[col_name] = df._data[col_name].astype(col_dtype)
-
-    if names is not None and len(names) and isinstance(names[0], int):
-        df.columns = [int(x) for x in df._data]
-    elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"):
-        df.columns = [int(x) for x in df._column_names]
-
-    # Set index if the index_col parameter is passed
-    if index_col is not None and index_col is not False:
-        if isinstance(index_col, int):
-            index_col_name = df._data.get_labels_by_index(index_col)[0]
-            df = df.set_index(index_col_name)
-            if isinstance(index_col_name, str) and \
-                    names is None and orig_header == "infer":
-                if index_col_name.startswith("Unnamed:"):
-                    # TODO: Try to upstream it to libcudf
-                    # csv reader in future
-                    df._index.name = None
-            elif names is None:
-                df._index.name = index_col
-        else:
-            df = df.set_index(index_col)
-
-    return df
-
-
-@acquire_spill_lock()
-def write_csv(
-    table,
-    object path_or_buf=None,
-    object sep=",",
-    object na_rep="",
-    bool header=True,
-    object lineterminator="\n",
-    int rows_per_chunk=8,
-    bool index=True,
-):
-    """
-    Cython function to call into libcudf API, see `write_csv`.
-
-    See Also
-    --------
-    cudf.to_csv
-    """
-    index_and_not_empty = index is True and table.index is not None
-    columns = [
-        col.to_pylibcudf(mode="read") for col in table.index._columns
-    ] if index_and_not_empty else []
-    columns.extend(col.to_pylibcudf(mode="read") for col in table._columns)
-    col_names = []
-    if header:
-        all_names = list(table.index.names) if index_and_not_empty else []
-        all_names.extend(
-            na_rep if name is None or pd.isnull(name)
-            else name for name in table._column_names
-        )
-        col_names = [
-            '""' if (name in (None, '') and len(all_names) == 1)
-            else (str(name) if name not in (None, '') else '')
-            for name in all_names
-        ]
-    try:
-        plc.io.csv.write_csv(
-            (
-                plc.io.csv.CsvWriterOptions.builder(
-                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
-                )
-                .names(col_names)
-                .na_rep(na_rep)
-                .include_header(header)
-                .rows_per_chunk(rows_per_chunk)
-                .line_terminator(str(lineterminator))
-                .inter_column_delimiter(str(sep))
-                .true_value("True")
-                .false_value("False")
-                .build()
-            )
-        )
-    except OverflowError:
-        raise OverflowError(
-            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
-            "Consider providing a smaller chunksize argument."
-        )
-
-
-cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
-    # TODO: Remove this work-around Dictionary types
-    # in libcudf are fully mapped to categorical columns:
-    # https://github.com/rapidsai/cudf/issues/3960
-    if isinstance(dtype, cudf.CategoricalDtype):
-        dtype = dtype.categories.dtype
-    elif dtype == "category":
-        dtype = "str"
-
-    if isinstance(dtype, str):
-        if str(dtype) == "date32":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_DAYS
-            )
-        elif str(dtype) in ("date", "date64"):
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[us]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MICROSECONDS
-            )
-        elif str(dtype) == "timestamp[s]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_SECONDS
-            )
-        elif str(dtype) == "timestamp[ms]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[ns]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_NANOSECONDS
-            )
-
-    dtype = cudf.dtype(dtype)
-    return dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
deleted file mode 100644
index b2f4c620144..00000000000
--- a/python/cudf/cudf/_lib/filling.pyx
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
-    pylibcudf.filling.fill_in_place(
-        destination.to_pylibcudf(mode='write'),
-        begin,
-        end,
-        (<DeviceScalar> as_device_scalar(value, dtype=destination.dtype)).c_value
-    )
-
-
-@acquire_spill_lock()
-def fill(Column destination, int begin, int end, DeviceScalar value):
-    return Column.from_pylibcudf(
-        pylibcudf.filling.fill(
-            destination.to_pylibcudf(mode='read'),
-            begin,
-            end,
-            (<DeviceScalar> as_device_scalar(value)).c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def repeat(list inp, object count):
-    ctbl = pylibcudf.Table([col.to_pylibcudf(mode="read") for col in inp])
-    if isinstance(count, Column):
-        count = count.to_pylibcudf(mode="read")
-    return columns_from_pylibcudf_table(
-        pylibcudf.filling.repeat(
-            ctbl,
-            count
-        )
-    )
-
-
-@acquire_spill_lock()
-def sequence(int size, DeviceScalar init, DeviceScalar step):
-    return Column.from_pylibcudf(
-        pylibcudf.filling.sequence(
-            size,
-            (<DeviceScalar> as_device_scalar(init)).c_value,
-            (<DeviceScalar> as_device_scalar(step)).c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4e712be6738..80a77ef2267 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -20,7 +20,7 @@ from cudf._lib.scalar import as_device_scalar
 
 import pylibcudf
 
-from cudf._lib.aggregation import make_aggregation
+from cudf.core._internals.aggregation import make_aggregation
 
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. These strings must be elements of the AggregationKind enum.
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
deleted file mode 100644
index e7408cf2852..00000000000
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources utils.pyx)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
deleted file mode 100644
index 96504ebdd66..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-    source_info,
-)
-
-from cudf._lib.column cimport Column
-
-
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & data) except*
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
-cdef add_df_col_struct_names(
-    df,
-    child_names_dict
-)
-cdef update_col_struct_field_names(
-    Column col,
-    child_names
-)
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info)
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
deleted file mode 100644
index f23980b387a..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cpython.buffer cimport PyBUF_READ
-from cpython.memoryview cimport PyMemoryView_FromMemory
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-)
-
-from cudf._lib.column cimport Column
-
-import codecs
-import io
-import os
-
-from cudf.core.dtypes import StructDtype
-
-# Converts the Python sink input to libcudf IO sink_info.
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & sink
-) except*:
-    cdef vector[data_sink *] data_sinks
-    cdef vector[string] paths
-    if isinstance(src[0], io.StringIO):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.TextIOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            # Files opened in text mode expect writes to be str rather than
-            # bytes, which requires conversion from utf-8. If the underlying
-            # buffer is utf-8, we can bypass this conversion by writing
-            # directly to it.
-            if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}:
-                raise NotImplementedError(f"Unsupported encoding {s.encoding}")
-            sink.push_back(
-                unique_ptr[data_sink](new iobase_data_sink(s.buffer))
-            )
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.IOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], (basestring, os.PathLike)):
-        paths.reserve(len(src))
-        for s in src:
-            paths.push_back(<string> os.path.expanduser(s).encode())
-        return sink_info(move(paths))
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src)))
-
-
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
-    cdef vector[unique_ptr[data_sink]] datasinks
-    cdef sink_info info = make_sinks_info([src], datasinks)
-    if not datasinks.empty():
-        sink.swap(datasinks[0])
-    return info
-
-
-# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
-# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
-cdef cppclass iobase_data_sink(data_sink):
-    object buf
-
-    iobase_data_sink(object buf_):
-        this.buf = buf_
-
-    void host_write(const void * data, size_t size) with gil:
-        if isinstance(buf, io.StringIO):
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
-                      .tobytes().decode())
-        else:
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
-
-    void flush() with gil:
-        buf.flush()
-
-    size_t bytes_written() with gil:
-        return buf.tell()
-
-
-cdef add_df_col_struct_names(df, child_names_dict):
-    for name, child_names in child_names_dict.items():
-        col = df._data[name]
-
-        df._data[name] = update_col_struct_field_names(col, child_names)
-
-
-cdef update_col_struct_field_names(Column col, child_names):
-    if col.children:
-        children = list(col.children)
-        for i, (child, names) in enumerate(zip(children, child_names.values())):
-            children[i] = update_col_struct_field_names(
-                child,
-                names
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        col = col._rename_fields(
-            child_names.keys()
-        )
-
-    return col
-
-
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info
-):
-    # Deprecated, remove in favor of add_col_struct_names
-    # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._column_labels_and_values):
-        table._data[name] = update_column_struct_field_names(
-            col, schema_info[i]
-        )
-
-
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-):
-    cdef vector[string] field_names
-
-    if col.children:
-        children = list(col.children)
-        for i, child in enumerate(children):
-            children[i] = update_column_struct_field_names(
-                child,
-                info.children[i]
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        field_names.reserve(len(col.base_children))
-        for i in range(info.children.size()):
-            field_names.push_back(info.children[i].name)
-        col = col._rename_fields(
-            field_names
-        )
-
-    return col
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
deleted file mode 100644
index 9372acdab44..00000000000
--- a/python/cudf/cudf/_lib/merge.pyx
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-def merge_sorted(
-    list input_columns,
-    list key_columns_indices,
-    bool ascending=True,
-    str na_position="last",
-):
-    """Merge multiple lists of lexicographically sorted columns into one list
-    of sorted columns. `input_columns` is a list of lists of columns to be
-    merged.
-    """
-    c_input_tables = [
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ) for source_columns in input_columns
-    ]
-
-    num_keys = len(key_columns_indices)
-
-    column_order = (
-        pylibcudf.types.Order.ASCENDING if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-
-    if not ascending:
-        na_position = "last" if na_position == "first" else "first"
-    null_precedence = (
-        pylibcudf.types.NullOrder.BEFORE if na_position == "first"
-        else pylibcudf.types.NullOrder.AFTER
-    )
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.merge.merge(
-            c_input_tables,
-            key_columns_indices,
-            [column_order] * num_keys,
-            [null_precedence] * num_keys,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
deleted file mode 100644
index 22ec5d472f2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources
-    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
-)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
deleted file mode 100644
index 2b2762eead2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
-
-
-@acquire_spill_lock()
-def byte_pair_encoding(
-    Column strings,
-    object merge_pairs,
-    object separator
-):
-    return Column.from_pylibcudf(
-        nvtext.byte_pair_encode.byte_pair_encoding(
-            strings.to_pylibcudf(mode="read"),
-            merge_pairs,
-            separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
deleted file mode 100644
index 3dd99c42d76..00000000000
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf cimport nvtext
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def edit_distance(Column strings, Column targets):
-    result = nvtext.edit_distance.edit_distance(
-        strings.to_pylibcudf(mode="read"),
-        targets.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def edit_distance_matrix(Column strings):
-    result = nvtext.edit_distance.edit_distance_matrix(
-        strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
deleted file mode 100644
index 7fdf9258b7f..00000000000
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def generate_ngrams(Column strings, int ngrams, object py_separator):
-    result = nvtext.generate_ngrams.generate_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams,
-        py_separator.device_value.c_value
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def generate_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.generate_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def hash_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.hash_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
deleted file mode 100644
index c964d0206b7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def jaccard_index(Column input1, Column input2, int width):
-    result = nvtext.jaccard.jaccard_index(
-        input1.to_pylibcudf(mode="read"),
-        input2.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
deleted file mode 100644
index 25cfcf99ca6..00000000000
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t, uint64_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def minhash(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash_permuted(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minhash64(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash64_permuted(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def word_minhash(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def word_minhash64(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
deleted file mode 100644
index c125d92a24e..00000000000
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def ngrams_tokenize(
-    Column input,
-    int ngrams,
-    object py_delimiter,
-    object py_separator
-):
-    return Column.from_pylibcudf(
-        nvtext.ngrams_tokenize.ngrams_tokenize(
-            input.to_pylibcudf(mode="read"),
-            ngrams,
-            py_delimiter.device_value.c_value,
-            py_separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
deleted file mode 100644
index cc45123dd0a..00000000000
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def normalize_spaces(Column input):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_spaces(
-            input.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def normalize_characters(Column input, bool do_lower=True):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_characters(
-            input.to_pylibcudf(mode="read"),
-            do_lower,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
deleted file mode 100644
index bec56ade83c..00000000000
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def replace_tokens(Column strings,
-                   Column targets,
-                   Column replacements,
-                   object py_delimiter):
-    """
-    The `targets` tokens are searched for within each `strings`
-    in the Column and replaced with the corresponding `replacements`
-    if found. Tokens are identified by the `py_delimiter` character
-    provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.replace_tokens(
-            strings.to_pylibcudf(mode="read"),
-            targets.to_pylibcudf(mode="read"),
-            replacements.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def filter_tokens(Column strings,
-                  size_type min_token_length,
-                  object py_replacement,
-                  object py_delimiter):
-    """
-    Tokens smaller than `min_token_length` are removed from `strings`
-    in the Column and optionally replaced with the corresponding
-    `py_replacement` string. Tokens are identified by the `py_delimiter`
-    character provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.filter_tokens(
-            strings.to_pylibcudf(mode="read"),
-            min_token_length,
-            py_replacement.device_value.c_value,
-            py_delimiter.device_value.c_value,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
deleted file mode 100644
index 63a389b64d5..00000000000
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from enum import IntEnum
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.nvtext.stemmer cimport (
-    letter_type,
-    underlying_type_t_letter_type,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-class LetterType(IntEnum):
-    CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
-    VOWEL = <underlying_type_t_letter_type> letter_type.VOWEL
-
-
-@acquire_spill_lock()
-def porter_stemmer_measure(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.porter_stemmer_measure(
-            strings.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter(Column strings,
-              object ltype,
-              size_type index):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            index,
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter_multi(Column strings,
-                    object ltype,
-                    Column indices):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            indices.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
deleted file mode 100644
index 5e0bfb74705..00000000000
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def subword_tokenize_inmem_hash(
-    Column strings,
-    object hashed_vocabulary,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-):
-    """
-    Subword tokenizes text series by using the pre-loaded hashed vocabulary
-    """
-    result = nvtext.subword_tokenize.subword_tokenize(
-        strings.to_pylibcudf(mode="read"),
-        hashed_vocabulary,
-        max_sequence_length,
-        stride,
-        do_lower,
-        do_truncate,
-    )
-    # return the 3 tensor components
-    tokens = Column.from_pylibcudf(result[0])
-    masks = Column.from_pylibcudf(result[1])
-    metadata = Column.from_pylibcudf(result[2])
-    return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
deleted file mode 100644
index f473c48e2f7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def _tokenize_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _tokenize_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def character_tokenize(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.character_tokenize(
-            strings.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def detokenize(Column strings, Column indices, object py_separator):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.detokenize(
-            strings.to_pylibcudf(mode="read"),
-            indices.to_pylibcudf(mode="read"),
-            py_separator.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def tokenize_with_vocabulary(Column strings,
-                             object vocabulary,
-                             object py_delimiter,
-                             size_type default_id):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_with_vocabulary(
-            strings.to_pylibcudf(mode="read"),
-            vocabulary,
-            py_delimiter.device_value.c_value,
-            default_id
-        )
-    )
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
deleted file mode 100644
index c829cac6409..00000000000
--- a/python/cudf/cudf/_lib/orc.pyx
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int64_t
-from libcpp cimport bool, int
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-import itertools
-from collections import OrderedDict
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport update_col_struct_field_names
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-from cudf.core.buffer import acquire_spill_lock
-from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
-from pylibcudf.io.orc cimport OrcChunkedWriter
-
-# TODO: Consider inlining this function since it seems to only be used in one place.
-cpdef read_parsed_orc_statistics(filepath_or_buffer):
-    """
-    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
-
-    See Also
-    --------
-    cudf.io.orc.read_orc_statistics
-    """
-
-    parsed = (
-        plc.io.orc.read_parsed_orc_statistics(
-            plc.io.SourceInfo([filepath_or_buffer])
-        )
-    )
-
-    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
-
-
-cpdef read_orc(object filepaths_or_buffers,
-               object columns=None,
-               object stripes=None,
-               object skip_rows=None,
-               object num_rows=None,
-               bool use_index=True,
-               object timestamp_type=None):
-    """
-    Cython function to call into libcudf API, see `read_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-
-    Notes
-    -----
-    Currently this function only considers the metadata of the first file in the list of
-    filepaths_or_buffers.
-    """
-
-    if columns is not None:
-        columns = [str(col) for col in columns]
-
-    tbl_w_meta = plc.io.orc.read_orc(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        stripes,
-        get_skiprows_arg(skip_rows),
-        get_num_rows_arg(num_rows),
-        use_index,
-        plc.types.DataType(
-            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
-                cudf.dtype(timestamp_type)
-            ]
-        )
-    )
-
-    names = tbl_w_meta.column_names(include_children=False)
-
-    actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
-                                             names,
-                                             skip_rows,
-                                             num_rows)
-
-    if columns is not None and (isinstance(columns, list) and len(columns) == 0):
-        # When `columns=[]`, index needs to be
-        # established, but not the columns.
-        nrows = tbl_w_meta.tbl.num_rows()
-        return {}, cudf.RangeIndex(nrows)
-
-    data, index = data_from_pylibcudf_io(
-        tbl_w_meta,
-        col_names if columns is None else names,
-        actual_index_names
-    )
-
-    if is_range_index:
-        index = range_idx
-    elif reset_index_name:
-        index.names = [None] * len(index.names)
-
-    child_name_values = tbl_w_meta.child_names.values()
-
-    data = {
-        name: update_col_struct_field_names(
-            col, child_names
-        )
-        for (name, col), child_names in zip(data.items(), child_name_values)
-    }
-
-    return data, index
-
-
-def _get_comp_type(object compression):
-    if compression is None or compression is False:
-        return plc.io.types.CompressionType.NONE
-
-    compression = str(compression).upper()
-    if compression == "SNAPPY":
-        return plc.io.types.CompressionType.SNAPPY
-    elif compression == "ZLIB":
-        return plc.io.types.CompressionType.ZLIB
-    elif compression == "ZSTD":
-        return plc.io.types.CompressionType.ZSTD
-    elif compression == "LZ4":
-        return plc.io.types.CompressionType.LZ4
-    else:
-        raise ValueError(f"Unsupported `compression` type {compression}")
-
-
-cdef tuple _get_index_from_metadata(
-        vector[map[string, string]] user_data,
-        object names,
-        object skip_rows,
-        object num_rows):
-
-    meta = None
-    index_col = None
-    is_range_index = False
-    reset_index_name = False
-    range_idx = None
-
-    if user_data.size() > 0:
-        json_str = user_data[0][b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            if 'index_columns' in meta and len(meta['index_columns']) > 0:
-                index_col = meta['index_columns']
-                if isinstance(index_col[0], dict) and \
-                        index_col[0]['kind'] == 'range':
-                    is_range_index = True
-                else:
-                    index_col_names = OrderedDict()
-                    for idx_col in index_col:
-                        for c in meta['columns']:
-                            if c['field_name'] == idx_col:
-                                index_col_names[idx_col] = \
-                                    c['name'] or c['field_name']
-                                if c['name'] is None:
-                                    reset_index_name = True
-
-    actual_index_names = None
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            range_index_meta = index_col[0]
-            range_idx = cudf.RangeIndex(
-                start=range_index_meta['start'],
-                stop=range_index_meta['stop'],
-                step=range_index_meta['step'],
-                name=range_index_meta['name']
-            )
-            if skip_rows is not None:
-                range_idx = range_idx[skip_rows:]
-            if num_rows is not None:
-                range_idx = range_idx[:num_rows]
-        else:
-            actual_index_names = list(index_col_names.values())
-            names = names[len(actual_index_names):]
-
-    return (
-        actual_index_names,
-        names,
-        is_range_index,
-        reset_index_name,
-        range_idx
-    )
-
-
-def _get_orc_stat_freq(str statistics):
-    """
-    Convert ORC statistics terms to CUDF convention:
-      - ORC "STRIPE"   == CUDF "ROWGROUP"
-      - ORC "ROWGROUP" == CUDF "PAGE"
-    """
-    statistics = str(statistics).upper()
-    if statistics == "NONE":
-        return plc.io.types.StatisticsFreq.STATISTICS_NONE
-    elif statistics == "STRIPE":
-        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
-    elif statistics == "ROWGROUP":
-        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
-    else:
-        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
-
-
-@acquire_spill_lock()
-def write_orc(
-    table,
-    object path_or_buf,
-    object compression="snappy",
-    str statistics="ROWGROUP",
-    object stripe_size_bytes=None,
-    object stripe_size_rows=None,
-    object row_index_stride=None,
-    object cols_as_map_type=None,
-    object index=None
-):
-    """
-    Cython function to call into libcudf API, see `cudf::io::write_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-    """
-    user_data = {}
-    user_data["pandas"] = generate_pandas_metadata(table, index)
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        columns = table._columns if table._index is None else [
-            *table.index._columns, *table._columns
-        ]
-        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        tbl_meta = TableInputMetadata(plc_table)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        plc_table = plc.Table(
-            [col.to_pylibcudf(mode="read") for col in table._columns]
-        )
-        tbl_meta = TableInputMetadata(plc_table)
-        num_index_cols_meta = 0
-
-    if cols_as_map_type is not None:
-        cols_as_map_type = set(cols_as_map_type)
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        tbl_meta.column_metadata[i].set_name(name)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            (cols_as_map_type is not None)
-            and (name in cols_as_map_type),
-        )
-
-    options = (
-        plc.io.orc.OrcWriterOptions.builder(
-            plc.io.SinkInfo([path_or_buf]), plc_table
-        )
-        .metadata(tbl_meta)
-        .key_value_metadata(user_data)
-        .compression(_get_comp_type(compression))
-        .enable_statistics(_get_orc_stat_freq(statistics))
-        .build()
-    )
-    if stripe_size_bytes is not None:
-        options.set_stripe_size_bytes(stripe_size_bytes)
-    if stripe_size_rows is not None:
-        options.set_stripe_size_rows(stripe_size_rows)
-    if row_index_stride is not None:
-        options.set_row_index_stride(row_index_stride)
-
-    plc.io.orc.write_orc(options)
-
-
-cdef int64_t get_skiprows_arg(object arg) except*:
-    arg = 0 if arg is None else arg
-    if not isinstance(arg, int) or arg < 0:
-        raise TypeError("skiprows must be an int >= 0")
-    return <int64_t> arg
-
-cdef int64_t get_num_rows_arg(object arg) except*:
-    arg = -1 if arg is None else arg
-    if not isinstance(arg, int) or arg < -1:
-        raise TypeError("num_rows must be an int >= -1")
-    return <int64_t> arg
-
-
-cdef class ORCWriter:
-    """
-    ORCWriter lets you you incrementally write out a ORC file from a series
-    of cudf tables
-
-    See Also
-    --------
-    cudf.io.orc.to_orc
-    """
-    cdef bool initialized
-    cdef OrcChunkedWriter writer
-    cdef SinkInfo sink
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef TableInputMetadata tbl_meta
-    cdef object cols_as_map_type
-    cdef object stripe_size_bytes
-    cdef object stripe_size_rows
-    cdef object row_index_stride
-
-    def __cinit__(self,
-                  object path,
-                  object index=None,
-                  object compression="snappy",
-                  str statistics="ROWGROUP",
-                  object cols_as_map_type=None,
-                  object stripe_size_bytes=None,
-                  object stripe_size_rows=None,
-                  object row_index_stride=None):
-        self.sink = plc.io.SinkInfo([path])
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.cols_as_map_type = cols_as_map_type \
-            if cols_as_map_type is None else set(cols_as_map_type)
-        self.stripe_size_bytes = stripe_size_bytes
-        self.stripe_size_rows = stripe_size_rows
-        self.row_index_stride = row_index_stride
-        self.initialized = False
-
-    def write_table(self, table):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(table)
-
-        keep_index = self.index is not False and (
-            table._index.name is not None or
-            isinstance(table._index, cudf.core.multiindex.MultiIndex)
-        )
-        if keep_index:
-            columns = [
-                col.to_pylibcudf(mode="read")
-                for col in itertools.chain(table.index._columns, table._columns)
-            ]
-        else:
-            columns = [col.to_pylibcudf(mode="read") for col in table._columns]
-
-        self.writer.write(plc.Table(columns))
-
-    def close(self):
-        if not self.initialized:
-            return
-
-        self.writer.close()
-
-    def __dealloc__(self):
-        self.close()
-
-    def _initialize_chunked_state(self, table):
-        """
-        Prepare all the values required to build the
-        chunked_orc_writer_options anb creates a writer"""
-
-        num_index_cols_meta = 0
-        plc_table = plc.Table(
-            [
-                col.to_pylibcudf(mode="read")
-                for col in table._columns
-            ]
-        )
-        self.tbl_meta = TableInputMetadata(plc_table)
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                plc_table = plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read")
-                        for col in itertools.chain(table.index._columns, table._columns)
-                    ]
-                )
-                self.tbl_meta = TableInputMetadata(plc_table)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(
-                        idx_name
-                    )
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    plc_table = plc.Table(
-                        [
-                            col.to_pylibcudf(mode="read")
-                            for col in itertools.chain(
-                                table.index._columns, table._columns
-                            )
-                        ]
-                    )
-                    self.tbl_meta = TableInputMetadata(plc_table)
-                    self.tbl_meta.column_metadata[0].set_name(
-                        table._index.name
-                    )
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name)
-            _set_col_children_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-                (self.cols_as_map_type is not None)
-                and (name in self.cols_as_map_type),
-            )
-
-        user_data = {}
-        pandas_metadata = generate_pandas_metadata(table, self.index)
-        user_data["pandas"] = pandas_metadata
-
-        options = (
-            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
-            .metadata(self.tbl_meta)
-            .key_value_metadata(user_data)
-            .compression(_get_comp_type(self.compression))
-            .enable_statistics(_get_orc_stat_freq(self.statistics))
-            .build()
-        )
-        if self.stripe_size_bytes is not None:
-            options.set_stripe_size_bytes(self.stripe_size_bytes)
-        if self.stripe_size_rows is not None:
-            options.set_stripe_size_rows(self.stripe_size_rows)
-        if self.row_index_stride is not None:
-            options.set_row_index_stride(self.row_index_stride)
-
-        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
-
-        self.initialized = True
-
-cdef _set_col_children_metadata(Column col,
-                                ColumnInMetadata col_meta,
-                                list_column_as_map=False):
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name)
-            _set_col_children_metadata(
-                child_col, col_meta.child(i), list_column_as_map
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if list_column_as_map:
-            col_meta.set_list_column_as_map()
-        _set_col_children_metadata(
-            col.children[cpp_lists_column_view.child_column_index],
-            col_meta.child(cpp_lists_column_view.child_column_index),
-            list_column_as_map
-        )
-    else:
-        return
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
deleted file mode 100644
index d4bd0cd306c..00000000000
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ /dev/null
@@ -1,921 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-import io
-
-import pyarrow as pa
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-import numpy as np
-
-from cython.operator cimport dereference
-
-from cudf.api.types import is_list_like
-
-from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
-
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-
-from libc.stdint cimport int64_t, uint8_t
-from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.expressions cimport Expression
-from pylibcudf.io.parquet cimport ChunkedParquetReader
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.parquet cimport (
-    chunked_parquet_writer_options,
-    merge_row_group_metadata as parquet_merge_metadata,
-    parquet_chunked_writer as cpp_parquet_chunked_writer,
-    parquet_writer_options,
-    write_parquet as parquet_writer,
-)
-from pylibcudf.libcudf.io.types cimport (
-    sink_info,
-    column_in_metadata,
-    table_input_metadata,
-    partition_info,
-    statistics_freq,
-    compression_type,
-    dictionary_policy,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    add_df_col_struct_names,
-    make_sinks_info,
-)
-from cudf._lib.utils cimport table_view_from_table
-
-import pylibcudf as plc
-
-from pylibcudf cimport Table
-
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
-
-
-cdef class BufferArrayFromVector:
-    cdef Py_ssize_t length
-    cdef unique_ptr[vector[uint8_t]] in_vec
-
-    # these two things declare part of the buffer interface
-    cdef Py_ssize_t shape[1]
-    cdef Py_ssize_t strides[1]
-
-    @staticmethod
-    cdef BufferArrayFromVector from_unique_ptr(
-        unique_ptr[vector[uint8_t]] in_vec
-    ):
-        cdef BufferArrayFromVector buf = BufferArrayFromVector()
-        buf.in_vec = move(in_vec)
-        buf.length = dereference(buf.in_vec).size()
-        return buf
-
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        cdef Py_ssize_t itemsize = sizeof(uint8_t)
-
-        self.shape[0] = self.length
-        self.strides[0] = 1
-
-        buffer.buf = dereference(self.in_vec).data()
-
-        buffer.format = NULL  # byte
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.length * itemsize   # product(shape) * itemsize
-        buffer.ndim = 1
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        pass
-
-
-def _parse_metadata(meta):
-    file_is_range_index = False
-    file_index_cols = None
-    file_column_dtype = None
-
-    if 'index_columns' in meta and len(meta['index_columns']) > 0:
-        file_index_cols = meta['index_columns']
-
-        if isinstance(file_index_cols[0], dict) and \
-                file_index_cols[0]['kind'] == 'range':
-            file_is_range_index = True
-    if 'column_indexes' in meta and len(meta['column_indexes']) == 1:
-        file_column_dtype = meta['column_indexes'][0]["numpy_type"]
-    return file_is_range_index, file_index_cols, file_column_dtype
-
-
-cdef object _process_metadata(object df,
-                              list names,
-                              dict child_names,
-                              list per_file_user_data,
-                              object row_groups,
-                              object filepaths_or_buffers,
-                              bool allow_range_index,
-                              bool use_pandas_metadata,
-                              size_type nrows=-1,
-                              int64_t skip_rows=0,
-                              ):
-
-    add_df_col_struct_names(df, child_names)
-    index_col = None
-    is_range_index = True
-    column_index_type = None
-    index_col_names = None
-    meta = None
-    for single_file in per_file_user_data:
-        if b'pandas' not in single_file:
-            continue
-        json_str = single_file[b'pandas'].decode('utf-8')
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-        is_range_index &= file_is_range_index
-
-        if not file_is_range_index and index_col is not None \
-                and index_col_names is None:
-            index_col_names = {}
-            for idx_col in index_col:
-                for c in meta['columns']:
-                    if c['field_name'] == idx_col:
-                        index_col_names[idx_col] = c['name']
-
-    if meta is not None:
-        # Book keep each column metadata as the order
-        # of `meta["columns"]` and `column_names` are not
-        # guaranteed to be deterministic and same always.
-        meta_data_per_column = {
-            col_meta['name']: col_meta for col_meta in meta["columns"]
-        }
-
-        # update the decimal precision of each column
-        for col in names:
-            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
-                df._data[col].dtype.precision = (
-                    meta_data_per_column[col]["metadata"]["precision"]
-                )
-
-    # Set the index column
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            if not allow_range_index:
-                return df
-
-            if len(per_file_user_data) > 1:
-                range_index_meta = {
-                    "kind": "range",
-                    "name": None,
-                    "start": 0,
-                    "stop": len(df),
-                    "step": 1
-                }
-            else:
-                range_index_meta = index_col[0]
-
-            if row_groups is not None:
-                per_file_metadata = [
-                    pa.parquet.read_metadata(
-                        # Pyarrow cannot read directly from bytes
-                        io.BytesIO(s) if isinstance(s, bytes) else s
-                    ) for s in filepaths_or_buffers
-                ]
-
-                filtered_idx = []
-                for i, file_meta in enumerate(per_file_metadata):
-                    row_groups_i = []
-                    start = 0
-                    for row_group in range(file_meta.num_row_groups):
-                        stop = start + file_meta.row_group(row_group).num_rows
-                        row_groups_i.append((start, stop))
-                        start = stop
-
-                    for rg in row_groups[i]:
-                        filtered_idx.append(
-                            cudf.RangeIndex(
-                                start=row_groups_i[rg][0],
-                                stop=row_groups_i[rg][1],
-                                step=range_index_meta['step']
-                            )
-                        )
-
-                if len(filtered_idx) > 0:
-                    idx = cudf.concat(filtered_idx)
-                else:
-                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
-            else:
-                start = range_index_meta["start"] + skip_rows
-                stop = range_index_meta["stop"]
-                if nrows != -1:
-                    stop = start + nrows
-                idx = cudf.RangeIndex(
-                    start=start,
-                    stop=stop,
-                    step=range_index_meta['step'],
-                    name=range_index_meta['name']
-                )
-
-            df._index = idx
-        elif set(index_col).issubset(names):
-            index_data = df[index_col]
-            actual_index_names = iter(index_col_names.values())
-            if index_data._num_columns == 1:
-                idx = cudf.Index._from_column(
-                    index_data._columns[0],
-                    name=next(actual_index_names)
-                )
-            else:
-                idx = cudf.MultiIndex.from_frame(
-                    index_data,
-                    names=list(actual_index_names)
-                )
-            df.drop(columns=index_col, inplace=True)
-            df._index = idx
-        else:
-            if use_pandas_metadata:
-                df.index.names = index_col
-
-    if df._num_columns == 0 and column_index_type is not None:
-        df._data.label_dtype = cudf.dtype(column_index_type)
-
-    return df
-
-
-def read_parquet_chunked(
-    filepaths_or_buffers,
-    columns=None,
-    row_groups=None,
-    use_pandas_metadata=True,
-    size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000,
-    size_type nrows=-1,
-    int64_t skip_rows=0,
-    allow_mismatched_pq_schemas=False
-):
-    # Note: If this function ever takes accepts filters
-    # allow_range_index needs to be False when a filter is passed
-    # (see read_parquet)
-    allow_range_index = columns is not None and len(columns) != 0
-
-    reader = ChunkedParquetReader(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        skip_rows=skip_rows,
-        nrows=nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-    )
-
-    tbl_w_meta = reader.read_chunk()
-    column_names = tbl_w_meta.column_names(include_children=False)
-    child_names = tbl_w_meta.child_names
-    per_file_user_data = tbl_w_meta.per_file_user_data
-    concatenated_columns = tbl_w_meta.tbl.columns()
-
-    # save memory
-    del tbl_w_meta
-
-    cdef Table tbl
-    while reader.has_next():
-        tbl = reader.read_chunk().tbl
-
-        for i in range(tbl.num_columns()):
-            concatenated_columns[i] = plc.concatenate.concatenate(
-                [concatenated_columns[i], tbl._columns[i]]
-            )
-            # Drop residual columns to save memory
-            tbl._columns[i] = None
-
-    df = cudf.DataFrame._from_data(
-        *_data_from_columns(
-            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
-            column_names=column_names,
-            index_names=None
-        )
-    )
-    df = _process_metadata(df, column_names, child_names,
-                           per_file_user_data, row_groups,
-                           filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-
-cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   use_pandas_metadata=True,
-                   Expression filters=None,
-                   size_type nrows=-1,
-                   int64_t skip_rows=0,
-                   allow_mismatched_pq_schemas=False):
-    """
-    Cython function to call into libcudf API, see `read_parquet`.
-
-    filters, if not None, should be an Expression that evaluates to a
-    boolean predicate as a function of columns being read.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-
-    allow_range_index = True
-    if columns is not None and len(columns) == 0 or filters:
-        allow_range_index = False
-
-    # Read Parquet
-
-    tbl_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories = False,
-        use_pandas_metadata = use_pandas_metadata,
-        skip_rows = skip_rows,
-        nrows = nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-    )
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(tbl_w_meta)
-    )
-
-    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
-                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
-                           row_groups, filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-cpdef read_parquet_metadata(list filepaths_or_buffers):
-    """
-    Cython function to call into libcudf API, see `read_parquet_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
-        plc.io.SourceInfo(filepaths_or_buffers)
-    )
-
-    # read all column names including index column, if any
-    col_names = [info.name() for info in parquet_metadata.schema().root().children()]
-
-    index_col_names = set()
-    json_str = parquet_metadata.metadata()['pandas']
-    if json_str != "":
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, _ = _parse_metadata(meta)
-        if (
-            not file_is_range_index
-            and index_col is not None
-        ):
-            columns = meta['columns']
-            for idx_col in index_col:
-                for c in columns:
-                    if c['field_name'] == idx_col:
-                        index_col_names.add(idx_col)
-
-    # remove the index column from the list of column names
-    # only if index_col_names is not None
-    if len(index_col_names) >= 0:
-        col_names = [name for name in col_names if name not in index_col_names]
-
-    return (
-        parquet_metadata.num_rows(),
-        parquet_metadata.num_rowgroups(),
-        col_names,
-        len(col_names),
-        parquet_metadata.rowgroup_metadata()
-    )
-
-
-@acquire_spill_lock()
-def write_parquet(
-    table,
-    object filepaths_or_buffers,
-    object index=None,
-    object compression="snappy",
-    object statistics="ROWGROUP",
-    object metadata_file_path=None,
-    object int96_timestamps=False,
-    object row_group_size_bytes=None,
-    object row_group_size_rows=None,
-    object max_page_size_bytes=None,
-    object max_page_size_rows=None,
-    object max_dictionary_size=None,
-    object partitions_info=None,
-    object force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-    write_arrow_schema=False,
-):
-    """
-    Cython function to call into libcudf API, see `write_parquet`.
-
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-
-    # Create the write options
-    cdef table_input_metadata tbl_meta
-
-    cdef vector[map[string, string]] user_data
-    cdef table_view tv
-    cdef vector[unique_ptr[data_sink]] _data_sinks
-    cdef sink_info sink = make_sinks_info(
-        filepaths_or_buffers, _data_sinks
-    )
-
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        tv = table_view_from_table(table)
-        tbl_meta = table_input_metadata(tv)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                str.encode(
-                    _index_level_name(idx_name, level, table._column_names)
-                )
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        tv = table_view_from_table(table, ignore_index=True)
-        tbl_meta = table_input_metadata(tv)
-        num_index_cols_meta = 0
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        if not isinstance(name, str):
-            if cudf.get_option("mode.pandas_compatible"):
-                tbl_meta.column_metadata[i].set_name(str(name).encode())
-            else:
-                raise ValueError(
-                    "Writing a Parquet file requires string column names"
-                )
-        else:
-            tbl_meta.column_metadata[i].set_name(name.encode())
-
-        _set_col_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            force_nullable_schema,
-            None,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-
-    cdef map[string, string] tmp_user_data
-    if partitions_info is not None:
-        for start_row, num_row in partitions_info:
-            partitioned_df = table.iloc[start_row: start_row + num_row].copy(
-                deep=False
-            )
-            pandas_metadata = generate_pandas_metadata(partitioned_df, index)
-            tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-            user_data.push_back(tmp_user_data)
-            tmp_user_data.clear()
-    else:
-        pandas_metadata = generate_pandas_metadata(table, index)
-        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-        user_data.push_back(tmp_user_data)
-
-    if header_version not in ("1.0", "2.0"):
-        raise ValueError(
-            f"Invalid parquet header version: {header_version}. "
-            "Valid values are '1.0' and '2.0'"
-        )
-
-    dict_policy = (
-        plc.io.types.DictionaryPolicy.ADAPTIVE
-        if use_dictionary
-        else plc.io.types.DictionaryPolicy.NEVER
-    )
-
-    comp_type = _get_comp_type(compression)
-    stat_freq = _get_stat_freq(statistics)
-
-    cdef unique_ptr[vector[uint8_t]] out_metadata_c
-    cdef vector[string] c_column_chunks_file_paths
-    cdef bool _int96_timestamps = int96_timestamps
-    cdef vector[partition_info] partitions
-
-    # Perform write
-    cdef parquet_writer_options args = move(
-        parquet_writer_options.builder(sink, tv)
-        .metadata(tbl_meta)
-        .key_value_metadata(move(user_data))
-        .compression(comp_type)
-        .stats_level(stat_freq)
-        .int96_timestamps(_int96_timestamps)
-        .write_v2_headers(header_version == "2.0")
-        .dictionary_policy(dict_policy)
-        .utc_timestamps(False)
-        .write_arrow_schema(write_arrow_schema)
-        .build()
-    )
-    if partitions_info is not None:
-        partitions.reserve(len(partitions_info))
-        for part in partitions_info:
-            partitions.push_back(
-                partition_info(part[0], part[1])
-            )
-        args.set_partitions(move(partitions))
-    if metadata_file_path is not None:
-        if is_list_like(metadata_file_path):
-            for path in metadata_file_path:
-                c_column_chunks_file_paths.push_back(str.encode(path))
-        else:
-            c_column_chunks_file_paths.push_back(
-                str.encode(metadata_file_path)
-            )
-        args.set_column_chunks_file_paths(move(c_column_chunks_file_paths))
-    if row_group_size_bytes is not None:
-        args.set_row_group_size_bytes(row_group_size_bytes)
-    if row_group_size_rows is not None:
-        args.set_row_group_size_rows(row_group_size_rows)
-    if max_page_size_bytes is not None:
-        args.set_max_page_size_bytes(max_page_size_bytes)
-    if max_page_size_rows is not None:
-        args.set_max_page_size_rows(max_page_size_rows)
-    if max_dictionary_size is not None:
-        args.set_max_dictionary_size(max_dictionary_size)
-
-    with nogil:
-        out_metadata_c = move(parquet_writer(args))
-
-    if metadata_file_path is not None:
-        out_metadata_py = BufferArrayFromVector.from_unique_ptr(
-            move(out_metadata_c)
-        )
-        return np.asarray(out_metadata_py)
-    else:
-        return None
-
-
-cdef class ParquetWriter:
-    """
-    ParquetWriter lets you incrementally write out a Parquet file from a series
-    of cudf tables
-
-    Parameters
-    ----------
-    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
-        File path or buffer to write to. The argument may also correspond
-        to a list of file paths or buffers.
-    index : bool or None, default None
-        If ``True``, include a dataframe's index(es) in the file output.
-        If ``False``, they will not be written to the file. If ``None``,
-        index(es) other than RangeIndex will be saved as columns.
-    compression : {'snappy', None}, default 'snappy'
-        Name of the compression to use. Use ``None`` for no compression.
-    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
-        Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default ``uint64 max``
-        Maximum size of each stripe of the output.
-        By default, a virtually infinite size equal to ``uint64 max`` will be used.
-    row_group_size_rows: int, default 1000000
-        Maximum number of rows of each stripe of the output.
-        By default, 1000000 (10^6 rows) will be used.
-    max_page_size_bytes: int, default 524288
-        Maximum uncompressed size of each page of the output.
-        By default, 524288 (512KB) will be used.
-    max_page_size_rows: int, default 20000
-        Maximum number of rows of each page of the output.
-        By default, 20000 will be used.
-    max_dictionary_size: int, default 1048576
-        Maximum size of the dictionary page for each output column chunk. Dictionary
-        encoding for column chunks that exceeds this limit will be disabled.
-        By default, 1048576 (1MB) will be used.
-    use_dictionary : bool, default True
-        If ``True``, enable dictionary encoding for Parquet page data
-        subject to ``max_dictionary_size`` constraints.
-        If ``False``, disable dictionary encoding for Parquet page data.
-    store_schema : bool, default False
-        If ``True``, enable computing and writing arrow schema to Parquet
-        file footer's key-value metadata section for faithful round-tripping.
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-    cdef bool initialized
-    cdef unique_ptr[cpp_parquet_chunked_writer] writer
-    cdef table_input_metadata tbl_meta
-    cdef sink_info sink
-    cdef vector[unique_ptr[data_sink]] _data_sink
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef size_t row_group_size_bytes
-    cdef size_type row_group_size_rows
-    cdef size_t max_page_size_bytes
-    cdef size_type max_page_size_rows
-    cdef size_t max_dictionary_size
-    cdef bool use_dictionary
-    cdef bool write_arrow_schema
-
-    def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression="snappy", str statistics="ROWGROUP",
-                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  size_type row_group_size_rows=1000000,
-                  size_t max_page_size_bytes=524288,
-                  size_type max_page_size_rows=20000,
-                  size_t max_dictionary_size=1048576,
-                  bool use_dictionary=True,
-                  bool store_schema=False):
-        filepaths_or_buffers = (
-            list(filepath_or_buffer)
-            if is_list_like(filepath_or_buffer)
-            else [filepath_or_buffer]
-        )
-        self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink)
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.initialized = False
-        self.row_group_size_bytes = row_group_size_bytes
-        self.row_group_size_rows = row_group_size_rows
-        self.max_page_size_bytes = max_page_size_bytes
-        self.max_page_size_rows = max_page_size_rows
-        self.max_dictionary_size = max_dictionary_size
-        self.use_dictionary = use_dictionary
-        self.write_arrow_schema = store_schema
-
-    def write_table(self, table, object partitions_info=None):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(
-                table,
-                num_partitions=len(partitions_info) if partitions_info else 1
-            )
-
-        cdef table_view tv
-        if self.index is not False and (
-            table._index.name is not None or
-                isinstance(table._index, cudf.core.multiindex.MultiIndex)):
-            tv = table_view_from_table(table)
-        else:
-            tv = table_view_from_table(table, ignore_index=True)
-
-        cdef vector[partition_info] partitions
-        if partitions_info is not None:
-            for part in partitions_info:
-                partitions.push_back(
-                    partition_info(part[0], part[1])
-                )
-
-        with nogil:
-            self.writer.get()[0].write(tv, partitions)
-
-    def close(self, object metadata_file_path=None):
-        cdef unique_ptr[vector[uint8_t]] out_metadata_c
-        cdef vector[string] column_chunks_file_paths
-
-        if not self.initialized:
-            return None
-
-        # Update metadata-collection options
-        if metadata_file_path is not None:
-            if is_list_like(metadata_file_path):
-                for path in metadata_file_path:
-                    column_chunks_file_paths.push_back(str.encode(path))
-            else:
-                column_chunks_file_paths.push_back(
-                    str.encode(metadata_file_path)
-                )
-
-        with nogil:
-            out_metadata_c = move(
-                self.writer.get()[0].close(column_chunks_file_paths)
-            )
-
-        if metadata_file_path is not None:
-            out_metadata_py = BufferArrayFromVector.from_unique_ptr(
-                move(out_metadata_c)
-            )
-            return np.asarray(out_metadata_py)
-        return None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        self.close()
-
-    def _initialize_chunked_state(self, table, num_partitions=1):
-        """ Prepares all the values required to build the
-        chunked_parquet_writer_options and creates a writer"""
-        cdef table_view tv
-
-        # Set the table_metadata
-        num_index_cols_meta = 0
-        self.tbl_meta = table_input_metadata(
-            table_view_from_table(table, ignore_index=True))
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                tv = table_view_from_table(table)
-                self.tbl_meta = table_input_metadata(tv)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(
-                        (str.encode(idx_name))
-                    )
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    tv = table_view_from_table(table)
-                    self.tbl_meta = table_input_metadata(tv)
-                    self.tbl_meta.column_metadata[0].set_name(
-                        str.encode(table._index.name)
-                    )
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name.encode())
-            _set_col_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-            )
-
-        index = (
-            False if isinstance(table._index, cudf.RangeIndex) else self.index
-        )
-        pandas_metadata = generate_pandas_metadata(table, index)
-        cdef map[string, string] tmp_user_data
-        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-        cdef vector[map[string, string]] user_data
-        user_data = vector[map[string, string]](num_partitions, tmp_user_data)
-
-        cdef chunked_parquet_writer_options args
-        cdef compression_type comp_type = _get_comp_type(self.compression)
-        cdef statistics_freq stat_freq = _get_stat_freq(self.statistics)
-        cdef dictionary_policy dict_policy = (
-            plc.io.types.DictionaryPolicy.ADAPTIVE
-            if self.use_dictionary
-            else plc.io.types.DictionaryPolicy.NEVER
-        )
-        with nogil:
-            args = move(
-                chunked_parquet_writer_options.builder(self.sink)
-                .metadata(self.tbl_meta)
-                .key_value_metadata(move(user_data))
-                .compression(comp_type)
-                .stats_level(stat_freq)
-                .row_group_size_bytes(self.row_group_size_bytes)
-                .row_group_size_rows(self.row_group_size_rows)
-                .max_page_size_bytes(self.max_page_size_bytes)
-                .max_page_size_rows(self.max_page_size_rows)
-                .max_dictionary_size(self.max_dictionary_size)
-                .write_arrow_schema(self.write_arrow_schema)
-                .build()
-            )
-            args.set_dictionary_policy(dict_policy)
-            self.writer.reset(new cpp_parquet_chunked_writer(args))
-        self.initialized = True
-
-
-cpdef merge_filemetadata(object filemetadata_list):
-    """
-    Cython function to call into libcudf API, see `merge_row_group_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.merge_row_group_metadata
-    """
-    cdef vector[unique_ptr[vector[uint8_t]]] list_c
-    cdef vector[uint8_t] blob_c
-    cdef unique_ptr[vector[uint8_t]] output_c
-
-    for blob_py in filemetadata_list:
-        blob_c = blob_py
-        list_c.push_back(move(make_unique[vector[uint8_t]](blob_c)))
-
-    with nogil:
-        output_c = move(parquet_merge_metadata(list_c))
-
-    out_metadata_py = BufferArrayFromVector.from_unique_ptr(move(output_c))
-    return np.asarray(out_metadata_py)
-
-
-cdef statistics_freq _get_stat_freq(str statistics):
-    result = getattr(
-        plc.io.types.StatisticsFreq,
-        f"STATISTICS_{statistics.upper()}",
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `statistics_freq` type")
-    return result
-
-
-cdef compression_type _get_comp_type(object compression):
-    if compression is None:
-        return plc.io.types.CompressionType.NONE
-    result = getattr(
-        plc.io.types.CompressionType,
-        str(compression).upper(),
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `compression` type")
-    return result
-
-
-cdef _set_col_metadata(
-    Column col,
-    column_in_metadata& col_meta,
-    bool force_nullable_schema=False,
-    str path=None,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-):
-    need_path = (skip_compression is not None or column_encoding is not None or
-                 column_type_length is not None or output_as_binary is not None)
-    name = col_meta.get_name().decode('UTF-8') if need_path else None
-    full_path = path + "." + name if path is not None else name
-
-    if force_nullable_schema:
-        # Only set nullability if `force_nullable_schema`
-        # is true.
-        col_meta.set_nullability(True)
-
-    if skip_compression is not None and full_path in skip_compression:
-        col_meta.set_skip_compression(True)
-
-    if column_encoding is not None and full_path in column_encoding:
-        encoding = column_encoding[full_path]
-        if encoding is None:
-            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
-        else:
-            enc = str(encoding).upper()
-            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
-            if c_encoding is None:
-                raise ValueError("Unsupported `column_encoding` type")
-        col_meta.set_encoding(c_encoding)
-
-    if column_type_length is not None and full_path in column_type_length:
-        col_meta.set_output_as_binary(True)
-        col_meta.set_type_length(column_type_length[full_path])
-
-    if output_as_binary is not None and full_path in output_as_binary:
-        col_meta.set_output_as_binary(True)
-
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name.encode())
-            _set_col_metadata(
-                child_col,
-                col_meta.child(i),
-                force_nullable_schema,
-                full_path,
-                skip_compression,
-                column_encoding,
-                column_type_length,
-                output_as_binary
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if full_path is not None:
-            full_path = full_path + ".list"
-            col_meta.child(1).set_name("element".encode())
-        _set_col_metadata(
-            col.children[1],
-            col_meta.child(1),
-            force_nullable_schema,
-            full_path,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
-        col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
deleted file mode 100644
index 944753d28b8..00000000000
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import warnings
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
-
-import pylibcudf
-
-from cudf._lib.aggregation import make_aggregation
-
-
-@acquire_spill_lock()
-def reduce(reduction_op, Column incol, dtype=None, **kwargs):
-    """
-    Top level Cython reduce function wrapping libcudf reductions.
-
-    Parameters
-    ----------
-    reduction_op : string
-        A string specifying the operation, e.g. sum, prod
-    incol : Column
-        A cuDF Column object
-    dtype: numpy.dtype, optional
-        A numpy data type to use for the output, defaults
-        to the same type as the input column
-    """
-    if dtype is not None:
-        warnings.warn(
-            "dtype is deprecated and will be remove in a future release. "
-            "Cast the result (e.g. .astype) after the operation instead.",
-            FutureWarning
-        )
-        col_dtype = dtype
-    else:
-        col_dtype = incol._reduction_result_dtype(reduction_op)
-
-    # check empty case
-    if len(incol) <= incol.null_count:
-        if reduction_op == 'sum' or reduction_op == 'sum_of_squares':
-            return incol.dtype.type(0)
-        if reduction_op == 'product':
-            return incol.dtype.type(1)
-        if reduction_op == "any":
-            return False
-
-        return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
-
-    result = pylibcudf.reduce.reduce(
-        incol.to_pylibcudf(mode="read"),
-        make_aggregation(reduction_op, kwargs).c_obj,
-        dtype_to_pylibcudf_type(col_dtype),
-    )
-
-    if is_decimal_type_id(result.type().id()):
-        scale = -result.type().scale()
-        precision = _reduce_precision(col_dtype, reduction_op, len(incol))
-        return DeviceScalar.from_pylibcudf(
-            result,
-            dtype=col_dtype.__class__(precision, scale),
-        ).value
-    scalar = DeviceScalar.from_pylibcudf(result).value
-    if isinstance(col_dtype, cudf.StructDtype):
-        # TODO: Utilize column_metadata in libcudf to maintain field labels
-        return dict(zip(col_dtype.fields.keys(), scalar.values()))
-    return scalar
-
-
-@acquire_spill_lock()
-def scan(scan_op, Column incol, inclusive, **kwargs):
-    """
-    Top level Cython scan function wrapping libcudf scans.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-    scan_op : string
-        A string specifying the operation, e.g. cumprod
-    inclusive: bool
-        Flag for including nulls in relevant scan
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.reduce.scan(
-            incol.to_pylibcudf(mode="read"),
-            make_aggregation(scan_op, kwargs).c_obj,
-            pylibcudf.reduce.ScanType.INCLUSIVE if inclusive
-            else pylibcudf.reduce.ScanType.EXCLUSIVE,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minmax(Column incol):
-    """
-    Top level Cython minmax function wrapping libcudf minmax.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-
-    Returns
-    -------
-    A pair of ``(min, max)`` values of ``incol``
-    """
-    min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read"))
-    return (
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)),
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)),
-    )
-
-
-def _reduce_precision(dtype, op, nrows):
-    """
-    Returns the result precision when performing the reduce
-    operation `op` for the given dtype and column size.
-
-    See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
-    p = dtype.precision
-    if op in ("min", "max"):
-        new_p = p
-    elif op == "sum":
-        new_p = p + nrows - 1
-    elif op == "product":
-        new_p = p * nrows + nrows - 1
-    elif op == "sum_of_squares":
-        new_p = 2 * p + nrows
-    else:
-        raise NotImplementedError()
-    return max(min(new_p, dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
deleted file mode 100644
index b50c6dd25e3..00000000000
--- a/python/cudf/cudf/_lib/replace.pyx
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.api.types import is_scalar
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-
-import pylibcudf
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def replace(Column input_col, Column values_to_replace,
-            Column replacement_values):
-    """
-    Replaces values from values_to_replace with corresponding value from
-    replacement_values in input_col
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    values_to_replace : Column with values which needs to be replaced
-    replacement_values : Column with values which will replace
-    """
-
-    return Column.from_pylibcudf(
-        pylibcudf.replace.find_and_replace_all(
-            input_col.to_pylibcudf(mode="read"),
-            values_to_replace.to_pylibcudf(mode="read"),
-            replacement_values.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_column(Column input_col, Column replacement_values):
-    """
-    Replaces null values in input_col with corresponding values from
-    replacement_values
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    replacement_values : Column with values which will replace nulls
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            replacement_values.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value):
-    """
-    Replaces null values in input_col with replacement_value
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    replacement_value : DeviceScalar with value which will replace nulls
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            replacement_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def replace_nulls_fill(Column input_col, object method):
-    """
-    Replaces null values in input_col with replacement_value
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    method : 'ffill' or 'bfill'
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.replace_nulls(
-            input_col.to_pylibcudf(mode="read"),
-            pylibcudf.replace.ReplacePolicy.PRECEDING
-            if method == 'ffill'
-            else pylibcudf.replace.ReplacePolicy.FOLLOWING,
-        )
-    )
-
-
-def replace_nulls(
-    Column input_col,
-    object replacement=None,
-    object method=None,
-    object dtype=None
-):
-    """
-    Calls one of the version of replace_nulls depending on type
-    of replacement
-    """
-
-    if replacement is None and method is None:
-        raise ValueError("Must specify a fill 'value' or 'method'.")
-
-    if replacement and method:
-        raise ValueError("Cannot specify both 'value' and 'method'.")
-
-    if method:
-        return replace_nulls_fill(input_col, method)
-    elif is_scalar(replacement):
-        return replace_nulls_scalar(
-            input_col,
-            as_device_scalar(replacement, dtype=dtype)
-        )
-    else:
-        return replace_nulls_column(input_col, replacement)
-
-
-@acquire_spill_lock()
-def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi):
-    """
-    Clip the input_col such that values < lo will be replaced by lo
-    and > hi will be replaced by hi
-
-    Parameters
-    ----------
-    input_col : Column whose value will be updated
-    lo : DeviceScalar value for clipping lower values
-    hi : DeviceScalar value for clipping upper values
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.clamp(
-            input_col.to_pylibcudf(mode="read"),
-            lo.c_value,
-            hi.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def clip(Column input_col, object lo, object hi):
-    """
-    Clip the input_col such that values < lo will be replaced by lo
-    and > hi will be replaced by hi
-    """
-
-    lo_scalar = as_device_scalar(lo, dtype=input_col.dtype)
-    hi_scalar = as_device_scalar(hi, dtype=input_col.dtype)
-
-    return clamp(input_col, lo_scalar, hi_scalar)
-
-
-@acquire_spill_lock()
-def normalize_nans_and_zeros_inplace(Column input_col):
-    """
-    Inplace normalizing
-    """
-    pylibcudf.replace.normalize_nans_and_zeros(
-        input_col.to_pylibcudf(mode="write"), inplace=True
-    )
-
-
-@acquire_spill_lock()
-def normalize_nans_and_zeros_column(Column input_col):
-    """
-    Returns a new  normalized Column
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.replace.normalize_nans_and_zeros(
-            input_col.to_pylibcudf(mode="read")
-        )
-    )
-
-
-def normalize_nans_and_zeros(Column input_col, in_place=False):
-    """
-    Normalize the NaN and zeros in input_col
-    Convert  -NaN  -> NaN
-    Convert  -0.0  -> 0.0
-
-    Parameters
-    ----------
-    input_col : Column that needs to be normalized
-    in_place : boolean whether to normalize in place or return new column
-    """
-
-    if in_place is True:
-        normalize_nans_and_zeros_inplace(input_col)
-    else:
-        return normalize_nans_and_zeros_column(input_col)
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
deleted file mode 100644
index f961c09e6f6..00000000000
--- a/python/cudf/cudf/_lib/round.pyx
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-from pylibcudf.round import RoundingMethod
-
-
-@acquire_spill_lock()
-def round(Column input_col, int decimal_places=0, how="half_even"):
-    """
-    Round column values to the given number of decimal places
-
-    Parameters
-    ----------
-    input_col : Column whose values will be rounded
-    decimal_places : The number or decimal places to round to
-
-    Returns
-    -------
-    A Column with values rounded to the given number of decimal places
-    """
-    if how not in {"half_even", "half_up"}:
-        raise ValueError("'how' must be either 'half_even' or 'half_up'")
-
-    how = (
-        RoundingMethod.HALF_EVEN if how == "half_even"
-        else RoundingMethod.HALF_UP
-    )
-
-    return Column.from_pylibcudf(
-        plc.round.round(
-            input_col.to_pylibcudf(mode="read"),
-            decimal_places,
-            how
-        )
-    )
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
deleted file mode 100644
index eefe37d9880..00000000000
--- a/python/cudf/cudf/_lib/sort.pyx
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from itertools import repeat
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def is_sorted(
-    list source_columns, object ascending=None, object null_position=None
-):
-    """
-    Checks whether the rows of a `table` are sorted in lexicographical order.
-
-    Parameters
-    ----------
-    source_columns : list of columns
-        columns to be checked for sort order
-    ascending : None or list-like of booleans
-        None or list-like of boolean values indicating expected sort order of
-        each column. If list-like, size of list-like must be len(columns). If
-        None, all columns expected sort order is set to ascending. False (0) -
-        descending, True (1) - ascending.
-    null_position : None or list-like of booleans
-        None or list-like of boolean values indicating desired order of nulls
-        compared to other elements. If list-like, size of list-like must be
-        len(columns). If None, null order is set to before. False (0) - after,
-        True (1) - before.
-
-    Returns
-    -------
-    returns : boolean
-        Returns True, if sorted as expected by ``ascending`` and
-        ``null_position``, False otherwise.
-    """
-
-    if ascending is None:
-        column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns)
-    else:
-        if len(ascending) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(ascending)} for `ascending`"
-            )
-        column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns)
-        for idx, val in enumerate(ascending):
-            if val:
-                column_order[idx] = pylibcudf.types.Order.ASCENDING
-
-    if null_position is None:
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-    else:
-        if len(null_position) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(null_position)} for `null_position`"
-            )
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-        for idx, val in enumerate(null_position):
-            if val:
-                null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE
-
-    return pylibcudf.sorting.is_sorted(
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ),
-        column_order,
-        null_precedence
-    )
-
-
-def ordering(column_order, null_precedence):
-    """
-    Construct order and null order vectors
-
-    Parameters
-    ----------
-    column_order
-        Iterable of bool (True for ascending order, False for descending)
-    null_precedence
-        Iterable string for null positions ("first" for start, "last" for end)
-
-    Both iterables must be the same length (not checked)
-
-    Returns
-    -------
-    pair of vectors (order, and null_order)
-    """
-    c_column_order = []
-    c_null_precedence = []
-    for asc, null in zip(column_order, null_precedence):
-        c_column_order.append(
-            pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING
-        )
-        if asc ^ (null == "first"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.AFTER)
-        elif asc ^ (null == "last"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE)
-        else:
-            raise ValueError(f"Invalid null precedence {null}")
-    return c_column_order, c_null_precedence
-
-
-@acquire_spill_lock()
-def order_by(
-    list columns_from_table,
-    object ascending,
-    str na_position,
-    *,
-    bool stable
-):
-    """
-    Get index to sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    columns_from_table : list[Column]
-        Columns from the table which will be sorted
-    ascending : sequence[bool]
-         Sequence of boolean values which correspond to each column
-         in the table to be sorted signifying the order of each column
-         True - Ascending and False - Descending
-    na_position : str
-        Whether null values should show up at the "first" or "last"
-        position of **all** sorted column.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    Column of indices that sorts the table
-    """
-    order = ordering(ascending, repeat(na_position))
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order")
-
-    return Column.from_pylibcudf(
-        func(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in columns_from_table],
-            ),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort(
-    list values,
-    list column_order=None,
-    list null_precedence=None,
-):
-    """
-    Sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    return columns_from_pylibcudf_table(
-        pylibcudf.sorting.sort(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort_by_key(
-    list values,
-    list keys,
-    object ascending,
-    object na_position,
-    *,
-    bool stable,
-):
-    """
-    Sort a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    ascending : list[bool]
-        Sequence of boolean values which correspond to each column
-        in the table to be sorted signifying the order of each column
-        True - Ascending and False - Descending
-    na_position : list[str]
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    order = ordering(ascending, na_position)
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key")
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def segmented_sort_by_key(
-    list values,
-    list keys,
-    Column segment_offsets,
-    list column_order=None,
-    list null_precedence=None,
-    *,
-    bool stable,
-):
-    """
-    Sort segments of a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    offsets : Column
-        Segment offsets
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    func = getattr(
-        pylibcudf.sorting,
-        f"{'stable_' if stable else ''}segmented_sort_by_key"
-    )
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            segment_offsets.to_pylibcudf(mode="read"),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def digitize(list source_columns, list bins, bool right=False):
-    """
-    Return the indices of the bins to which each value in source_table belongs.
-
-    Parameters
-    ----------
-    source_columns : Input columns to be binned.
-    bins : List containing columns of bins
-    right : Indicating whether the intervals include the
-            right or the left bin edge.
-    """
-    return Column.from_pylibcudf(
-        getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in bins]
-            ),
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in source_columns]
-            ),
-            [pylibcudf.types.Order.ASCENDING]*len(bins),
-            [pylibcudf.types.NullOrder.BEFORE]*len(bins)
-        )
-    )
-
-
-@acquire_spill_lock()
-def rank_columns(list source_columns, rank_method method, str na_option,
-                 bool ascending, bool pct
-                 ):
-    """
-    Compute numerical data ranks (1 through n) of each column in the dataframe
-    """
-    column_order = (
-        pylibcudf.types.Order.ASCENDING
-        if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-    # ascending
-    #    #top    = na_is_smallest
-    #    #bottom = na_is_largest
-    #    #keep   = na_is_largest
-    # descending
-    #    #top    = na_is_largest
-    #    #bottom = na_is_smallest
-    #    #keep   = na_is_smallest
-    if ascending:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-        else:
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-    else:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-        else:
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-    c_null_handling = (
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if na_option == 'keep'
-        else pylibcudf.types.NullPolicy.INCLUDE
-    )
-
-    return [
-        Column.from_pylibcudf(
-            pylibcudf.sorting.rank(
-                col.to_pylibcudf(mode="read"),
-                method,
-                column_order,
-                c_null_handling,
-                null_precedence,
-                pct,
-            )
-        )
-        for col in source_columns
-    ]
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
deleted file mode 100644
index dca9c4cc3fc..00000000000
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-add_subdirectory(convert)
-add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
deleted file mode 100644
index b795c54c112..00000000000
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
-from cudf._lib.nvtext.generate_ngrams import (
-    generate_character_ngrams,
-    generate_ngrams,
-    hash_character_ngrams,
-)
-from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import (
-    minhash,
-    minhash64,
-    minhash64_permuted,
-    minhash_permuted,
-    word_minhash,
-    word_minhash64,
-)
-from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
-from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
-from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
-from cudf._lib.nvtext.stemmer import (
-    LetterType,
-    is_letter,
-    is_letter_multi,
-    porter_stemmer_measure,
-)
-from cudf._lib.nvtext.tokenize import (
-    _count_tokens_column,
-    _count_tokens_scalar,
-    _tokenize_column,
-    _tokenize_scalar,
-    character_tokenize,
-    detokenize,
-    tokenize_with_vocabulary,
-)
-from cudf._lib.strings.convert.convert_fixed_point import to_decimal
-from cudf._lib.strings.convert.convert_floats import is_float
-from cudf._lib.strings.convert.convert_integers import is_integer
-from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
-from cudf._lib.strings.split.partition import partition, rpartition
-from cudf._lib.strings.split.split import (
-    rsplit,
-    rsplit_re,
-    rsplit_record,
-    rsplit_record_re,
-    split,
-    split_re,
-    split_record,
-    split_record_re,
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
deleted file mode 100644
index e8a76b476a8..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx
-                   convert_lists.pyx convert_urls.pyx
-)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
deleted file mode 100644
index 96dcd021c3b..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def from_decimal(Column input_col):
-    """
-    Converts a `Decimal64Column` to a `StringColumn`.
-
-    Parameters
-    ----------
-    input_col : input column of type decimal
-
-    Returns
-    -------
-    A column of strings representing the input decimal values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def to_decimal(Column input_col, object out_type):
-    """
-    Returns a `Decimal64Column` from the provided `StringColumn`
-    using the scale in the `out_type`.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    out_type : The type and scale of the decimal column expected
-
-    Returns
-    -------
-    A column of decimals parsed from the string values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(out_type),
-    )
-    result = Column.from_pylibcudf(plc_column)
-    result.dtype.precision = out_type.precision
-    return result
-
-
-@acquire_spill_lock()
-def is_fixed_point(Column input_col, object dtype):
-    """
-    Returns a Column of boolean values with True for `input_col`
-    that have fixed-point characters. The output row also has a
-    False value if the corresponding string would cause an integer
-    overflow. The scale of the `dtype` is used to determine overflow
-    in the output row.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    dtype : The type and scale of a decimal column
-
-    Returns
-    -------
-    A Column of booleans indicating valid decimal conversion.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(dtype),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
deleted file mode 100644
index 5da6e3f10cc..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    plc_column = plc.strings.convert.convert_floats.is_float(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
deleted file mode 100644
index 50113347ccb..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have integers.
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_integers.is_integer(
-            source_strings.to_pylibcudf(mode="read")
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
deleted file mode 100644
index 3a2cb4bd5c7..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def format_list_column(Column source_list, Column separators):
-    """
-    Format a list column of strings into a strings column.
-
-    Parameters
-    ----------
-    input_col : input column of type list with strings child.
-
-    separators: strings used for formatting (', ', '[', ']')
-
-    Returns
-    -------
-    Formatted strings column
-    """
-    plc_column = plc.strings.convert.convert_lists.format_list_column(
-        source_list.to_pylibcudf(mode="read"),
-        as_device_scalar("None").c_value,
-        separators.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
deleted file mode 100644
index d5c2f771970..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def url_decode(Column source_strings):
-    """
-    Decode each string in column. No format checking is performed.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL decoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_decode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def url_encode(Column source_strings):
-    """
-    Encode each string in column. No format checking is performed.
-    All characters are encoded except for ASCII letters, digits,
-    and these characters: '.','_','-','~'. Encoding converts to
-    hex using UTF-8 encoded bytes.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL encoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_encode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
deleted file mode 100644
index 4ede0a2fac5..00000000000
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources partition.pyx split.pyx)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
deleted file mode 100644
index 5319addc41c..00000000000
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def partition(Column source_strings,
-              object py_delimiter):
-    """
-    Returns data by splitting the `source_strings`
-    column at the first occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.partition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rpartition(Column source_strings,
-               object py_delimiter):
-    """
-    Returns a Column by splitting the `source_strings`
-    column at the last occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.rpartition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
deleted file mode 100644
index 4ec6c7073d8..00000000000
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def split(Column source_strings,
-          object py_delimiter,
-          size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_table = plc.strings.split.split.split(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record(Column source_strings,
-                 object py_delimiter,
-                 size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_column = plc.strings.split.split.split_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit(Column source_strings,
-           object py_delimiter,
-           size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_table = plc.strings.split.split.rsplit(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_record(Column source_strings,
-                  object py_delimiter,
-                  size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_column = plc.strings.split.split.rsplit_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def split_re(Column source_strings,
-             object pattern,
-             size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_table = plc.strings.split.split.split_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_re(Column source_strings,
-              object pattern,
-              size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_table = plc.strings.split.split.rsplit_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record_re(Column source_strings,
-                    object pattern,
-                    size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_column = plc.strings.split.split.split_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit_record_re(Column source_strings,
-                     object pattern,
-                     size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_column = plc.strings.split.split.rsplit_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index dd2fafbe07f..83f0cb850a5 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
-
 from pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
@@ -27,6 +26,7 @@ from rmm.librmm.device_buffer cimport device_buffer
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
+from pylibcudf cimport Column as plc_Column
 
 
 def get_cuda_build_version():
@@ -52,9 +52,9 @@ def column_from_udf_string_array(DeviceBuffer d_buffer):
         c_result = move(cpp_column_from_udf_string_array(data, size))
         cpp_free_udf_string_array(data, size)
 
-    result = Column.from_unique_ptr(move(c_result))
-
-    return result
+    return Column.from_pylibcudf(
+        plc_Column.from_libcudf(move(c_result))
+    )
 
 
 def get_character_flags_table_ptr():
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
deleted file mode 100644
index 7942d067c2b..00000000000
--- a/python/cudf/cudf/_lib/text.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from io import TextIOBase
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def read_text(object filepaths_or_buffers,
-              str delimiter,
-              object byte_range,
-              bool strip_delimiters,
-              object compression,
-              object compression_offsets):
-    """
-    Cython function to call into libcudf API, see `multibyte_split`.
-
-    See Also
-    --------
-    cudf.io.text.read_text
-    """
-    if compression is None:
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
-        else:
-            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
-    elif compression == "bgzip":
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            raise ValueError("bgzip compression requires a file path")
-        if compression_offsets is not None:
-            if len(compression_offsets) != 2:
-                raise ValueError(
-                    "compression offsets need to consist of two elements")
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-                compression_offsets[0],
-                compression_offsets[1]
-            )
-        else:
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-            )
-    else:
-        raise ValueError("Only bgzip compression is supported at the moment")
-
-    options = plc.io.text.ParseOptions(
-        byte_range=byte_range, strip_delimiters=strip_delimiters
-    )
-    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
deleted file mode 100644
index a163bb07888..00000000000
--- a/python/cudf/cudf/_lib/transform.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from numba.np import numpy_support
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.utils import cudautils
-
-from pylibcudf cimport transform as plc_transform
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def bools_to_mask(Column col):
-    """
-    Given an int8 (boolean) column, compress the data from booleans to bits and
-    return a Buffer
-    """
-    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
-    """
-    Given a mask buffer, returns a boolean column representng bit 0 -> False
-    and 1 -> True within range of [begin_bit, end_bit),
-    """
-    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
-        raise TypeError("mask_buffer is not an instance of "
-                        "cudf.core.buffer.Buffer")
-    plc_column = plc_transform.mask_to_bools(
-        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def nans_to_nulls(Column input):
-    mask, _ = plc_transform.nans_to_nulls(
-        input.to_pylibcudf(mode="read")
-    )
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def transform(Column input, op):
-    nb_type = numpy_support.from_dtype(input.dtype)
-    nb_signature = (nb_type,)
-    compiled_op = cudautils.compile_udf(op, nb_signature)
-    np_dtype = cudf.dtype(compiled_op[1])
-
-    plc_column = plc_transform.transform(
-        input.to_pylibcudf(mode="read"),
-        compiled_op[0],
-        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
-        True
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def table_encode(list source_columns):
-    plc_table, plc_column = plc_transform.encode(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
-    )
-
-    return (
-        [Column.from_pylibcudf(col) for col in plc_table.columns()],
-        Column.from_pylibcudf(plc_column)
-    )
-
-
-def one_hot_encode(Column input_column, Column categories):
-    plc_table = plc_transform.one_hot_encode(
-        input_column.to_pylibcudf(mode="read"),
-        categories.to_pylibcudf(mode="read"),
-    )
-    result_columns = [
-        Column.from_pylibcudf(col, data_ptr_exposed=True)
-        for col in plc_table.columns()
-    ]
-    result_labels = [
-        x if x is not None else '<NA>'
-        for x in categories.to_arrow().to_pylist()
-    ]
-    return dict(zip(result_labels, result_columns))
-
-
-@acquire_spill_lock()
-def compute_column(list columns, tuple column_names, str expr):
-    """Compute a new column by evaluating an expression on a set of columns.
-
-    Parameters
-    ----------
-    columns : list
-        The set of columns forming the table to evaluate the expression on.
-    column_names : tuple[str]
-        The names associated with each column. These names are necessary to map
-        column names in the expression to indices in the provided list of
-        columns, which are what will be used by libcudf to evaluate the
-        expression on the table.
-    expr : str
-        The expression to evaluate.
-    """
-    result = plc_transform.compute_column(
-        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        plc.expressions.to_expression(expr, column_names),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 6b3f10e1806..ff032656f80 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-
+from pylibcudf cimport Column as plc_Column
 try:
     import ujson as json
 except ImportError:
@@ -223,10 +223,11 @@ cdef columns_from_unique_ptr(
 
     cdef size_t i
 
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
-
-    return columns
+    return [
+        Column.from_pylibcudf(
+            plc_Column.from_libcudf(move(dereference(it+i)))
+        ) for i in range(c_columns.size())
+    ]
 
 
 cpdef columns_from_pylibcudf_table(tbl):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index a6abd63d042..1b6152b81ca 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal
@@ -133,7 +132,7 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    def tolist(self):  # noqa: D102
+    def tolist(self):
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
@@ -148,7 +147,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self) -> int:  # noqa: D401
+    def ndim(self) -> int:
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -265,7 +264,7 @@ def get_loc(self, key):
         slice(1, 3, None)
         >>> multi_index.get_loc(('b', 'e'))
         1
-        """  # noqa: E501
+        """
 
     def max(self):
         """The maximum value of the index."""
@@ -330,13 +329,6 @@ def get_level_values(self, level):
         else:
             raise KeyError(f"Requested level with name {level} " "not found")
 
-    @classmethod
-    def deserialize(cls, header, frames):
-        # Dispatch deserialization to the appropriate index type in case
-        # deserialization is ever attempted with the base class directly.
-        idx_type = pickle.loads(header["type-serialized"])
-        return idx_type.deserialize(header, frames)
-
     @property
     def names(self):
         """
@@ -1473,7 +1465,7 @@ def _intersection(self, other, sort=None):
             ._data
         )
 
-        if sort is {None, True} and len(other):
+        if sort in {None, True} and len(other):
             return intersection_result.sort_values()
         return intersection_result
 
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
new file mode 100644
index 00000000000..fe8ea5a947a
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from numba.np import numpy_support
+
+import pylibcudf as plc
+
+import cudf
+from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+from cudf.api.types import is_scalar
+from cudf.utils import cudautils
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from typing_extensions import Self
+
+_agg_name_map = {
+    "COUNT_VALID": "COUNT",
+    "COUNT_ALL": "SIZE",
+    "VARIANCE": "VAR",
+    "NTH_ELEMENT": "NTH",
+    "COLLECT_LIST": "COLLECT",
+    "COLLECT_SET": "UNIQUE",
+}
+
+
+class Aggregation:
+    def __init__(self, agg: plc.aggregation.Aggregation) -> None:
+        self.c_obj = agg
+
+    @property
+    def kind(self) -> str:
+        name = self.c_obj.kind().name
+        return _agg_name_map.get(name, name)
+
+    @classmethod
+    def sum(cls) -> Self:
+        return cls(plc.aggregation.sum())
+
+    @classmethod
+    def min(cls) -> Self:
+        return cls(plc.aggregation.min())
+
+    @classmethod
+    def max(cls) -> Self:
+        return cls(plc.aggregation.max())
+
+    @classmethod
+    def idxmin(cls) -> Self:
+        return cls(plc.aggregation.argmin())
+
+    @classmethod
+    def idxmax(cls) -> Self:
+        return cls(plc.aggregation.argmax())
+
+    @classmethod
+    def mean(cls) -> Self:
+        return cls(plc.aggregation.mean())
+
+    @classmethod
+    def count(cls, dropna: bool = True) -> Self:
+        return cls(
+            plc.aggregation.count(
+                plc.types.NullPolicy.EXCLUDE
+                if dropna
+                else plc.types.NullPolicy.INCLUDE
+            )
+        )
+
+    @classmethod
+    def ewma(cls, com: float = 1.0, adjust: bool = True) -> Self:
+        return cls(
+            plc.aggregation.ewma(
+                com,
+                plc.aggregation.EWMHistory.INFINITE
+                if adjust
+                else plc.aggregation.EWMHistory.FINITE,
+            )
+        )
+
+    @classmethod
+    def size(cls) -> Self:
+        return cls(plc.aggregation.count(plc.types.NullPolicy.INCLUDE))
+
+    @classmethod
+    def collect(cls) -> Self:
+        return cls(plc.aggregation.collect_list(plc.types.NullPolicy.INCLUDE))
+
+    @classmethod
+    def nunique(cls, dropna: bool = True) -> Self:
+        return cls(
+            plc.aggregation.nunique(
+                plc.types.NullPolicy.EXCLUDE
+                if dropna
+                else plc.types.NullPolicy.INCLUDE
+            )
+        )
+
+    @classmethod
+    def nth(cls, size: int) -> Self:
+        return cls(plc.aggregation.nth_element(size))
+
+    @classmethod
+    def product(cls) -> Self:
+        return cls(plc.aggregation.product())
+
+    prod = product
+
+    @classmethod
+    def sum_of_squares(cls) -> Self:
+        return cls(plc.aggregation.sum_of_squares())
+
+    @classmethod
+    def var(cls, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.variance(ddof))
+
+    @classmethod
+    def std(cls, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.std(ddof))
+
+    @classmethod
+    def median(cls) -> Self:
+        return cls(plc.aggregation.median())
+
+    @classmethod
+    def quantile(
+        cls,
+        q: float | list[float] = 0.5,
+        interpolation: Literal[
+            "linear", "lower", "higher", "midpoint", "nearest"
+        ] = "linear",
+    ) -> Self:
+        return cls(
+            plc.aggregation.quantile(
+                [q] if is_scalar(q) else q,
+                plc.types.Interpolation[interpolation.upper()],
+            )
+        )
+
+    @classmethod
+    def unique(cls) -> Self:
+        return cls(
+            plc.aggregation.collect_set(
+                plc.types.NullPolicy.INCLUDE,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+        )
+
+    @classmethod
+    def first(cls) -> Self:
+        return cls(
+            plc.aggregation.nth_element(0, plc.types.NullPolicy.EXCLUDE)
+        )
+
+    @classmethod
+    def last(cls) -> Self:
+        return cls(
+            plc.aggregation.nth_element(-1, plc.types.NullPolicy.EXCLUDE)
+        )
+
+    @classmethod
+    def corr(cls, method, min_periods) -> Self:
+        return cls(
+            plc.aggregation.correlation(
+                plc.aggregation.CorrelationType[method.upper()], min_periods
+            )
+        )
+
+    @classmethod
+    def cov(cls, min_periods: int, ddof: int = 1) -> Self:
+        return cls(plc.aggregation.covariance(min_periods, ddof))
+
+    # scan aggregations
+    @classmethod
+    def cumcount(cls) -> Self:
+        return cls.count(False)
+
+    cumsum = sum
+    cummin = min
+    cummax = max
+    cumprod = product
+
+    @classmethod
+    def rank(
+        cls,
+        method: Literal["first", "average", "min", "max", "dense"],
+        ascending: bool,
+        na_option: Literal["keep", "top", "bottom"],
+        pct: bool,
+    ) -> Self:
+        return cls(
+            plc.aggregation.rank(
+                plc.aggregation.RankMethod[method.upper()],
+                (
+                    plc.types.Order.ASCENDING
+                    if ascending
+                    else plc.types.Order.DESCENDING
+                ),
+                (
+                    plc.types.NullPolicy.EXCLUDE
+                    if na_option == "keep"
+                    else plc.types.NullPolicy.INCLUDE
+                ),
+                (
+                    plc.types.NullOrder.BEFORE
+                    if (na_option == "top") == ascending
+                    else plc.types.NullOrder.AFTER
+                ),
+                (
+                    plc.aggregation.RankPercentage.ZERO_NORMALIZED
+                    if pct
+                    else plc.aggregation.RankPercentage.NONE
+                ),
+            )
+        )
+
+    # Reduce aggregations
+    @classmethod
+    def any(cls) -> Self:
+        return cls(plc.aggregation.any())
+
+    @classmethod
+    def all(cls) -> Self:
+        return cls(plc.aggregation.all())
+
+    # Rolling aggregations
+    @classmethod
+    def from_udf(cls, op, *args, **kwargs) -> Self:
+        # Handling UDF type
+        nb_type = numpy_support.from_dtype(kwargs["dtype"])
+        type_signature = (nb_type[:],)
+        ptx_code, output_dtype = cudautils.compile_udf(op, type_signature)
+        output_np_dtype = cudf.dtype(output_dtype)
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
+            raise TypeError(
+                f"Result of window function has unsupported dtype {op[1]}"
+            )
+
+        return cls(
+            plc.aggregation.udf(
+                ptx_code,
+                plc.DataType(
+                    SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]
+                ),
+            )
+        )
+
+
+def make_aggregation(
+    op: str | Callable, kwargs: dict | None = None
+) -> Aggregation:
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    Aggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    if isinstance(op, str):
+        return getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            return Aggregation.collect()
+        elif "dtype" in kwargs:
+            return Aggregation.from_udf(op, **kwargs)
+        else:
+            return op(Aggregation)
+    raise TypeError(f"Unknown aggregation {op}")
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
new file mode 100644
index 00000000000..212150f505e
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+    from cudf.core.column import ColumnBase
+    from cudf.core.scalar import Scalar
+
+
+@acquire_spill_lock()
+def binaryop(
+    lhs: ColumnBase | Scalar, rhs: ColumnBase | Scalar, op: str, dtype: Dtype
+) -> ColumnBase:
+    """
+    Dispatches a binary op call to the appropriate libcudf function:
+    """
+    # TODO: Shouldn't have to keep special-casing. We need to define a separate
+    # pipeline for libcudf binops that don't map to Python binops.
+    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
+        op = op[2:-2]
+    # Map pandas operation names to pylibcudf operation names.
+    _op_map = {
+        "TRUEDIV": "TRUE_DIV",
+        "FLOORDIV": "FLOOR_DIV",
+        "MOD": "PYMOD",
+        "EQ": "EQUAL",
+        "NE": "NOT_EQUAL",
+        "LT": "LESS",
+        "GT": "GREATER",
+        "LE": "LESS_EQUAL",
+        "GE": "GREATER_EQUAL",
+        "AND": "BITWISE_AND",
+        "OR": "BITWISE_OR",
+        "XOR": "BITWISE_XOR",
+        "L_AND": "LOGICAL_AND",
+        "L_OR": "LOGICAL_OR",
+    }
+    op = op.upper()
+    op = _op_map.get(op, op)
+
+    return Column.from_pylibcudf(
+        plc.binaryop.binary_operation(
+            lhs.to_pylibcudf(mode="read")
+            if isinstance(lhs, Column)
+            else lhs.device_value.c_value,
+            rhs.to_pylibcudf(mode="read")
+            if isinstance(rhs, Column)
+            else rhs.device_value.c_value,
+            plc.binaryop.BinaryOperator[op],
+            dtype_to_pylibcudf_type(dtype),
+        )
+    )
diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py
new file mode 100644
index 00000000000..69f9e7664b1
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/sorting.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def is_sorted(
+    source_columns: list[ColumnBase],
+    ascending: list[bool] | None = None,
+    null_position: list[bool] | None = None,
+) -> bool:
+    """
+    Checks whether the rows of a `table` are sorted in lexicographical order.
+
+    Parameters
+    ----------
+    source_columns : list of columns
+        columns to be checked for sort order
+    ascending : None or list-like of booleans
+        None or list-like of boolean values indicating expected sort order of
+        each column. If list-like, size of list-like must be len(columns). If
+        None, all columns expected sort order is set to ascending. False (0) -
+        descending, True (1) - ascending.
+    null_position : None or list-like of booleans
+        None or list-like of boolean values indicating desired order of nulls
+        compared to other elements. If list-like, size of list-like must be
+        len(columns). If None, null order is set to before. False (0) - after,
+        True (1) - before.
+
+    Returns
+    -------
+    returns : boolean
+        Returns True, if sorted as expected by ``ascending`` and
+        ``null_position``, False otherwise.
+    """
+    if ascending is None:
+        column_order = [plc.types.Order.ASCENDING] * len(source_columns)
+    else:
+        if len(ascending) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(ascending)} for `ascending`"
+            )
+        column_order = [
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+            for asc in ascending
+        ]
+
+    if null_position is None:
+        null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns)
+    else:
+        if len(null_position) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(null_position)} for `null_position`"
+            )
+        null_precedence = [
+            plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER
+            for null in null_position
+        ]
+
+    return plc.sorting.is_sorted(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        column_order,
+        null_precedence,
+    )
+
+
+def ordering(
+    column_order: list[bool],
+    null_precedence: Iterable[Literal["first", "last"]],
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Construct order and null order vectors
+
+    Parameters
+    ----------
+    column_order
+        Iterable of bool (True for ascending order, False for descending)
+    null_precedence
+        Iterable string for null positions ("first" for start, "last" for end)
+
+    Both iterables must be the same length (not checked)
+
+    Returns
+    -------
+    pair of vectors (order, and null_order)
+    """
+    c_column_order = []
+    c_null_precedence = []
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.append(
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+        )
+        if asc ^ (null == "first"):
+            c_null_precedence.append(plc.types.NullOrder.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.append(plc.types.NullOrder.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    return c_column_order, c_null_precedence
+
+
+@acquire_spill_lock()
+def order_by(
+    columns_from_table: list[ColumnBase],
+    ascending: list[bool],
+    na_position: Literal["first", "last"],
+    *,
+    stable: bool,
+):
+    """
+    Get index to sort the table in ascending/descending order.
+
+    Parameters
+    ----------
+    columns_from_table : list[Column]
+        Columns from the table which will be sorted
+    ascending : sequence[bool]
+         Sequence of boolean values which correspond to each column
+         in the table to be sorted signifying the order of each column
+         True - Ascending and False - Descending
+    na_position : str
+        Whether null values should show up at the "first" or "last"
+        position of **all** sorted column.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    Column of indices that sorts the table
+    """
+    order = ordering(ascending, itertools.repeat(na_position))
+    func = (
+        plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns_from_table],
+            ),
+            order[0],
+            order[1],
+        )
+    )
+
+
+@acquire_spill_lock()
+def sort_by_key(
+    values: list[ColumnBase],
+    keys: list[ColumnBase],
+    ascending: list[bool],
+    na_position: list[Literal["first", "last"]],
+    *,
+    stable: bool,
+) -> list[ColumnBase]:
+    """
+    Sort a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    ascending : list[bool]
+        Sequence of boolean values which correspond to each column
+        in the table to be sorted signifying the order of each column
+        True - Ascending and False - Descending
+    na_position : list[str]
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    order = ordering(ascending, na_position)
+    func = (
+        plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+    )
+    return [
+        Column.from_pylibcudf(col)
+        for col in func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
+            order[0],
+            order[1],
+        ).columns()
+    ]
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index ce6bb83bc77..c8ea03b04fe 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,8 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 """Common abstract base classes for cudf."""
 
-import pickle
-
 import numpy
 
 import cudf
@@ -22,6 +20,14 @@ class Serializable:
     latter converts back from that representation into an equivalent object.
     """
 
+    # A mapping from class names to the classes themselves. This is used to
+    # reconstruct the correct class when deserializing an object.
+    _name_type_map: dict = {}
+
+    def __init_subclass__(cls, /, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls._name_type_map[cls.__name__] = cls
+
     def serialize(self):
         """Generate an equivalent serializable representation of an object.
 
@@ -98,7 +104,7 @@ def device_serialize(self):
             )
             for f in frames
         )
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["type-serialized-name"] = type(self).__name__
         header["is-cuda"] = [
             hasattr(f, "__cuda_array_interface__") for f in frames
         ]
@@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames):
 
         :meta private:
         """
-        typ = pickle.loads(header["type-serialized"])
+        typ = cls._name_type_map[header["type-serialized-name"]]
         frames = [
             cudf.core.buffer.as_buffer(f) if c else memoryview(f)
-            for c, f in zip(header["is-cuda"], frames)
+            for c, f in zip(header["is-cuda"], frames, strict=True)
         ]
         return typ.deserialize(header, frames)
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index ffa306bf93f..625938ca168 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import math
-import pickle
 import weakref
 from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Literal
@@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]:
             second element is a list containing single frame.
         """
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+        header["owner-type-serialized-name"] = type(self._owner).__name__
         header["frame_count"] = 1
         frames = [self]
         return header, frames
@@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
         if isinstance(frame, cls):
             return frame  # The frame is already deserialized
 
-        owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
+        owner_type: BufferOwner = Serializable._name_type_map[
+            header["owner-type-serialized-name"]
+        ]
         if hasattr(frame, "__cuda_array_interface__"):
             owner = owner_type.from_device_memory(frame, exposed=False)
         else:
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index ed351a6b107..07d0d698cb8 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -54,7 +54,7 @@ def get_rmm_memory_resource_stack(
     """
 
     if hasattr(mr, "upstream_mr"):
-        return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr)
+        return [mr, *get_rmm_memory_resource_stack(mr.upstream_mr)]
     return [mr]
 
 
@@ -275,7 +275,7 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         print(
             f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes "
             "failed, spill-on-demand couldn't find any device memory to "
-            f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n"
+            f"spill:\n{self!r}\ntraceback:\n{get_traceback()}\n"
             f"{self.statistics}"
         )
         return False  # Since we didn't find anything to spill, we give up
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b40c56c9a6b..cbb65229933 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import collections.abc
-import pickle
 import time
 import weakref
 from threading import RLock
@@ -366,7 +365,7 @@ def __str__(self) -> str:
             f"<{self.__class__.__name__} size={format_bytes(self._size)} "
             f"spillable={self.spillable} exposed={self.exposed} "
             f"num-spill-locks={len(self._spill_locks)} "
-            f"ptr={ptr_info} owner={repr(self._owner)}>"
+            f"ptr={ptr_info} owner={self._owner!r}>"
         )
 
 
@@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
         frames: list[Buffer | memoryview]
         with self._owner.lock:
-            header["type-serialized"] = pickle.dumps(self.__class__)
-            header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+            header["owner-type-serialized-name"] = type(self._owner).__name__
             header["frame_count"] = 1
             if self.is_spilled:
                 frames = [self.memoryview()]
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 8d38a5f2272..b49f5154697 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.byte_pair_encode import (
-    byte_pair_encoding as cpp_byte_pair_encoding,
-)
 
 
 class BytePairEncoder:
@@ -25,12 +22,12 @@ class BytePairEncoder:
     BytePairEncoder
     """
 
-    def __init__(self, merges_pair: "cudf.Series"):
+    def __init__(self, merges_pair: cudf.Series) -> None:
         self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
             merges_pair._column.to_pylibcudf(mode="read")
         )
 
-    def __call__(self, text, separator: str = " ") -> cudf.Series:
+    def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series:
         dtype: object
         """
         sep = cudf.Scalar(separator, dtype="str")
-        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
-
-        return cudf.Series._from_column(result)
+        return cudf.Series._from_column(
+            text._column.byte_pair_encoding(self.merge_pairs, sep)
+        )
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index a1e87d04bc9..db8d33f013a 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -1,31 +1,52 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-"""
-isort: skip_file
-"""
-
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
     as_column,
     build_column,
     column_empty,
-    column_empty_like,
     concat_columns,
     deserialize_columns,
     serialize_columns,
 )
-from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
-from cudf.core.column.datetime import DatetimeTZColumn  # noqa: F401
-from cudf.core.column.lists import ListColumn  # noqa: F401
-from cudf.core.column.numerical import NumericalColumn  # noqa: F401
-from cudf.core.column.string import StringColumn  # noqa: F401
-from cudf.core.column.struct import StructColumn  # noqa: F401
-from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
-from cudf.core.column.decimal import (  # noqa: F401
+from cudf.core.column.datetime import (
+    DatetimeColumn,
+    DatetimeTZColumn,
+)
+from cudf.core.column.decimal import (
     Decimal32Column,
     Decimal64Column,
     Decimal128Column,
     DecimalBaseColumn,
 )
+from cudf.core.column.interval import IntervalColumn
+from cudf.core.column.lists import ListColumn
+from cudf.core.column.numerical import NumericalColumn
+from cudf.core.column.string import StringColumn
+from cudf.core.column.struct import StructColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
+
+__all__ = [
+    "CategoricalColumn",
+    "ColumnBase",
+    "DatetimeColumn",
+    "DatetimeTZColumn",
+    "Decimal32Column",
+    "Decimal64Column",
+    "Decimal128Column",
+    "DecimalBaseColumn",
+    "IntervalColumn",
+    "ListColumn",
+    "NumericalColumn",
+    "StringColumn",
+    "StructColumn",
+    "TimeDeltaColumn",
+    "as_column",
+    "build_column",
+    "column_empty",
+    "column_empty_like",
+    "concat_columns",
+    "deserialize_columns",
+    "serialize_columns",
+]
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 7354b917f90..a0cf38c6f51 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -13,7 +13,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.transform import bools_to_mask
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
@@ -668,13 +667,8 @@ def _fill(
             return self if inplace else self.copy()
 
         fill_code = self._encode(fill_value)
-        fill_scalar = cudf._lib.scalar.as_device_scalar(
-            fill_code, self.codes.dtype
-        )
-
         result = self if inplace else self.copy()
-
-        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
+        result.codes._fill(fill_code, begin, end, inplace=True)
         return result
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
@@ -780,12 +774,11 @@ def to_pandas(
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
-            new_mask = bools_to_mask(self.notnull())
             col = type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
                 dtype=self.dtype,
-                mask=new_mask,
+                mask=self.notnull().fillna(False).as_mask(),
                 children=self.children,
             )
         else:
@@ -843,9 +836,9 @@ def values(self):
         """
         raise NotImplementedError("cudf.Categorical is not yet implemented")
 
-    def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
         return (
-            self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)
+            self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)  # type: ignore[return-value]
         )
 
     def data_array_view(
@@ -888,7 +881,7 @@ def find_and_replace(
         if len(replacement_col) == replacement_col.null_count:
             replacement_col = replacement_col.astype(self.categories.dtype)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "
@@ -989,10 +982,8 @@ def find_and_replace(
         replacement_col = catmap._data["index"].astype(replaced.codes.dtype)
 
         replaced_codes = column.as_column(replaced.codes)
-        output = libcudf.replace.replace(
-            replaced_codes, to_replace_col, replacement_col
-        )
-        codes = as_unsigned_codes(len(new_cats["cats"]), output)
+        output = replaced_codes.replace(to_replace_col, replacement_col)
+        codes = as_unsigned_codes(len(new_cats["cats"]), output)  # type: ignore[arg-type]
 
         result = type(self)(
             data=self.data,  # type: ignore[arg-type]
@@ -1202,7 +1193,7 @@ def _concat(
                 f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
             )
         elif newsize == 0:
-            codes_col = column.column_empty(0, head.codes.dtype, masked=True)
+            codes_col = column.column_empty(0, head.codes.dtype)
         else:
             codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8ddfd4a54ae..cc07af0f669 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import pickle
+import warnings
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -32,8 +32,7 @@
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.transform import bools_to_mask
-from cudf._lib.types import size_type_dtype
+from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -43,7 +42,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import unary
+from cudf.core._internals import aggregation, sorting, unary
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -236,8 +235,14 @@ def find_and_replace(
     ) -> Self:
         raise NotImplementedError
 
-    def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
-        return libcudf.replace.clip(self, lo, hi)
+    @acquire_spill_lock()
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
+        plc_column = plc.replace.clamp(
+            self.to_pylibcudf(mode="read"),
+            cudf.Scalar(lo, self.dtype).device_value.c_value,
+            cudf.Scalar(hi, self.dtype).device_value.c_value,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if self is other:
@@ -255,21 +260,17 @@ def all(self, skipna: bool = True) -> bool:
         # The skipna argument is only used for numerical columns.
         # If all entries are null the result is True, including when the column
         # is empty.
-
         if self.null_count == self.size:
             return True
-
-        return libcudf.reduce.reduce("all", self)
+        return self.reduce("all")
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
-
         if not skipna and self.has_nulls():
             return True
         elif skipna and self.null_count == self.size:
             return False
-
-        return libcudf.reduce.reduce("any", self)
+        return self.reduce("any")
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -367,10 +368,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
+    @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:
-        return libcudf.transform.mask_to_bools(
-            self.base_mask, self.offset, self.offset + len(self)
+        plc_column = plc.transform.mask_to_bools(
+            self.base_mask.get_ptr(mode="read"),  # type: ignore[union-attr]
+            self.offset,
+            self.offset + len(self),
         )
+        return type(self).from_pylibcudf(plc_column)
 
     @cached_property
     def memory_usage(self) -> int:
@@ -395,14 +400,19 @@ def _fill(
         # the scalar is None when calling `is_valid`.
         slr = cudf.Scalar(fill_value, dtype=self.dtype)
 
-        if not inplace:
-            return libcudf.filling.fill(self, begin, end, slr.device_value)
-
-        if is_string_dtype(self.dtype):
-            return self._mimic_inplace(
-                libcudf.filling.fill(self, begin, end, slr.device_value),
-                inplace=True,
-            )
+        if not inplace or is_string_dtype(self.dtype):
+            with acquire_spill_lock():
+                result = type(self).from_pylibcudf(
+                    plc.filling.fill(
+                        self.to_pylibcudf(mode="read"),
+                        begin,
+                        end,
+                        slr.device_value.c_value,
+                    )
+                )
+            if is_string_dtype(self.dtype):
+                return self._mimic_inplace(result, inplace=True)
+            return result  # type: ignore[return-value]
 
         if not slr.is_valid() and not self.nullable:
             mask = as_buffer(
@@ -412,8 +422,13 @@ def _fill(
             )
             self.set_base_mask(mask)
 
-        libcudf.filling.fill_in_place(self, begin, end, slr.device_value)
-
+        with acquire_spill_lock():
+            plc.filling.fill_in_place(
+                self.to_pylibcudf(mode="write"),
+                begin,
+                end,
+                slr.device_value.c_value,
+            )
         return self
 
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
@@ -536,7 +551,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return cast(Self, column_empty(0, self.dtype, masked=True))
+            return cast(Self, column_empty(0, self.dtype))
         # compute mask slice
         if stride == 1:
             return libcudf.copying.column_slice(self, [start, stop])[
@@ -685,6 +700,18 @@ def _validate_fillna_value(
             return cudf.Scalar(fill_value, dtype=self.dtype)
         return as_column(fill_value)
 
+    @acquire_spill_lock()
+    def replace(
+        self, values_to_replace: Self, replacement_values: Self
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.replace.find_and_replace_all(
+                self.to_pylibcudf(mode="read"),
+                values_to_replace.to_pylibcudf(mode="read"),
+                replacement_values.to_pylibcudf(mode="read"),
+            )
+        )
+
     def fillna(
         self,
         fill_value: ScalarLike | ColumnLike,
@@ -703,11 +730,32 @@ def fillna(
                 return self.copy()
             else:
                 fill_value = self._validate_fillna_value(fill_value)
-        return libcudf.replace.replace_nulls(
-            input_col=self.nans_to_nulls(),
-            replacement=fill_value,
-            method=method,
-        )._with_type_metadata(self.dtype)
+
+        if fill_value is None and method is None:
+            raise ValueError("Must specify a fill 'value' or 'method'.")
+
+        if fill_value and method:
+            raise ValueError("Cannot specify both 'value' and 'method'.")
+
+        input_col = self.nans_to_nulls()
+
+        with acquire_spill_lock():
+            if method:
+                plc_replace = (
+                    plc.replace.ReplacePolicy.PRECEDING
+                    if method == "ffill"
+                    else plc.replace.ReplacePolicy.FOLLOWING
+                )
+            elif is_scalar(fill_value):
+                plc_replace = cudf.Scalar(fill_value).device_value.c_value
+            else:
+                plc_replace = fill_value.to_pylibcudf(mode="read")
+            plc_column = plc.replace.replace_nulls(
+                input_col.to_pylibcudf(mode="read"),
+                plc_replace,
+            )
+            result = type(self).from_pylibcudf(plc_column)
+        return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
@@ -833,7 +881,7 @@ def take(
         """
         # Handle zero size
         if indices.size == 0:
-            return cast(Self, column_empty_like(self, newsize=0))
+            return cast(Self, column_empty(row_count=0, dtype=self.dtype))
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
@@ -932,11 +980,14 @@ def as_mask(self) -> Buffer:
         -------
         Buffer
         """
-
         if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
-        return bools_to_mask(self)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.bools_to_mask(
+                self.to_pylibcudf(mode="read")
+            )
+            return as_buffer(mask)
 
     @property
     def is_unique(self) -> bool:
@@ -945,13 +996,13 @@ def is_unique(self) -> bool:
 
     @cached_property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [True], None
         )
 
     @cached_property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [False], None
         )
 
@@ -975,15 +1026,20 @@ def contains(self, other: ColumnBase) -> ColumnBase:
     def sort_values(
         self: Self,
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
             return self.copy()
-        return libcudf.sort.sort(
-            [self], column_order=[ascending], null_precedence=[na_position]
-        )[0]
+        order = sorting.ordering([ascending], [na_position])
+        with acquire_spill_lock():
+            plc_table = plc.sorting.sort(
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                order[0],
+                order[1],
+            )
+            return type(self).from_pylibcudf(plc_table.columns()[0])  # type: ignore[return-value]
 
     def distinct_count(self, dropna: bool = True) -> int:
         try:
@@ -1003,7 +1059,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             if self.dtype == dtype:
                 result = self
             else:
-                result = column_empty(0, dtype=dtype, masked=self.nullable)
+                result = column_empty(0, dtype=dtype)
         elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
@@ -1153,7 +1209,7 @@ def argsort(
                 as_column(range(len(self) - 1, -1, -1)),
             )
         else:
-            return libcudf.sort.order_by(
+            return sorting.order_by(
                 [self], [ascending], na_position, stable=True
             )
 
@@ -1173,7 +1229,6 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
             "data": (self.data_ptr, False),
             "version": 1,
         }
-
         if self.nullable and self.has_nulls():
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
@@ -1240,28 +1295,27 @@ def serialize(self) -> tuple[dict, list]:
 
         header: dict[Any, Any] = {}
         frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
         try:
-            dtype, dtype_frames = self.dtype.serialize()
+            dtype, dtype_frames = self.dtype.device_serialize()
             header["dtype"] = dtype
             frames.extend(dtype_frames)
             header["dtype-is-cudf-serialized"] = True
         except AttributeError:
-            header["dtype"] = pickle.dumps(self.dtype)
+            header["dtype"] = self.dtype.str
             header["dtype-is-cudf-serialized"] = False
 
         if self.data is not None:
-            data_header, data_frames = self.data.serialize()
+            data_header, data_frames = self.data.device_serialize()
             header["data"] = data_header
             frames.extend(data_frames)
 
         if self.mask is not None:
-            mask_header, mask_frames = self.mask.serialize()
+            mask_header, mask_frames = self.mask.device_serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
         if self.children:
             child_headers, child_frames = zip(
-                *(c.serialize() for c in self.children)
+                *(c.device_serialize() for c in self.children)
             )
             header["subheaders"] = list(child_headers)
             frames.extend(chain(*child_frames))
@@ -1273,8 +1327,7 @@ def serialize(self) -> tuple[dict, list]:
     def deserialize(cls, header: dict, frames: list) -> ColumnBase:
         def unpack(header, frames) -> tuple[Any, list]:
             count = header["frame_count"]
-            klass = pickle.loads(header["type-serialized"])
-            obj = klass.deserialize(header, frames[:count])
+            obj = cls.device_deserialize(header, frames[:count])
             return obj, frames[count:]
 
         assert header["frame_count"] == len(frames), (
@@ -1284,7 +1337,7 @@ def unpack(header, frames) -> tuple[Any, list]:
         if header["dtype-is-cudf-serialized"]:
             dtype, frames = unpack(header["dtype"], frames)
         else:
-            dtype = pickle.loads(header["dtype"])
+            dtype = np.dtype(header["dtype"])
         if "data" in header:
             data, frames = unpack(header["data"], frames)
         else:
@@ -1318,7 +1371,7 @@ def nans_to_nulls(self: Self) -> Self:
 
     def normalize_binop_value(
         self, other: ScalarLike
-    ) -> ColumnBase | ScalarLike:
+    ) -> ColumnBase | cudf.Scalar:
         raise NotImplementedError
 
     def _reduce(
@@ -1342,33 +1395,35 @@ def _reduce(
         )
         if isinstance(preprocessed, ColumnBase):
             dtype = kwargs.pop("dtype", None)
-            return libcudf.reduce.reduce(
-                op, preprocessed, dtype=dtype, **kwargs
-            )
+            return preprocessed.reduce(op, dtype, **kwargs)
         return preprocessed
 
+    def _can_return_nan(self, skipna: bool | None = None) -> bool:
+        return not skipna and self.has_nulls(include_nan=False)
+
     def _process_for_reduction(
         self, skipna: bool | None = None, min_count: int = 0
     ) -> ColumnBase | ScalarLike:
-        if skipna is None:
-            skipna = True
+        skipna = True if skipna is None else skipna
 
-        if self.has_nulls():
+        if self._can_return_nan(skipna=skipna):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        col = self.nans_to_nulls() if skipna else self
+        if col.has_nulls():
             if skipna:
-                result_col = self.dropna()
+                col = col.dropna()
             else:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        result_col = self
-
         # TODO: If and when pandas decides to validate that `min_count` >= 0 we
         # should insert comparable behavior.
         # https://github.com/pandas-dev/pandas/issues/50022
         if min_count > 0:
-            valid_count = len(result_col) - result_col.null_count
+            valid_count = len(col) - col.null_count
             if valid_count < min_count:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-        return result_col
+        return col
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         """
@@ -1461,41 +1516,107 @@ def _return_sentinel_column():
         del right_rows
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        (codes,) = libcudf.sort.sort_by_key(
+        (codes,) = sorting.sort_by_key(
             codes, [left_gather_map], [True], ["last"], stable=True
         )
         return codes.fillna(na_sentinel.value)
 
-
-def column_empty_like(
-    column: ColumnBase,
-    dtype: Dtype | None = None,
-    masked: bool = False,
-    newsize: int | None = None,
-) -> ColumnBase:
-    """Allocate a new column like the given *column*"""
-    if dtype is None:
-        dtype = column.dtype
-    row_count = len(column) if newsize is None else newsize
-
-    if (
-        hasattr(column, "dtype")
-        and isinstance(column.dtype, cudf.CategoricalDtype)
-        and dtype == column.dtype
-    ):
-        catcolumn = cast("cudf.core.column.CategoricalColumn", column)
-        codes = column_empty_like(
-            catcolumn.codes, masked=masked, newsize=newsize
+    def one_hot_encode(
+        self, categories: ColumnBase
+    ) -> abc.Generator[ColumnBase]:
+        plc_table = plc.transform.one_hot_encode(
+            self.to_pylibcudf(mode="read"),
+            categories.to_pylibcudf(mode="read"),
         )
-        return build_column(
-            data=None,
-            dtype=dtype,
-            mask=codes.base_mask,
-            children=(codes,),
-            size=codes.size,
+        return (
+            type(self).from_pylibcudf(col, data_ptr_exposed=True)
+            for col in plc_table.columns()
+        )
+
+    @acquire_spill_lock()
+    def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.reduce.scan(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(scan_op, kwargs).c_obj,
+                plc.reduce.ScanType.INCLUSIVE
+                if inclusive
+                else plc.reduce.ScanType.EXCLUSIVE,
+            )
         )
 
-    return column_empty(row_count, dtype, masked)
+    def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
+        if dtype is not None:
+            warnings.warn(
+                "dtype is deprecated and will be remove in a future release. "
+                "Cast the result (e.g. .astype) after the operation instead.",
+                FutureWarning,
+            )
+            col_dtype = dtype
+        else:
+            col_dtype = self._reduction_result_dtype(reduction_op)
+
+        # check empty case
+        if len(self) <= self.null_count:
+            if reduction_op == "sum" or reduction_op == "sum_of_squares":
+                return self.dtype.type(0)
+            if reduction_op == "product":
+                return self.dtype.type(1)
+            if reduction_op == "any":
+                return False
+
+            return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
+
+        with acquire_spill_lock():
+            plc_scalar = plc.reduce.reduce(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(reduction_op, kwargs).c_obj,
+                dtype_to_pylibcudf_type(col_dtype),
+            )
+            result_col = type(self).from_pylibcudf(
+                plc.Column.from_scalar(plc_scalar, 1)
+            )
+            if plc_scalar.type().id() in {
+                plc.TypeId.DECIMAL128,
+                plc.TypeId.DECIMAL64,
+                plc.TypeId.DECIMAL32,
+            }:
+                scale = -plc_scalar.type().scale()
+                # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
+                p = col_dtype.precision
+                nrows = len(self)
+                if reduction_op in {"min", "max"}:
+                    new_p = p
+                elif reduction_op == "sum":
+                    new_p = p + nrows - 1
+                elif reduction_op == "product":
+                    new_p = p * nrows + nrows - 1
+                elif reduction_op == "sum_of_squares":
+                    new_p = 2 * p + nrows
+                else:
+                    raise NotImplementedError(
+                        f"{reduction_op} not implemented for decimal types."
+                    )
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
+                new_dtype = type(col_dtype)(precision, scale)
+                result_col = result_col.astype(new_dtype)
+            elif isinstance(col_dtype, cudf.IntervalDtype):
+                result_col = type(self).from_struct_column(  # type: ignore[attr-defined]
+                    result_col, closed=col_dtype.closed
+                )
+        return result_col.element_indexing(0)
+
+    @acquire_spill_lock()
+    def minmax(self) -> tuple[ScalarLike, ScalarLike]:
+        min_val, max_val = plc.reduce.minmax(self.to_pylibcudf(mode="read"))
+        return (
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(min_val, 1))
+            .element_indexing(0),
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(max_val, 1))
+            .element_indexing(0),
+        )
 
 
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
@@ -1507,9 +1628,27 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
 
 
 def column_empty(
-    row_count: int, dtype: Dtype = "object", masked: bool = False
+    row_count: int,
+    dtype: Dtype = "object",
+    for_numba: bool = False,
 ) -> ColumnBase:
-    """Allocate a new column like the given row_count and dtype."""
+    """
+    Allocate a new column with the given row_count and dtype.
+
+    * Passing row_count == 0 creates a size 0 column without a mask buffer.
+    * Passing row_count > 0 creates an all null column with a mask buffer.
+
+    Parameters
+    ----------
+    row_count : int
+        Number of elements in the column.
+
+    dtype : Dtype
+        Type of the column.
+
+    for_numba : bool, default False
+        If True, don't allocate a mask as it's not supported by numba.
+    """
     dtype = cudf.dtype(dtype)
     children: tuple[ColumnBase, ...] = ()
 
@@ -1551,7 +1690,7 @@ def column_empty(
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
-    if masked:
+    if row_count > 0 and not for_numba:
         mask = as_buffer(
             plc.null_mask.create_null_mask(
                 row_count, plc.null_mask.MaskState.ALL_NULL
@@ -1774,11 +1913,18 @@ def as_column(
     * range objects
     """
     if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)):
-        column = libcudf.filling.sequence(
-            len(arbitrary),
-            as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")),
-            as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")),
-        )
+        with acquire_spill_lock():
+            column = Column.from_pylibcudf(
+                plc.filling.sequence(
+                    len(arbitrary),
+                    as_device_scalar(
+                        arbitrary.start, dtype=np.dtype(np.int64)
+                    ).c_value,
+                    as_device_scalar(
+                        arbitrary.step, dtype=np.dtype(np.int64)
+                    ).c_value,
+                )
+            )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
             dtype = cudf.dtype(
                 f'i{cudf.get_option("default_integer_bitwidth")//8}'
@@ -2047,8 +2193,7 @@ def as_column(
                     )
                 # Consider NaT as NA in the mask
                 # but maintain NaT as a value
-                bool_mask = as_column(~is_nat)
-                mask = as_buffer(bools_to_mask(bool_mask))
+                mask = as_column(~is_nat).as_mask()
             buffer = as_buffer(arbitrary.view("|u1"))
             col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
             if dtype:
@@ -2218,8 +2363,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         )
         return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
     elif typecode == "b":
-        col = as_column(cai_mask)
-        return bools_to_mask(col)
+        return as_column(cai_mask).as_mask()
     else:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
@@ -2245,7 +2389,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     frames = []
 
     if len(columns) > 0:
-        header_columns = [c.serialize() for c in columns]
+        header_columns: list[tuple[dict, list]] = [
+            c.device_serialize() for c in columns
+        ]
         headers, column_frames = zip(*header_columns)
         for f in column_frames:
             frames.extend(f)
@@ -2262,7 +2408,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
 
     for meta in headers:
         col_frame_count = meta["frame_count"]
-        col_typ = pickle.loads(meta["type-serialized"])
+        col_typ = Serializable._name_type_map[meta["type-serialized-name"]]
         colobj = col_typ.deserialize(meta, frames[:col_frame_count])
         columns.append(colobj)
         # Advance frames
@@ -2275,7 +2421,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
         dtype = cudf.dtype(None)
-        return column_empty(0, dtype=dtype, masked=True)
+        return column_empty(0, dtype=dtype)
 
     # If all columns are `NumericalColumn` with different dtypes,
     # we cast them to a common dtype.
@@ -2297,9 +2443,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         if not is_dtype_equal(obj.dtype, head.dtype):
             # if all null, cast to appropriate dtype
             if obj.null_count == len(obj):
-                objs[i] = column_empty_like(
-                    head, dtype=head.dtype, masked=True, newsize=len(obj)
-                )
+                objs[i] = column_empty(row_count=len(obj), dtype=head.dtype)
             else:
                 raise ValueError("All columns must be the same type")
 
@@ -2324,7 +2468,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
             f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
         )
     elif newsize == 0:
-        return column_empty(0, head.dtype, masked=True)
+        return column_empty(0, head.dtype)
 
     # Filter out inputs that have 0 length, then concatenate.
     objs_with_len = [o for o in objs if len(o)]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 24b55fe1bc2..81b82040b8d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,9 +18,11 @@
 import pylibcudf as plc
 
 import cudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core._internals.search import search_sorted
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -28,7 +30,7 @@
     get_tz_data,
 )
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column, column, string
+from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import (
@@ -507,7 +509,9 @@ def isocalendar(self) -> dict[str, ColumnBase]:
             )
         }
 
-    def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
+    def normalize_binop_value(  # type: ignore[override]
+        self, other: DatetimeLikeScalar
+    ) -> cudf.Scalar | cudf.DateOffset | ColumnBase:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
@@ -594,14 +598,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
-            names = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            names = column.column_empty(0, dtype="object")
         return string._datetime_to_str_typecast_functions[self.dtype](
             self, format, names
         )
@@ -787,12 +789,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         if out_dtype is None:
             return NotImplemented
 
-        result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-        if out_dtype != cudf.dtype(np.bool_) and op == "__add__":
+        result_col = binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if out_dtype.kind != "b" and op == "__add__":
             return result_col
-        elif cudf.get_option(
-            "mode.pandas_compatible"
-        ) and out_dtype == cudf.dtype(np.bool_):
+        elif (
+            cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b"
+        ):
             return result_col.fillna(op == "__ne__")
         else:
             return result_col
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index ce7aa91f775..9e6a73f1a9c 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -10,15 +10,14 @@
 import numpy as np
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_fixed_point import (
-    from_decimal as cpp_from_decimal,
-)
 from cudf.api.types import is_scalar
-from cudf.core._internals import unary
-from cudf.core.buffer import as_buffer
-from cudf.core.column import ColumnBase
+from cudf.core._internals import binaryop, unary
+from cudf.core.buffer import acquire_spill_lock, as_buffer
+from cudf.core.column.column import ColumnBase
+from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
     Decimal32Dtype,
     Decimal64Dtype,
@@ -28,9 +27,9 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
-from .numerical_base import NumericalBaseColumn
-
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
@@ -89,7 +88,13 @@ def as_decimal_column(
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
-            return cpp_from_decimal(self)
+            with acquire_spill_lock():
+                plc_column = (
+                    plc.strings.convert.convert_fixed_point.from_fixed_point(
+                        self.to_pylibcudf(mode="read"),
+                    )
+                )
+                return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
         else:
             return cast(
                 cudf.core.column.StringColumn,
@@ -142,7 +147,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
             rhs = rhs.astype(
                 type(output_type)(rhs.dtype.precision, rhs.dtype.scale)
             )
-            result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type)
+            result = binaryop.binaryop(lhs, rhs, op, output_type)
             # libcudf doesn't support precision, so result.dtype doesn't
             # maintain output_type.precision
             result.dtype.precision = output_type.precision
@@ -154,7 +159,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
             "__le__",
             "__ge__",
         }:
-            result = libcudf.binaryop.binaryop(lhs, rhs, op, bool)
+            result = binaryop.binaryop(lhs, rhs, op, bool)
         else:
             raise TypeError(
                 f"{op} not supported for the following dtypes: "
@@ -178,7 +183,7 @@ def _validate_fillna_value(
             "integer values"
         )
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
                 if other.dtype.kind not in "iu":
@@ -210,7 +215,7 @@ def normalize_binop_value(self, other):
             other = Decimal(other)
             metadata = other.as_tuple()
             precision = max(len(metadata.digits), metadata.exponent)
-            scale = -metadata.exponent
+            scale = -cast(int, metadata.exponent)
             return cudf.Scalar(
                 other, dtype=self.dtype.__class__(precision, scale)
             )
@@ -435,7 +440,7 @@ def _get_decimal_type(
     `op` for the given dtypes.
 
     For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
+    """
 
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
@@ -506,8 +511,8 @@ def _get_decimal_type(
     # if we've reached this point, we cannot create a decimal type without
     # overflow; raise an informative error
     raise ValueError(
-        f"Performing {op} between columns of type {repr(lhs_dtype)} and "
-        f"{repr(rhs_dtype)} would result in overflow"
+        f"Performing {op} between columns of type {lhs_dtype!r} and "
+        f"{rhs_dtype!r} would result in overflow"
     )
 
 
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 9147270c289..dd8f58a118e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -7,13 +7,13 @@
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import StructColumn, as_column
+from cudf.core.column.column import as_column
+from cudf.core.column.struct import StructColumn
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-    from cudf._typing import ScalarLike
     from cudf.core.buffer import Buffer
     from cudf.core.column import ColumnBase
 
@@ -210,16 +210,3 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
-
-    def _reduce(
-        self,
-        op: str,
-        skipna: bool | None = None,
-        min_count: int = 0,
-        *args,
-        **kwargs,
-    ) -> ScalarLike:
-        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
-        if cudf.get_option("mode.pandas_compatible"):
-            return pd.Interval(**result, closed=self.dtype.closed)
-        return result
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 42df5123014..ba98e28f6a2 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -13,11 +13,11 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.strings.convert.convert_lists import format_list_column
+import cudf.core.column.column as column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column, column
+from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
@@ -187,8 +187,8 @@ def __cuda_array_interface__(self):
             "Lists are not yet supported via `__cuda_array_interface__`"
         )
 
-    def normalize_binop_value(self, other):
-        if not isinstance(other, ListColumn):
+    def normalize_binop_value(self, other) -> Self:
+        if not isinstance(other, type(self)):
             return NotImplemented
         return other
 
@@ -255,7 +255,7 @@ def from_sequences(
             data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
-            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
+            mask=as_column(mask_col).as_mask(),
             offset=0,
             null_count=0,
             children=(offset_col, data_col),
@@ -271,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
 
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        with acquire_spill_lock():
+            plc_column = plc.strings.convert.convert_lists.format_list_column(
+                lc.to_pylibcudf(mode="read"),
+                cudf.Scalar("None").device_value.c_value,
+                separators.to_pylibcudf(mode="read"),
+            )
+            return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a7538c1c947..f099cef3331 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -7,18 +7,24 @@
 
 import numpy as np
 import pandas as pd
+from numba.np import numpy_support
 from typing_extensions import Self
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
-from cudf.core._internals import unary
-from cudf.core.column import ColumnBase, as_column, column, string
+from cudf.core._internals import binaryop, unary
+from cudf.core.buffer import acquire_spill_lock, as_buffer
+from cudf.core.column.column import ColumnBase, as_column
+from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -26,8 +32,6 @@
     np_dtypes_to_pandas_dtypes,
 )
 
-from .numerical_base import NumericalBaseColumn
-
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 
@@ -178,13 +182,27 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
+    @acquire_spill_lock()
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        plc_column = plc.transform.transform(
+            self.to_pylibcudf(mode="read"),
+            compiled_op[0],
+            plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+            True,
+        )
+        return type(self).from_pylibcudf(plc_column)
+
     def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         if callable(unaryop):
-            return libcudf.transform.transform(self, unaryop)
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
 
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = pylibcudf.unary.UnaryOperator[unaryop]
+        unaryop = plc.unary.UnaryOperator[unaryop]
         return unary.unary_operation(self, unaryop)
 
     def __invert__(self):
@@ -226,7 +244,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # If `other` is a Python integer and it is out-of-bounds
             # promotion could fail but we can trivially define the result
             # in terms of `notnull` or `NULL_NOT_EQUALS`.
-            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+            if type(other) is int and self.dtype.kind in "iu":
                 truthiness = None
                 iinfo = np.iinfo(self.dtype)
                 if iinfo.min > other:
@@ -291,20 +309,21 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         lhs, rhs = (other, self) if reflect else (self, other)
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        return binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
         if self.dtype.kind != "f" or self.nan_count == 0:
             return self
-        newmask = libcudf.transform.nans_to_nulls(self)
-        return self.set_mask(newmask)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.nans_to_nulls(
+                self.to_pylibcudf(mode="read")
+            )
+            return self.set_mask(as_buffer(mask))
 
-    def normalize_binop_value(
-        self, other: ScalarLike
-    ) -> ColumnBase | cudf.Scalar:
+    def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
-            if not isinstance(other, NumericalColumn):
+            if not isinstance(other, type(self)):
                 return NotImplemented
             return other
         if isinstance(other, cudf.Scalar):
@@ -401,22 +420,12 @@ def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
         # is empty.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if result_col.null_count == result_col.size:
-            return True
-
-        return libcudf.reduce.reduce("all", result_col)
+        return super(type(self), result_col).all(skipna=skipna)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if not skipna and result_col.has_nulls():
-            return True
-        elif skipna and result_col.null_count == result_col.size:
-            return False
-
-        return libcudf.reduce.reduce("any", result_col)
+        return super(type(self), result_col).any(skipna=skipna)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -464,25 +473,12 @@ def _process_values_for_isin(
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
-    def _process_for_reduction(
-        self, skipna: bool | None = None, min_count: int = 0
-    ) -> NumericalColumn | ScalarLike:
-        skipna = True if skipna is None else skipna
-
-        if self._can_return_nan(skipna=skipna):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        col = self.nans_to_nulls() if skipna else self
-        return super(NumericalColumn, col)._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
         replacement: ColumnLike,
         all_nan: bool = False,
-    ) -> NumericalColumn:
+    ) -> Self:
         """
         Return col with *to_replace* replaced with *value*.
         """
@@ -547,7 +543,7 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        replaced = self.astype(common_type)
+        replaced = cast(Self, self.astype(common_type))
         df = cudf.DataFrame._from_data(
             {
                 "old": to_replace_col.astype(common_type),
@@ -563,9 +559,7 @@ def find_and_replace(
             )
             df = df.dropna(subset=["old"])
 
-        return libcudf.replace.replace(
-            replaced, df._data["old"], df._data["new"]
-        )
+        return replaced.replace(df._data["old"], df._data["new"])
 
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
@@ -724,6 +718,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
 
         return super()._reduction_result_dtype(reduction_op)
 
+    @acquire_spill_lock()
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
+        """Return the indices of the bins to which each value in column belongs.
+
+        Parameters
+        ----------
+        bins : np.ndarray
+            1-D column-like object of bins with same type as `column`, should be
+            monotonically increasing.
+        right : bool
+            Indicates whether interval contains the right or left bin edge.
+
+        Returns
+        -------
+        A column containing the indices
+        """
+        if self.dtype != bins.dtype:
+            raise ValueError(
+                "digitize() expects bins and input column have the same dtype."
+            )
+
+        bin_col = as_column(bins, dtype=bins.dtype)
+        if bin_col.nullable:
+            raise ValueError("`bins` cannot contain null entries.")
+
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            getattr(plc.search, "lower_bound" if right else "upper_bound")(
+                plc.Table([bin_col.to_pylibcudf(mode="read")]),
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                [plc.types.Order.ASCENDING],
+                [plc.types.NullOrder.BEFORE],
+            )
+        )
+
 
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list
@@ -778,34 +806,3 @@ def _normalize_find_and_replace_input(
     if not normalized_column.can_cast_safely(input_column_dtype):
         return normalized_column
     return normalized_column.astype(input_column_dtype)
-
-
-def digitize(
-    column: ColumnBase, bins: np.ndarray, right: bool = False
-) -> ColumnBase:
-    """Return the indices of the bins to which each value in column belongs.
-
-    Parameters
-    ----------
-    column : Column
-        Input column.
-    bins : Column-like
-        1-D column-like object of bins with same type as `column`, should be
-        monotonically increasing.
-    right : bool
-        Indicates whether interval contains the right or left bin edge.
-
-    Returns
-    -------
-    A column containing the indices
-    """
-    if not column.dtype == bins.dtype:
-        raise ValueError(
-            "Digitize() expects bins and input column have the same dtype."
-        )
-
-    bin_col = as_column(bins, dtype=bins.dtype)
-    if bin_col.nullable:
-        raise ValueError("`bins` cannot contain null entries.")
-
-    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 6d639337401..689d5132d45 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,16 +3,16 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 
 import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import sorting
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase
+from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
@@ -139,17 +139,18 @@ def quantile(
             result = cast(
                 NumericalBaseColumn,
                 cudf.core.column.column_empty(
-                    row_count=len(q), dtype=self.dtype, masked=True
+                    row_count=len(q), dtype=self.dtype
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
-            indices = libcudf.sort.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+            indices = sorting.order_by(
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),
@@ -246,14 +247,23 @@ def corr(self, other: NumericalBaseColumn) -> float:
         return cov / lhs_std / rhs_std
 
     def round(
-        self, decimals: int = 0, how: str = "half_even"
+        self,
+        decimals: int = 0,
+        how: Literal["half_even", "half_up"] = "half_even",
     ) -> NumericalBaseColumn:
         if not cudf.api.types.is_integer(decimals):
-            raise TypeError("Values in decimals must be integers")
-        """Round the values in the Column to the given number of decimals."""
-        return libcudf.round.round(self, decimal_places=decimals, how=how)
+            raise TypeError("Argument 'decimals' must an integer")
+        if how not in {"half_even", "half_up"}:
+            raise ValueError(f"{how=} must be either 'half_even' or 'half_up'")
+        plc_how = plc.round.RoundingMethod[how.upper()]
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                plc.round.round(
+                    self.to_pylibcudf(mode="read"), decimals, plc_how
+                )
+            )
 
     def _scan(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(
-            op.replace("cum", ""), self, True
-        )._with_type_metadata(self.dtype)
+        return self.scan(op.replace("cum", ""), True)._with_type_metadata(
+            self.dtype
+        )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d45c76d3ddb..d76caa5c3b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -17,13 +17,15 @@
 
 import cudf
 import cudf.api.types
+import cudf.core.column.column as column
+import cudf.core.column.datetime as datetime
 from cudf import _lib as libcudf
-from cudf._lib import string_casting as str_cast, strings as libstrings
+from cudf._lib import string_casting as str_cast
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import column, datetime
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
@@ -43,6 +45,8 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.lists import ListColumn
+    from cudf.core.column.numerical import NumericalColumn
 
 
 def str_to_boolean(column: StringColumn):
@@ -548,7 +552,7 @@ def join(
         2    <NA>
         3     c-d
         dtype: object
-        """  # noqa E501
+        """
         if sep is None:
             sep = ""
 
@@ -621,7 +625,7 @@ def join(
 
     def _split_by_character(self):
         col = self._column.fillna("")  # sanitize nulls
-        result_col = libstrings.character_tokenize(col)
+        result_col = col.character_tokenize()
 
         offset_col = col.children[0]
 
@@ -694,7 +698,7 @@ def extract(
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if not _is_supported_regex_flags(flags):
             raise NotImplementedError(
                 "unsupported value for `flags` parameter"
@@ -830,7 +834,7 @@ def contains(
             value is set.
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
-        """  # noqa W605
+        """
         if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
         if regex and isinstance(pat, re.Pattern):
@@ -1335,7 +1339,7 @@ def isinteger(self) -> SeriesOrIndex:
         2    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_integer(self._column))
+        return self._return_or_inplace(self._column.is_integer())
 
     def ishex(self) -> SeriesOrIndex:
         """
@@ -1467,7 +1471,7 @@ def isfloat(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_float(self._column))
+        return self._return_or_inplace(self._column.is_float())
 
     def isdecimal(self) -> SeriesOrIndex:
         """
@@ -2709,26 +2713,25 @@ def split(
         if len(str(pat)) <= 1:
             regex = False
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.split_re(self._column, pat, n)
+                    data = self._column.split_re(pat, n)
                 else:
-                    data = libstrings.split(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.split(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.split_record_re(self._column, pat, n)
+                result_table = self._column.split_record_re(pat, n)
             else:
-                result_table = libstrings.split_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.split_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2882,28 +2885,25 @@ def rsplit(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.rsplit_re(self._column, pat, n)
+                    data = self._column.rsplit_re(pat, n)
                 else:
-                    data = libstrings.rsplit(
-                        self._column, cudf.Scalar(pat, "str"), n
-                    )
+                    data = self._column.rsplit(cudf.Scalar(pat, "str"), n)
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
                 else:
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.rsplit_record_re(
-                    self._column, pat, n
-                )
+                result_table = self._column.rsplit_record_re(pat, n)
             else:
-                result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.rsplit_record(
+                    cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2988,7 +2988,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
+            self._column.partition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -3053,7 +3053,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
+            self._column.rpartition(cudf.Scalar(sep, "str")),
             expand=expand,
         )
 
@@ -3675,7 +3675,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
             -   Some characters need to be escaped when passing
                 in pat. e.g. ``'$'`` has a special meaning in regex
                 and must be escaped when finding this literal character.
-        """  # noqa W605
+        """
         if isinstance(pat, re.Pattern):
             flags = pat.flags & ~re.U
             pat = pat.pattern
@@ -4498,8 +4498,7 @@ def url_decode(self) -> SeriesOrIndex:
         1    https://medium.com/rapids-ai
         dtype: object
         """
-
-        return self._return_or_inplace(libstrings.url_decode(self._column))
+        return self._return_or_inplace(self._column.url_decode())
 
     def url_encode(self) -> SeriesOrIndex:
         """
@@ -4530,7 +4529,7 @@ def url_encode(self) -> SeriesOrIndex:
         1    https%3A%2F%2Fmedium.com%2Frapids-ai
         dtype: object
         """
-        return self._return_or_inplace(libstrings.url_encode(self._column))
+        return self._return_or_inplace(self._column.url_encode())
 
     def code_points(self) -> SeriesOrIndex:
         """
@@ -4695,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         1    test string
         dtype: object
         """
-        return self._return_or_inplace(
-            libstrings.normalize_spaces(self._column)
-        )
+        return self._return_or_inplace(self._column.normalize_spaces())
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
@@ -4745,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         dtype: object
         """
         return self._return_or_inplace(
-            libstrings.normalize_characters(self._column, do_lower)
+            self._column.normalize_characters(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -4777,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         2    goodbye
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
-        if isinstance(delimiter, Column):
+        if isinstance(delim, Column):
             result = self._return_or_inplace(
-                libstrings._tokenize_column(self._column, delimiter),
+                self._column.tokenize_column(delim),
                 retain_index=False,
             )
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             result = self._return_or_inplace(
-                libstrings._tokenize_scalar(self._column, delimiter),
+                self._column.tokenize_scalar(delim),
                 retain_index=False,
             )
         else:
@@ -4801,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         return result
 
     def detokenize(
-        self, indices: "cudf.Series", separator: str = " "
+        self, indices: cudf.Series, separator: str = " "
     ) -> SeriesOrIndex:
         """
         Combines tokens into strings by concatenating them in the order
@@ -4831,9 +4828,9 @@ def detokenize(
         2          three
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.detokenize(self._column, indices._column, separator),
+            self._column.detokenize(indices._column, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -4884,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex:
         2    .
         dtype: object
         """
-        result_col = libstrings.character_tokenize(self._column)
+        result_col = self._column.character_tokenize()
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series._from_column(
+            return type(self._parent)._from_column(
                 result_col, name=self._parent.name, index=index
             )
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
-            return result_col
+            return self._return_or_inplace(result_col)
 
     def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         """
@@ -4921,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         2    0
         dtype: int32
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        if isinstance(delimiter, Column):
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        if isinstance(delim, Column):
             return self._return_or_inplace(
-                libstrings._count_tokens_column(self._column, delimiter)
+                self._column.count_tokens_column(delim)
             )
 
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             return self._return_or_inplace(
-                libstrings._count_tokens_scalar(self._column, delimiter)
+                self._column.count_tokens_scalar(delim)  # type: ignore[arg-type]
             )
         else:
             raise TypeError(
@@ -4968,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex:
         2    xyz_hhh
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.generate_ngrams(self._column, n, separator),
+            self._column.generate_ngrams(n, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5017,7 +5012,7 @@ def character_ngrams(
         dtype: list
         """
         result = self._return_or_inplace(
-            libstrings.generate_character_ngrams(self._column, n),
+            self._column.generate_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5062,7 +5057,7 @@ def hash_character_ngrams(
         """
 
         result = self._return_or_inplace(
-            libstrings.hash_character_ngrams(self._column, n),
+            self._column.hash_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5100,10 +5095,10 @@ def ngrams_tokenize(
         2    best_book
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter")
-        separator = _massage_string_arg(separator, "separator")
+        delim = _massage_string_arg(delimiter, "delimiter")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.ngrams_tokenize(self._column, n, delimiter, separator),
+            self._column.ngrams_tokenize(n, delim, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5182,10 +5177,9 @@ def replace_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.replace_tokens(
-                self._column,
-                targets_column,
-                replacements_column,
+            self._column.replace_tokens(
+                targets_column,  # type: ignore[arg-type]
+                replacements_column,  # type: ignore[arg-type]
                 cudf.Scalar(delimiter, dtype="str"),
             ),
         )
@@ -5253,8 +5247,7 @@ def filter_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.filter_tokens(
-                self._column,
+            self._column.filter_tokens(
                 min_token_length,
                 cudf.Scalar(replacement, dtype="str"),
                 cudf.Scalar(delimiter, dtype="str"),
@@ -5280,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex:
         1    2
         dtype: int32
         """
-        return self._return_or_inplace(
-            libstrings.porter_stemmer_measure(self._column)
-        )
+        return self._return_or_inplace(self._column.porter_stemmer_measure())
 
     def is_consonant(self, position) -> SeriesOrIndex:
         """
@@ -5315,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex:
         1    False
         dtype: bool
         """
-        ltype = libstrings.LetterType.CONSONANT
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(False, position)  # type: ignore[arg-type]
         )
 
     def is_vowel(self, position) -> SeriesOrIndex:
@@ -5359,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex:
         1     True
         dtype: bool
         """
-        ltype = libstrings.LetterType.VOWEL
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(True, position)  # type: ignore[arg-type]
         )
 
     def edit_distance(self, targets) -> SeriesOrIndex:
@@ -5418,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex:
             )
 
         return self._return_or_inplace(
-            libstrings.edit_distance(self._column, targets_column)
+            self._column.edit_distance(targets_column)  # type: ignore[arg-type]
         )
 
     def edit_distance_matrix(self) -> SeriesOrIndex:
@@ -5458,54 +5435,9 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
             )
-        return self._return_or_inplace(
-            libstrings.edit_distance_matrix(self._column)
-        )
+        return self._return_or_inplace(self._column.edit_distance_matrix())
 
     def minhash(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0     [21141582]
-        1    [962346254]
-        dtype: list
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0    [21141582, 403093213, 1258052021]
-        1    [962346254, 677440381, 122618762]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash(self._column, seeds_column, width)
-        )
-
-    def minhash_permuted(
         self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5537,7 +5469,7 @@ def minhash_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book'])
         >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
         >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
-        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash(0, a=a, b=b, width=5)
         0    [1305480171, 462824409, 74608232]
         1       [32665388, 65330773, 97996158]
         dtype: list
@@ -5553,53 +5485,10 @@ def minhash_permuted(
                 f"Expecting a Series with dtype uint32, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash_permuted(
-                self._column, seed, a_column, b_column, width
-            )
+            self._column.minhash(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def minhash64(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> str_series.str.minhash64(seeds)
-        0    [3232308021562742685, 4445611509348165860, 586435843695903598]
-        1    [23008204270530356, 1281229757012344693, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash64(self._column, seeds_column, width)
-        )
-
-    def minhash64_permuted(
         self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5630,7 +5519,7 @@ def minhash64_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
         >>> a = cudf.Series([2, 3], dtype=np.uint64)
         >>> b = cudf.Series([5, 6], dtype=np.uint64)
-        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash64(0, a=a, b=b, width=5)
         0    [172452388517576012, 316595762085180527]
         1      [71427536958126239, 58787297728258215]
         2    [423885828176437114, 1140588505926961370]
@@ -5647,79 +5536,7 @@ def minhash64_permuted(
                 f"Expecting a Series with dtype uint64, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash64_permuted(
-                self._column, seed, a_column, b_column, width
-            )
-        )
-
-    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> ls.str.word_minhash(seeds=seeds)
-        0     [21141582, 1232889953, 1268336794]
-        1    [962346254, 2321233602, 1354839212]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash(self._column, seeds_column)
-        )
-
-    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> ls.str.word_minhash64(seeds)
-        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
-        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash64(self._column, seeds_column)
+            self._column.minhash64(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
@@ -5745,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         1    0.307692
         dtype: float32
         """
-
         return self._return_or_inplace(
-            libstrings.jaccard_index(self._column, input._column, width),
+            self._column.jaccard_index(input._column, width)
         )
 
 
-def _massage_string_arg(value, name, allow_col=False):
+def _massage_string_arg(
+    value, name, allow_col: bool = False
+) -> StringColumn | cudf.Scalar:
     if isinstance(value, cudf.Scalar):
         return value
 
@@ -5762,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False):
 
     if allow_col:
         if isinstance(value, list):
-            return column.as_column(value, dtype="str")
+            return column.as_column(value, dtype="str")  # type: ignore[return-value]
 
-        if isinstance(value, Column) and is_string_dtype(value.dtype):
+        if isinstance(value, StringColumn):
             return value
 
         allowed_types.append("Column")
@@ -6014,13 +5832,13 @@ def as_numerical_column(
         out_dtype = cudf.api.types.dtype(dtype)
         string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(string_col).all():
+            if not string_col.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(string_col).all():
+            if not string_col.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
@@ -6037,7 +5855,7 @@ def strptime(
                 f"dtype must be datetime or timedelta type, not {dtype}"
             )
         elif self.null_count == len(self):
-            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+            return column.column_empty(len(self), dtype=dtype)  # type: ignore[return-value]
         elif (self == "None").any():
             raise ValueError(
                 "Cannot convert `None` value to datetime or timedelta."
@@ -6098,10 +5916,17 @@ def as_timedelta_column(
     ) -> cudf.core.column.TimeDeltaColumn:
         return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
+    @acquire_spill_lock()
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return libstrings.to_decimal(self, dtype)
+    ) -> cudf.core.column.DecimalBaseColumn:
+        plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+            self.to_pylibcudf(mode="read"),
+            libcudf.types.dtype_to_pylibcudf_type(dtype),
+        )
+        result = Column.from_pylibcudf(plc_column)
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result  # type: ignore[return-value]
 
     def as_string_column(self) -> StringColumn:
         return self
@@ -6137,12 +5962,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
         if self.dtype == to_dtype:
             return True
-        elif (
-            to_dtype.kind in {"i", "u"}
-            and not libstrings.is_integer(self).all()
-        ):
+        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
             return False
-        elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
+        elif to_dtype.kind == "f" and not self.is_float().all():
             return False
         else:
             return True
@@ -6160,7 +5982,7 @@ def find_and_replace(
         to_replace_col = column.as_column(to_replace)
         replacement_col = column.as_column(replacement)
 
-        if type(to_replace_col) != type(replacement_col):
+        if type(to_replace_col) is not type(replacement_col):
             raise TypeError(
                 f"to_replace and value should be of same types,"
                 f"got to_replace dtype: {to_replace_col.dtype} and "
@@ -6185,7 +6007,7 @@ def find_and_replace(
             df = df.dropna(subset=["old"])
         else:
             res = self
-        return libcudf.replace.replace(res, df._data["old"], df._data["new"])
+        return res.replace(df._data["old"], df._data["new"])
 
     def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
         if (
@@ -6199,7 +6021,7 @@ def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
 
     def _binaryop(
         self, other: ColumnBinaryOperand, op: str
-    ) -> "column.ColumnBase":
+    ) -> column.ColumnBase:
         reflect, op = self._check_reflected_op(op)
         # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to
         # support binary operations between empty or all null string columns
@@ -6228,7 +6050,7 @@ def _binaryop(
         if other is NotImplemented:
             return NotImplemented
 
-        if isinstance(other, (StringColumn, str, cudf.Scalar)):
+        if isinstance(other, (StringColumn, cudf.Scalar)):
             if isinstance(other, cudf.Scalar) and other.dtype != "O":
                 if op in {
                     "__eq__",
@@ -6278,9 +6100,7 @@ def _binaryop(
                 "NULL_NOT_EQUALS",
             }:
                 lhs, rhs = (other, self) if reflect else (self, other)
-                return libcudf.binaryop.binaryop(
-                    lhs=lhs, rhs=rhs, op=op, dtype="bool"
-                )
+                return binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool")
         return NotImplemented
 
     @copy_docstring(column.ColumnBase.view)
@@ -6306,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         return to_view.view(dtype)
 
+    @acquire_spill_lock()
+    def minhash(
+        self,
+        seed: np.uint32,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def minhash64(
+        self,
+        seed: np.uint64,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash64(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def jaccard_index(self, other: Self, width: int) -> NumericalColumn:
+        result = plc.nvtext.jaccard.jaccard_index(
+            self.to_pylibcudf(mode="read"),
+            other.to_pylibcudf(mode="read"),
+            width,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self:
+        result = plc.nvtext.generate_ngrams.generate_ngrams(
+            self.to_pylibcudf(mode="read"),
+            ngrams,
+            separator.device_value.c_value,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def hash_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance(self, targets: Self) -> NumericalColumn:
+        result = plc.nvtext.edit_distance.edit_distance(
+            self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance_matrix(self) -> ListColumn:
+        result = plc.nvtext.edit_distance.edit_distance_matrix(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def byte_pair_encoding(
+        self,
+        merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.byte_pair_encode.byte_pair_encoding(
+                self.to_pylibcudf(mode="read"),
+                merge_pairs,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def ngrams_tokenize(
+        self,
+        ngrams: int,
+        delimiter: cudf.Scalar,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+                self.to_pylibcudf(mode="read"),
+                ngrams,
+                delimiter.device_value.c_value,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_spaces(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_spaces(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_characters(self, do_lower: bool = True) -> Self:
+        return Column.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                do_lower,
+            )
+        )
+
+    @acquire_spill_lock()
+    def replace_tokens(
+        self, targets: Self, replacements: Self, delimiter: cudf.Scalar
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.replace_tokens(
+                self.to_pylibcudf(mode="read"),
+                targets.to_pylibcudf(mode="read"),
+                replacements.to_pylibcudf(mode="read"),
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def filter_tokens(
+        self,
+        min_token_length: int,
+        replacement: cudf.Scalar,
+        delimiter: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.filter_tokens(
+                self.to_pylibcudf(mode="read"),
+                min_token_length,
+                replacement.device_value.c_value,
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def porter_stemmer_measure(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.porter_stemmer_measure(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.is_letter(
+                self.to_pylibcudf(mode="read"),
+                is_vowel,
+                index
+                if isinstance(index, int)
+                else index.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def subword_tokenize(
+        self,
+        hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary,
+        max_sequence_length: int = 64,
+        stride: int = 48,
+        do_lower: bool = True,
+        do_truncate: bool = False,
+    ) -> tuple[ColumnBase, ColumnBase, ColumnBase]:
+        """
+        Subword tokenizes text series by using the pre-loaded hashed vocabulary
+        """
+        result = plc.nvtext.subword_tokenize.subword_tokenize(
+            self.to_pylibcudf(mode="read"),
+            hashed_vocabulary,
+            max_sequence_length,
+            stride,
+            do_lower,
+            do_truncate,
+        )
+        # return the 3 tensor components
+        tokens = type(self).from_pylibcudf(result[0])
+        masks = type(self).from_pylibcudf(result[1])
+        metadata = type(self).from_pylibcudf(result[2])
+        return tokens, masks, metadata
+
+    @acquire_spill_lock()
+    def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_column(self, delimiters: Self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_column(self, delimiters: Self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def character_tokenize(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.character_tokenize(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_with_vocabulary(
+        self,
+        vocabulary: plc.nvtext.tokenize.TokenizeVocabulary,
+        delimiter: cudf.Scalar,
+        default_id: int,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_with_vocabulary(
+                self.to_pylibcudf(mode="read"),
+                vocabulary,
+                delimiter.device_value.c_value,
+                default_id,
+            )
+        )
+
+    @acquire_spill_lock()
+    def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.detokenize(
+                self.to_pylibcudf(mode="read"),
+                indices.to_pylibcudf(mode="read"),
+                separator.device_value.c_value,
+            )
+        )
+
     def _modify_characters(
         self, method: Callable[[plc.Column], plc.Column]
     ) -> Self:
@@ -6334,11 +6426,180 @@ def title(self) -> Self:
     def is_title(self) -> Self:
         return self._modify_characters(plc.strings.capitalize.is_title)
 
+    @acquire_spill_lock()
     def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
-        with acquire_spill_lock():
-            plc_result = plc.strings.replace.replace_multiple(
-                self.to_pylibcudf(mode="read"),
-                pattern.to_pylibcudf(mode="read"),
-                replacements.to_pylibcudf(mode="read"),
+        plc_result = plc.strings.replace.replace_multiple(
+            self.to_pylibcudf(mode="read"),
+            pattern.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+        )
+        return cast(Self, Column.from_pylibcudf(plc_result))
+
+    @acquire_spill_lock()
+    def _split_record_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Column,
+        ],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return cast(Self, Column.from_pylibcudf(plc_column))
+
+    def split_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.split_record_re
+        )
+
+    def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_record_re
+        )
+
+    @acquire_spill_lock()
+    def _split_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Table,
+        ],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.split_re
+        )
+
+    def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_re
+        )
+
+    @acquire_spill_lock()
+    def _split_record(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.split_record
+        )
+
+    def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.rsplit_record
+        )
+
+    @acquire_spill_lock()
+    def _split(
+        self,
+        delimiter: cudf.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.split)
+
+    def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
+
+    @acquire_spill_lock()
+    def _partition(
+        self,
+        delimiter: cudf.Scalar,
+        method: Callable[[plc.Column, plc.Scalar], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter.device_value.c_value,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
             )
-            return cast(Self, Column.from_pylibcudf(plc_result))
+        )
+
+    def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.partition
+        )
+
+    def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.rpartition
+        )
+
+    @acquire_spill_lock()
+    def url_decode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_decode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def url_encode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_encode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_integer(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_integers.is_integer(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_float(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_floats.is_float(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 2adc6b54bab..ba765b50729 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import ColumnBase
+from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import StructDtype
 from cudf.core.missing import NA
@@ -107,12 +107,9 @@ def memory_usage(self) -> int:
 
         return n
 
-    def element_indexing(self, index: int):
+    def element_indexing(self, index: int) -> dict:
         result = super().element_indexing(index)
-        return {
-            field: value
-            for field, value in zip(self.dtype.fields, result.values())
-        }
+        return dict(zip(self.dtype.fields, result.values()))
 
     def __setitem__(self, key, value):
         if isinstance(value, dict):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 620fe31c30f..8b1515acae2 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -11,11 +11,12 @@
 import pyarrow as pa
 
 import cudf
-from cudf import _lib as libcudf
+import cudf.core.column.column as column
+import cudf.core.column.string as string
 from cudf.api.types import is_scalar
-from cudf.core._internals import unary
+from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
-from cudf.core.column import ColumnBase, column, string
+from cudf.core.column.column import ColumnBase
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import (
     _all_bools_with_nulls,
@@ -186,8 +187,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 this = self.astype(common_dtype).astype(out_dtype)
                 if isinstance(other, cudf.Scalar):
                     if other.is_valid():
-                        other = other.value.astype(common_dtype).astype(
-                            out_dtype
+                        other = cudf.Scalar(
+                            other.value.astype(common_dtype).astype(out_dtype)
                         )
                     else:
                         other = cudf.Scalar(None, out_dtype)
@@ -217,10 +218,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         lhs, rhs = (other, this) if reflect else (this, other)
 
-        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-        if cudf.get_option(
-            "mode.pandas_compatible"
-        ) and out_dtype == cudf.dtype(np.bool_):
+        result = binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b":
             result = result.fillna(op == "__ne__")
         return result
 
@@ -295,7 +294,7 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         else:
             return string._timedelta_to_str_typecast_functions[self.dtype](
@@ -468,7 +467,7 @@ def components(self) -> dict[str, ColumnBase]:
         2  13000     10       12       48           712             0            0
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
-        """  # noqa: E501
+        """
 
         date_meta = {
             "seconds": ["m", "s"],
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 496e86ed709..e4fd82e819b 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -49,7 +49,7 @@ def from_zip(cls, data: abc.Iterator):
     def __getitem__(self, key):
         """Recursively apply dict.__getitem__ for nested elements."""
         # As described in the pandas docs
-        # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index  # noqa: E501
+        # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index
         # accessing nested elements of a multiindex must be done using a tuple.
         # Lists and other sequences are treated as accessing multiple elements
         # at the top level of the index.
@@ -62,10 +62,10 @@ def _to_flat_dict_inner(d: dict, parents: tuple = ()):
     for k, v in d.items():
         if not isinstance(v, d.__class__):
             if parents:
-                k = parents + (k,)
+                k = (*parents, k)
             yield (k, v)
         else:
-            yield from _to_flat_dict_inner(d=v, parents=parents + (k,))
+            yield from _to_flat_dict_inner(d=v, parents=(*parents, k))
 
 
 class ColumnAccessor(abc.MutableMapping):
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 16d8964f083..4b6ad59c8e1 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -5,7 +5,6 @@
 from typing_extensions import Self
 
 import cudf
-import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 
 if TYPE_CHECKING:
@@ -70,8 +69,8 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             if self.column.dtype.kind not in {"i", "u"}:
                 raise TypeError("Gather map must have integer dtype")
             if not nullify:
-                lo, hi = libcudf.reduce.minmax(self.column)
-                if lo.value < -nrows or hi.value >= nrows:
+                lo, hi = self.column.minmax()
+                if lo < -nrows or hi >= nrows:
                     raise IndexError(
                         f"Gather map is out of bounds for [0, {nrows})"
                     )
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index a4d12cfc7f0..5bfea45a946 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -149,7 +149,7 @@ def cut(
         if len(set(bins)) is not len(bins):
             if duplicates == "raise":
                 raise ValueError(
-                    f"Bin edges must be unique: {repr(bins)}.\n"
+                    f"Bin edges must be unique: {bins!r}.\n"
                     f"You can drop duplicate edges by setting the 'duplicates'"
                     "kwarg"
                 )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b58ab13be93..fce361e18ea 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7,13 +7,18 @@
 import itertools
 import numbers
 import os
-import pickle
 import re
 import sys
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator, MutableMapping
+from collections.abc import (
+    Callable,
+    Hashable,
+    Iterator,
+    MutableMapping,
+    Sequence,
+)
 from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
@@ -44,7 +49,6 @@
 )
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column import (
     CategoricalColumn,
@@ -582,7 +586,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
     pass
 
 
-class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
+class DataFrame(IndexedFrame, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
 
@@ -770,9 +774,7 @@ def __init__(
                 label_dtype = getattr(columns, "dtype", None)
                 self._data = ColumnAccessor(
                     {
-                        k: column.column_empty(
-                            len(self), dtype="object", masked=True
-                        )
+                        k: column_empty(len(self), dtype="object")
                         for k in columns
                     },
                     level_names=tuple(columns.names)
@@ -975,8 +977,8 @@ def _init_from_series_list(self, data, columns, index):
         if columns is not None:
             for col_name in columns:
                 if col_name not in self._data:
-                    self._data[col_name] = column.column_empty(
-                        row_count=len(self), dtype=None, masked=True
+                    self._data[col_name] = column_empty(
+                        row_count=len(self), dtype=None
                     )
             self._data._level_names = (
                 tuple(columns.names)
@@ -1027,11 +1029,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
-                data = [
-                    cudf.core.column.column_empty(row_count=0, dtype=None)
-                    for _ in columns
-                ]
-
+                data = [column_empty(row_count=0, dtype=None) for _ in columns]
             for col_name, col in enumerate(data):
                 self._data[col_name] = column.as_column(col)
             self._data.rangeindex = True
@@ -1070,9 +1068,8 @@ def _init_from_dict_like(
                 # the provided index, so we need to return a masked
                 # array of nulls if an index is given.
                 empty_column = functools.partial(
-                    cudf.core.column.column_empty,
-                    row_count=(0 if index is None else len(index)),
-                    masked=index is not None,
+                    column_empty,
+                    row_count=0 if index is None else len(index),
                 )
 
             data = {
@@ -1131,7 +1128,7 @@ def _from_data(
         data: MutableMapping,
         index: BaseIndex | None = None,
         columns: Any = None,
-    ) -> DataFrame:
+    ) -> Self:
         out = super()._from_data(data=data, index=index)
         if columns is not None:
             out.columns = columns
@@ -1184,7 +1181,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1199,8 +1196,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
@@ -1418,8 +1414,8 @@ def __setitem__(self, arg, value):
                         new_columns = (
                             value
                             if key == arg
-                            else column.column_empty_like(
-                                col, masked=True, newsize=length
+                            else column_empty(
+                                row_count=length, dtype=col.dtype
                             )
                             for key, col in self._column_labels_and_values
                         )
@@ -2242,7 +2238,7 @@ def from_dict(
         n1 n2
         a  b   1  3
            c   2  4
-        """  # noqa: E501
+        """
 
         orient = orient.lower()
         if orient == "index":
@@ -2399,7 +2395,7 @@ def to_dict(
         >>> df.to_dict('records', into=dd)
         [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
          defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
-        """  # noqa: E501
+        """
         orient = orient.lower()
 
         if orient == "series":
@@ -2502,16 +2498,7 @@ def scatter_by_map(
                 )
 
             if map_index.size > 0:
-                plc_lo, plc_hi = plc.reduce.minmax(
-                    map_index.to_pylibcudf(mode="read")
-                )
-                # TODO: Use pylibcudf Scalar once APIs are more developed
-                lo = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_lo, 1)
-                ).element_indexing(0)
-                hi = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_hi, 1)
-                ).element_indexing(0)
+                lo, hi = map_index.minmax()
                 if lo < 0 or hi >= map_size:
                     raise ValueError("Partition map has invalid values")
 
@@ -3027,7 +3014,7 @@ def set_index(
         if len(keys) == 0:
             raise ValueError("No valid columns to be added to index.")
         if append:
-            keys = [self.index] + keys
+            keys = [self.index, *keys]
 
         # Preliminary type check
         labels_not_found = []
@@ -3093,7 +3080,7 @@ def set_index(
     @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):  # noqa: D102
+    ):
         if isinstance(value, (pd.Series, pd.DataFrame)):
             value = cudf.from_pandas(value)
         if isinstance(value, cudf.Series):
@@ -3379,10 +3366,8 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
                         (
-                            column.column_empty_like(
-                                col_data, masked=True, newsize=length
-                            )
-                            for col_data in self._columns
+                            column_empty(row_count=length, dtype=dtype)
+                            for _, dtype in self._dtypes
                         ),
                         verify=False,
                     )
@@ -3487,7 +3472,7 @@ def diff(self, periods=1, axis=0):
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
-                    name: column_empty(len(self), dtype=dtype, masked=True)
+                    name: column_empty(len(self), dtype=dtype)
                     for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
@@ -3574,7 +3559,7 @@ def drop_duplicates(
         1  Yum Yum   cup     4.0
         2  Indomie   cup     3.5
         4  Indomie  pack     5.0
-        """  # noqa: E501
+        """
         outdf = super().drop_duplicates(
             subset=subset,
             keep=keep,
@@ -3867,9 +3852,7 @@ def agg(self, aggs, axis=None):
                 result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = self[key]
-                    col_empty = column_empty(
-                        len(idxs), dtype=col.dtype, masked=True
-                    )
+                    col_empty = column_empty(len(idxs), dtype=col.dtype)
                     ans = cudf.Series._from_column(
                         col_empty, index=cudf.Index(idxs)
                     )
@@ -4854,7 +4837,7 @@ def map(
 
         if na_action not in {"ignore", None}:
             raise ValueError(
-                f"na_action must be 'ignore' or None. Got {repr(na_action)}"
+                f"na_action must be 'ignore' or None. Got {na_action!r}"
             )
 
         if na_action == "ignore":
@@ -5727,7 +5710,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
         """
 
         data = self
-        index_descr = []
+        index_descr: Sequence[dict[str, Any]] | Sequence[str] = []
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
         index = self.index
@@ -5934,7 +5917,7 @@ def _from_arrays(
         index=None,
         columns=None,
         nan_as_null=False,
-    ):
+    ) -> Self:
         """
         Convert an object implementing an array interface to DataFrame.
 
@@ -5987,6 +5970,12 @@ def _from_arrays(
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
+        # Mapping/MutableMapping are invariant in the key type, so
+        # dict[int, ColumnBase] (the inferred type of ca_data) is not
+        # a valid type to pass to a function accepting
+        # Mapping[Hashable, ColumnBase] even though int is Hashable.
+        # See: https://github.com/python/typing/issues/445
+        ca_data: dict[Hashable, ColumnBase]
         if array_data.ndim == 2:
             ca_data = {
                 k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
@@ -6133,7 +6122,7 @@ def quantile(
             non-numeric types and result is expected to be a Series in case of
             Pandas. cuDF will return a DataFrame as it doesn't support mixed
             types under Series.
-        """  # noqa: E501
+        """
         if axis not in (0, None):
             raise NotImplementedError("axis is not implemented yet")
 
@@ -6179,9 +6168,7 @@ def quantile(
                         quant_index=False,
                     )._column
                     if len(res) == 0:
-                        res = column.column_empty_like(
-                            qs, dtype=ser.dtype, masked=True, newsize=len(qs)
-                        )
+                        res = column_empty(row_count=len(qs), dtype=ser.dtype)
                     result[k] = res
             result = DataFrame._from_data(result)
 
@@ -6762,9 +6749,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             )
             result = column.as_column(result, dtype=result_dtype)
             if mask is not None:
-                result = result.set_mask(
-                    cudf._lib.transform.bools_to_mask(mask._column)
-                )
+                result = result.set_mask(mask._column.as_mask())
             return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result, index=self.index)
@@ -6832,7 +6817,7 @@ def select_dtypes(self, include=None, exclude=None):
         3  False  2.0
         4   True  1.0
         5  False  2.0
-        """  # noqa: E501
+        """
 
         # code modified from:
         # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196
@@ -7035,7 +7020,9 @@ def to_orc(
         )
 
     @_performance_tracking
-    def stack(self, level=-1, dropna=no_default, future_stack=False):
+    def stack(
+        self, level=-1, dropna=no_default, future_stack=False
+    ) -> DataFrame | Series:
         """Stack the prescribed level(s) from columns to index
 
         Return a reshaped DataFrame or Series having a multi-level
@@ -7282,11 +7269,13 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         )
 
         if has_unnamed_levels:
-            unnamed_level_values = list(
-                map(column_name_idx.get_level_values, unnamed_levels_indices)
-            )
             unnamed_level_values = pd.MultiIndex.from_arrays(
-                unnamed_level_values
+                list(
+                    map(
+                        column_name_idx.get_level_values,
+                        unnamed_levels_indices,
+                    )
+                )
             )
 
         def unnamed_group_generator():
@@ -7333,9 +7322,7 @@ def unnamed_group_generator():
             )
 
             all_nulls = functools.cache(
-                functools.partial(
-                    column_empty, self.shape[0], common_type, masked=True
-                )
+                functools.partial(column_empty, self.shape[0], common_type)
             )
 
             # homogenize the dtypes of the columns
@@ -7869,6 +7856,16 @@ def interleave_columns(self):
             )
         return self._constructor_sliced._from_column(result_col)
 
+    @acquire_spill_lock()
+    def _compute_columns(self, expr: str) -> ColumnBase:
+        plc_column = plc.transform.compute_column(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in self._columns]
+            ),
+            plc.expressions.to_expression(expr, self._column_names),
+        )
+        return libcudf.column.Column.from_pylibcudf(plc_column)
+
     @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
@@ -7996,11 +7993,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_column(
-                libcudf.transform.compute_column(
-                    [*self._columns], self._column_names, statements[0]
-                )
-            )
+            return Series._from_column(self._compute_columns(statements[0]))
 
         targets = []
         exprs = []
@@ -8016,15 +8009,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
             targets.append(t.strip())
             exprs.append(e.strip())
 
-        cols = (
-            libcudf.transform.compute_column(
-                [*self._columns], self._column_names, e
-            )
-            for e in exprs
-        )
         ret = self if inplace else self.copy(deep=False)
-        for name, col in zip(targets, cols):
-            ret._data[name] = col
+        for name, expr in zip(targets, exprs):
+            ret._data[name] = self._compute_columns(expr)
         if not inplace:
             return ret
 
@@ -8582,7 +8569,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
             # If column not in this df, fill with an all-null column
             if idx >= len(cols) or cols[idx] is None:
                 n = len(next(x for x in cols if x is not None))
-                cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True)
+                cols[idx] = column_empty(row_count=n, dtype=dtype)
             else:
                 # If column is categorical, rebase the codes with the
                 # combined categories, and cast the new codes to the
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index aa601a2b322..a798041699e 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -799,8 +799,7 @@ def _set_missing_values(
             valid_mask = _ensure_gpu_buffer(
                 valid_mask[0], valid_mask[1], allow_copy
             )
-            boolmask = as_column(valid_mask._buf, dtype="bool")
-            bitmask = cudf._lib.transform.bools_to_mask(boolmask)
+            bitmask = as_column(valid_mask._buf, dtype="bool").as_mask()
             return cudf_col.set_mask(bitmask)
         elif null == _MaskKind.BITMASK:
             valid_mask = _ensure_gpu_buffer(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 2110e610c37..971f0be77f8 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -3,7 +3,6 @@
 
 import decimal
 import operator
-import pickle
 import textwrap
 import warnings
 from functools import cached_property
@@ -91,13 +90,13 @@ def dtype(arbitrary):
         raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype")
 
 
-def _decode_type(
+def _check_type(
     cls: type,
     header: dict,
     frames: list,
     is_valid_class: Callable[[type, type], bool] = operator.is_,
-) -> tuple[dict, list, type]:
-    """Decode metadata-encoded type and check validity
+) -> None:
+    """Perform metadata-encoded type and check validity
 
     Parameters
     ----------
@@ -112,12 +111,6 @@ class performing deserialization
         serialization by `cls` (default is to check type equality), called
         as `is_valid_class(decoded_class, cls)`.
 
-    Returns
-    -------
-    tuple
-        Tuple of validated headers, frames, and the decoded class
-        constructor.
-
     Raises
     ------
     AssertionError
@@ -128,11 +121,11 @@ class performing deserialization
         f"Deserialization expected {header['frame_count']} frames, "
         f"but received {len(frames)}."
     )
-    klass = pickle.loads(header["type-serialized"])
+    klass = Serializable._name_type_map[header["type-serialized-name"]]
     assert is_valid_class(
-        klass, cls
+        klass,
+        cls,
     ), f"Header-encoded {klass=} does not match decoding {cls=}."
-    return header, frames, klass
 
 
 class _BaseDtype(ExtensionDtype, Serializable):
@@ -196,9 +189,7 @@ def categories(self) -> cudf.Index:
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            col = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            col = cudf.core.column.column_empty(0, dtype="object")
         else:
             col = self._categories
         return cudf.Index._from_column(col)
@@ -237,7 +228,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype)
         >>> cudf_dtype
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
-        """  # noqa: E501
+        """
         return CategoricalDtype(
             categories=dtype.categories, ordered=dtype.ordered
         )
@@ -254,7 +245,7 @@ def to_pandas(self) -> pd.CategoricalDtype:
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
         >>> dtype.to_pandas()
         CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object)
-        """  # noqa: E501
+        """
         if self._categories is None:
             categories = None
         elif self._categories.dtype.kind == "f":
@@ -305,13 +296,14 @@ def construct_from_string(self):
 
     def serialize(self):
         header = {}
-        header["type-serialized"] = pickle.dumps(type(self))
         header["ordered"] = self.ordered
 
         frames = []
 
         if self.categories is not None:
-            categories_header, categories_frames = self.categories.serialize()
+            categories_header, categories_frames = (
+                self.categories.device_serialize()
+            )
         header["categories"] = categories_header
         frames.extend(categories_frames)
         header["frame_count"] = len(frames)
@@ -319,15 +311,14 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         ordered = header["ordered"]
         categories_header = header["categories"]
         categories_frames = frames
-        categories_type = pickle.loads(categories_header["type-serialized"])
-        categories = categories_type.deserialize(
+        categories = Serializable.device_deserialize(
             categories_header, categories_frames
         )
-        return klass(categories=categories, ordered=ordered)
+        return cls(categories=categories, ordered=ordered)
 
     def __repr__(self):
         return self.to_pandas().__repr__()
@@ -399,7 +390,7 @@ def element_type(self) -> Dtype:
         ListDtype(float32)
         >>> deep_nested_type.element_type.element_type.element_type
         'float32'
-        """  # noqa: E501
+        """
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
         elif isinstance(self._typ.value_type, pa.StructType):
@@ -420,7 +411,7 @@ def leaf_type(self):
         ListDtype(ListDtype(ListDtype(float32)))
         >>> deep_nested_type.leaf_type
         'float32'
-        """  # noqa: E501
+        """
         if isinstance(self.element_type, ListDtype):
             return self.element_type.leaf_type
         else:
@@ -486,7 +477,7 @@ def __eq__(self, other):
 
     def __repr__(self):
         if isinstance(self.element_type, (ListDtype, StructDtype)):
-            return f"{type(self).__name__}({repr(self.element_type)})"
+            return f"{type(self).__name__}({self.element_type!r})"
         else:
             return f"{type(self).__name__}({self.element_type})"
 
@@ -495,12 +486,13 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Dtype] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames = []
 
         if isinstance(self.element_type, _BaseDtype):
-            header["element-type"], frames = self.element_type.serialize()
+            header["element-type"], frames = (
+                self.element_type.device_serialize()
+            )
         else:
             header["element-type"] = getattr(
                 self.element_type, "name", self.element_type
@@ -510,14 +502,14 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         if isinstance(header["element-type"], dict):
-            element_type = pickle.loads(
-                header["element-type"]["type-serialized"]
-            ).deserialize(header["element-type"], frames)
+            element_type = Serializable.device_deserialize(
+                header["element-type"], frames
+            )
         else:
             element_type = header["element-type"]
-        return klass(element_type=element_type)
+        return cls(element_type=element_type)
 
     @cached_property
     def itemsize(self):
@@ -556,7 +548,7 @@ class StructDtype(_BaseDtype):
     >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"})
     >>> nested_struct_dtype
     StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')})
-    """  # noqa: E501
+    """
 
     name = "struct"
 
@@ -641,7 +633,6 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames: list[Buffer] = []
 
@@ -649,33 +640,31 @@ def serialize(self) -> tuple[dict, list]:
 
         for k, dtype in self.fields.items():
             if isinstance(dtype, _BaseDtype):
-                dtype_header, dtype_frames = dtype.serialize()
+                dtype_header, dtype_frames = dtype.device_serialize()
                 fields[k] = (
                     dtype_header,
                     (len(frames), len(frames) + len(dtype_frames)),
                 )
                 frames.extend(dtype_frames)
             else:
-                fields[k] = pickle.dumps(dtype)
+                fields[k] = dtype.str
         header["fields"] = fields
         header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         fields = {}
         for k, dtype in header["fields"].items():
             if isinstance(dtype, tuple):
                 dtype_header, (start, stop) = dtype
-                fields[k] = pickle.loads(
-                    dtype_header["type-serialized"]
-                ).deserialize(
+                fields[k] = Serializable.device_deserialize(
                     dtype_header,
                     frames[start:stop],
                 )
             else:
-                fields[k] = pickle.loads(dtype)
+                fields[k] = np.dtype(dtype)
         return cls(fields)
 
     @cached_property
@@ -730,7 +719,7 @@ def itemsize(self):
         >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2)
         >>> decimal{size}_dtype
         Decimal{size}Dtype(precision=9, scale=2)
-    """  # noqa: E501
+    """
 )
 
 
@@ -743,7 +732,7 @@ def __init__(self, precision, scale=0):
 
     @property
     def str(self):
-        return f"{str(self.name)}({self.precision}, {self.scale})"
+        return f"{self.name!s}({self.precision}, {self.scale})"
 
     @property
     def precision(self):
@@ -838,7 +827,6 @@ def _from_decimal(cls, decimal):
     def serialize(self) -> tuple[dict, list]:
         return (
             {
-                "type-serialized": pickle.dumps(type(self)),
                 "precision": self.precision,
                 "scale": self.scale,
                 "frame_count": 0,
@@ -848,11 +836,8 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(
-            cls, header, frames, is_valid_class=issubclass
-        )
-        klass = pickle.loads(header["type-serialized"])
-        return klass(header["precision"], header["scale"])
+        _check_type(cls, header, frames, is_valid_class=issubclass)
+        return cls(header["precision"], header["scale"])
 
     def __eq__(self, other: Dtype) -> bool:
         if other is self:
@@ -950,7 +935,7 @@ def __eq__(self, other):
             # This means equality isn't transitive but mimics pandas
             return other in (self.name, str(self))
         return (
-            type(self) == type(other)
+            type(self) is type(other)
             and self.subtype == other.subtype
             and self.closed == other.closed
         )
@@ -960,18 +945,17 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "fields": pickle.dumps((self.subtype, self.closed)),
+            "fields": (self.subtype.str, self.closed),
             "frame_count": 0,
         }
         return header, []
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
-        klass = pickle.loads(header["type-serialized"])
-        subtype, closed = pickle.loads(header["fields"])
-        return klass(subtype, closed=closed)
+        _check_type(cls, header, frames)
+        subtype, closed = header["fields"]
+        subtype = np.dtype(subtype)
+        return cls(subtype, closed=closed)
 
 
 def _is_categorical_dtype(obj):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0c0f271fe6f..4f40ba0bd92 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections import abc
 from typing import TYPE_CHECKING, Any, Literal
@@ -23,7 +22,9 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import sorting
 from cudf.core._internals.search import search_sorted
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -47,7 +48,7 @@
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
-class Frame(BinaryOperand, Scannable):
+class Frame(BinaryOperand, Scannable, Serializable):
     """A collection of Column objects with an optional index.
 
     Parameters
@@ -97,37 +98,80 @@ def ndim(self) -> int:
     @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
+        frames = []
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(self._column_names),
-            "column_rangeindex": pickle.dumps(self._data.rangeindex),
-            "column_multiindex": pickle.dumps(self._data.multiindex),
-            "column_label_dtype": pickle.dumps(self._data.label_dtype),
-            "column_level_names": pickle.dumps(self._data._level_names),
+            "column_label_dtype": None,
+            "dtype-is-cudf-serialized": False,
         }
-        header["columns"], frames = serialize_columns(self._columns)
+        if (label_dtype := self._data.label_dtype) is not None:
+            try:
+                header["column_label_dtype"], frames = (
+                    label_dtype.device_serialize()
+                )
+                header["dtype-is-cudf-serialized"] = True
+            except AttributeError:
+                header["column_label_dtype"] = label_dtype.str
+
+        header["columns"], column_frames = serialize_columns(self._columns)
+        column_names, column_names_numpy_type = (
+            zip(
+                *[
+                    (cname.item(), type(cname).__name__)
+                    if isinstance(cname, np.generic)
+                    else (cname, "")
+                    for cname in self._column_names
+                ]
+            )
+            if self._column_names
+            else ((), ())
+        )
+        header |= {
+            "column_names": column_names,
+            "column_names_numpy_type": column_names_numpy_type,
+            "column_rangeindex": self._data.rangeindex,
+            "column_multiindex": self._data.multiindex,
+            "column_level_names": self._data._level_names,
+        }
+        frames.extend(column_frames)
+
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
-        cls_deserialize = pickle.loads(header["type-serialized"])
-        column_names = pickle.loads(header["column_names"])
-        columns = deserialize_columns(header["columns"], frames)
         kwargs = {}
+        dtype_header = header["column_label_dtype"]
+        if header["dtype-is-cudf-serialized"]:
+            count = dtype_header["frame_count"]
+            kwargs["label_dtype"] = cls.device_deserialize(
+                header, frames[:count]
+            )
+            frames = frames[count:]
+        else:
+            kwargs["label_dtype"] = (
+                np.dtype(dtype_header) if dtype_header is not None else None
+            )
+
+        columns = deserialize_columns(header["columns"], frames)
         for metadata in [
             "rangeindex",
             "multiindex",
-            "label_dtype",
             "level_names",
         ]:
             key = f"column_{metadata}"
             if key in header:
-                kwargs[metadata] = pickle.loads(header[key])
+                kwargs[metadata] = header[key]
+
+        column_names = [
+            getattr(np, cntype)(cname) if cntype != "" else cname
+            for cname, cntype in zip(
+                header["column_names"], header["column_names_numpy_type"]
+            )
+        ]
         col_accessor = ColumnAccessor(
             data=dict(zip(column_names, columns)), **kwargs
         )
-        return cls_deserialize._from_data(col_accessor)
+        return cls._from_data(col_accessor)
 
     @classmethod
     @_performance_tracking
@@ -1392,7 +1436,7 @@ def argsort(
         >>> idx = cudf.Index([3, 1, 2])
         >>> idx.argsort()
         array([1, 2, 0], dtype=int32)
-        """  # noqa: E501
+        """
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
         if kind != "quicksort":
@@ -1433,7 +1477,7 @@ def _get_sorted_inds(
         else:
             ascending_lst = list(ascending)
 
-        return libcudf.sort.order_by(
+        return sorting.order_by(
             list(to_sort),
             ascending_lst,
             na_position,
@@ -1457,7 +1501,14 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode(list(self._columns))
+        plc_table, plc_column = plc.transform.encode(
+            plc.Table([col.to_pylibcudf(mode="read") for col in self._columns])
+        )
+        columns = [
+            libcudf.column.Column.from_pylibcudf(col)
+            for col in plc_table.columns()
+        ]
+        indices = libcudf.column.Column.from_pylibcudf(plc_column)
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
@@ -1901,7 +1952,16 @@ def _repeat(
         if not is_scalar(repeats):
             repeats = as_column(repeats)
 
-        return libcudf.filling.repeat(columns, repeats)
+        with acquire_spill_lock():
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns]
+            )
+            if isinstance(repeats, ColumnBase):
+                repeats = repeats.to_pylibcudf(mode="read")
+            return [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc.filling.repeat(plc_table, repeats).columns()
+            ]
 
     @_performance_tracking
     @_warn_no_dask_cudf
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e977f037b79..b772d35846d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3,7 +3,6 @@
 
 import copy
 import itertools
-import pickle
 import textwrap
 import warnings
 from collections import abc
@@ -19,11 +18,11 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
-from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -494,14 +493,15 @@ def size(self):
         """
         Return the size of each group.
         """
-        col = cudf.core.column.column_empty(
-            len(self.obj), "int8", masked=False
-        )
-        return (
-            cudf.Series._from_column(col)
+        col = cudf.core.column.column_empty(len(self.obj), "int8")
+        result = (
+            cudf.Series._from_column(col, name=getattr(self.obj, "name", None))
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
+        if not self._as_index:
+            result = result.rename("size").reset_index()
+        return result
 
     @_performance_tracking
     def cumcount(self, ascending: bool = True):
@@ -521,7 +521,8 @@ def cumcount(self, ascending: bool = True):
         return (
             cudf.Series._from_column(
                 cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
+                    len(self.obj),
+                    "int8",
                 ),
                 index=self.obj.index,
             )
@@ -791,7 +792,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 # want, and right order is a matching gather map for
                 # the result table. Get the correct order by sorting
                 # the right gather map.
-                (right_order,) = libcudf.sort.sort_by_key(
+                (right_order,) = sorting.sort_by_key(
                     [right_order],
                     [left_order],
                     [True],
@@ -1247,15 +1248,20 @@ def sample(
                 for off, size in zip(group_offsets, size_per_group):
                     rs.shuffle(indices[off : off + size])
             else:
-                rng = cp.random.default_rng(seed=random_state)
-                (indices,) = segmented_sort_by_key(
-                    [as_column(indices)],
-                    [as_column(rng.random(size=nrows))],
-                    as_column(group_offsets),
-                    [],
-                    [],
-                    stable=True,
+                keys = cp.random.default_rng(seed=random_state).random(
+                    size=nrows
                 )
+                with acquire_spill_lock():
+                    plc_table = plc.sorting.stable_segmented_sort_by_key(
+                        plc.Table(
+                            [as_column(indices).to_pylibcudf(mode="read")]
+                        ),
+                        plc.Table([as_column(keys).to_pylibcudf(mode="read")]),
+                        as_column(group_offsets).to_pylibcudf(mode="read"),
+                        [plc.types.Order.ASCENDING],
+                        [plc.types.NullOrder.AFTER],
+                    )
+                    indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
             want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
@@ -1278,7 +1284,7 @@ def serialize(self):
 
         obj_header, obj_frames = self.obj.serialize()
         header["obj"] = obj_header
-        header["obj_type"] = pickle.dumps(type(self.obj))
+        header["obj_type_name"] = type(self.obj).__name__
         header["num_obj_frames"] = len(obj_frames)
         frames.extend(obj_frames)
 
@@ -1293,7 +1299,7 @@ def serialize(self):
     def deserialize(cls, header, frames):
         kwargs = header["kwargs"]
 
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
@@ -1467,9 +1473,7 @@ def _iterative_groupby_apply(
                 RuntimeWarning,
             )
 
-        chunks = [
-            grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
-        ]
+        chunks = [grouped_values[s:e] for s, e in itertools.pairwise(offsets)]
         chunk_results = [function(chk, *args) for chk in chunks]
         return self._post_process_chunk_results(
             chunk_results, group_names, group_keys, grouped_values
@@ -3328,8 +3332,8 @@ def _handle_misc(self, by):
     def serialize(self):
         header = {}
         frames = []
-        header["names"] = pickle.dumps(self.names)
-        header["_named_columns"] = pickle.dumps(self._named_columns)
+        header["names"] = self.names
+        header["_named_columns"] = self._named_columns
         column_header, column_frames = cudf.core.column.serialize_columns(
             self._key_columns
         )
@@ -3339,8 +3343,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames
         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ff9cd310aef..8d3ef1036d1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
@@ -20,7 +19,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.filling import sequence
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -338,7 +336,7 @@ def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
-            return column.column_empty(0, masked=False, dtype=self.dtype)
+            return column.column_empty(0, dtype=self.dtype)
 
     def _clean_nulls_from_index(self) -> Self:
         return self
@@ -498,9 +496,8 @@ def serialize(self):
         header["index_column"]["step"] = self.step
         frames = []
 
-        header["name"] = pickle.dumps(self.name)
-        header["dtype"] = pickle.dumps(self.dtype)
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["name"] = self.name
+        header["dtype"] = self.dtype.str
         header["frame_count"] = 0
         return header, frames
 
@@ -508,11 +505,14 @@ def serialize(self):
     @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
-        name = pickle.loads(header["name"])
+        name = header["name"]
         start = h["start"]
         stop = h["stop"]
         step = h.get("step", 1)
-        return RangeIndex(start=start, stop=stop, step=step, name=name)
+        dtype = np.dtype(header["dtype"])
+        return RangeIndex(
+            start=start, stop=stop, step=step, dtype=dtype, name=name
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -1619,7 +1619,7 @@ def argsort(
         Returns
         -------
         cupy.ndarray: The indices sorted based on input.
-        """  # noqa: E501
+        """
         return super().argsort(
             axis=axis,
             kind=kind,
@@ -2218,7 +2218,7 @@ def year(self) -> Index:
         DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]')
         >>> datetime_index.year
         Index([2000, 2001, 2002], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.year, name=self.name)
 
     @property  # type: ignore
@@ -2237,7 +2237,7 @@ def month(self) -> Index:
         DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]')
         >>> datetime_index.month
         Index([1, 2, 3], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.month, name=self.name)
 
     @property  # type: ignore
@@ -2256,7 +2256,7 @@ def day(self) -> Index:
         DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]')
         >>> datetime_index.day
         Index([1, 2, 3], dtype='int16')
-        """  # noqa: E501
+        """
         return Index._from_column(self._column.day, name=self.name)
 
     @property  # type: ignore
@@ -2340,7 +2340,7 @@ def microsecond(self) -> Index:
               dtype='datetime64[ns]')
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
-        """  # noqa: E501
+        """
         return Index._from_column(
             (
                 # Need to manually promote column to int32 because
@@ -2615,7 +2615,7 @@ def ceil(self, freq: str) -> Self:
         ... ])
         >>> gIndex.ceil("T")
         DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
@@ -2646,7 +2646,7 @@ def floor(self, freq: str) -> Self:
         ... ])
         >>> gIndex.floor("T")
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(
             self._column.floor(freq), name=self.name
         )
@@ -2686,7 +2686,7 @@ def round(self, freq: str) -> Self:
         DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]')
         >>> dt_idx.round('T')
         DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
-        """  # noqa: E501
+        """
         return type(self)._from_column(
             self._column.round(freq), name=self.name
         )
@@ -2737,7 +2737,7 @@ def tz_localize(
         ``ambiguous`` and ``nonexistent`` arguments. Any
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
-        """  # noqa: E501
+        """
         result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_column(
             result_col, name=self.name, freq=self._freq
@@ -2774,7 +2774,7 @@ def tz_convert(self, tz: str | None) -> Self:
                        '2018-03-02 14:00:00+00:00',
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
-        """  # noqa: E501
+        """
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_column(result_col, name=self.name)
 
@@ -3118,7 +3118,7 @@ class CategoricalIndex(Index):
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a")
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
-    """  # noqa: E501
+    """
 
     @_performance_tracking
     def __init__(
@@ -3402,11 +3402,14 @@ def interval_range(
     start = start.astype(common_dtype)
     freq = freq.astype(common_dtype)
 
-    bin_edges = sequence(
-        size=periods + 1,
-        init=start.device_value,
-        step=freq.device_value,
-    )
+    with acquire_spill_lock():
+        bin_edges = libcudf.column.Column.from_pylibcudf(
+            plc.filling.sequence(
+                size=periods + 1,
+                init=start.device_value.c_value,
+                step=freq.device_value.c_value,
+            )
+        )
     return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
 
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f8c2587937..1a667e24bef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -607,7 +607,7 @@ def copy(self, deep: bool = True) -> Self:
         )
 
     @_performance_tracking
-    def equals(self, other) -> bool:  # noqa: D102
+    def equals(self, other) -> bool:
         return super().equals(other) and self.index.equals(other.index)
 
     @property
@@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
 
         col = _post_process_output_col(ans_col, retty)
 
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        col.set_base_mask(ans_mask.as_mask())
         result = cudf.Series._from_column(col, index=self.index)
 
         return result
@@ -3851,7 +3851,6 @@ def _reindex(
                 if name in df._data
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
-                    masked=True,
                     row_count=len(index),
                 )
             )
@@ -3970,7 +3969,13 @@ def round(self, decimals=0, how="half_even"):
 
         cols = (
             col.round(decimals[name], how=how)
-            if name in decimals and col.dtype.kind in "fiu"
+            if name in decimals
+            and (
+                col.dtype.kind in "fiu"
+                or isinstance(
+                    col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)
+                )
+            )
             else col.copy(deep=True)
             for name, col in self._column_labels_and_values
         )
@@ -5474,7 +5479,7 @@ def groupby(
             ),
         )
     )
-    def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def add(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5515,7 +5520,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def radd(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5556,7 +5561,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def subtract(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5599,7 +5604,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rsub(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5640,7 +5645,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def multiply(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5683,7 +5688,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rmul(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5724,7 +5729,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def mod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5765,7 +5770,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rmod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5806,7 +5811,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def pow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5847,7 +5852,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rpow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5888,7 +5893,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def floordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5929,7 +5934,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rfloordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -5970,7 +5975,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def truediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -6015,7 +6020,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+    def rtruediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
@@ -6059,7 +6064,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def eq(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
@@ -6099,7 +6104,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def ne(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
@@ -6139,7 +6144,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def lt(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
@@ -6179,7 +6184,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def le(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
@@ -6219,7 +6224,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def gt(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
@@ -6259,7 +6264,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             ),
         )
     )
-    def ge(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
+    def ge(self, other, axis="columns", level=None, fill_value=None):
         return self._binaryop(
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
@@ -6362,10 +6367,50 @@ def rank(
             elif source._num_columns != num_cols:
                 dropped_cols = True
 
-        result_columns = libcudf.sort.rank_columns(
-            [*source._columns], method_enum, na_option, ascending, pct
+        column_order = (
+            plc.types.Order.ASCENDING
+            if ascending
+            else plc.types.Order.DESCENDING
+        )
+        # ascending
+        #    #top    = na_is_smallest
+        #    #bottom = na_is_largest
+        #    #keep   = na_is_largest
+        # descending
+        #    #top    = na_is_largest
+        #    #bottom = na_is_smallest
+        #    #keep   = na_is_smallest
+        if ascending:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.BEFORE
+            else:
+                null_precedence = plc.types.NullOrder.AFTER
+        else:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.AFTER
+            else:
+                null_precedence = plc.types.NullOrder.BEFORE
+        c_null_handling = (
+            plc.types.NullPolicy.EXCLUDE
+            if na_option == "keep"
+            else plc.types.NullPolicy.INCLUDE
         )
 
+        with acquire_spill_lock():
+            result_columns = [
+                libcudf.column.Column.from_pylibcudf(
+                    plc.sorting.rank(
+                        col.to_pylibcudf(mode="read"),
+                        method_enum,
+                        column_order,
+                        c_null_handling,
+                        null_precedence,
+                        pct,
+                    )
+                )
+                for col in source._columns
+            ]
+
         if dropped_cols:
             result = type(source)._from_data(
                 ColumnAccessor(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 5c224176730..e7ea91c1f21 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -9,6 +9,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.types import size_type_dtype
+from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
@@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols):
                 for map_, n, null in zip(maps, lengths, nullify)
             )
         )
-        return libcudf.sort.sort_by_key(
+        return sorting.sort_by_key(
             list(maps),
             # If how is right, right map is primary sort key.
             key_order[:: -1 if self.how == "right" else 1],
@@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
             else:
                 to_sort = [*result._columns]
                 index_names = None
-            result_columns = libcudf.sort.sort_by_key(
+            result_columns = sorting.sort_by_key(
                 to_sort,
                 by,
                 [True] * len(by),
diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py
index b0f606e32e6..289fcb84d91 100644
--- a/python/cudf/cudf/core/mixins/scans.py
+++ b/python/cudf/cudf/core/mixins/scans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from .mixin_factory import _create_delegating_mixin
 
@@ -12,5 +12,5 @@
         "cumprod",
         "cummin",
         "cummax",
-    },  # noqa: E231
+    },
 )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 19a53af018d..a99e06e4a8e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -5,7 +5,6 @@
 import itertools
 import numbers
 import operator
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any
@@ -23,6 +22,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core._internals import sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -192,12 +192,12 @@ def __init__(
         source_data = {}
         for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
-                lo, hi = libcudf.reduce.minmax(code)
-                if lo.value < -1 or hi.value > len(level) - 1:
+                lo, hi = code.minmax()
+                if lo < -1 or hi > len(level) - 1:
                     raise ValueError(
                         f"Codes must be -1 <= codes <= {len(level) - 1}"
                     )
-                if lo.value == -1:
+                if lo == -1:
                     # Now we can gather and insert null automatically
                     code[code == -1] = np.iinfo(size_type_dtype).min
             result_col = libcudf.copying.gather(
@@ -567,7 +567,7 @@ def levels(self) -> list[cudf.Index]:
                 names=['a', 'b'])
         >>> midx.levels
         [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
-        """  # noqa: E501
+        """
         return [
             idx.rename(name) for idx, name in zip(self._levels, self.names)
         ]
@@ -921,15 +921,15 @@ def take(self, indices) -> Self:
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
-        header["column_names"] = pickle.dumps(self.names)
+        header["column_names"] = self.names
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
-        column_names = pickle.loads(header["column_names"])
-        header["column_names"] = pickle.dumps(range(0, len(column_names)))
+        column_names = header["column_names"]
+        header["column_names"] = range(0, len(column_names))
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
@@ -1678,7 +1678,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
                 f"Expected a list-like or None for `null_position`, got "
                 f"{type(null_position)}"
             )
-        return libcudf.sort.is_sorted(
+        return sorting.is_sorted(
             [*self._columns], ascending=ascending, null_position=null_position
         )
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index d95d252559f..391ee31f125 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 from __future__ import annotations
 
-import pickle
 import warnings
 from typing import TYPE_CHECKING
 
@@ -26,6 +25,7 @@
 
 import cudf
 from cudf._lib.column import Column
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
@@ -97,21 +97,21 @@ def serialize(self):
         header, frames = super().serialize()
         grouping_head, grouping_frames = self.grouping.serialize()
         header["grouping"] = grouping_head
-        header["resampler_type"] = pickle.dumps(type(self))
+        header["resampler_type"] = type(self).__name__
         header["grouping_frames_count"] = len(grouping_frames)
         frames.extend(grouping_frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
         grouping = _ResampleGrouping.deserialize(
             header["grouping"], frames[header["num_obj_frames"] :]
         )
-        resampler_cls = pickle.loads(header["resampler_type"])
+        resampler_cls = Serializable._name_type_map[header["resampler_type"]]
         out = resampler_cls.__new__(resampler_cls)
         out.grouping = grouping
         super().__init__(out, obj, by=grouping)
@@ -163,8 +163,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames[: -header["__bin_labels_count"]]
         )
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 016bd1225cd..59a3e9dbf3b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -8,13 +8,15 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.transform import one_hot_encode
+from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.column import ColumnBase, as_column, column_empty_like
+from cudf.core.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils.dtypes import min_unsigned_type
 
@@ -421,8 +423,8 @@ def concat(
                         # if join is inner and it contains an empty df
                         # we return an empty df, hence creating an empty
                         # column with dtype metadata retained.
-                        result_data[name] = cudf.core.column.column_empty_like(
-                            col, newsize=0
+                        result_data[name] = column_empty(
+                            row_count=0, dtype=col.dtype
                         )
                     else:
                         result_data[name] = col
@@ -458,8 +460,8 @@ def concat(
                     else:
                         col_label = (k, name)
                     if empty_inner:
-                        result_data[col_label] = (
-                            cudf.core.column.column_empty_like(col, newsize=0)
+                        result_data[col_label] = column_empty(
+                            row_count=0, dtype=col.dtype
                         )
                     else:
                         result_data[col_label] = col
@@ -941,21 +943,46 @@ def _merge_sorted(
                 idx + objs[0].index.nlevels for idx in key_columns_indices
             ]
 
-    columns = [
-        [
-            *(obj.index._columns if not ignore_index else ()),
-            *obj._columns,
-        ]
+    columns = (
+        itertools.chain(obj.index._columns, obj._columns)
+        if not ignore_index
+        else obj._columns
         for obj in objs
+    )
+
+    input_tables = [
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
+        for source_columns in columns
+    ]
+
+    num_keys = len(key_columns_indices)
+
+    column_order = (
+        plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+    )
+
+    if not ascending:
+        na_position = "last" if na_position == "first" else "first"
+
+    null_precedence = (
+        plc.types.NullOrder.BEFORE
+        if na_position == "first"
+        else plc.types.NullOrder.AFTER
+    )
+
+    plc_table = plc.merge.merge(
+        input_tables,
+        key_columns_indices,
+        [column_order] * num_keys,
+        [null_precedence] * num_keys,
+    )
+
+    result_columns = [
+        Column.from_pylibcudf(col) for col in plc_table.columns()
     ]
 
     return objs[0]._from_columns_like_self(
-        cudf._lib.merge.merge_sorted(
-            input_columns=columns,
-            key_columns_indices=key_columns_indices,
-            ascending=ascending,
-            na_position=na_position,
-        ),
+        result_columns,
         column_names=objs[0]._column_names,
         index_names=None if ignore_index else objs[0]._index_names,
     )
@@ -995,9 +1022,7 @@ def as_tuple(x):
             ]
             new_size = nrows * len(names)
             scatter_map = (columns_idx * np.int32(nrows)) + index_idx
-            target_col = cudf.core.column.column_empty_like(
-                col, masked=True, newsize=new_size
-            )
+            target_col = column_empty(row_count=new_size, dtype=col.dtype)
             target_col[scatter_map] = col
             target = cudf.Index._from_column(target_col)
             result.update(
@@ -1013,7 +1038,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._column_names,
+        level_names=(None, *columns._column_names),
         verify=False,
     )
     return cudf.DataFrame._from_data(ca, index=index_labels)
@@ -1300,7 +1325,9 @@ def _one_hot_encode_column(
     """
     if isinstance(column.dtype, cudf.CategoricalDtype):
         if column.size == column.null_count:
-            column = column_empty_like(categories, newsize=column.size)
+            column = column_empty(
+                row_count=column.size, dtype=categories.dtype
+            )
         else:
             column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
@@ -1310,7 +1337,11 @@ def _one_hot_encode_column(
             f"np.iinfo({size_type_dtype}).max. Consider reducing "
             "size of category"
         )
-    data = one_hot_encode(column, categories)
+    result_labels = (
+        x if x is not None else "<NA>"
+        for x in categories.to_arrow().to_pylist()
+    )
+    data = dict(zip(result_labels, column.one_hot_encode(categories)))
 
     if drop_first and len(data):
         data.pop(next(iter(data)))
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f6331aa1f49..80dd0921f9c 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -304,7 +304,7 @@ def __repr__(self):
         # https://github.com/numpy/numpy/issues/17552
         return (
             f"{self.__class__.__name__}"
-            f"({str(self.value)}, dtype={self.dtype})"
+            f"({self.value!s}, dtype={self.dtype})"
         )
 
     def _binop_result_dtype_or_error(self, other, op):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 95ea22b5ad5..961e5e11bc0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4,7 +4,6 @@
 
 import functools
 import inspect
-import pickle
 import textwrap
 import warnings
 from collections import abc
@@ -17,7 +16,6 @@
 from typing_extensions import Self, assert_never
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -28,7 +26,6 @@
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -415,7 +412,7 @@ def _loc_to_iloc(self, arg):
                 return indices
 
 
-class Series(SingleColumnFrame, IndexedFrame, Serializable):
+class Series(SingleColumnFrame, IndexedFrame):
     """
     One-dimensional GPU array (including time series).
 
@@ -517,7 +514,7 @@ def from_categorical(cls, categorical, codes=None):
         3    a
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
-        """  # noqa: E501
+        """
         col = as_column(categorical)
         if codes is not None:
             codes = as_column(codes)
@@ -526,7 +523,7 @@ def from_categorical(cls, categorical, codes=None):
 
             mask = None
             if not valid_codes.all():
-                mask = libcudf.transform.bools_to_mask(valid_codes)
+                mask = valid_codes.as_mask()
             col = CategoricalColumn(
                 data=col.data,
                 size=codes.size,
@@ -900,7 +897,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -916,8 +913,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
@@ -942,7 +938,20 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
-    def tolist(self):  # noqa: D102
+    def tolist(self):
+        """Conversion to host memory lists is currently unsupported
+
+        Raises
+        ------
+        TypeError
+            If this method is called
+
+        Notes
+        -----
+        cuDF currently does not support implicity conversion from GPU stored series to
+        host stored lists. A `TypeError` is raised when this method is called.
+        Consider calling `.to_arrow().to_pylist()` to construct a Python list.
+        """
         raise TypeError(
             "cuDF does not support conversion to host memory "
             "via the `tolist()` method. Consider using "
@@ -1087,7 +1096,7 @@ def reindex(
             DataFrame, followed by the original Series values. When `drop` is
             True, a `Series` is returned. In either case, if ``inplace=True``,
             no value is returned.
-""",  # noqa: E501
+""",
             example="""
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
         >>> series
@@ -1196,7 +1205,7 @@ def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame:
         12      c
         13   <NA>
         15      d
-        """  # noqa: E501
+        """
         return self._to_frame(name=name, index=self.index)
 
     @_performance_tracking
@@ -2122,7 +2131,7 @@ def data(self):
         >>> np.array(series.data.memoryview())
         array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
                0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
-        """  # noqa: E501
+        """
         return self._column.data
 
     @property  # type: ignore
@@ -3401,7 +3410,7 @@ def describe(
         )
 
     @_performance_tracking
-    def digitize(self, bins, right=False):
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
         """Return the indices of the bins to which each value belongs.
 
         Notes
@@ -3432,9 +3441,8 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series._from_column(
-            cudf.core.column.numerical.digitize(self._column, bins, right),
-            name=self.name,
+        return type(self)._from_column(
+            self._column.digitize(bins, right), name=self.name
         )
 
     @_performance_tracking
@@ -4590,7 +4598,7 @@ def is_month_end(self) -> Series:
         7    False
         8    False
         dtype: bool
-        """  # noqa: E501
+        """
         return self._return_result_like_self(self.series._column.is_month_end)
 
     @property  # type: ignore
@@ -5169,7 +5177,7 @@ def components(self) -> cudf.DataFrame:
         2  13000     10       12       48           712             0            0
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
-        """  # noqa: E501
+        """
         ca = ColumnAccessor(self.series._column.components(), verify=False)
         return self.series._constructor_expanddim._from_data(
             ca, index=self.series.index
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 0e66f383ca0..f6d0664758f 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -83,7 +83,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_performance_tracking
-    def ndim(self) -> int:  # noqa: D401
+    def ndim(self) -> int:
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
@@ -105,12 +105,12 @@ def _column(self) -> ColumnBase:
 
     @property  # type: ignore
     @_performance_tracking
-    def values(self) -> cupy.ndarray:  # noqa: D102
+    def values(self) -> cupy.ndarray:
         return self._column.values
 
     @property  # type: ignore
     @_performance_tracking
-    def values_host(self) -> numpy.ndarray:  # noqa: D102
+    def values_host(self) -> numpy.ndarray:
         return self._column.values_host
 
     @classmethod
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index dda1f199078..479838ef2a8 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -8,10 +8,6 @@
 
 import pylibcudf as plc
 
-from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize_inmem_hash as cpp_subword_tokenize,
-)
-
 
 def _cast_to_appropriate_type(ar, cast_type):
     if cast_type == "cp":
@@ -210,8 +206,7 @@ def __call__(
         stride = max_length - stride
         # behavior varies from subword_tokenize but maps with huggingface
 
-        input_ids, attention_mask, metadata = cpp_subword_tokenize(
-            text._column,
+        input_ids, attention_mask, metadata = text._column.subword_tokenize(
             self.vocab_file,
             max_sequence_length=max_length,
             stride=stride,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 1e31376cce8..fb8b9b3131c 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.tokenize import (
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
 
 
 class TokenizeVocabulary:
@@ -20,7 +17,7 @@ class TokenizeVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: "cudf.Series"):
+    def __init__(self, vocabulary: cudf.Series) -> None:
         self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
@@ -46,8 +43,8 @@ def tokenize(
         if delimiter is None:
             delimiter = ""
         delim = cudf.Scalar(delimiter, dtype="str")
-        result = cpp_tokenize_with_vocabulary(
-            text._column, self.vocabulary, delim, default_id
+        result = text._column.tokenize_with_vocabulary(
+            self.vocabulary, delim, default_id
         )
 
         return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 80ee078917a..8be336021b1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -15,9 +15,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
                         )
                         break
                     elif arg_col.dtype.kind == "O":
-                        if not cpp_is_integer(arg_col).all():
+                        if not arg_col.is_integer().all():
                             col = new_series._column.strptime(
                                 cudf.dtype("datetime64[ns]"), format=format
                             )
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 91f23490031..40348461f8c 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -2,14 +2,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -18,10 +17,16 @@
 from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
-    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.string import StringColumn
 
 
-def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
+def to_numeric(
+    arg,
+    errors: Literal["raise", "coerce", "ignore"] = "raise",
+    downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+    dtype_backend=None,
+):
     """
     Convert argument into numerical types.
 
@@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         else:
             try:
                 col = _convert_str_col(
-                    col._get_decategorized_column(), errors, downcast
+                    col._get_decategorized_column(),  # type: ignore[attr-defined]
+                    errors,
+                    downcast,
                 )
             except ValueError as e:
                 if errors == "ignore":
@@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
                     raise e
     elif is_string_dtype(dtype):
         try:
-            col = _convert_str_col(col, errors, downcast)
+            col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
             if errors == "ignore":
                 return arg
@@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         return col.values
 
 
-def _convert_str_col(col, errors, _downcast=None):
+def _convert_str_col(
+    col: StringColumn,
+    errors: Literal["raise", "coerce", "ignore"],
+    _downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+) -> NumericalColumn:
     """
     Converts a string column to numeric column
 
@@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None):
     if not is_string_dtype(col):
         raise TypeError("col must be string dtype.")
 
-    is_integer = libstrings.is_integer(col)
-    if is_integer.all():
-        return col.astype(dtype=cudf.dtype("i8"))
+    if col.is_integer().all():
+        return col.astype(dtype=cudf.dtype("i8"))  # type: ignore[return-value]
 
-    col = _proc_inf_empty_strings(col)
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
+    converted_col = (
+        col.to_lower()
+        .find_and_replace(as_column([""]), as_column(["NaN"]))
+        .replace_multiple(
+            as_column(["+", "inf", "inity"]),  # type: ignore[arg-type]
+            as_column(["", "Inf", ""]),  # type: ignore[arg-type]
+        )
+    )
 
-    is_float = libstrings.is_float(col)
+    is_float = converted_col.is_float()
     if is_float.all():
         if _downcast in {"unsigned", "signed", "integer"}:
             warnings.warn(
@@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.astype(dtype=cudf.dtype("float32"))
+            return converted_col.astype(dtype=cudf.dtype("float32"))  # type: ignore[return-value]
         else:
-            return col.astype(dtype=cudf.dtype("float64"))
+            return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            col = libcudf.string_casting.stod(col)
+            converted_col = libcudf.string_casting.stod(converted_col)
             non_numerics = is_float.unary_operator("not")
-            col[non_numerics] = None
-            return col
+            converted_col[non_numerics] = None
+            return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
-
-
-def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
-    """Handles empty and infinity strings"""
-    col = col.to_lower()  # type: ignore[attr-defined]
-    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
-    # TODO: This can be handled by libcudf in
-    # future see StringColumn.as_numerical_column
-    col = col.replace_multiple(  # type: ignore[attr-defined]
-        as_column(["+", "inf", "inity"]),
-        as_column(["", "Inf", ""]),
-    )
-    return col
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 3af662b62ea..814d3e9fc85 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -154,8 +154,9 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
 
-    output = cudf.core.column.column_empty(ngroups, dtype=return_type)
-
+    output = cudf.core.column.column_empty(
+        ngroups, dtype=return_type, for_numba=True
+    )
     launch_args = [
         offsets,
         output,
diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py
index 4c90c5bbba0..3a1e01caf28 100644
--- a/python/cudf/cudf/core/udf/masked_typing.py
+++ b/python/cudf/cudf/core/udf/masked_typing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import operator
 
@@ -50,7 +50,7 @@
 SUPPORTED_NUMPY_TYPES = (
     NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES
 )
-supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"]))
+supported_type_str = "\n".join(sorted([*list(SUPPORTED_NUMPY_TYPES), "bool"]))
 
 _units = ["ns", "ms", "us", "s"]
 _datetime_cases = {types.NPDatetime(u) for u in _units}
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 094df955273..c4a063a50e8 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from cudf._lib.reduce import scan
 from cudf.api.types import is_numeric_dtype
 from cudf.core.window.rolling import _RollingBase
 
@@ -194,13 +193,8 @@ def _apply_agg_column(
         # as such we need to convert the nans to nulls before
         # passing them in.
         to_libcudf_column = source_column.astype("float64").nans_to_nulls()
-
-        return scan(
-            agg_name,
-            to_libcudf_column,
-            True,
-            com=self.com,
-            adjust=self.adjust,
+        return to_libcudf_column.scan(
+            agg_name, True, com=self.com, adjust=self.adjust
         )
 
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index d2cb5e8c190..a580c35ccbf 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -12,8 +12,8 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.aggregation import make_aggregation
 from cudf.api.types import is_integer, is_number
+from cudf.core._internals.aggregation import make_aggregation
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import as_column
 from cudf.core.mixins import Reducible
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index dbabaacf6b5..a91a4951306 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -4,9 +4,8 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.transform import bools_to_mask
 
-__all__ = ["timeseries", "randomdata"]
+__all__ = ["randomdata", "timeseries"]
 
 
 # TODO:
@@ -70,7 +69,7 @@ def timeseries(
             size=len(index),
             p=[1 - nulls_frequency, nulls_frequency],
         )
-        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
+        mask_buf = cudf.core.column.as_column(mask).as_mask()
         masked_col = gdf[col]._column.set_mask(mask_buf)
         gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 3dc8915bfd1..da9a66f3874 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,57 +1,73 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import errno
+import itertools
+import os
 import warnings
 from collections import abc
 from io import BytesIO, StringIO
+from typing import cast
 
 import numpy as np
+import pandas as pd
+
+import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
-from cudf.api.types import is_scalar
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf._lib.utils import data_from_pylibcudf_io
+from cudf.api.types import is_hashable, is_scalar
+from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
 from cudf.utils.performance_tracking import _performance_tracking
 
+_CSV_HEX_TYPE_MAP = {
+    "hex": np.dtype("int64"),
+    "hex64": np.dtype("int64"),
+    "hex32": np.dtype("int32"),
+}
+
 
 @_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
-    sep=",",
-    delimiter=None,
+    sep: str = ",",
+    delimiter: str | None = None,
     header="infer",
     names=None,
     index_col=None,
     usecols=None,
     prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols: bool = True,
     dtype=None,
     true_values=None,
     false_values=None,
-    skipinitialspace=False,
-    skiprows=0,
-    skipfooter=0,
-    nrows=None,
+    skipinitialspace: bool = False,
+    skiprows: int = 0,
+    skipfooter: int = 0,
+    nrows: int | None = None,
     na_values=None,
-    keep_default_na=True,
-    na_filter=True,
-    skip_blank_lines=True,
+    keep_default_na: bool = True,
+    na_filter: bool = True,
+    skip_blank_lines: bool = True,
     parse_dates=None,
-    dayfirst=False,
+    dayfirst: bool = False,
     compression="infer",
-    thousands=None,
-    decimal=".",
-    lineterminator="\n",
-    quotechar='"',
-    quoting=0,
-    doublequote=True,
-    comment=None,
-    delim_whitespace=False,
-    byte_range=None,
+    thousands: str | None = None,
+    decimal: str = ".",
+    lineterminator: str = "\n",
+    quotechar: str = '"',
+    quoting: int = 0,
+    doublequote: bool = True,
+    comment: str | None = None,
+    delim_whitespace: bool = False,
+    byte_range: list[int] | tuple[int, int] | None = None,
     storage_options=None,
-    bytes_per_thread=None,
-):
+    bytes_per_thread: int | None = None,
+) -> cudf.DataFrame:
     """{docstring}"""
 
     if delim_whitespace is not False:
@@ -77,60 +93,225 @@ def read_csv(
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
-    df = libcudf.csv.read_csv(
-        filepath_or_buffer,
-        lineterminator=lineterminator,
-        quotechar=quotechar,
-        quoting=quoting,
-        doublequote=doublequote,
-        header=header,
-        mangle_dupe_cols=mangle_dupe_cols,
-        usecols=usecols,
-        sep=sep,
-        delimiter=delimiter,
-        delim_whitespace=delim_whitespace,
-        skipinitialspace=skipinitialspace,
-        names=names,
-        dtype=dtype,
-        skipfooter=skipfooter,
-        skiprows=skiprows,
-        dayfirst=dayfirst,
-        compression=compression,
-        thousands=thousands,
-        decimal=decimal,
-        true_values=true_values,
-        false_values=false_values,
-        nrows=nrows,
-        byte_range=byte_range,
-        skip_blank_lines=skip_blank_lines,
-        parse_dates=parse_dates,
-        comment=comment,
-        na_values=na_values,
-        keep_default_na=keep_default_na,
-        na_filter=na_filter,
-        prefix=prefix,
-        index_col=index_col,
+    if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)):
+        if not os.path.isfile(filepath_or_buffer):
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer
+            )
+
+    if isinstance(filepath_or_buffer, StringIO):
+        filepath_or_buffer = filepath_or_buffer.read().encode()
+    elif isinstance(filepath_or_buffer, str) and not os.path.isfile(
+        filepath_or_buffer
+    ):
+        filepath_or_buffer = filepath_or_buffer.encode()
+
+    _validate_args(
+        delimiter,
+        sep,
+        delim_whitespace,
+        decimal,
+        thousands,
+        nrows,
+        skipfooter,
+        byte_range,
+        skiprows,
+    )
+
+    # Alias sep -> delimiter.
+    if delimiter is None:
+        delimiter = sep
+
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = plc.io.types.CompressionType.NONE
+    else:
+        compression_map = {
+            "infer": plc.io.types.CompressionType.AUTO,
+            "gzip": plc.io.types.CompressionType.GZIP,
+            "bz2": plc.io.types.CompressionType.BZIP2,
+            "zip": plc.io.types.CompressionType.ZIP,
+        }
+        c_compression = compression_map[compression]
+
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == "infer":
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == "infer":
+            header = 0
+
+    hex_cols: list[abc.Hashable] = []
+    new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = {}
+            for k, col_type in dtype.items():
+                if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP:
+                    col_type = _CSV_HEX_TYPE_MAP[col_type]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif cudf.api.types.is_scalar(dtype) or isinstance(
+            dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type)
+        ):
+            if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP:
+                dtype = _CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype))
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP:
+                    col_dtype = _CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype))
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+    options = (
+        plc.io.csv.CsvReaderOptions.builder(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
+        .compression(c_compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range[0])
+        .byte_range_size(byte_range[1])
+        .nrows(nrows if nrows is not None else -1)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(str(lineterminator))
+        .quotechar(quotechar)
+        .decimal(decimal)
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
+    )
+
+    options.set_header(header)
+
+    if names is not None:
+        options.set_names([str(name) for name in names])
+
+    if prefix is not None:
+        options.set_prefix(prefix)
+
+    if usecols is not None:
+        if all(isinstance(col, int) for col in usecols):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name) for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(delimiter)
+
+    if thousands is not None:
+        options.set_thousands(thousands)
+
+    if comment is not None:
+        options.set_comment(comment)
+
+    if parse_dates is not None:
+        options.set_parse_dates(list(parse_dates))
+
+    if hex_cols is not None:
+        options.set_parse_hex(list(hex_cols))
+
+    options.set_dtypes(new_dtypes)
+
+    if true_values is not None:
+        options.set_true_values([str(val) for val in true_values])
+
+    if false_values is not None:
+        options.set_false_values([str(val) for val in false_values])
+
+    if na_values is not None:
+        options.set_na_values([str(val) for val in na_values])
+
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(plc.io.csv.read_csv(options))
     )
 
+    if isinstance(dtype, abc.Mapping):
+        for k, v in dtype.items():
+            if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
+                df._data[str(k)] = df._data[str(k)].astype(v)
+    elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype):
+        df = df.astype(dtype)
+    elif isinstance(dtype, abc.Collection) and not is_scalar(dtype):
+        for index, col_dtype in enumerate(dtype):
+            if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
+                col_name = df._column_names[index]
+                df._data[col_name] = df._data[col_name].astype(col_dtype)
+
+    if names is not None and len(names) and isinstance(names[0], int):
+        df.columns = [int(x) for x in df._data]
+    elif (
+        names is None
+        and header == -1
+        and cudf.get_option("mode.pandas_compatible")
+    ):
+        df.columns = [int(x) for x in df._column_names]
+
+    # Set index if the index_col parameter is passed
+    if index_col is not None and index_col is not False:
+        if isinstance(index_col, int):
+            index_col_name = df._data.get_labels_by_index(index_col)[0]
+            df = df.set_index(index_col_name)
+            if (
+                isinstance(index_col_name, str)
+                and names is None
+                and orig_header == "infer"
+            ):
+                if index_col_name.startswith("Unnamed:"):
+                    # TODO: Try to upstream it to libcudf
+                    # csv reader in future
+                    df.index.name = None
+            elif names is None:
+                df.index.name = index_col
+        else:
+            df = df.set_index(index_col)
+
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
         specified_dtypes = {} if dtype is None else dtype
-        unspecified_dtypes = {
-            name: dtype
-            for name, dtype in df._dtypes
-            if name not in specified_dtypes
-        }
         default_dtypes = {}
-
-        for name, dt in unspecified_dtypes.items():
-            if dt == np.dtype("i1"):
+        for name, dt in df._dtypes:
+            if name in specified_dtypes:
+                continue
+            elif dt == np.dtype("i1"):
                 # csv reader reads all null column as int8.
                 # The dtype should remain int8.
                 default_dtypes[name] = dt
             else:
                 default_dtypes[name] = _maybe_convert_to_default_type(dt)
-        df = df.astype(default_dtypes)
+
+        if default_dtypes:
+            df = df.astype(default_dtypes)
 
     return df
 
@@ -138,17 +319,17 @@ def read_csv(
 @_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
-    df,
+    df: cudf.DataFrame,
     path_or_buf=None,
-    sep=",",
-    na_rep="",
+    sep: str = ",",
+    na_rep: str = "",
     columns=None,
-    header=True,
-    index=True,
+    header: bool = True,
+    index: bool = True,
     encoding=None,
     compression=None,
-    lineterminator="\n",
-    chunksize=None,
+    lineterminator: str = "\n",
+    chunksize: int | None = None,
     storage_options=None,
 ):
     """{docstring}"""
@@ -187,15 +368,10 @@ def to_csv(
             )
 
     for _, dtype in df._dtypes:
-        if isinstance(dtype, cudf.ListDtype):
-            raise NotImplementedError(
-                "Writing to csv format is not yet supported with "
-                "list columns."
-            )
-        elif isinstance(dtype, cudf.StructDtype):
+        if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
-                "Struct columns."
+                f"{dtype} columns."
             )
 
     # TODO: Need to typecast categorical columns to the underlying
@@ -208,7 +384,7 @@ def to_csv(
         df = df.copy(deep=False)
         for col_name, col in df._column_labels_and_values:
             if isinstance(col.dtype, cudf.CategoricalDtype):
-                df._data[col_name] = col.astype(col.categories.dtype)
+                df._data[col_name] = col.astype(col.dtype.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
             df.index = df.index.astype(df.index.categories.dtype)
@@ -218,7 +394,7 @@ def to_csv(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            libcudf.csv.write_csv(
+            _plc_write_csv(
                 df,
                 path_or_buf=file_obj,
                 sep=sep,
@@ -229,7 +405,7 @@ def to_csv(
                 index=index,
             )
     else:
-        libcudf.csv.write_csv(
+        _plc_write_csv(
             df,
             path_or_buf=path_or_buf,
             sep=sep,
@@ -243,3 +419,127 @@ def to_csv(
     if return_as_string:
         path_or_buf.seek(0)
         return path_or_buf.read()
+
+
+@acquire_spill_lock()
+def _plc_write_csv(
+    table: cudf.DataFrame,
+    path_or_buf=None,
+    sep: str = ",",
+    na_rep: str = "",
+    header: bool = True,
+    lineterminator: str = "\n",
+    rows_per_chunk: int = 8,
+    index: bool = True,
+) -> None:
+    iter_columns = (
+        itertools.chain(table.index._columns, table._columns)
+        if index
+        else table._columns
+    )
+    columns = [col.to_pylibcudf(mode="read") for col in iter_columns]
+    col_names = []
+    if header:
+        table_names = (
+            na_rep if name is None or pd.isnull(name) else name
+            for name in table._column_names
+        )
+        iter_names = (
+            itertools.chain(table.index.names, table_names)
+            if index
+            else table_names
+        )
+        all_names = list(iter_names)
+        col_names = [
+            '""'
+            if (name in (None, "") and len(all_names) == 1)
+            else (str(name) if name not in (None, "") else "")
+            for name in all_names
+        ]
+    try:
+        plc.io.csv.write_csv(
+            (
+                plc.io.csv.CsvWriterOptions.builder(
+                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
+                )
+                .names(col_names)
+                .na_rep(na_rep)
+                .include_header(header)
+                .rows_per_chunk(rows_per_chunk)
+                .line_terminator(str(lineterminator))
+                .inter_column_delimiter(str(sep))
+                .true_value("True")
+                .false_value("False")
+                .build()
+            )
+        )
+    except OverflowError as err:
+        raise OverflowError(
+            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
+            "Consider providing a smaller chunksize argument."
+        ) from err
+
+
+def _validate_args(
+    delimiter: str | None,
+    sep: str,
+    delim_whitespace: bool,
+    decimal: str,
+    thousands: str | None,
+    nrows: int | None,
+    skipfooter: int,
+    byte_range: list[int] | tuple[int, int] | None,
+    skiprows: int,
+) -> None:
+    if delim_whitespace:
+        if delimiter is not None:
+            raise ValueError("cannot set both delimiter and delim_whitespace")
+        if sep != ",":
+            raise ValueError("cannot set both sep and delim_whitespace")
+
+    # Alias sep -> delimiter.
+    actual_delimiter = delimiter if delimiter else sep
+
+    if decimal == actual_delimiter:
+        raise ValueError("decimal cannot be the same as delimiter")
+
+    if thousands == actual_delimiter:
+        raise ValueError("thousands cannot be the same as delimiter")
+
+    if nrows is not None and skipfooter != 0:
+        raise ValueError("cannot use both nrows and skipfooter parameters")
+
+    if byte_range is not None:
+        if skipfooter != 0 or skiprows != 0 or nrows is not None:
+            raise ValueError(
+                "cannot manually limit rows to be read when using the byte range parameter"
+            )
+
+
+def _get_plc_data_type_from_dtype(dtype) -> plc.DataType:
+    # TODO: Remove this work-around Dictionary types
+    # in libcudf are fully mapped to categorical columns:
+    # https://github.com/rapidsai/cudf/issues/3960
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
+
+    if isinstance(dtype, str):
+        if dtype == "date32":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS)
+        elif dtype in ("date", "date64"):
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[us]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype == "timestamp[s]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS)
+        elif dtype == "timestamp[ms]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[ns]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS)
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 68b60809bb9..5616413b7e4 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,147 +1,28 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
-import datetime
+import itertools
 import warnings
+from typing import TYPE_CHECKING, Literal
 
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import orc as liborc
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf._lib.utils import data_from_pylibcudf_io
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
 
-def _make_empty_df(filepath_or_buffer, columns):
-    from pyarrow import orc
-
-    orc_file = orc.ORCFile(filepath_or_buffer)
-    schema = orc_file.schema
-    col_names = schema.names if columns is None else columns
-    return cudf.DataFrame._from_data(
-        data={
-            col_name: cudf.core.column.column_empty(
-                row_count=0,
-                dtype=schema.field(col_name).type.to_pandas_dtype(),
-            )
-            for col_name in col_names
-        }
-    )
-
-
-def _parse_column_statistics(cs, column_statistics_blob):
-    # Initialize stats to return and parse stats blob
-    column_statistics = {}
-    cs.ParseFromString(column_statistics_blob)
-
-    # Load from parsed stats blob into stats to return
-    if cs.HasField("numberOfValues"):
-        column_statistics["number_of_values"] = cs.numberOfValues
-    if cs.HasField("hasNull"):
-        column_statistics["has_null"] = cs.hasNull
-
-    if cs.HasField("intStatistics"):
-        column_statistics["minimum"] = (
-            cs.intStatistics.minimum
-            if cs.intStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.intStatistics.maximum
-            if cs.intStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
-        )
-
-    elif cs.HasField("doubleStatistics"):
-        column_statistics["minimum"] = (
-            cs.doubleStatistics.minimum
-            if cs.doubleStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.doubleStatistics.maximum
-            if cs.doubleStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.doubleStatistics.sum
-            if cs.doubleStatistics.HasField("sum")
-            else None
-        )
-
-    elif cs.HasField("stringStatistics"):
-        column_statistics["minimum"] = (
-            cs.stringStatistics.minimum
-            if cs.stringStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.stringStatistics.maximum
-            if cs.stringStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.stringStatistics.sum
-
-    elif cs.HasField("bucketStatistics"):
-        column_statistics["true_count"] = cs.bucketStatistics.count[0]
-        column_statistics["false_count"] = (
-            column_statistics["number_of_values"]
-            - column_statistics["true_count"]
-        )
-
-    elif cs.HasField("decimalStatistics"):
-        column_statistics["minimum"] = (
-            cs.decimalStatistics.minimum
-            if cs.decimalStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.decimalStatistics.maximum
-            if cs.decimalStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.decimalStatistics.sum
-
-    elif cs.HasField("dateStatistics"):
-        column_statistics["minimum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("maximum")
-            else None
-        )
-
-    elif cs.HasField("timestampStatistics"):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if cs.timestampStatistics.HasField(
-            "minimumUtc"
-        ) and cs.timestampStatistics.HasField("maximumUtc"):
-            column_statistics["minimum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc
-            )
-            column_statistics["maximum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
-            )
-
-    elif cs.HasField("binaryStatistics"):
-        column_statistics["sum"] = cs.binaryStatistics.sum
-
-    return column_statistics
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
 
 
 @ioutils.doc_read_orc_metadata()
@@ -175,11 +56,12 @@ def read_orc_statistics(
         path_or_buf = ioutils._select_single_source(
             path_or_buf, "read_orc_statistics"
         )
-        (
-            column_names,
-            parsed_file_statistics,
-            parsed_stripes_statistics,
-        ) = liborc.read_parsed_orc_statistics(path_or_buf)
+        parsed = plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([path_or_buf])
+        )
+        column_names = parsed.column_names
+        parsed_file_statistics = parsed.file_stats
+        parsed_stripes_statistics = parsed.stripes_stats
 
         # Parse file statistics
         file_statistics = {
@@ -273,16 +155,14 @@ def read_orc(
     columns=None,
     filters=None,
     stripes=None,
-    skiprows=None,
-    num_rows=None,
-    use_index=True,
+    skiprows: int | None = None,
+    num_rows: int | None = None,
+    use_index: bool = True,
     timestamp_type=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
     """{docstring}"""
-    from cudf import DataFrame
-
     if skiprows is not None:
         # Do not remove until cuIO team approves its removal.
         warnings.warn(
@@ -329,31 +209,132 @@ def read_orc(
 
         # Return empty if everything was filtered
         if len(selected_stripes) == 0:
-            return _make_empty_df(filepaths_or_buffers[0], columns)
+            from pyarrow import orc
+
+            orc_file = orc.ORCFile(filepaths_or_buffers[0])
+            schema = orc_file.schema
+            col_names = schema.names if columns is None else columns
+            return cudf.DataFrame._from_data(
+                data={
+                    col_name: cudf.core.column.column_empty(
+                        row_count=0,
+                        dtype=schema.field(col_name).type.to_pandas_dtype(),
+                    )
+                    for col_name in col_names
+                }
+            )
         else:
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
-            )
+        if columns is not None:
+            columns = [str(col) for col in columns]
+
+        if skiprows is None:
+            skiprows = 0
+        elif not isinstance(skiprows, int) or skiprows < 0:
+            raise TypeError("skiprows must be an int >= 0")
+
+        if num_rows is None:
+            num_rows = -1
+        elif not isinstance(num_rows, int) or num_rows < -1:
+            raise TypeError("num_rows must be an int >= -1")
+
+        tbl_w_meta = plc.io.orc.read_orc(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            columns,
+            stripes,
+            skiprows,
+            num_rows,
+            use_index,
+            dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)),
         )
+
+        if isinstance(columns, list) and len(columns) == 0:
+            # When `columns=[]`, index needs to be
+            # established, but not the columns.
+            nrows = tbl_w_meta.tbl.num_rows()
+            data = {}
+            index = cudf.RangeIndex(nrows)
+        else:
+            names = tbl_w_meta.column_names(include_children=False)
+            index_col = None
+            is_range_index = False
+            reset_index_name = False
+            range_idx = None
+
+            if len(tbl_w_meta.per_file_user_data) > 0:
+                json_str = (
+                    tbl_w_meta.per_file_user_data[0]
+                    .get(b"pandas", b"")
+                    .decode("utf-8")
+                )
+                if json_str != "":
+                    meta = json.loads(json_str)
+                    if (
+                        "index_columns" in meta
+                        and len(meta["index_columns"]) > 0
+                    ):
+                        index_col = meta["index_columns"]
+                        if (
+                            isinstance(index_col[0], dict)
+                            and index_col[0]["kind"] == "range"
+                        ):
+                            is_range_index = True
+                        else:
+                            index_col_names = {}
+                            for idx_col in index_col:
+                                for c in meta["columns"]:
+                                    if c["field_name"] == idx_col:
+                                        index_col_names[idx_col] = (
+                                            c["name"] or c["field_name"]
+                                        )
+                                        if c["name"] is None:
+                                            reset_index_name = True
+
+            actual_index_names = None
+            col_names = names
+            if index_col is not None and len(index_col) > 0:
+                if is_range_index:
+                    range_index_meta = index_col[0]
+                    range_idx = cudf.RangeIndex(
+                        start=range_index_meta["start"],
+                        stop=range_index_meta["stop"],
+                        step=range_index_meta["step"],
+                        name=range_index_meta["name"],
+                    )
+                    if skiprows != 0:
+                        range_idx = range_idx[skiprows:]
+                    if num_rows != -1:
+                        range_idx = range_idx[:num_rows]
+                else:
+                    actual_index_names = list(index_col_names.values())
+                    col_names = names[len(actual_index_names) :]
+
+            data, index = data_from_pylibcudf_io(
+                tbl_w_meta,
+                col_names if columns is None else names,
+                actual_index_names,
+            )
+
+            if is_range_index:
+                index = range_idx
+            elif reset_index_name:
+                index.names = [None] * len(index.names)
+
+            child_name_values = tbl_w_meta.child_names.values()
+
+            data = {
+                name: ioutils._update_col_struct_field_names(col, child_names)
+                for (name, col), child_names in zip(
+                    data.items(), child_name_values
+                )
+            }
+
+        return cudf.DataFrame._from_data(data, index=index)
     else:
         from pyarrow import orc
 
-        def read_orc_stripe(orc_file, stripe, columns):
-            pa_table = orc_file.read_stripe(stripe, columns)
-            if isinstance(pa_table, pa.RecordBatch):
-                pa_table = pa.Table.from_batches([pa_table])
-            return pa_table
-
         warnings.warn("Using CPU via PyArrow to read ORC dataset.")
         if len(filepath_or_buffer) > 1:
             raise NotImplementedError(
@@ -364,11 +345,18 @@ def read_orc_stripe(orc_file, stripe, columns):
         orc_file = orc.ORCFile(filepath_or_buffer[0])
         if stripes is not None and len(stripes) > 0:
             for stripe_source_file in stripes:
-                pa_tables = [
-                    read_orc_stripe(orc_file, i, columns)
+                pa_tables = (
+                    orc_file.read_stripe(i, columns)
                     for i in stripe_source_file
-                ]
-                pa_table = pa.concat_tables(pa_tables)
+                )
+                pa_table = pa.concat_tables(
+                    [
+                        pa.Table.from_batches([table])
+                        if isinstance(table, pa.RecordBatch)
+                        else table
+                        for table in pa_tables
+                    ]
+                )
         else:
             pa_table = orc_file.read(columns=columns)
         df = cudf.DataFrame.from_arrow(pa_table)
@@ -378,16 +366,18 @@ def read_orc_stripe(orc_file, stripe, columns):
 
 @ioutils.doc_to_orc()
 def to_orc(
-    df,
+    df: cudf.DataFrame,
     fname,
-    compression="snappy",
-    statistics="ROWGROUP",
-    stripe_size_bytes=None,
-    stripe_size_rows=None,
-    row_index_stride=None,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
     cols_as_map_type=None,
     storage_options=None,
-    index=None,
+    index: bool | None = None,
 ):
     """{docstring}"""
 
@@ -413,7 +403,7 @@ def to_orc(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            liborc.write_orc(
+            _plc_write_orc(
                 df,
                 file_obj,
                 compression,
@@ -425,7 +415,7 @@ def to_orc(
                 index,
             )
     else:
-        liborc.write_orc(
+        _plc_write_orc(
             df,
             path_or_buf,
             compression,
@@ -438,4 +428,279 @@ def to_orc(
         )
 
 
-ORCWriter = liborc.ORCWriter
+@acquire_spill_lock()
+def _plc_write_orc(
+    table: cudf.DataFrame,
+    path_or_buf,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
+    cols_as_map_type=None,
+    index: bool | None = None,
+) -> None:
+    """
+    See `cudf::io::write_orc`.
+
+    See Also
+    --------
+    cudf.read_orc
+    """
+    user_data = {"pandas": ioutils.generate_pandas_metadata(table, index)}
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = (
+            table._columns
+            if table.index is None
+            else itertools.chain(table.index._columns, table._columns)
+        )
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table._index.names):
+            tbl_meta.column_metadata[level].set_name(
+                ioutils._index_level_name(idx_name, level, table._column_names)  # type: ignore[arg-type]
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    has_map_type = False
+    if cols_as_map_type is not None:
+        cols_as_map_type = set(cols_as_map_type)
+        has_map_type = True
+
+    for i, (name, col) in enumerate(
+        table._column_labels_and_values, start=num_index_cols_meta
+    ):
+        tbl_meta.column_metadata[i].set_name(name)
+        _set_col_children_metadata(
+            col,
+            tbl_meta.column_metadata[i],
+            has_map_type and name in cols_as_map_type,
+        )
+
+    options = (
+        plc.io.orc.OrcWriterOptions.builder(
+            plc.io.SinkInfo([path_or_buf]), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(_get_comp_type(compression))
+        .enable_statistics(_get_orc_stat_freq(statistics))
+        .build()
+    )
+    if stripe_size_bytes is not None:
+        options.set_stripe_size_bytes(stripe_size_bytes)
+    if stripe_size_rows is not None:
+        options.set_stripe_size_rows(stripe_size_rows)
+    if row_index_stride is not None:
+        options.set_row_index_stride(row_index_stride)
+
+    plc.io.orc.write_orc(options)
+
+
+class ORCWriter:
+    """
+    ORCWriter lets you you incrementally write out a ORC file from a series
+    of cudf tables
+
+    See Also
+    --------
+    cudf.io.orc.to_orc
+    """
+
+    def __init__(
+        self,
+        path,
+        index: bool | None = None,
+        compression: Literal[
+            False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+        ] = "SNAPPY",
+        statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+        cols_as_map_type=None,
+        stripe_size_bytes: int | None = None,
+        stripe_size_rows: int | None = None,
+        row_index_stride: int | None = None,
+    ):
+        self.sink = plc.io.SinkInfo([path])
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.cols_as_map_type = (
+            cols_as_map_type
+            if cols_as_map_type is None
+            else set(cols_as_map_type)
+        )
+        self.stripe_size_bytes = stripe_size_bytes
+        self.stripe_size_rows = stripe_size_rows
+        self.row_index_stride = row_index_stride
+        self.initialized = False
+
+    def write_table(self, table):
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(table)
+
+        keep_index = self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        )
+        if keep_index:
+            cols_to_write = itertools.chain(
+                table.index._columns, table._columns
+            )
+        else:
+            cols_to_write = table._columns
+
+        self.writer.write(
+            plc.Table([col.to_pylibcudf(mode="read") for col in cols_to_write])
+        )
+
+    def close(self):
+        if not self.initialized:
+            return
+        self.writer.close()
+
+    def _initialize_chunked_state(self, table):
+        """
+        Prepare all the values required to build the
+        chunked_orc_writer_options anb creates a writer
+        """
+
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        has_map_type = self.cols_as_map_type is not None
+        for i, (name, col) in enumerate(
+            table._column_labels_and_values, start=num_index_cols_meta
+        ):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_children_metadata(
+                col,
+                self.tbl_meta.column_metadata[i],
+                has_map_type and name in self.cols_as_map_type,
+            )
+
+        user_data = {
+            "pandas": ioutils.generate_pandas_metadata(table, self.index)
+        }
+
+        options = (
+            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(_get_comp_type(self.compression))
+            .enable_statistics(_get_orc_stat_freq(self.statistics))
+            .build()
+        )
+        if self.stripe_size_bytes is not None:
+            options.set_stripe_size_bytes(self.stripe_size_bytes)
+        if self.stripe_size_rows is not None:
+            options.set_stripe_size_rows(self.stripe_size_rows)
+        if self.row_index_stride is not None:
+            options.set_row_index_stride(self.row_index_stride)
+
+        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
+
+        self.initialized = True
+
+
+def _get_comp_type(
+    compression: Literal[False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"],
+) -> plc.io.types.CompressionType:
+    if compression is None or compression is False:
+        return plc.io.types.CompressionType.NONE
+
+    normed_compression = compression.upper()
+    if normed_compression == "SNAPPY":
+        return plc.io.types.CompressionType.SNAPPY
+    elif normed_compression == "ZLIB":
+        return plc.io.types.CompressionType.ZLIB
+    elif normed_compression == "ZSTD":
+        return plc.io.types.CompressionType.ZSTD
+    elif normed_compression == "LZ4":
+        return plc.io.types.CompressionType.LZ4
+    else:
+        raise ValueError(f"Unsupported `compression` type {compression}")
+
+
+def _get_orc_stat_freq(
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"],
+) -> plc.io.types.StatisticsFreq:
+    """
+    Convert ORC statistics terms to CUDF convention:
+      - ORC "STRIPE"   == CUDF "ROWGROUP"
+      - ORC "ROWGROUP" == CUDF "PAGE"
+    """
+    normed_statistics = statistics.upper()
+    if normed_statistics == "NONE":
+        return plc.io.types.StatisticsFreq.STATISTICS_NONE
+    elif normed_statistics == "STRIPE":
+        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
+    elif normed_statistics == "ROWGROUP":
+        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
+    else:
+        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
+
+
+def _set_col_children_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    list_column_as_map: bool = False,
+) -> None:
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_children_metadata(
+                child_col, col_meta.child(i), list_column_as_map
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if list_column_as_map:
+            col_meta.set_list_column_as_map()
+        _set_col_children_metadata(
+            col.children[1], col_meta.child(1), list_column_as_map
+        )
+    else:
+        return
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 750c6cec180..153ee0fa01a 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
 import itertools
 import math
 import operator
@@ -10,23 +11,42 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 from uuid import uuid4
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import parquet as libparquet
+from cudf._lib.column import Column
+from cudf._lib.utils import (
+    _data_from_columns,
+    _index_level_name,
+    data_from_pylibcudf_io,
+    generate_pandas_metadata,
+)
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column, column_empty
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
+
+    from typing_extensions import Self
+
+    from cudf.core.column import ColumnBase
 
 
 BYTE_SIZES = {
@@ -55,31 +75,200 @@
 }
 
 
+@acquire_spill_lock()
+def _plc_write_parquet(
+    table,
+    filepaths_or_buffers,
+    index: bool | None = None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
+    partitions_info=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = False,
+) -> np.ndarray | None:
+    """
+    Cython function to call into libcudf API, see `write_parquet`.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = itertools.chain(table.index._columns, table._columns)
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table.index.names):
+            tbl_meta.column_metadata[level].set_name(
+                _index_level_name(idx_name, level, table._column_names)
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        if not isinstance(name, str):
+            if cudf.get_option("mode.pandas_compatible"):
+                tbl_meta.column_metadata[i].set_name(str(name))
+            else:
+                raise ValueError(
+                    "Writing a Parquet file requires string column names"
+                )
+        else:
+            tbl_meta.column_metadata[i].set_name(name)
+
+        _set_col_metadata(
+            table[name]._column,
+            tbl_meta.column_metadata[i],
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    if partitions_info is not None:
+        user_data = [
+            {
+                "pandas": generate_pandas_metadata(
+                    table.iloc[start_row : start_row + num_row].copy(
+                        deep=False
+                    ),
+                    index,
+                )
+            }
+            for start_row, num_row in partitions_info
+        ]
+    else:
+        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
+
+    if header_version not in ("1.0", "2.0"):
+        raise ValueError(
+            f"Invalid parquet header version: {header_version}. "
+            "Valid values are '1.0' and '2.0'"
+        )
+
+    dict_policy = (
+        plc.io.types.DictionaryPolicy.ADAPTIVE
+        if use_dictionary
+        else plc.io.types.DictionaryPolicy.NEVER
+    )
+
+    comp_type = _get_comp_type(compression)
+    stat_freq = _get_stat_freq(statistics)
+    options = (
+        plc.io.parquet.ParquetWriterOptions.builder(
+            plc.io.SinkInfo(filepaths_or_buffers), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .int96_timestamps(int96_timestamps)
+        .write_v2_headers(header_version == "2.0")
+        .dictionary_policy(dict_policy)
+        .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
+        .build()
+    )
+    if partitions_info is not None:
+        options.set_partitions(
+            [
+                plc.io.types.PartitionInfo(part[0], part[1])
+                for part in partitions_info
+            ]
+        )
+    if metadata_file_path is not None:
+        if is_list_like(metadata_file_path):
+            options.set_column_chunks_file_paths(metadata_file_path)
+        else:
+            options.set_column_chunks_file_paths([metadata_file_path])
+    if row_group_size_bytes is not None:
+        options.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        options.set_row_group_size_rows(row_group_size_rows)
+    if max_page_size_bytes is not None:
+        options.set_max_page_size_bytes(max_page_size_bytes)
+    if max_page_size_rows is not None:
+        options.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        options.set_max_dictionary_size(max_dictionary_size)
+    blob = plc.io.parquet.write_parquet(options)
+    if metadata_file_path is not None:
+        return np.asarray(blob.obj)
+    else:
+        return None
+
+
 @_performance_tracking
 def _write_parquet(
     df,
     paths,
-    compression="snappy",
-    index=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     partitions_info=None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
-    write_arrow_schema=True,
-):
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = True,
+) -> np.ndarray | None:
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
             ValueError("partition info is required for multiple paths")
@@ -124,11 +313,11 @@ def _write_parquet(
             file_objs = [
                 ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs
             ]
-            write_parquet_res = libparquet.write_parquet(
+            write_parquet_res = _plc_write_parquet(
                 df, filepaths_or_buffers=file_objs, **common_args
             )
     else:
-        write_parquet_res = libparquet.write_parquet(
+        write_parquet_res = _plc_write_parquet(
             df, filepaths_or_buffers=paths_or_bufs, **common_args
         )
 
@@ -141,26 +330,38 @@ def _write_parquet(
 def write_to_dataset(
     df,
     root_path,
-    compression="snappy",
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
     filename=None,
     partition_cols=None,
     fs=None,
-    preserve_index=False,
-    return_metadata=False,
-    statistics="ROWGROUP",
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
+    preserve_index: bool = False,
+    return_metadata: bool = False,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
@@ -330,9 +531,29 @@ def write_to_dataset(
     return metadata
 
 
+def _parse_metadata(meta) -> tuple[bool, Any, Any]:
+    file_is_range_index = False
+    file_index_cols = None
+    file_column_dtype = None
+
+    if "index_columns" in meta and len(meta["index_columns"]) > 0:
+        file_index_cols = meta["index_columns"]
+
+        if (
+            isinstance(file_index_cols[0], dict)
+            and file_index_cols[0]["kind"] == "range"
+        ):
+            file_is_range_index = True
+    if "column_indexes" in meta and len(meta["column_indexes"]) == 1:
+        file_column_dtype = meta["column_indexes"][0]["numpy_type"]
+    return file_is_range_index, file_index_cols, file_column_dtype
+
+
 @ioutils.doc_read_parquet_metadata()
 @_performance_tracking
-def read_parquet_metadata(filepath_or_buffer):
+def read_parquet_metadata(
+    filepath_or_buffer,
+) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]:
     """{docstring}"""
 
     # List of filepaths or buffers
@@ -341,7 +562,39 @@ def read_parquet_metadata(filepath_or_buffer):
         bytes_per_thread=None,
     )
 
-    return libparquet.read_parquet_metadata(filepaths_or_buffers)
+    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(filepaths_or_buffers)
+    )
+
+    # read all column names including index column, if any
+    col_names = [
+        info.name() for info in parquet_metadata.schema().root().children()
+    ]
+
+    index_col_names = set()
+    json_str = parquet_metadata.metadata()["pandas"]
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None:
+            columns = meta["columns"]
+            for idx_col in index_col:
+                for c in columns:
+                    if c["field_name"] == idx_col:
+                        index_col_names.add(idx_col)
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if len(index_col_names) >= 0:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    return (
+        parquet_metadata.num_rows(),
+        parquet_metadata.num_rowgroups(),
+        col_names,
+        len(col_names),
+        parquet_metadata.rowgroup_metadata(),
+    )
 
 
 @_performance_tracking
@@ -886,7 +1139,6 @@ def _parquet_to_frame(
                     dfs[-1][name] = column_empty(
                         row_count=_len,
                         dtype=_dtype,
-                        masked=True,
                     )
                 else:
                     dfs[-1][name] = as_column(
@@ -913,16 +1165,18 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
-    nrows=None,
-    skip_rows=None,
-    allow_mismatched_pq_schemas=False,
+    nrows: int | None = None,
+    skip_rows: int | None = None,
+    allow_mismatched_pq_schemas: bool = False,
     *args,
     **kwargs,
-):
+) -> cudf.DataFrame:
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
-        if kwargs:
+        if set(kwargs.keys()).difference(
+            set(("_chunk_read_limit", "_pass_read_limit"))
+        ):
             raise ValueError(
                 "cudf engine doesn't support the "
                 f"following keyword arguments: {list(kwargs.keys())}"
@@ -932,30 +1186,123 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
+        if nrows is None:
+            nrows = -1
+        if skip_rows is None:
+            skip_rows = 0
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.read_parquet_chunked(
+            # Note: If this function ever takes accepts filters
+            # allow_range_index needs to be False when a filter is passed
+            # (see read_parquet)
+            allow_range_index = columns is not None and len(columns) != 0
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+
+            reader = plc.io.parquet.ChunkedParquetReader(
+                options,
+                chunk_read_limit=kwargs.get("_chunk_read_limit", 0),
+                pass_read_limit=kwargs.get("_pass_read_limit", 1024000000),
+            )
+
+            tbl_w_meta = reader.read_chunk()
+            column_names = tbl_w_meta.column_names(include_children=False)
+            child_names = tbl_w_meta.child_names
+            per_file_user_data = tbl_w_meta.per_file_user_data
+            concatenated_columns = tbl_w_meta.tbl.columns()
+
+            # save memory
+            del tbl_w_meta
+
+            while reader.has_next():
+                tbl = reader.read_chunk().tbl
+
+                for i in range(tbl.num_columns()):
+                    concatenated_columns[i] = plc.concatenate.concatenate(
+                        [concatenated_columns[i], tbl._columns[i]]
+                    )
+                    # Drop residual columns to save memory
+                    tbl._columns[i] = None
+
+            df = cudf.DataFrame._from_data(
+                *_data_from_columns(
+                    columns=[
+                        Column.from_pylibcudf(plc)
+                        for plc in concatenated_columns
+                    ],
+                    column_names=column_names,
+                    index_names=None,
+                )
+            )
+            df = _process_metadata(
+                df,
+                column_names,
+                child_names,
+                per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
-                nrows=nrows if nrows is not None else -1,
-                skip_rows=skip_rows if skip_rows is not None else 0,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
+                allow_range_index,
+                use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
+            return df
         else:
-            if nrows is None:
-                nrows = -1
-            if skip_rows is None:
-                skip_rows = 0
-            return libparquet.read_parquet(
+            allow_range_index = True
+            filters = kwargs.get("filters", None)
+            if columns is not None and len(columns) == 0 or filters:
+                allow_range_index = False
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+            if filters is not None:
+                options.set_filter(filters)
+
+            tbl_w_meta = plc.io.parquet.read_parquet(options)
+
+            df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta))
+
+            df = _process_metadata(
+                df,
+                tbl_w_meta.column_names(include_children=False),
+                tbl_w_meta.child_names,
+                tbl_w_meta.per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
+                allow_range_index,
+                use_pandas_metadata,
                 nrows=nrows,
                 skip_rows=skip_rows,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
+            return df
     else:
         if (
             isinstance(filepaths_or_buffers, list)
@@ -980,28 +1327,40 @@ def to_parquet(
     df,
     path,
     engine="cudf",
-    compression="snappy",
-    index=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
     partition_cols=None,
     partition_file_name=None,
     partition_offsets=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     storage_options=None,
-    return_metadata=False,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    return_metadata: bool = False,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
     *args,
     **kwargs,
@@ -1062,10 +1421,7 @@ def to_parquet(
             )
 
         partition_info = (
-            [
-                (i, j - i)
-                for i, j in zip(partition_offsets, partition_offsets[1:])
-            ]
+            [(i, j - i) for i, j in itertools.pairwise(partition_offsets)]
             if partition_offsets is not None
             else None
         )
@@ -1117,10 +1473,11 @@ def to_parquet(
 
 
 @ioutils.doc_merge_parquet_filemetadata()
-def merge_parquet_filemetadata(filemetadata_list):
+def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray:
     """{docstring}"""
-
-    return libparquet.merge_filemetadata(filemetadata_list)
+    return np.asarray(
+        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
+    )
 
 
 def _generate_filename():
@@ -1208,10 +1565,207 @@ def _get_groups_and_offsets(
     return part_names, grouped_df, part_offsets
 
 
-ParquetWriter = libparquet.ParquetWriter
+class ParquetWriter:
+    """
+    ParquetWriter lets you incrementally write out a Parquet file from a series
+    of cudf tables
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
+        File path or buffer to write to. The argument may also correspond
+        to a list of file paths or buffers.
+    index : bool or None, default None
+        If ``True``, include a dataframe's index(es) in the file output.
+        If ``False``, they will not be written to the file. If ``None``,
+        index(es) other than RangeIndex will be saved as columns.
+    compression : {'snappy', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
+        Level at which column statistics should be included in file.
+    row_group_size_bytes: int, default ``uint64 max``
+        Maximum size of each stripe of the output.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
+    row_group_size_rows: int, default 1000000
+        Maximum number of rows of each stripe of the output.
+        By default, 1000000 (10^6 rows) will be used.
+    max_page_size_bytes: int, default 524288
+        Maximum uncompressed size of each page of the output.
+        By default, 524288 (512KB) will be used.
+    max_page_size_rows: int, default 20000
+        Maximum number of rows of each page of the output.
+        By default, 20000 will be used.
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+
+    def __init__(
+        self,
+        filepath_or_buffer,
+        index: bool | None = None,
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+        row_group_size_bytes: int = int(np.iinfo(np.uint64).max),
+        row_group_size_rows: int = 1000000,
+        max_page_size_bytes: int = 524288,
+        max_page_size_rows: int = 20000,
+        max_dictionary_size: int = 1048576,
+        use_dictionary: bool = True,
+        store_schema: bool = False,
+    ):
+        filepaths_or_buffers = (
+            list(filepath_or_buffer)
+            if is_list_like(filepath_or_buffer)
+            else [filepath_or_buffer]
+        )
+        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.initialized = False
+        self.row_group_size_bytes = row_group_size_bytes
+        self.row_group_size_rows = row_group_size_rows
+        self.max_page_size_bytes = max_page_size_bytes
+        self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.use_dictionary = use_dictionary
+        self.write_arrow_schema = store_schema
+
+    def write_table(self, table, partitions_info=None) -> None:
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(
+                table,
+                num_partitions=len(partitions_info) if partitions_info else 1,
+            )
+        if self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        ):
+            columns = itertools.chain(table.index._columns, table._columns)
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns]
+            )
+        else:
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
+            )
+        self.writer.write(plc_table, partitions_info)
+
+    def close(self, metadata_file_path=None) -> np.ndarray | None:
+        if not self.initialized:
+            return None
+        column_chunks_file_paths = []
+        if metadata_file_path is not None:
+            if is_list_like(metadata_file_path):
+                column_chunks_file_paths = list(metadata_file_path)
+            else:
+                column_chunks_file_paths = [metadata_file_path]
+        blob = self.writer.close(column_chunks_file_paths)
+        if metadata_file_path is not None:
+            return np.asarray(blob.obj)
+        return None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(self, *args) -> None:
+        self.close()
+
+    def _initialize_chunked_state(
+        self, table, num_partitions: int = 1
+    ) -> None:
+        """Prepares all the values required to build the
+        chunked_parquet_writer_options and creates a writer
+        """
+
+        # Set the table_metadata
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_metadata(
+                table[name]._column,
+                self.tbl_meta.column_metadata[i],
+            )
+
+        index = (
+            False if isinstance(table.index, cudf.RangeIndex) else self.index
+        )
+        user_data = [
+            {"pandas": generate_pandas_metadata(table, index)}
+        ] * num_partitions
+        comp_type = _get_comp_type(self.compression)
+        stat_freq = _get_stat_freq(self.statistics)
+        dict_policy = (
+            plc.io.types.DictionaryPolicy.ADAPTIVE
+            if self.use_dictionary
+            else plc.io.types.DictionaryPolicy.NEVER
+        )
+        options = (
+            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(comp_type)
+            .stats_level(stat_freq)
+            .row_group_size_bytes(self.row_group_size_bytes)
+            .row_group_size_rows(self.row_group_size_rows)
+            .max_page_size_bytes(self.max_page_size_bytes)
+            .max_page_size_rows(self.max_page_size_rows)
+            .max_dictionary_size(self.max_dictionary_size)
+            .write_arrow_schema(self.write_arrow_schema)
+            .build()
+        )
+        options.set_dictionary_policy(dict_policy)
+        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
+        self.initialized = True
 
 
-def _parse_bytes(s):
+def _parse_bytes(s: str) -> int:
     """Parse byte string to numbers
 
     Utility function vendored from Dask.
@@ -1348,8 +1902,8 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression="snappy",
-        statistics="ROWGROUP",
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
         storage_options=None,
@@ -1373,9 +1927,7 @@ def __init__(
         self.partition_cols = partition_cols
         # Collection of `ParquetWriter`s, and the corresponding
         # partition_col values they're responsible for
-        self._chunked_writers: list[
-            tuple[libparquet.ParquetWriter, list[str], str]
-        ] = []
+        self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = []
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: dict[str, int] = {}
@@ -1485,7 +2037,7 @@ def write_table(self, df):
         )
         existing_cw_batch = defaultdict(dict)
         new_cw_paths = []
-        partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])]
+        partition_info = [(i, j - i) for i, j in itertools.pairwise(offsets)]
 
         for path, part_info, meta_path in zip(
             paths,
@@ -1566,3 +2118,257 @@ def _hive_dirname(name, val):
     if pd.isna(val):
         val = "__HIVE_DEFAULT_PARTITION__"
     return f"{name}={val}"
+
+
+def _set_col_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    force_nullable_schema: bool = False,
+    path: str | None = None,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+) -> None:
+    need_path = (
+        skip_compression is not None
+        or column_encoding is not None
+        or column_type_length is not None
+        or output_as_binary is not None
+    )
+    name = col_meta.get_name() if need_path else None
+    full_path = (
+        path + "." + name if (path is not None and name is not None) else name
+    )
+
+    if force_nullable_schema:
+        # Only set nullability if `force_nullable_schema`
+        # is true.
+        col_meta.set_nullability(True)
+
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        encoding = column_encoding[full_path]
+        if encoding is None:
+            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
+        else:
+            enc = str(encoding).upper()
+            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
+            if c_encoding is None:
+                raise ValueError("Unsupported `column_encoding` type")
+        col_meta.set_encoding(c_encoding)
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_metadata(
+                child_col,
+                col_meta.child(i),
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary,
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element")
+        _set_col_metadata(
+            col.children[1],
+            col_meta.child(1),
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
+        col_meta.set_decimal_precision(col.dtype.precision)
+
+
+def _get_comp_type(
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None],
+) -> plc.io.types.CompressionType:
+    if compression is None:
+        return plc.io.types.CompressionType.NONE
+    result = getattr(plc.io.types.CompressionType, compression.upper(), None)
+    if result is None:
+        raise ValueError("Unsupported `compression` type")
+    return result
+
+
+def _get_stat_freq(
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"],
+) -> plc.io.types.StatisticsFreq:
+    result = getattr(
+        plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None
+    )
+    if result is None:
+        raise ValueError("Unsupported `statistics_freq` type")
+    return result
+
+
+def _process_metadata(
+    df: cudf.DataFrame,
+    names: list[Hashable],
+    child_names: dict,
+    per_file_user_data: list,
+    row_groups,
+    filepaths_or_buffers,
+    allow_range_index: bool,
+    use_pandas_metadata: bool,
+    nrows: int = -1,
+    skip_rows: int = 0,
+) -> cudf.DataFrame:
+    ioutils._add_df_col_struct_names(df, child_names)
+    index_col = None
+    is_range_index = True
+    column_index_type = None
+    index_col_names = None
+    meta = None
+    for single_file in per_file_user_data:
+        if b"pandas" not in single_file:
+            continue
+        json_str = single_file[b"pandas"].decode("utf-8")
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(
+            meta
+        )
+        is_range_index &= file_is_range_index
+
+        if (
+            not file_is_range_index
+            and index_col is not None
+            and index_col_names is None
+        ):
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta["columns"]:
+                    if c["field_name"] == idx_col:
+                        index_col_names[idx_col] = c["name"]
+
+    if meta is not None:
+        # Book keep each column metadata as the order
+        # of `meta["columns"]` and `column_names` are not
+        # guaranteed to be deterministic and same always.
+        meta_data_per_column = {
+            col_meta["name"]: col_meta for col_meta in meta["columns"]
+        }
+
+        # update the decimal precision of each column
+        for col in names:
+            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
+                df._data[col].dtype.precision = meta_data_per_column[col][
+                    "metadata"
+                ]["precision"]
+
+    # Set the index column
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            if not allow_range_index:
+                return df
+
+            if len(per_file_user_data) > 1:
+                range_index_meta = {
+                    "kind": "range",
+                    "name": None,
+                    "start": 0,
+                    "stop": len(df),
+                    "step": 1,
+                }
+            else:
+                range_index_meta = index_col[0]
+
+            if row_groups is not None:
+                per_file_metadata = [
+                    pa.parquet.read_metadata(
+                        # Pyarrow cannot read directly from bytes
+                        io.BytesIO(s) if isinstance(s, bytes) else s
+                    )
+                    for s in filepaths_or_buffers
+                ]
+
+                filtered_idx = []
+                for i, file_meta in enumerate(per_file_metadata):
+                    row_groups_i = []
+                    start = 0
+                    for row_group in range(file_meta.num_row_groups):
+                        stop = start + file_meta.row_group(row_group).num_rows
+                        row_groups_i.append((start, stop))
+                        start = stop
+
+                    for rg in row_groups[i]:
+                        filtered_idx.append(
+                            cudf.RangeIndex(
+                                start=row_groups_i[rg][0],
+                                stop=row_groups_i[rg][1],
+                                step=range_index_meta["step"],
+                            )
+                        )
+
+                if len(filtered_idx) > 0:
+                    idx = cudf.concat(filtered_idx)
+                else:
+                    idx = cudf.Index._from_column(
+                        cudf.core.column.column_empty(0)
+                    )
+            else:
+                start = range_index_meta["start"] + skip_rows  # type: ignore[operator]
+                stop = range_index_meta["stop"]
+                if nrows > -1:
+                    stop = start + nrows
+                idx = cudf.RangeIndex(
+                    start=start,
+                    stop=stop,
+                    step=range_index_meta["step"],
+                    name=range_index_meta["name"],
+                )
+
+            df.index = idx
+        elif set(index_col).issubset(names):
+            index_data = df[index_col]
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
+                idx = cudf.Index._from_column(
+                    index_data._columns[0], name=next(actual_index_names)
+                )
+            else:
+                idx = cudf.MultiIndex.from_frame(
+                    index_data, names=list(actual_index_names)
+                )
+            df.drop(columns=index_col, inplace=True)
+            df.index = idx
+        else:
+            if use_pandas_metadata:
+                df.index.names = index_col
+
+    if df._num_columns == 0 and column_index_type is not None:
+        df._data.label_dtype = cudf.dtype(column_index_type)
+
+    return df
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 5ce738cae0e..5e266c5ff55 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "Compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index e206c8bca08..79a3a794af3 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -380,7 +380,7 @@ class option_context(ContextDecorator):
     >>> from cudf import option_context
     >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32):
     ...     pass
-    """  # noqa: E501
+    """
 
     def __init__(self, *args) -> None:
         if len(args) % 2 != 0:
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index bacf1f7e77b..fec181e85d7 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -12,7 +12,7 @@
 from .magics import load_ipython_extension
 from .profiler import Profiler
 
-__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"]
+__all__ = ["Profiler", "install", "is_proxy_object", "load_ipython_extension"]
 
 
 LOADED = False
@@ -57,7 +57,7 @@ def install():
     current_mr = rmm.mr.get_current_device_resource()
     if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
         warnings.warn(
-            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={rmm_mode!s}",
             UserWarning,
         )
         return
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index e0d3d9101a9..619ee822a54 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -96,7 +96,7 @@ def main():
             (module,) = args.module
             # run the module passing the remaining arguments
             # as if it were run with python -m <module> <args>
-            sys.argv[:] = [module] + args.args  # not thread safe?
+            sys.argv[:] = [module, *args.args]  # not thread safe?
             runpy.run_module(module, run_name="__main__")
         elif len(args.args) >= 1:
             # Remove ourself from argv and continue
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 05e7d159c63..e763875adb8 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -77,8 +77,8 @@ def _pandas_util_dir():
     # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py
     res = list(
         set(
-            list(importlib.import_module("pandas.util").__dict__.keys())
-            + [
+            [
+                *list(importlib.import_module("pandas.util").__dict__.keys()),
                 "Appender",
                 "Substitution",
                 "_exceptions",
@@ -219,7 +219,7 @@ def Timestamp_Timedelta__new__(cls, *args, **kwargs):
 def _DataFrame__dir__(self):
     # Column names that are string identifiers are added to the dir of the
     # DataFrame
-    # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878  # noqa: E501
+    # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878
     _pd_df_dir = dir(pd.DataFrame)
     return _pd_df_dir + [
         colname
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 40893ee2614..d32d388b975 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.   # noqa: E501
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -247,7 +247,7 @@ def _fsproxy_state(self) -> _State:
     if metaclasses:
         metaclass = types.new_class(  # type: ignore
             f"{name}_Meta",
-            metaclasses + (_FastSlowProxyMeta,),
+            (*metaclasses, _FastSlowProxyMeta),
             {},
         )
     cls = types.new_class(
@@ -1301,7 +1301,7 @@ def _replace_closurevars(
     return functools.update_wrapper(
         g,
         f,
-        assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
+        assigned=(*functools.WRAPPER_ASSIGNMENTS, "__kwdefaults__"),
     )
 
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index bb2fc00d9fc..e4ee0ce1ca4 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -41,7 +41,7 @@ def count_failures(log_file_name, pattern):
                     PANDAS_TEST_PREFIX
                 )
                 if fnmatch(line_module_name, pattern):
-                    if "longrepr" in line and line["longrepr"]:
+                    if line.get("longrepr"):
                         if isinstance(line["longrepr"], (tuple, list)):
                             message = line["longrepr"][2].splitlines()[0]
                         elif isinstance(line["longrepr"], str):
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 99b686406fb..01a75a2efb0 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -237,9 +237,9 @@ def generate(
 def get_dataframe(parameters, use_threads):
     # Initialize seeds
     if parameters.seed is not None:
-        rng = np.random.default_rng(seed=parameters.seed)  # noqa: F841
+        rng = np.random.default_rng(seed=parameters.seed)
     else:
-        rng = np.random.default_rng(seed=0)  # noqa: F841
+        rng = np.random.default_rng(seed=0)
 
     # For each column, invoke the data generator
     for column_params in parameters.column_parameters:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 8d342f8e6c6..0b09cf7dc34 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -77,7 +77,7 @@ def _check_types(
         ):
             return
 
-    if type(left) != type(right):
+    if type(left) is not type(right):
         raise_assert_detail(
             obj, "Class types are different", f"{type(left)}", f"{type(right)}"
         )
@@ -149,7 +149,7 @@ def assert_column_equal(
         ):
             pass
         else:
-            if type(left) != type(right) or left.dtype != right.dtype:
+            if type(left) is not type(right) or left.dtype != right.dtype:
                 msg1 = f"{left.dtype}"
                 msg2 = f"{right.dtype}"
                 raise_assert_detail(obj, "Dtypes are different", msg1, msg2)
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl
index 1ec077d10f7..64e06f0631d 100644
Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 691da224f44..81ba61b31dc 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -57,7 +57,7 @@ def test_localize_ambiguous(request, unit, zone_name):
     request.applymarker(
         pytest.mark.xfail(
             condition=(zone_name == "America/Metlakatla"),
-            reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html",  # noqa: E501
+            reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html",
         )
     )
     s = cudf.Series(
@@ -83,7 +83,7 @@ def test_localize_nonexistent(request, unit, zone_name):
     request.applymarker(
         pytest.mark.xfail(
             condition=(zone_name == "America/Grand_Turk"),
-            reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html",  # noqa: E501
+            reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html",
         )
     )
     s = cudf.Series(
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 71b6bbd688d..0712a0de635 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -167,11 +167,11 @@
     _operators_arithmetic = _operators_arithmetic[:1]
     _operators_comparison = _operators_comparison[:1]
     _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1]
-    DATETIME_TYPES = {"datetime64[ms]"}  # noqa: F811
-    NUMERIC_TYPES = {"float32"}  # noqa: F811
-    FLOAT_TYPES = {"float64"}  # noqa: F811
-    INTEGER_TYPES = {"int16"}  # noqa: F811
-    TIMEDELTA_TYPES = {"timedelta64[s]"}  # noqa: F811
+    DATETIME_TYPES = {"datetime64[ms]"}
+    NUMERIC_TYPES = {"float32"}
+    FLOAT_TYPES = {"float64"}
+    INTEGER_TYPES = {"int16"}
+    TIMEDELTA_TYPES = {"timedelta64[s]"}
     # To save time, we skip tests marked "pytest.mark.xfail"
     pytest_xfail = pytest.mark.skipif
 
@@ -444,7 +444,7 @@ def test_str_series_compare_num_reflected(
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("nelem", [1, 2, 100])
 @pytest.mark.parametrize("cmpop", _cmpops)
-@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"])
+@pytest.mark.parametrize("dtype", [*utils.NUMERIC_TYPES, "datetime64[ms]"])
 @pytest.mark.parametrize("use_cudf_scalar", [True, False])
 def test_series_compare_scalar(
     nelem, cmpop, obj_class, dtype, use_cudf_scalar
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index db41f689255..db24fdd2a29 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -98,7 +98,7 @@ def test_categorical_compare_unordered():
     # test equal
     out = sr == sr
     assert out.dtype == np.bool_
-    assert type(out[0]) == np.bool_
+    assert type(out[0]) is np.bool_
     assert np.all(out.to_numpy())
     assert np.all(pdsr == pdsr)
 
@@ -134,7 +134,7 @@ def test_categorical_compare_ordered():
     # test equal
     out = sr1 == sr1
     assert out.dtype == np.bool_
-    assert type(out[0]) == np.bool_
+    assert type(out[0]) is np.bool_
     assert np.all(out.to_numpy())
     assert np.all(pdsr1 == pdsr1)
 
@@ -768,7 +768,7 @@ def test_categorical_setitem_with_nan():
     assert_eq(gs, expected_series)
 
 
-@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"])
+@pytest.mark.parametrize("dtype", [*list(NUMERIC_TYPES), "object"])
 @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
 def test_series_construction_with_nulls(input_obj, dtype):
     dtype = cudf.dtype(dtype)
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 65947efc2df..c3c9a1c5338 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -7,7 +7,6 @@
 import pytest
 
 import cudf
-from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal
@@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data["a"]]
-    got_mask = mask_to_bools(
-        gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data["a"]._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data]
-    got_mask = mask_to_bools(
-        gd_data._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index ab0f1767cd6..f57f256d55c 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -625,7 +625,7 @@ def test_concat_series_dataframe_input_str(objs):
 )
 @pytest.mark.parametrize("ignore_index", [True, False])
 def test_concat_empty_dataframes(df, other, ignore_index):
-    other_pd = [df] + other
+    other_pd = [df, *other]
 
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
@@ -1224,7 +1224,7 @@ def test_concat_join_empty_dataframes(
     request, df, other, ignore_index, join, sort
 ):
     axis = 0
-    other_pd = [df] + other
+    other_pd = [df, *other]
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
 
@@ -1312,7 +1312,7 @@ def test_concat_join_empty_dataframes_axis_1(
     df, other, ignore_index, axis, join, sort
 ):
     # no duplicate columns
-    other_pd = [df] + other
+    other_pd = [df, *other]
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index ac772c47e3a..e18112d03ea 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -916,10 +916,10 @@ def test_csv_reader_nrows(tmpdir):
         str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows
     )
     assert df.shape == (read_rows, 2)
-    assert str(skip_rows) in list(df)[0]
+    assert str(skip_rows) in next(iter(df))
     assert str(2 * skip_rows) in list(df)[1]
     for row in range(0, read_rows // sample_skip, sample_skip):
-        assert df[list(df)[0]][row] == row + skip_rows + 1
+        assert df[next(iter(df))][row] == row + skip_rows + 1
         assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1)
     assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows)
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 29f2f46e3c7..dcde0dab83d 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -170,7 +170,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     # CuPy array
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
     a = cudf.core.column.as_column(a)
-    b = cupy.asarray([1, 1, 1])  # noqa: F841
+    b = cupy.asarray([1, 1, 1])
     assert_eq(pd.Index([1, 2, 3]), a.to_pandas())
 
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
@@ -187,7 +187,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     ),
 )
 def test_cuda_array_interface_pytorch():
-    torch = pytest.importorskip("torch", minversion="1.6.0")
+    torch = pytest.importorskip("torch", minversion="2.4.0")
     if not torch.cuda.is_available():
         pytest.skip("need gpu version of pytorch to be installed")
 
@@ -202,15 +202,10 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    # TODO: This test fails with PyTorch 2. It appears that PyTorch
-    # checks that the pointer is device-accessible even when the
-    # size is zero. See
-    # https://github.com/pytorch/pytorch/issues/98133
-    #
-    # index = cudf.Index([], dtype="float64")
-    # tensor = torch.tensor(index)
-    # got = cudf.Index(tensor)
-    # assert_eq(got, index)
+    index = cudf.Index([], dtype="float64")
+    tensor = torch.tensor(index)
+    got = cudf.Index(tensor)
+    assert_eq(got, index)
 
     index = cudf.core.index.RangeIndex(start=0, stop=100)
     tensor = torch.tensor(index)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 509ee0d65a5..d04fd97dcbd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -56,9 +56,9 @@
 # If spilling is enabled globally, we skip many test permutations
 # to reduce running time.
 if get_global_manager() is not None:
-    ALL_TYPES = ["float32"]  # noqa: F811
-    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
-    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    ALL_TYPES = ["float32"]
+    DATETIME_TYPES = ["datetime64[ms]"]
+    NUMERIC_TYPES = ["float32"]
     # To save time, we skip tests marked "xfail"
     pytest_xfail = pytest.mark.skipif
 
@@ -452,8 +452,8 @@ def test_dataframe_basic():
     df = cudf.concat([df, df2])
     assert len(df) == 11
 
-    hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
-    hvals = np.asarray(rnd_vals.tolist() + [321])
+    hkeys = np.asarray([*np.arange(10, dtype=np.float64).tolist(), 123])
+    hvals = np.asarray([*rnd_vals.tolist(), 321])
 
     np.testing.assert_equal(df["keys"].to_numpy(), hkeys)
     np.testing.assert_equal(df["vals"].to_numpy(), hvals)
@@ -1118,7 +1118,7 @@ def test_dataframe_to_string_wide(monkeypatch):
         1   1   1   1   1   1   1   1   1  ...    1    1    1    1    1    1    1    1
         2   2   2   2   2   2   2   2   2  ...    2    2    2    2    2    2    2    2
 
-        [3 rows x 100 columns]"""  # noqa: E501
+        [3 rows x 100 columns]"""
     )
     assert got == expect
 
@@ -2197,7 +2197,7 @@ def test_dataframe_shape_empty():
 
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 20])
-@pytest.mark.parametrize("dtype", dtypes + ["object"])
+@pytest.mark.parametrize("dtype", [*dtypes, "object"])
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     # In case of `bool` dtype: pandas <= 1.2.5 type-casts
@@ -2842,7 +2842,7 @@ def test_arrow_round_trip(preserve_index, index):
     assert_eq(gdf_out, pdf_out)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"])
 def test_cuda_array_interface(dtype):
     np_data = np.arange(10).astype(dtype)
     cupy_data = cupy.array(np_data)
@@ -3707,7 +3707,7 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names():
     assert result._data.names == tuple(result._data.keys())
 
 
-@pytest.mark.parametrize("dtype", dtypes + ["category"])
+@pytest.mark.parametrize("dtype", [*dtypes, "category"])
 def test_dataframe_0_row_dtype(dtype):
     if dtype == "category":
         data = pd.Series(["a", "b", "c", "d", "e"], dtype="category")
@@ -7910,10 +7910,10 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index):
 
     with _hide_concat_empty_dtype_warning():
         expected = pd.concat(
-            [pdf] + other_pd, sort=sort, ignore_index=ignore_index
+            [pdf, *other_pd], sort=sort, ignore_index=ignore_index
         )
         actual = cudf.concat(
-            [gdf] + other_gd, sort=sort, ignore_index=ignore_index
+            [gdf, *other_gd], sort=sort, ignore_index=ignore_index
         )
 
     # In some cases, Pandas creates an empty Index([], dtype="object") for
@@ -8026,10 +8026,10 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index):
 
     with _hide_concat_empty_dtype_warning():
         expected = pd.concat(
-            [pdf] + other_pd, sort=sort, ignore_index=ignore_index
+            [pdf, *other_pd], sort=sort, ignore_index=ignore_index
         )
         actual = cudf.concat(
-            [gdf] + other_gd, sort=sort, ignore_index=ignore_index
+            [gdf, *other_gd], sort=sort, ignore_index=ignore_index
         )
 
     if expected.shape != df.shape:
@@ -10892,7 +10892,7 @@ def test_dataframe_from_ndarray_dup_columns():
 @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]])
 def test_dataframe_contains(name, contains, other_names):
-    column_names = [name] + other_names
+    column_names = [name, *other_names]
     gdf = cudf.DataFrame({c: [0] for c in column_names})
     pdf = pd.DataFrame({c: [0] for c in column_names})
 
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index f93bd2c5d32..6a9dd4c4a66 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -16,7 +16,7 @@
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     rng = np.random.default_rng(seed=0)
-    types = NUMERIC_TYPES + ["bool"]
+    types = [*NUMERIC_TYPES, "bool"]
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e4422e204bc..d8a2528230e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -917,7 +917,6 @@ def test_groupby_apply_return_col_from_df():
     # tests a UDF that consists of purely colwise
     # ops, such as `lambda group: group.x + group.y`
     # which returns a column
-    func = lambda group: group.x + group.y  # noqa:E731
     df = cudf.DataFrame(
         {
             "id": range(10),
@@ -1222,7 +1221,7 @@ def test_groupby_column_numeral():
         pd.Series([0, 2, 0]),
         pd.Series([0, 2, 0], index=[0, 2, 1]),
     ],
-)  # noqa: E501
+)
 def test_groupby_external_series(series):
     pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]})
     gdf = DataFrame.from_pandas(pdf)
@@ -2016,8 +2015,8 @@ def test_multi_agg():
 @pytest.mark.parametrize(
     "agg",
     (
-        list(itertools.combinations(["count", "max", "min", "nunique"], 2))
-        + [
+        [
+            *itertools.combinations(["count", "max", "min", "nunique"], 2),
             {"b": "min", "c": "mean"},
             {"b": "max", "c": "mean"},
             {"b": "count", "c": "mean"},
@@ -4075,3 +4074,17 @@ def test_get_group_list_like():
 
     with pytest.raises(KeyError):
         df.groupby(["a"]).get_group([1])
+
+
+def test_size_as_index_false():
+    df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"])
+    expected = df.groupby("a", as_index=False).size()
+    result = cudf.from_pandas(df).groupby("a", as_index=False).size()
+    assert_eq(result, expected)
+
+
+def test_size_series_with_name():
+    ser = pd.Series(range(3), name="foo")
+    expected = ser.groupby(ser).size()
+    result = cudf.from_pandas(ser).groupby(ser).size()
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index 430ed973f19..4921b7b51fc 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -16,7 +16,7 @@
 
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
-    types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set(
+    types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set(
         UNSIGNED_TYPES
     )
     typer = {"col_" + val: val for val in types}
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 24d42d9eb4c..11f6d687931 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1274,7 +1274,7 @@ def test_index_append_list(data, other):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 @pytest.mark.parametrize("name", [1, "a", None])
 def test_index_basic(data, dtype, name):
@@ -1399,7 +1399,7 @@ def test_multiindex_append(data, other):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_empty(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1410,7 +1410,7 @@ def test_index_empty(data, dtype):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_size(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1421,7 +1421,7 @@ def test_index_size(data, dtype):
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_drop_duplicates(data, dtype):
     pdi = pd.Index(data, dtype=dtype)
@@ -1437,7 +1437,7 @@ def test_dropna_bad_how():
 
 @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_tolist(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
@@ -1455,7 +1455,7 @@ def test_index_tolist(data, dtype):
 
 @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_iter_error(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
@@ -1473,7 +1473,7 @@ def test_index_iter_error(data, dtype):
 
 @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]])
 @pytest.mark.parametrize(
-    "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+    "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"]
 )
 def test_index_values_host(data, dtype):
     gdi = cudf.Index(data, dtype=dtype)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index f6941ce7fae..f8e61651f37 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer():
         result = left.merge(right, how="outer", on="key")
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_inner_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype.categories.dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_left_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"])
 def test_categorical_typecast_outer_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index b48be6b2c2f..aaa8d7d07ee 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -58,12 +58,14 @@ def gdf(pdf):
 @pytest.fixture(params=[0, 1, 10, 100])
 def gdf_writer_types(request):
     # datetime64[us], datetime64[ns] are unsupported due to a bug in parser
-    types = (
-        NUMERIC_TYPES
-        + ["datetime64[s]", "datetime64[ms]"]
-        + TIMEDELTA_TYPES
-        + ["bool", "str"]
-    )
+    types = [
+        *NUMERIC_TYPES,
+        "datetime64[s]",
+        "datetime64[ms]",
+        *TIMEDELTA_TYPES,
+        "bool",
+        "str",
+    ]
     typer = {"col_" + val: val for val in types}
     ncols = len(types)
     nrows = request.param
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 7d87fc73621..260b481b933 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -852,7 +852,7 @@ def test_listcol_setitem_retain_dtype():
         {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]}
     )
     df1 = df.head(0)
-    # Performing a setitem on `b` triggers a `column.column_empty_like` call
+    # Performing a setitem on `b` triggers a `column.column_empty` call
     # which tries to create an empty ListColumn.
     df1["b"] = df1["c"]
     # Performing a copy to trigger a copy dtype which is obtained by accessing
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 41c1c3ccb20..c4b4ef60184 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -606,7 +606,7 @@ def normalized_equals(value1, value2):
 def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    supported_stat_types = supported_numpy_dtypes + ["str"]
+    supported_stat_types = [*supported_numpy_dtypes, "str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
     if nrows == 100000:
@@ -681,7 +681,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     from pyarrow import orc
 
-    supported_stat_types = supported_numpy_dtypes + ["str"]
+    supported_stat_types = [*supported_numpy_dtypes, "str"]
     # Writing bool columns to multiple row groups is disabled
     # until #6763 is fixed
     if nrows == 200000:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 659d2ebd89a..77d1f77d30b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,6 @@
 from pyarrow import parquet as pq
 
 import cudf
-from cudf._lib.parquet import read_parquet_chunked
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.parquet import (
     ParquetDatasetWriter,
@@ -2313,7 +2312,7 @@ def test_parquet_writer_criteo(tmpdir):
 
     cont_names = ["I" + str(x) for x in range(1, 14)]
     cat_names = ["C" + str(x) for x in range(1, 27)]
-    cols = ["label"] + cont_names + cat_names
+    cols = ["label", *cont_names, *cat_names]
 
     df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000))
     df = df.drop(columns=cont_names)
@@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer, row_group_size=10000)
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        use_pandas_metadata=use_pandas_metadata,
-        row_groups=row_groups,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            use_pandas_metadata=use_pandas_metadata,
+            row_groups=row_groups,
+        )
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
@@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs(
     # Number of rows to read
     nrows = num_rows if num_rows is not None else len(df)
 
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders(
     nrows = num_rows if num_rows is not None else len(df)
 
     # Check with num_rows specified
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     ).reset_index(drop=True)
 
     # Read with chunked reader (filter columns not supported)
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["list", "d_list", "str"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["list", "d_list", "str"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
 
     # Construct the expected table without filter columns
     expected_chunked = cudf.concat(
@@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs():
     )
 
     # Read with chunked reader
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["struct.b.b_b.b_b_a"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b.b_b.b_b_a"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
@@ -4158,6 +4162,31 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
 
+def test_parquet_roundtrip_zero_rows_no_column_mask():
+    expected = cudf.DataFrame._from_data(
+        {
+            "int": cudf.core.column.column_empty(0, "int64"),
+            "float": cudf.core.column.column_empty(0, "float64"),
+            "datetime": cudf.core.column.column_empty(0, "datetime64[ns]"),
+            "timedelta": cudf.core.column.column_empty(0, "timedelta64[ns]"),
+            "bool": cudf.core.column.column_empty(0, "bool"),
+            "decimal": cudf.core.column.column_empty(
+                0, cudf.Decimal64Dtype(1)
+            ),
+            "struct": cudf.core.column.column_empty(
+                0, cudf.StructDtype({"a": "int64"})
+            ),
+            "list": cudf.core.column.column_empty(
+                0, cudf.ListDtype("float64")
+            ),
+        }
+    )
+    with BytesIO() as bio:
+        expected.to_parquet(bio)
+        result = cudf.read_parquet(bio)
+    assert_eq(result, expected)
+
+
 def test_parquet_reader_mismatched_nullability():
     # Ensure that we can faithfully read the tables with mismatched nullabilities
     df1 = cudf.DataFrame(
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 7d8303df0c3..84de2ac38e7 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -90,4 +90,20 @@ def test_quantile_type_int_float(interpolation):
     actual = gsr.quantile(0.5, interpolation=interpolation)
 
     assert expected == actual
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d9f4ceaf3f7..8ea0d205e8b 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -31,7 +31,7 @@
     [
         cudf.Series([5, 1, 2, 3, None, 243, None, 4]),
         cudf.Series(["one", "two", "three", None, "one"], dtype="category"),
-        cudf.Series(list(range(400)) + [None]),
+        cudf.Series([*list(range(400)), None]),
     ],
 )
 @pytest.mark.parametrize(
@@ -128,7 +128,7 @@ def test_series_replace():
     assert_eq(a8, sr8.to_numpy())
 
     # large input containing null
-    sr9 = cudf.Series(list(range(400)) + [None])
+    sr9 = cudf.Series([*list(range(400)), None])
     sr10 = sr9.replace([22, 323, 27, 0], None)
     assert sr10.null_count == 5
     assert len(sr10.dropna().to_numpy()) == (401 - 5)
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 53fe5f7f30d..5cebdf37c9f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -28,9 +28,9 @@
 # If spilling is enabled globally, we skip many test permutations
 # to reduce running time.
 if get_global_manager() is not None:
-    ALL_TYPES = ["float32"]  # noqa: F811
-    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
-    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    ALL_TYPES = ["float32"]
+    DATETIME_TYPES = ["datetime64[ms]"]
+    NUMERIC_TYPES = ["float32"]
     # To save time, we skip tests marked "pytest.mark.xfail"
     pytest_xfail = pytest.mark.skipif
 
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index f2faf4343b6..fcd98831686 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -351,7 +351,7 @@ def test_scalar_implicit_float_conversion(value):
     got = float(cudf.Scalar(value))
 
     assert expect == got
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
 
 
 @pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1", True, False])
@@ -360,7 +360,7 @@ def test_scalar_implicit_int_conversion(value):
     got = int(cudf.Scalar(value))
 
     assert expect == got
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
 
 
 @pytest.mark.parametrize("cls", [int, float, bool])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 68f2aaf9cab..b50ed04427f 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging import version
 
 import cudf
 from cudf.testing import _utils as utils, assert_eq
@@ -149,13 +150,19 @@ def test_serialize(df, to_host):
 
 def test_serialize_dtype_error_checking():
     dtype = cudf.IntervalDtype("float", "right")
-    header, frames = dtype.serialize()
-    with pytest.raises(AssertionError):
-        # Invalid number of frames
-        type(dtype).deserialize(header, [None] * (header["frame_count"] + 1))
+    # Must call device_serialize (not serialize) to ensure that the type metadata is
+    # encoded in the header.
+    header, frames = dtype.device_serialize()
     with pytest.raises(AssertionError):
         # mismatching class
         cudf.StructDtype.deserialize(header, frames)
+    # The is-cuda flag list length must match the number of frames
+    header["is-cuda"] = [False]
+    with pytest.raises(AssertionError):
+        # Invalid number of frames
+        type(dtype).deserialize(
+            header, [np.zeros(1)] * (header["frame_count"] + 1)
+        )
 
 
 def test_serialize_dataframe():
@@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes():
     assert expect == got
 
 
+@pytest.mark.skipif(
+    version.parse(np.__version__) < version.parse("2.0.0"),
+    reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x",
+)
 def test_deserialize_cudf_23_12(datadir):
     fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl"
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a040d1dc57f..f8697c5c6b8 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -282,8 +282,8 @@ def test_series_concat_list_series_with_index(data, others, ignore_index):
     other_ps = others
     other_gs = [cudf.from_pandas(obj) for obj in others]
 
-    expected = pd.concat([psr] + other_ps, ignore_index=ignore_index)
-    actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index)
+    expected = pd.concat([psr, *other_ps], ignore_index=ignore_index)
+    actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index)
 
     assert_eq(expected, actual)
 
@@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal):
     assert_eq(result, expected, atol=1e-10)
 
 
+@pytest.mark.parametrize(
+    "data, dtype, decimals, expected_half_up, expected_half_even",
+    [
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.34, 3.46],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            0,
+            [1.0, 2.0, 3.0],
+            [1.0, 2.0, 3.0],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            3,
+            [1.234, 2.345, 3.456],
+            [1.234, 2.345, 3.456],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            4,
+            [1.2346, 2.3457, 3.4568],
+            [1.2346, 2.3457, 3.4568],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.35, 3.46],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            6,
+            [1.234567, 2.345678, 3.456789],
+            [1.234567, 2.345678, 3.456789],
+        ),
+    ],
+)
+def test_series_round_decimal(
+    data, dtype, decimals, expected_half_up, expected_half_even
+):
+    ser = cudf.Series(data).astype(dtype)
+
+    result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype)
+    expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype)
+    assert_eq(result_half_up, expected_ser_half_up)
+
+    result_half_even = ser.round(decimals=decimals, how="half_even").astype(
+        dtype
+    )
+    expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype)
+    assert_eq(result_half_even, expected_ser_half_even)
+
+
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
@@ -1942,7 +2005,7 @@ def test_diff_many_dtypes(data):
 @pytest.mark.parametrize("num_rows", [1, 100])
 @pytest.mark.parametrize("num_bins", [1, 10])
 @pytest.mark.parametrize("right", [True, False])
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"])
 @pytest.mark.parametrize("series_bins", [True, False])
 def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
     rng = np.random.default_rng(seed=0)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 5406836ba61..6119fda0752 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -472,7 +472,7 @@ def test_loc_setitem_series_index_alignment_13031(other_index):
         ),
     ],
 )
-@pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1])
+@pytest.mark.parametrize("arg", [*list(range(-20, 20)), 5.6, 3.1])
 def test_series_set_item_range_index(ps, arg):
     gsr = cudf.from_pandas(ps)
     psr = ps.copy(deep=True)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 7af83a99d60..13d98e43ddc 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -669,7 +669,7 @@ def test_statistics_expose(manager: SpillManager):
     # Expose the first buffer
     buffers[0].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 1
-    stat = list(manager.statistics.exposes.values())[0]
+    stat = next(iter(manager.statistics.exposes.values()))
     assert stat.count == 1
     assert stat.total_nbytes == buffers[0].nbytes
     assert stat.spilled_nbytes == 0
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 9700f548a16..bdc9e695844 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -536,8 +536,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index):
 
     assert_eq(expect, got)
 
-    expect = ps.str.cat(others=[ps.index] + [ps.index], sep=sep, na_rep=na_rep)
-    got = gs.str.cat(others=[gs.index] + [gs.index], sep=sep, na_rep=na_rep)
+    expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep)
+    got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 69876d97aad..f4841f42e91 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -82,7 +82,9 @@ def run_udf_test(data, func, dtype):
         )
     else:
         dtype = np.dtype(dtype)
-        output = cudf.core.column.column_empty(len(data), dtype=dtype)
+        output = cudf.core.column.column_empty(
+            len(data), dtype=dtype, for_numba=True
+        )
 
     cudf_column = cudf.core.column.as_column(data)
     str_views = column_to_string_view_array(cudf_column)
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 899d78c999b..b85943626a6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -79,7 +79,7 @@ def test_series_construction_with_nulls():
 )
 def test_serialize_struct_dtype(fields):
     dtype = cudf.StructDtype(fields)
-    recreated = dtype.__class__.deserialize(*dtype.serialize())
+    recreated = dtype.__class__.device_deserialize(*dtype.device_serialize())
     assert recreated == dtype
 
 
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index c3620db3880..87734ebed58 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -69,7 +69,7 @@ def test_basic_assert_index_equal(
         msg = str(e)
 
     if kind is not None:
-        if (kind == TypeError) and (
+        if (kind is TypeError) and (
             msg
             == (
                 "Categoricals can only be compared "
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 47e541fdcef..9a62285403f 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -44,7 +44,7 @@ def test_tokenize():
 
     actual = strings.str.tokenize()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -71,7 +71,7 @@ def test_tokenize_delimiter():
 
     actual = strings.str.tokenize(delimiter="o")
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -106,7 +106,7 @@ def test_detokenize():
             "the siamésé cat jumped under the sofa",
         ]
     )
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     indices = cudf.Series(
@@ -122,7 +122,7 @@ def test_detokenize():
             "the+the+the+the",
         ]
     )
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -150,7 +150,7 @@ def test_token_count(delimiter, expected_token_counts):
 
     actual = strings.str.token_count(delimiter)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual, check_dtype=False)
 
 
@@ -208,7 +208,7 @@ def test_tokenize_with_vocabulary(delimiter, input, default_id, results):
     )
 
     actual = tokenizer.tokenize(strings, delimiter, default_id)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -232,7 +232,7 @@ def test_normalize_spaces():
 
     actual = strings.str.normalize_spaces()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -252,7 +252,7 @@ def test_normalize_characters():
     )
 
     actual = strings.str.normalize_characters()
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
@@ -266,7 +266,7 @@ def test_normalize_characters():
         ]
     )
     actual = strings.str.normalize_characters(do_lower=False)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -309,7 +309,7 @@ def test_ngrams(n, separator, expected_values):
 
     actual = strings.str.ngrams(n=n, separator=separator)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -364,7 +364,7 @@ def test_character_ngrams(n, expected_values, expected_index, as_list):
 
     actual = strings.str.character_ngrams(n=n, as_list=as_list)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -379,12 +379,12 @@ def test_hash_character_ngrams():
         ]
     )
     actual = strings.str.hash_character_ngrams(5, True)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     actual = strings.str.hash_character_ngrams(5)
     expected = expected.explode()
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -417,7 +417,7 @@ def test_ngrams_tokenize(n, separator, expected_values):
 
     actual = strings.str.ngrams_tokenize(n=n, separator=separator)
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -844,7 +844,7 @@ def test_porter_stemmer_measure():
 
     actual = strings.str.porter_stemmer_measure()
 
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
@@ -856,14 +856,14 @@ def test_is_vowel_consonant():
         [False, False, True, False, False, False, True, False, None, False]
     )
     actual = strings.str.is_vowel(2)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
         [True, False, True, False, False, False, True, True, None, False]
     )
     actual = strings.str.is_consonant(1)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0])
@@ -871,18 +871,18 @@ def test_is_vowel_consonant():
         [False, True, False, False, True, False, True, True, None, False]
     )
     actual = strings.str.is_vowel(indices)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
     expected = cudf.Series(
         [False, False, True, True, False, True, False, False, None, False]
     )
     actual = strings.str.is_consonant(indices)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
 
-def test_minhash_permuted():
+def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
     params = cudf.Series([1, 2, 3], dtype=np.uint32)
@@ -894,7 +894,7 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     params = cudf.Series([1, 2, 3], dtype=np.uint64)
@@ -912,78 +912,18 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash64(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash_permuted(1, a="a", b="b", width=7)
+        strings.str.minhash(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash_permuted(1, a=params, b=params, width=6)
+        strings.str.minhash(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
-
-
-def test_word_minhash():
-    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
-            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash(seeds=seeds)
-    assert_eq(expected, actual)
-
-    expected = cudf.Series(
-        [
-            cudf.Series([2603139454418834912], dtype=np.uint64),
-            cudf.Series([5240044617220523711], dtype=np.uint64),
-        ]
-    )
-    actual = ls.str.word_minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-    expected = cudf.Series(
-        [
-            cudf.Series(
-                [
-                    2603139454418834912,
-                    8644371945174847701,
-                    5541030711534384340,
-                ],
-                dtype=np.uint64,
-            ),
-            cudf.Series(
-                [5240044617220523711, 5847101123925041457, 153762819128779913],
-                dtype=np.uint64,
-            ),
-        ]
-    )
-    actual = ls.str.word_minhash64(seeds=seeds)
-    assert_eq(expected, actual)
-
-    # test wrong seed types
-    with pytest.raises(ValueError):
-        ls.str.word_minhash(seeds="a")
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        ls.str.word_minhash(seeds=seeds)
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        ls.str.word_minhash64(seeds=seeds)
+        strings.str.minhash64(1, a=params, b=params, width=8)
 
 
 def test_jaccard_index():
@@ -1097,5 +1037,5 @@ def test_byte_pair_encoding(separator, input, results):
     expected = cudf.Series([results, None, "", results])
 
     actual = encoder(strings, separator)
-    assert type(expected) == type(actual)
+    assert type(expected) is type(actual)
     assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index cd7fe5ee023..4d6f4ea73a8 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -9,7 +9,7 @@
 from numba.core.utils import pysignature
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column
 from cudf.utils import utils
@@ -121,7 +121,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"):
                 nullmask.copy(), dtype=utils.mask_dtype
             )
         else:
-            out_mask = libcudf.binaryop.binaryop(
+            out_mask = binaryop.binaryop(
                 nullmask, out_mask, op, out_mask.dtype
             )
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 252bb19063a..a04fcb8df7a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -3,39 +3,46 @@
 
 import datetime
 import functools
+import json
 import operator
 import os
 import urllib
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import fsspec
 import fsspec.implementations.local
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
 import cudf
 from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
+from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
 
 try:
     import fsspec.parquet as fsspec_parquet
-
 except ImportError:
     fsspec_parquet = None
 
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
 
     from cudf.core.column import ColumnBase
 
 
+PARQUET_META_TYPE_MAP = {
+    str(cudf_dtype): str(pandas_dtype)
+    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
+}
+
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -829,7 +836,7 @@
 >>> cudf.read_json(json_str, engine='cudf', lines=True, dtype={'k1':float, 'k2':cudf.ListDtype(int)})
     k1   k2
 0  1.0  [1]
-"""  # noqa: E501
+"""
 doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json)
 
 _docstring_to_json = """
@@ -1487,6 +1494,153 @@
 )
 
 
+def _index_level_name(
+    index_name: Hashable, level: int, column_names: list[Hashable]
+) -> Hashable:
+    """
+    Return the name of an index level or a default name
+    if `index_name` is None or is already a column name.
+
+    Parameters
+    ----------
+    index_name : name of an Index object
+    level : level of the Index object
+
+    Returns
+    -------
+    name : str
+    """
+    if index_name is not None and index_name not in column_names:
+        return index_name
+    else:
+        return f"__index_level_{level}__"
+
+
+def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str:
+    col_names: list[Hashable] = []
+    types = []
+    index_levels = []
+    index_descriptors = []
+    columns_to_convert = list(table._columns)
+    # Columns
+    for name, col in table._column_labels_and_values:
+        if cudf.get_option("mode.pandas_compatible"):
+            # in pandas-compat mode, non-string column names are stringified.
+            col_names.append(str(name))
+        else:
+            col_names.append(name)
+
+        if isinstance(col.dtype, cudf.CategoricalDtype):
+            raise ValueError(
+                "'category' column dtypes are currently not "
+                + "supported by the gpu accelerated parquet writer"
+            )
+        elif isinstance(
+            col.dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype),
+        ):
+            types.append(col.dtype.to_arrow())
+        else:
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interoperable
+            # with arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
+
+    # Indexes
+    materialize_index = False
+    if index is not False:
+        for level, name in enumerate(table.index.names):
+            if isinstance(table.index, cudf.MultiIndex):
+                idx = table.index.get_level_values(level)
+            else:
+                idx = table.index
+
+            if isinstance(idx, cudf.RangeIndex):
+                if index is None:
+                    descr: dict[str, Any] | Hashable = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    materialize_index = True
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = idx._as_int_index()
+                    descr = _index_level_name(
+                        index_name=materialized_idx.name,
+                        level=level,
+                        column_names=col_names,
+                    )
+                    index_levels.append(materialized_idx)
+                    columns_to_convert.append(materialized_idx._values)
+                    col_names.append(descr)
+                    types.append(np_to_pa_dtype(materialized_idx.dtype))
+            else:
+                descr = _index_level_name(
+                    index_name=idx.name, level=level, column_names=col_names
+                )
+                columns_to_convert.append(idx._values)
+                col_names.append(descr)
+                if isinstance(idx.dtype, cudf.CategoricalDtype):
+                    raise ValueError(
+                        "'category' column dtypes are currently not "
+                        + "supported by the gpu accelerated parquet writer"
+                    )
+                elif isinstance(idx.dtype, cudf.ListDtype):
+                    types.append(col.dtype.to_arrow())
+                else:
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
+                index_levels.append(idx)
+            index_descriptors.append(descr)
+
+    df_meta = table.head(0)
+    if materialize_index:
+        df_meta.index = df_meta.index._as_int_index()
+    metadata = pa.pandas_compat.construct_metadata(
+        columns_to_convert=columns_to_convert,
+        # It is OKAY to do `.head(0).to_pandas()` because
+        # this method will extract `.columns` metadata only
+        df=df_meta.to_pandas(),
+        column_names=col_names,
+        index_levels=index_levels,
+        index_descriptors=index_descriptors,
+        preserve_index=index,
+        types=types,
+    )
+
+    md_dict = json.loads(metadata[b"pandas"])
+
+    # correct metadata for list and struct and nullable numeric types
+    for col_meta in md_dict["columns"]:
+        if (
+            col_meta["name"] in table._column_names
+            and table._data[col_meta["name"]].nullable
+            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
+            and col_meta["pandas_type"] != "decimal"
+        ):
+            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
+                col_meta["numpy_type"]
+            ]
+        if col_meta["numpy_type"] in ("list", "struct"):
+            col_meta["numpy_type"] = "object"
+
+    return json.dumps(md_dict)
+
+
 def is_url(url):
     """Check if a string is a valid URL to a network location.
 
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 78aeac425f7..4e3d32c8ed0 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -64,7 +64,7 @@ def query_parser(text):
     Returns
     -------
     info: a `dict` of the parsed info
-    """  # noqa
+    """
     # convert any '@' to
     text = text.replace("@", ENVREF_PREFIX)
     tree = ast.parse(text)
@@ -210,7 +210,6 @@ def query_execute(df, expr, callenv):
         Contains keys 'local_dict', 'locals' and 'globals' which are all dict.
         They represent the arg, local and global dictionaries of the caller.
     """
-
     # compile
     compiled = query_compile(expr)
     columns = compiled["colnames"]
@@ -247,9 +246,9 @@ def query_execute(df, expr, callenv):
 
     # allocate output buffer
     nrows = len(df)
-    out = column_empty(nrows, dtype=np.bool_)
+    out = column_empty(nrows, dtype=np.bool_, for_numba=True)
     # run kernel
-    args = [out] + colarrays + envargs
+    args = [out, *colarrays, *envargs]
     with _CUDFNumbaConfig():
         kernel.forall(nrows)(*args)
     out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index e6d252b8807..c83c1cbe895 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -210,7 +210,7 @@ class GetAttrGetItemMixin:
 
     # Tracking of protected keys by each subclass is necessary to make the
     # `__getattr__`->`__getitem__` call safe. See
-    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
+    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html
     # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this
     # class to avoid calling `__getitem__` inside `__getattr__` when
     # `__getitem__` will internally again call `__getattr__`, resulting in an
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 4473a0e6f12..d494e157a18 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1589,8 +1589,8 @@ def test_numpy_cupy_flatiter(series):
     _, s = series
     arr = s.values
 
-    assert type(arr.flat._fsproxy_fast) == cp.flatiter
-    assert type(arr.flat._fsproxy_slow) == np.flatiter
+    assert type(arr.flat._fsproxy_fast) is cp.flatiter
+    assert type(arr.flat._fsproxy_slow) is np.flatiter
 
 
 @pytest.mark.xfail(
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 6b317cc13fb..3891110e9d3 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -76,13 +76,6 @@ files:
       - py_version
       - test_base
       - test_xgboost
-  test_catboost:
-    output: none
-    includes:
-      - cuda_version
-      - py_version
-      - test_base
-      - test_catboost
   test_cuml:
     output: none
     includes:
@@ -213,7 +206,7 @@ dependencies:
       - output_types: conda
         packages:
           - numpy
-          - pytorch>=2.1.0
+          - pytorch>=2.4.0
   test_seaborn:
     common:
       - output_types: conda
@@ -251,14 +244,6 @@ dependencies:
           - pip
           - pip:
             - xgboost>=2.0.1
-  test_catboost:
-    common:
-      - output_types: conda
-        packages:
-          - numpy
-          - scipy
-          - scikit-learn
-          - catboost
   test_cuml:
     common:
       - output_types: conda
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
deleted file mode 100644
index 04cc69231fe..00000000000
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-import pytest
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
-from sklearn.datasets import make_classification, make_regression
-
-rng = np.random.default_rng(seed=42)
-
-
-def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
-    if isinstance(expect, (tuple, list)):
-        assert len(expect) == len(got)
-        for e, g in zip(expect, got):
-            assert_catboost_equal(e, g, rtol, atol)
-    elif isinstance(expect, np.ndarray):
-        np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
-    elif isinstance(expect, pd.DataFrame):
-        pd.testing.assert_frame_equal(expect, got)
-    elif isinstance(expect, pd.Series):
-        pd.testing.assert_series_equal(expect, got)
-    else:
-        assert expect == got
-
-
-pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)
-
-
-@pytest.fixture
-def regression_data():
-    X, y = make_regression(n_samples=100, n_features=10, random_state=42)
-    return pd.DataFrame(X), pd.Series(y)
-
-
-@pytest.fixture
-def classification_data():
-    X, y = make_classification(
-        n_samples=100, n_features=10, n_classes=2, random_state=42
-    )
-    return pd.DataFrame(X), pd.Series(y)
-
-
-def test_catboost_regressor_with_dataframe(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_regressor_with_numpy(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_classifier_with_dataframe(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_classifier_with_numpy(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_pool_and_dataframe(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X, y)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_with_pool_and_numpy(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X.values, y.values)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_categorical_features():
-    data = {
-        "numerical_feature": rng.standard_normal(100),
-        "categorical_feature": rng.choice(["A", "B", "C"], size=100),
-        "target": rng.integers(0, 2, size=100),
-    }
-    df = pd.DataFrame(data)
-    X = df[["numerical_feature", "categorical_feature"]]
-    y = df["target"]
-    cat_features = ["categorical_feature"]
-    model = CatBoostClassifier(
-        iterations=10, verbose=0, cat_features=cat_features
-    )
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-@pytest.mark.parametrize(
-    "X, y",
-    [
-        (
-            pd.DataFrame(rng.standard_normal((100, 5))),
-            pd.Series(rng.standard_normal(100)),
-        ),
-        (rng.standard_normal((100, 5)), rng.standard_normal(100)),
-    ],
-)
-def test_catboost_train_test_split(X, y):
-    from sklearn.model_selection import train_test_split
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X_train, y_train)
-    predictions = model.predict(X_test)
-    return len(X_train), len(X_test), len(y_train), len(y_test), predictions
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
index bef02c86355..8be48953974 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
@@ -71,6 +71,9 @@ def test_holoviews_heatmap(df):
     )
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_holoviews_histogram(df):
     return get_plot_info(hv.Histogram(df.values))
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
index 665b9d6fb08..c91808021e8 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -15,7 +15,7 @@ def assert_plots_equal(expect, got):
         for expect_ch, got_ch in zip(
             expect.get_children(), got.get_children()
         ):
-            assert type(expect_ch) == type(got_ch)
+            assert type(expect_ch) is type(got_ch)
             if isinstance(expect_ch, Line2D):
                 assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
                 assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
@@ -33,6 +33,9 @@ def assert_plots_equal(expect, got):
 pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_line():
     df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
     (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")
@@ -40,6 +43,9 @@ def test_line():
     return plt.gca()
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_bar():
     data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
     ax = data.plot(kind="bar")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
index 472f1889354..4d35d9e8946 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
@@ -37,6 +37,9 @@ def test_numpy_dot(df):
     return np.dot(df, df.T)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_numpy_fft(sr):
     fft = np.fft.fft(sr)
     return fft
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
index 27d9df83476..2a0f6697f3a 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
@@ -8,7 +8,7 @@
 
 
 def assert_plotly_equal(expect, got):
-    assert type(expect) == type(got)
+    assert type(expect) is type(got)
     if isinstance(expect, dict):
         assert expect.keys() == got.keys()
         for k in expect.keys():
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
index ad287471aa0..7cea635afc4 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -116,6 +116,9 @@ def test_torch_train(data):
     return model(test_x1, test_x2)
 
 
+@pytest.mark.skip(
+    reason="AssertionError: The values for attribute 'device' do not match: cpu != cuda:0."
+)
 def test_torch_tensor_ctor():
     s = pd.Series(range(5))
     return torch.tensor(s.values)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
index 4b272900acd..f6a8a96ae3c 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -14,7 +14,7 @@ def assert_plots_equal(expect, got):
         for expect_ch, got_ch in zip(
             expect.get_children(), got.get_children()
         ):
-            assert type(expect_ch) == type(got_ch)
+            assert type(expect_ch) is type(got_ch)
             if isinstance(expect_ch, Line2D):
                 assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
                 assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
@@ -54,6 +54,9 @@ def test_scatter(df):
     return ax
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_lineplot_with_sns_data():
     df = sns.load_dataset("flights")
     ax = sns.lineplot(data=df, x="month", y="passengers")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
index 0777d982ac2..f275659288e 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -41,7 +41,7 @@ def test_multidimensional_distributed_timeseries(dask_client):
     rng = np.random.default_rng(seed=42)
     # Each row represents data from a different dimension while each column represents
     # data from the same dimension
-    your_time_series = rng.random(3, 1000)
+    your_time_series = rng.random((3, 1000))
     # Approximately, how many data points might be found in a pattern
     window_size = 50
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
index ba1f518cbfd..b4fad3024e7 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
@@ -271,6 +271,7 @@ def call(self, values):
         return tf.concat(values, axis=-1)
 
 
+@pytest.mark.xfail(reason="ValueError: Invalid dtype: object")
 def test_full_example_train_with_df(df, target):
     # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
     # Inputs are directly passed as dictionary of series
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
index 70f1e6a4250..0fd632507a6 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
@@ -113,6 +113,9 @@ def test_with_external_memory(
     return predt
 
 
+@pytest.mark.skip(
+    reason="TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly."
+)
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_predict(device: str) -> np.ndarray:
     reg = xgb.XGBRegressor(n_estimators=2, device=device)
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index df3e6b87991..21c18ef0174 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -20,7 +20,7 @@ requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
-    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
+    "cuda-python>=11.8.5,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index ba4858c5619..72e09b872d5 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -21,8 +21,8 @@
 del _ensure_polars_version
 
 __all__: list[str] = [
-    "execute_with_cudf",
     "Translator",
     "__git_commit__",
     "__version__",
+    "execute_with_cudf",
 ]
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 95527028aa9..29d3dc4ae79 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -217,7 +217,8 @@ def validate_config_options(config: dict) -> None:
         If the configuration contains unsupported options.
     """
     if unsupported := (
-        config.keys() - {"raise_on_fail", "parquet_options", "executor"}
+        config.keys()
+        - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
     ):
         raise ValueError(
             f"Engine configuration contains unsupported settings: {unsupported}"
@@ -226,6 +227,17 @@ def validate_config_options(config: dict) -> None:
         config.get("parquet_options", {})
     )
 
+    # Validate executor_options
+    executor = config.get("executor", "pylibcudf")
+    if executor == "dask-experimental":
+        unsupported = config.get("executor_options", {}).keys() - {
+            "max_rows_per_partition"
+        }
+    else:
+        unsupported = config.get("executor_options", {}).keys()
+    if unsupported:
+        raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
+
 
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index 3b1eff4a0d0..9dff8822376 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column"]
+__all__: list[str] = ["Column", "DataFrame"]
 
 from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 326d6b65cbe..98d49e36fb1 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -36,27 +36,27 @@
 from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 
 __all__ = [
-    "Expr",
+    "Agg",
+    "AggInfo",
+    "BinOp",
+    "BooleanFunction",
+    "Cast",
+    "Col",
+    "ColRef",
     "ErrorExpr",
-    "NamedExpr",
+    "Expr",
+    "Filter",
+    "Gather",
+    "GroupedRollingWindow",
+    "Len",
     "Literal",
     "LiteralColumn",
-    "Len",
-    "Col",
-    "ColRef",
-    "BooleanFunction",
-    "StringFunction",
-    "TemporalFunction",
+    "NamedExpr",
+    "RollingWindow",
     "Sort",
     "SortBy",
-    "Gather",
-    "Filter",
-    "RollingWindow",
-    "GroupedRollingWindow",
-    "Cast",
-    "Agg",
-    "AggInfo",
+    "StringFunction",
+    "TemporalFunction",
     "Ternary",
-    "BinOp",
     "UnaryFunction",
 ]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index 2af9fdaacc5..624a9bd87ea 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -31,7 +31,7 @@
 
 
 class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request")
+    __slots__ = ("name", "op", "options", "request")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 23851f91938..4c7ae007070 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -20,7 +20,7 @@
 
     from cudf_polars.containers import Column, DataFrame
 
-__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"]
+__all__ = ["AggInfo", "Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"]
 
 
 class AggInfo(NamedTuple):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 1682e7a8a9c..5aa35ead127 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -195,7 +195,7 @@ def do_evaluate(
                 # If the input null count was non-zero, we must
                 # post-process the result to insert the correct value.
                 h_result = plc.interop.to_arrow(result).as_py()
-                if is_any and not h_result or not is_any and h_result:
+                if (is_any and not h_result) or (not is_any and h_result):
                     # Any                     All
                     # False || Null => Null   True && Null => Null
                     return Column(plc.Column.all_null_like(column.obj, 1))
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index fa68bcb9426..48c37d101f4 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -13,7 +13,7 @@
 if TYPE_CHECKING:
     import pylibcudf as plc
 
-__all__ = ["RollingWindow", "GroupedRollingWindow"]
+__all__ = ["GroupedRollingWindow", "RollingWindow"]
 
 
 class RollingWindow(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index 77d7d4c0d22..12326740f74 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -20,7 +20,7 @@
 
     from cudf_polars.containers import DataFrame
 
-__all__ = ["Gather", "Filter"]
+__all__ = ["Filter", "Gather"]
 
 
 class Gather(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 92c3c658c21..124a6e8d71c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -92,7 +92,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
                 raise ValueError("StringFunction required")
             return getattr(cls, name)
 
-    __slots__ = ("name", "options", "_regex_program")
+    __slots__ = ("_regex_program", "name", "options")
     _non_child = ("dtype", "name", "options")
 
     def __init__(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 7999ec86068..10caaff6811 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -21,7 +21,7 @@
 
     from cudf_polars.containers import DataFrame
 
-__all__ = ["Cast", "UnaryFunction", "Len"]
+__all__ = ["Cast", "Len", "UnaryFunction"]
 
 
 class Cast(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e8d9691f2a0..b5af3bb80bf 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -42,24 +42,24 @@
 
 __all__ = [
     "IR",
-    "ErrorNode",
-    "PythonScan",
-    "Scan",
     "Cache",
-    "DataFrameScan",
-    "Select",
-    "GroupBy",
-    "Join",
     "ConditionalJoin",
-    "HStack",
+    "DataFrameScan",
     "Distinct",
-    "Sort",
-    "Slice",
+    "ErrorNode",
     "Filter",
-    "Projection",
+    "GroupBy",
+    "HConcat",
+    "HStack",
+    "Join",
     "MapFunction",
+    "Projection",
+    "PythonScan",
+    "Scan",
+    "Select",
+    "Slice",
+    "Sort",
     "Union",
-    "HConcat",
 ]
 
 
@@ -130,7 +130,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
 class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
-    __slots__ = ("schema", "_non_child_args")
+    __slots__ = ("_non_child_args", "schema")
     # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     # Concrete classes should set this up with the arguments that will
@@ -253,16 +253,16 @@ class Scan(IR):
     """Input from files."""
 
     __slots__ = (
-        "typ",
-        "reader_options",
         "cloud_options",
         "config_options",
-        "paths",
-        "with_columns",
-        "skip_rows",
         "n_rows",
-        "row_index",
+        "paths",
         "predicate",
+        "reader_options",
+        "row_index",
+        "skip_rows",
+        "typ",
+        "with_columns",
     )
     _non_child = (
         "schema",
@@ -517,17 +517,22 @@ def do_evaluate(
         elif typ == "parquet":
             parquet_options = config_options.get("parquet_options", {})
             if parquet_options.get("chunked", True):
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                # We handle skip_rows != 0 by reading from the
+                # up to n_rows + skip_rows and slicing off the
+                # first skip_rows entries.
+                # TODO: Remove this workaround once
+                # https://github.com/rapidsai/cudf/issues/16186
+                # is fixed
+                nrows = n_rows + skip_rows
+                if nrows > -1:
+                    options.set_num_rows(nrows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    # We handle skip_rows != 0 by reading from the
-                    # up to n_rows + skip_rows and slicing off the
-                    # first skip_rows entries.
-                    # TODO: Remove this workaround once
-                    # https://github.com/rapidsai/cudf/issues/16186
-                    # is fixed
-                    nrows=n_rows + skip_rows,
-                    skip_rows=0,
+                    options,
                     chunk_read_limit=parquet_options.get(
                         "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
                     ),
@@ -573,13 +578,18 @@ def slice_skip(tbl: plc.Table):
                 if predicate is not None and row_index is None:
                     # Can't apply filters during read if we have a row index.
                     filters = to_parquet_filter(predicate.value)
-                tbl_w_meta = plc.io.parquet.read_parquet(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    filters=filters,
-                    nrows=n_rows,
-                    skip_rows=skip_rows,
-                )
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                if n_rows != -1:
+                    options.set_num_rows(n_rows)
+                if skip_rows != 0:
+                    options.set_skip_rows(skip_rows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
+                if filters is not None:
+                    options.set_filter(filters)
+                tbl_w_meta = plc.io.parquet.read_parquet(options)
                 df = DataFrame.from_table(
                     tbl_w_meta.tbl,
                     # TODO: consider nested column names?
@@ -688,14 +698,16 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
-    __slots__ = ("df", "projection", "predicate")
-    _non_child = ("schema", "df", "projection", "predicate")
+    __slots__ = ("config_options", "df", "predicate", "projection")
+    _non_child = ("schema", "df", "projection", "predicate", "config_options")
     df: Any
     """Polars LazyFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
+    config_options: dict[str, Any]
+    """GPU-specific configuration options"""
 
     def __init__(
         self,
@@ -703,11 +715,13 @@ def __init__(
         df: Any,
         projection: Sequence[str] | None,
         predicate: expr.NamedExpr | None,
+        config_options: dict[str, Any],
     ):
         self.schema = schema
         self.df = df
         self.projection = tuple(projection) if projection is not None else None
         self.predicate = predicate
+        self.config_options = config_options
         self._non_child_args = (schema, df, self.projection, predicate)
         self.children = ()
 
@@ -719,7 +733,14 @@ def get_hashable(self) -> Hashable:
         not stable across runs, or repeat instances of the same equal dataframes.
         """
         schema_hash = tuple(self.schema.items())
-        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+        return (
+            type(self),
+            schema_hash,
+            id(self.df),
+            self.projection,
+            self.predicate,
+            json.dumps(self.config_options),
+        )
 
     @classmethod
     def do_evaluate(
@@ -819,11 +840,11 @@ class GroupBy(IR):
     """Perform a groupby."""
 
     __slots__ = (
+        "agg_infos",
         "agg_requests",
         "keys",
         "maintain_order",
         "options",
-        "agg_infos",
     )
     _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
     keys: tuple[expr.NamedExpr, ...]
@@ -993,7 +1014,7 @@ def do_evaluate(
 class ConditionalJoin(IR):
     """A conditional inner join of two dataframes on a predicate."""
 
-    __slots__ = ("predicate", "options", "ast_predicate")
+    __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
     options: tuple
@@ -1053,7 +1074,7 @@ def do_evaluate(
 class Join(IR):
     """A join of two dataframes."""
 
-    __slots__ = ("left_on", "right_on", "options")
+    __slots__ = ("left_on", "options", "right_on")
     _non_child = ("schema", "left_on", "right_on", "options")
     left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
@@ -1337,7 +1358,7 @@ def do_evaluate(
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    __slots__ = ("keep", "subset", "zlice", "stable")
+    __slots__ = ("keep", "stable", "subset", "zlice")
     _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
     """Which distinct value to keep."""
@@ -1424,7 +1445,7 @@ def do_evaluate(
 class Sort(IR):
     """Sort a dataframe."""
 
-    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    __slots__ = ("by", "null_order", "order", "stable", "zlice")
     _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
     by: tuple[expr.NamedExpr, ...]
     """Sort keys."""
@@ -1505,7 +1526,7 @@ def do_evaluate(
 class Slice(IR):
     """Slice a dataframe."""
 
-    __slots__ = ("offset", "length")
+    __slots__ = ("length", "offset")
     _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b1e2de63ba6..37cf36dc4dd 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -263,6 +263,7 @@ def _(
         translate_named_expr(translator, n=node.selection)
         if node.selection is not None
         else None,
+        translator.config.config.copy(),
     )
 
 
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
index be8338cb9a9..b3248dae93c 100644
--- a/python/cudf_polars/cudf_polars/dsl/traversal.py
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -16,10 +16,10 @@
 
 
 __all__: list[str] = [
-    "traversal",
-    "reuse_if_unchanged",
-    "make_recursive",
     "CachingVisitor",
+    "make_recursive",
+    "reuse_if_unchanged",
+    "traversal",
 ]
 
 
diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
new file mode 100644
index 00000000000..8f660632df2
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition base classes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import Union
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator, Sequence
+
+    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.nodebase import Node
+
+
+class PartitionInfo:
+    """
+    Partitioning information.
+
+    This class only tracks the partition count (for now).
+    """
+
+    __slots__ = ("count",)
+
+    def __init__(self, count: int):
+        self.count = count
+
+    def keys(self, node: Node) -> Iterator[tuple[str, int]]:
+        """Return the partitioned keys for a given node."""
+        name = get_key_name(node)
+        yield from ((name, i) for i in range(self.count))
+
+
+def get_key_name(node: Node) -> str:
+    """Generate the key name for a Node."""
+    return f"{type(node).__name__.lower()}-{hash(node)}"
+
+
+def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
+    # Concatenate a sequence of DataFrames vertically
+    return Union.do_evaluate(None, *dfs)
diff --git a/python/cudf_polars/cudf_polars/experimental/dispatch.py b/python/cudf_polars/cudf_polars/experimental/dispatch.py
new file mode 100644
index 00000000000..79a52ff3cde
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/dispatch.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition dispatch functions."""
+
+from __future__ import annotations
+
+from functools import singledispatch
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import TypeAlias
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.typing import GenericTransformer
+
+
+LowerIRTransformer: TypeAlias = (
+    "GenericTransformer[IR, tuple[IR, MutableMapping[IR, PartitionInfo]]]"
+)
+"""Protocol for Lowering IR nodes."""
+
+
+@singledispatch
+def lower_ir_node(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Rewrite an IR node and extract partitioning information.
+
+    Parameters
+    ----------
+    ir
+        IR node to rewrite.
+    rec
+        Recursive LowerIRTransformer callable.
+
+    Returns
+    -------
+    new_ir, partition_info
+        The rewritten node, and a mapping from unique nodes in
+        the full IR graph to associated partitioning information.
+
+    Notes
+    -----
+    This function is used by `lower_ir_graph`.
+
+    See Also
+    --------
+    lower_ir_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+
+
+@singledispatch
+def generate_ir_tasks(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    """
+    Generate a task graph for evaluation of an IR node.
+
+    Parameters
+    ----------
+    ir
+        IR node to generate tasks for.
+    partition_info
+        Partitioning information, obtained from :func:`lower_ir_graph`.
+
+    Returns
+    -------
+    mapping
+        A (partial) dask task graph for the evaluation of an ir node.
+
+    Notes
+    -----
+    Task generation should only produce the tasks for the current node,
+    referring to child tasks by name.
+
+    See Also
+    --------
+    task_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
new file mode 100644
index 00000000000..3a1fec36079
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition IO Logic."""
+
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import DataFrameScan, Union
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+
+
+@lower_ir_node.register(DataFrameScan)
+def _(
+    ir: DataFrameScan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    rows_per_partition = ir.config_options.get("executor_options", {}).get(
+        "max_rows_per_partition", 1_000_000
+    )
+
+    nrows = max(ir.df.shape()[0], 1)
+    count = math.ceil(nrows / rows_per_partition)
+
+    if count > 1:
+        length = math.ceil(nrows / count)
+        slices = [
+            DataFrameScan(
+                ir.schema,
+                ir.df.slice(offset, length),
+                ir.projection,
+                ir.predicate,
+                ir.config_options,
+            )
+            for offset in range(0, nrows, length)
+        ]
+        new_node = Union(ir.schema, None, *slices)
+        return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
+            new_node: PartitionInfo(count=count)
+        }
+
+    return ir, {ir: PartitionInfo(count=1)}
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 6518dd60c7d..e5884f1c574 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,93 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
-"""Partitioned LogicalPlan nodes."""
+"""Multi-partition Dask execution."""
 
 from __future__ import annotations
 
+import itertools
 import operator
-from functools import reduce, singledispatch
+from functools import reduce
 from typing import TYPE_CHECKING, Any
 
-from cudf_polars.dsl.ir import IR
-from cudf_polars.dsl.traversal import traversal
+import cudf_polars.experimental.io  # noqa: F401
+from cudf_polars.dsl.ir import IR, Cache, Projection, Union
+from cudf_polars.dsl.traversal import CachingVisitor, traversal
+from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
+from cudf_polars.experimental.dispatch import (
+    generate_ir_tasks,
+    lower_ir_node,
+)
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
-    from typing import TypeAlias
 
     from cudf_polars.containers import DataFrame
-    from cudf_polars.dsl.nodebase import Node
-    from cudf_polars.typing import GenericTransformer
-
-
-class PartitionInfo:
-    """
-    Partitioning information.
-
-    This class only tracks the partition count (for now).
-    """
-
-    __slots__ = ("count",)
-
-    def __init__(self, count: int):
-        self.count = count
-
-
-LowerIRTransformer: TypeAlias = (
-    "GenericTransformer[IR, MutableMapping[IR, PartitionInfo]]"
-)
-"""Protocol for Lowering IR nodes."""
-
-
-def get_key_name(node: Node) -> str:
-    """Generate the key name for a Node."""
-    return f"{type(node).__name__.lower()}-{hash(node)}"
-
-
-@singledispatch
-def lower_ir_node(
-    ir: IR, rec: LowerIRTransformer
-) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    """
-    Rewrite an IR node and extract partitioning information.
-
-    Parameters
-    ----------
-    ir
-        IR node to rewrite.
-    rec
-        Recursive LowerIRTransformer callable.
-
-    Returns
-    -------
-    new_ir, partition_info
-        The rewritten node, and a mapping from unique nodes in
-        the full IR graph to associated partitioning information.
-
-    Notes
-    -----
-    This function is used by `lower_ir_graph`.
-
-    See Also
-    --------
-    lower_ir_graph
-    """
-    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
 
 
 @lower_ir_node.register(IR)
 def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Default logic - Requires single partition
+
     if len(ir.children) == 0:
         # Default leaf node has single partition
-        return ir, {ir: PartitionInfo(count=1)}
+        return ir, {
+            ir: PartitionInfo(count=1)
+        }  # pragma: no cover; Missed by pylibcudf executor
 
     # Lower children
-    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False)
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
     partition_info = reduce(operator.or_, _partition_info)
 
     # Check that child partitioning is supported
-    count = max(partition_info[c].count for c in children)
-    if count > 1:
+    if any(partition_info[c].count > 1 for c in children):
         raise NotImplementedError(
             f"Class {type(ir)} does not support multiple partitions."
         )  # pragma: no cover
@@ -123,41 +76,62 @@ def lower_ir_graph(ir: IR) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     --------
     lower_ir_node
     """
-    from cudf_polars.dsl.traversal import CachingVisitor
-
     mapper = CachingVisitor(lower_ir_node)
     return mapper(ir)
 
 
-@singledispatch
-def generate_ir_tasks(
+def task_graph(
     ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
-) -> MutableMapping[Any, Any]:
+) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
     """
-    Generate a task graph for evaluation of an IR node.
+    Construct a task graph for evaluation of an IR graph.
 
     Parameters
     ----------
     ir
-        IR node to generate tasks for.
+        Root of the graph to rewrite.
     partition_info
-        Partitioning information, obtained from :func:`lower_ir_graph`.
+        A mapping from all unique IR nodes to the
+        associated partitioning information.
 
     Returns
     -------
-    mapping
-        A (partial) dask task graph for the evaluation of an ir node.
+    graph
+        A Dask-compatible task graph for the entire
+        IR graph with root `ir`.
 
     Notes
     -----
-    Task generation should only produce the tasks for the current node,
-    referring to child tasks by name.
+    This function traverses the unique nodes of the
+    graph with root `ir`, and extracts the tasks for
+    each node with :func:`generate_ir_tasks`.
 
     See Also
     --------
-    task_graph
+    generate_ir_tasks
     """
-    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+    graph = reduce(
+        operator.or_,
+        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
+    )
+
+    key_name = get_key_name(ir)
+    partition_count = partition_info[ir].count
+    if partition_count > 1:
+        graph[key_name] = (_concat, list(partition_info[ir].keys(ir)))
+        return graph, key_name
+    else:
+        return graph, (key_name, 0)
+
+
+def evaluate_dask(ir: IR) -> DataFrame:
+    """Evaluate an IR graph with Dask."""
+    from dask import get
+
+    ir, partition_info = lower_ir_graph(ir)
+
+    graph, key = task_graph(ir, partition_info)
+    return get(graph, key)
 
 
 @generate_ir_tasks.register(IR)
@@ -189,48 +163,85 @@ def _(
     }
 
 
-def task_graph(
-    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
-) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
-    """
-    Construct a task graph for evaluation of an IR graph.
+@lower_ir_node.register(Union)
+def _(
+    ir: Union, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Lower children
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
+    partition_info = reduce(operator.or_, _partition_info)
 
-    Parameters
-    ----------
-    ir
-        Root of the graph to rewrite.
-    partition_info
-        A mapping from all unique IR nodes to the
-        associated partitioning information.
+    # Check zlice
+    if ir.zlice is not None:  # pragma: no cover
+        if any(p[c].count > 1 for p, c in zip(children, _partition_info, strict=False)):
+            raise NotImplementedError("zlice is not supported for multiple partitions.")
+        new_node = ir.reconstruct(children)
+        partition_info[new_node] = PartitionInfo(count=1)
+        return new_node, partition_info
 
-    Returns
-    -------
-    graph
-        A Dask-compatible task graph for the entire
-        IR graph with root `ir`.
+    # Partition count is the sum of all child partitions
+    count = sum(partition_info[c].count for c in children)
 
-    Notes
-    -----
-    This function traverses the unique nodes of the
-    graph with root `ir`, and extracts the tasks for
-    each node with :func:`generate_ir_tasks`.
+    # Return reconstructed node and partition-info dict
+    new_node = ir.reconstruct(children)
+    partition_info[new_node] = PartitionInfo(count=count)
+    return new_node, partition_info
 
-    See Also
-    --------
-    generate_ir_tasks
-    """
-    graph = reduce(
-        operator.or_,
-        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
-    )
-    return graph, (get_key_name(ir), 0)
 
+@generate_ir_tasks.register(Union)
+def _(
+    ir: Union, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    key_name = get_key_name(ir)
+    partition = itertools.count()
+    return {
+        (key_name, next(partition)): child_key
+        for child in ir.children
+        for child_key in partition_info[child].keys(child)
+    }
 
-def evaluate_dask(ir: IR) -> DataFrame:
-    """Evaluate an IR graph with Dask."""
-    from dask import get
 
-    ir, partition_info = lower_ir_graph(ir)
+def _lower_ir_pwise(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Lower a partition-wise (i.e. embarrassingly-parallel) IR node
 
-    graph, key = task_graph(ir, partition_info)
-    return get(graph, key)
+    # Lower children
+    children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
+    partition_info = reduce(operator.or_, _partition_info)
+    counts = {partition_info[c].count for c in children}
+
+    # Check that child partitioning is supported
+    if len(counts) > 1:
+        raise NotImplementedError(
+            f"Class {type(ir)} does not support unbalanced partitions."
+        )  # pragma: no cover
+
+    # Return reconstructed node and partition-info dict
+    partition = PartitionInfo(count=max(counts))
+    new_node = ir.reconstruct(children)
+    partition_info[new_node] = partition
+    return new_node, partition_info
+
+
+lower_ir_node.register(Projection, _lower_ir_pwise)
+lower_ir_node.register(Cache, _lower_ir_pwise)
+
+
+def _generate_ir_tasks_pwise(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Generate partition-wise (i.e. embarrassingly-parallel) tasks
+    child_names = [get_key_name(c) for c in ir.children]
+    return {
+        key: (
+            ir.do_evaluate,
+            *ir._non_child_args,
+            *[(child_name, i) for child_name in child_names],
+        )
+        for i, key in enumerate(partition_info[ir].keys(ir))
+    }
+
+
+generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise)
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 57c5fdaa7cf..52be130ab90 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -21,13 +21,13 @@
     from cudf_polars.dsl import expr, ir, nodebase
 
 __all__: list[str] = [
-    "PolarsIR",
-    "PolarsExpr",
-    "NodeTraverser",
-    "OptimizationArgs",
-    "GenericTransformer",
     "ExprTransformer",
+    "GenericTransformer",
     "IRTransformer",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "PolarsExpr",
+    "PolarsIR",
 ]
 
 PolarsIR: TypeAlias = Union[
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index e7ac72df609..6bb5d78c488 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -19,9 +19,9 @@
 )
 
 __all__ = [
-    "from_polars",
-    "downcast_arrow_lists",
     "can_cast",
+    "downcast_arrow_lists",
+    "from_polars",
     "is_order_preserving_cast",
 ]
 import pylibcudf as plc
@@ -75,11 +75,13 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
     return (
         (
             from_ == to
-            or not has_empty
-            and (
-                plc.traits.is_fixed_width(to)
-                and plc.traits.is_fixed_width(from_)
-                and plc.unary.is_supported_cast(from_, to)
+            or (
+                not has_empty
+                and (
+                    plc.traits.is_fixed_width(to)
+                    and plc.traits.is_fixed_width(from_)
+                    and plc.unary.is_supported_cast(from_, to)
+                )
             )
         )
         or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to))
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f050a7c568a..b781b13ec10 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -158,6 +158,7 @@ ignore = [
   "ISC002", # multi-line-implicit-string-concatenation
 ]
 fixable = ["ALL"]
+typing-modules = ["cudf_polars.typing"]
 
 [tool.ruff.lint.per-file-ignores]
 "**/tests/**/*.py" = ["D"]
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 2f4df9289f8..9755994c419 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -116,7 +116,11 @@ def test_rewrite_ir_node():
     def replace_df(node, rec):
         if isinstance(node, ir.DataFrameScan):
             return ir.DataFrameScan(
-                node.schema, new_df._df, node.projection, node.predicate
+                node.schema,
+                new_df._df,
+                node.projection,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 
@@ -144,7 +148,11 @@ def test_rewrite_scan_node(tmp_path):
     def replace_scan(node, rec):
         if isinstance(node, ir.Scan):
             return ir.DataFrameScan(
-                node.schema, right._df, node.with_columns, node.predicate
+                node.schema,
+                right._df,
+                node.with_columns,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 
diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py
new file mode 100644
index 00000000000..77c7bf0c503
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_dataframescan.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import Translator
+from cudf_polars.experimental.parallel import lower_ir_graph
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": range(30_000),
+            "y": ["cat", "dog", "fish"] * 10_000,
+            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 6_000,
+        }
+    )
+
+
+@pytest.mark.parametrize("max_rows_per_partition", [1_000, 1_000_000])
+def test_parallel_dataframescan(df, max_rows_per_partition):
+    total_row_count = len(df.collect())
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": max_rows_per_partition},
+    )
+    assert_gpu_result_equal(df, engine=engine)
+
+    # Check partitioning
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+    ir, info = lower_ir_graph(qir)
+    count = info[ir].count
+    if max_rows_per_partition < total_row_count:
+        assert count > 1
+    else:
+        assert count == 1
+
+
+def test_dataframescan_concat(df):
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 1_000},
+    )
+    df2 = pl.concat([df, df])
+    assert_gpu_result_equal(df2, engine=engine)
diff --git a/python/cudf_polars/tests/test_executors.py b/python/cudf_polars/tests/test_executors.py
index 3eaea2ec9ea..b8c0bb926ab 100644
--- a/python/cudf_polars/tests/test_executors.py
+++ b/python/cudf_polars/tests/test_executors.py
@@ -66,3 +66,19 @@ def test_unknown_executor():
         match="ValueError: Unknown executor 'unknown-executor'",
     ):
         assert_gpu_result_equal(df, executor="unknown-executor")
+
+
+@pytest.mark.parametrize("executor", [None, "pylibcudf", "dask-experimental"])
+def test_unknown_executor_options(executor):
+    df = pl.LazyFrame({})
+
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="Unsupported executor_options",
+    ):
+        df.collect(
+            engine=pl.GPUEngine(
+                executor=executor,
+                executor_options={"foo": None},
+            )
+        )
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index cc17e71039a..20eb2404b77 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -3,15 +3,15 @@
 import warnings
 from importlib import import_module
 
-from dask import config
 import dask.dataframe as dd
-from dask.dataframe import from_delayed  # noqa: E402
+from dask import config
+from dask.dataframe import from_delayed
 
-import cudf  # noqa: E402
+import cudf
 
-from . import backends  # noqa: E402, F401
-from ._version import __git_commit__, __version__  # noqa: E402, F401
-from .core import concat, from_cudf, DataFrame, Index, Series  # noqa: F401
+from . import backends  # noqa: F401
+from ._version import __git_commit__, __version__  # noqa: F401
+from .core import DataFrame, Index, Series, concat, from_cudf
 
 QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED
 
@@ -56,17 +56,17 @@ def inner_func(*args, **kwargs):
 
 
 if QUERY_PLANNING_ON:
+    from . import io
     from ._expr.expr import _patch_dask_expr
-    from . import io  # noqa: F401
 
     groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
     read_text = DataFrame.read_text
     _patch_dask_expr()
 
 else:
+    from . import io  # noqa: F401
     from ._legacy.groupby import groupby_agg  # noqa: F401
     from ._legacy.io import read_text  # noqa: F401
-    from . import io  # noqa: F401
 
 
 to_orc = _deprecated_api(
@@ -78,10 +78,10 @@ def inner_func(*args, **kwargs):
 
 __all__ = [
     "DataFrame",
-    "Series",
     "Index",
-    "from_cudf",
+    "Series",
     "concat",
+    "from_cudf",
     "from_delayed",
 ]
 
diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 89c0d108743..5192e6b8171 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -163,6 +163,11 @@ def read_text(*args, **kwargs):
 
         return legacy_read_text(*args, **kwargs)
 
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
+
 
 class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
@@ -182,6 +187,11 @@ def struct(self):
 
         return StructMethods(self)
 
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
+
 
 class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
@@ -213,8 +223,9 @@ def _create_array_collection_with_meta(expr):
     name = result._name
     meta = result._meta
     divisions = result.divisions
-    chunks = ((np.nan,) * (len(divisions) - 1),) + tuple(
-        (d,) for d in meta.shape[1:]
+    chunks = (
+        (np.nan,) * (len(divisions) - 1),
+        *tuple((d,) for d in meta.shape[1:]),
     )
     if len(chunks) > 1:
         if isinstance(dsk, HighLevelGraph):
@@ -224,11 +235,11 @@ def _create_array_collection_with_meta(expr):
             layer = dsk
         if isinstance(layer, Blockwise):
             layer.new_axes["j"] = chunks[1][0]
-            layer.output_indices = layer.output_indices + ("j",)
+            layer.output_indices = (*layer.output_indices, "j")
         else:
             suffix = (0,) * (len(chunks) - 1)
             for i in range(len(chunks[0])):
-                layer[(name, i) + suffix] = layer.pop((name, i))
+                layer[(name, i, *suffix)] = layer.pop((name, i))
 
     return da.Array(dsk, name=name, chunks=chunks, meta=meta)
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 7d6d5c05cbe..5fd217209ec 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -10,7 +10,7 @@
 
 # This module provides backward compatibility for legacy import patterns.
 if dd.DASK_EXPR_ENABLED:
-    from dask_cudf._expr.collection import (  # noqa: E402
+    from dask_cudf._expr.collection import (
         DataFrame,
         Index,
         Series,
@@ -19,7 +19,7 @@
     from dask_cudf._legacy.core import DataFrame, Index, Series  # noqa: F401
 
 
-concat = dd.concat  # noqa: F401
+concat = dd.concat
 
 
 @_dask_cudf_performance_tracking
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 212951336c9..9bca33e414a 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api, QUERY_PLANNING_ON
-
-from . import csv, orc, json, parquet, text  # noqa: F401
+from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
 
+from . import csv, json, orc, parquet, text  # noqa: F401
 
 read_csv = _deprecated_api(
     "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv"
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ce9935c8b3c..ba6209c4820 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -40,7 +40,7 @@ def TaskList(*x):
 from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
 
 # Dask-expr imports CudfEngine from this module
-from dask_cudf._legacy.io.parquet import CudfEngine  # noqa: F401
+from dask_cudf._legacy.io.parquet import CudfEngine
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 5130b804179..7101fb7e00a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -489,7 +489,7 @@ def test_repartition_hash_staged(npartitions):
     )
 
     # Make sure we are getting a dask_cudf dataframe
-    assert type(ddf_new) == type(ddf)
+    assert type(ddf_new) is type(ddf)
 
     # Check that the length was preserved
     assert len(ddf_new) == len(ddf)
@@ -956,7 +956,7 @@ def func(x):
 
     # NOTE: The calculation here doesn't need to make sense.
     # We just need to make sure we get the right type back.
-    assert type(result) == type(expect)
+    assert type(result) is type(expect)
 
 
 @pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
@@ -1019,3 +1019,29 @@ def test_rename_axis_after_join():
     result = ddf1.join(ddf2, how="outer")
     expected = df1.join(df2, how="outer")
     dd.assert_eq(result, expected, check_index=False)
+
+
+def test_clip_dataframe():
+    df = cudf.DataFrame(
+        {
+            "id": ["a", "b", "c", "d"],
+            "score": [-1, 1, 4, 6],
+        }
+    )
+    expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1)
+    got = dd.from_pandas(df, npartitions=2).clip(
+        lower=["b", 1], upper=["d", 5], axis=1
+    )
+    dd.assert_eq(expect, got)
+
+
+def test_clip_series():
+    ser = cudf.Series([-0.5, 0.5, 4.5, 5.5])
+    expect = ser.clip(lower=0, upper=5).round().astype(int)
+    got = (
+        dd.from_pandas(ser, npartitions=2)
+        .clip(lower=0, upper=5)
+        .round()
+        .astype(int)
+    )
+    dd.assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index fe57d4a4f00..d91b9defc1c 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -44,7 +44,7 @@ def test_pyarrow_conversion_dispatch(preserve_index, index):
     if not preserve_index and index is not None:
         df1.index.name = None
 
-    assert type(df1) == type(df2)
+    assert type(df1) is type(df2)
     assert_eq(df1, df2)
 
     # Check that preserve_index does not produce a RangeIndex
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index d03180852eb..c28b7e49207 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -4,7 +4,7 @@
 import pytest
 
 import dask
-from dask import dataframe as dd
+from dask import array as da, dataframe as dd
 from dask.distributed import Client
 from distributed.utils_test import cleanup, loop, loop_in_thread  # noqa: F401
 
@@ -121,3 +121,17 @@ def test_unique():
                 ddf.x.unique().compute(),
                 check_index=False,
             )
+
+
+def test_serialization_of_numpy_types():
+    # Dask uses numpy integers as column names, which can break cudf serialization
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            with dask.config.set(
+                {"dataframe.backend": "cudf", "array.backend": "cupy"}
+            ):
+                rng = da.random.default_rng()
+                X_arr = rng.random((100, 10), chunks=(50, 10))
+                X = dd.from_dask_array(X_arr)
+                X = X[X.columns[0]]
+                X.compute()
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 918290aa6fa..9bd3b506db0 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -58,7 +58,7 @@ def pdf(request):
 # deprecation check for "collect".
 @pytest.mark.parametrize(
     "aggregation",
-    sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)),
+    sorted((*tuple(set(OPTIMIZED_AGGS) - {list}), "collect")),
 )
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py
index 10c476cbe89..4077fa8fbf9 100644
--- a/python/libcudf/libcudf/__init__.py
+++ b/python/libcudf/libcudf/__init__.py
@@ -14,3 +14,5 @@
 
 from libcudf._version import __git_commit__, __version__
 from libcudf.load import load_library
+
+__all__ = ["__git_commit__", "__version__", "load_library"]
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 62a2170f83e..8ea176a6b07 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -65,8 +65,8 @@
     "aggregation",
     "binaryop",
     "column_factories",
-    "contiguous_split",
     "concatenate",
+    "contiguous_split",
     "copying",
     "datetime",
     "experimental",
@@ -83,6 +83,7 @@
     "lists",
     "merge",
     "null_mask",
+    "nvtext",
     "partitioning",
     "quantiles",
     "reduce",
@@ -91,13 +92,12 @@
     "rolling",
     "round",
     "search",
+    "sorting",
     "stream_compaction",
     "strings",
-    "sorting",
     "traits",
     "transform",
     "transpose",
     "types",
     "unary",
-    "nvtext",
 ]
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 1a61c20d783..84f47cf5305 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -15,15 +15,39 @@ from pylibcudf.io.types cimport (
     TableWithMetadata,
 )
 from pylibcudf.libcudf.io.parquet cimport (
+    parquet_chunked_writer as cpp_parquet_chunked_writer,
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_writer_options,
     parquet_writer_options_builder,
+    parquet_reader_options,
+    parquet_reader_options_builder,
+    chunked_parquet_writer_options,
+    chunked_parquet_writer_options_builder,
 )
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.table cimport Table
 from pylibcudf.types cimport DataType
 
 
+cdef class ParquetReaderOptions:
+    cdef parquet_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_row_groups(self, list row_groups)
+    cpdef void set_num_rows(self, size_type nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_columns(self, list col_names)
+    cpdef void set_filter(self, Expression filter)
+
+cdef class ParquetReaderOptionsBuilder:
+    cdef parquet_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val)
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val)
+    cpdef build(self)
+
+
 cdef class ChunkedParquetReader:
     cdef unique_ptr[cpp_chunked_parquet_reader] reader
 
@@ -31,20 +55,51 @@ cdef class ChunkedParquetReader:
     cpdef TableWithMetadata read_chunk(self)
 
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = *,
-    list row_groups = *,
-    Expression filters = *,
-    bool convert_strings_to_categories = *,
-    bool use_pandas_metadata = *,
-    int64_t skip_rows = *,
-    size_type nrows = *,
-    bool allow_mismatched_pq_schemas = *,
-    # disabled see comment in parquet.pyx for more
-    # ReaderColumnSchema reader_column_schema = *,
-    # DataType timestamp_type = *
-)
+cpdef read_parquet(ParquetReaderOptions options)
+
+
+cdef class ParquetChunkedWriter:
+    cdef unique_ptr[cpp_parquet_chunked_writer] c_obj
+    cpdef memoryview close(self, list column_chunks_file_paths)
+    cpdef void write(self, Table table, object partitions_info=*)
+
+
+cdef class ChunkedParquetWriterOptions:
+    cdef chunked_parquet_writer_options c_obj
+    cdef SinkInfo sink
+
+    cpdef void set_dictionary_policy(self, dictionary_policy policy)
+
+
+cdef class ChunkedParquetWriterOptionsBuilder:
+    cdef chunked_parquet_writer_options_builder c_obj
+    cdef SinkInfo sink
+
+    cpdef ChunkedParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata)
+
+    cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata)
+
+    cpdef ChunkedParquetWriterOptionsBuilder compression(
+        self,
+        compression_type compression
+    )
+
+    cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf)
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val)
+
+    cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled)
+
+    cpdef ChunkedParquetWriterOptions build(self)
+
 
 cdef class ParquetWriterOptions:
     cdef parquet_writer_options c_obj
@@ -91,3 +146,5 @@ cdef class ParquetWriterOptionsBuilder:
     cpdef ParquetWriterOptions build(self)
 
 cpdef memoryview write_parquet(ParquetWriterOptions options)
+
+cpdef memoryview merge_row_group_metadata(list metdata_list)
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index eb2ca68109b..2d8d12c1a45 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from collections.abc import Mapping
-from typing import Self
+
+from typing_extensions import Self
 
 from pylibcudf.expressions import Expression
 from pylibcudf.io.types import (
@@ -16,6 +17,24 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 
+class ParquetReaderOptions:
+    def __init__(self): ...
+    def set_row_groups(self, row_groups: list[list[int]]): ...
+    def set_num_rows(self, nrows: int): ...
+    def set_skip_rows(self, skip_rows: int): ...
+    def set_columns(self, col_names: list[str]): ...
+    def set_filter(self, filter: Expression): ...
+    @staticmethod
+    def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ...
+
+class ParquetReaderOptionsBuilder:
+    def __init__(self): ...
+    def convert_strings_to_categories(self, val: bool) -> Self: ...
+    def use_pandas_metadata(self, val: bool) -> Self: ...
+    def allow_mismatched_pq_schemas(self, val: bool) -> Self: ...
+    def use_arrow_schema(self, val: bool) -> Self: ...
+    def build(self) -> ParquetReaderOptions: ...
+
 class ChunkedParquetReader:
     def __init__(
         self,
@@ -78,3 +97,34 @@ class ParquetWriterOptionsBuilder:
     def build(self) -> ParquetWriterOptions: ...
 
 def write_parquet(options: ParquetWriterOptions) -> memoryview: ...
+
+class ParquetChunkedWriter:
+    def __init__(self): ...
+    def close(self, metadata_file_path: list) -> memoryview: ...
+    def write(self, table: Table) -> None: ...
+    @staticmethod
+    def from_options(options: ChunkedParquetWriterOptions) -> Self: ...
+
+class ChunkedParquetWriterOptions:
+    def __init__(self): ...
+    def set_dictionary_policy(self, policy: DictionaryPolicy) -> None: ...
+    @staticmethod
+    def builder(sink: SinkInfo) -> ChunkedParquetWriterOptionsBuilder: ...
+
+class ChunkedParquetWriterOptionsBuilder:
+    def __init__(self): ...
+    def metadata(self, metadata: TableInputMetadata) -> Self: ...
+    def key_value_metadata(
+        self, metadata: list[Mapping[str, str]]
+    ) -> Self: ...
+    def compression(self, compression: CompressionType) -> Self: ...
+    def stats_level(self, sf: StatisticsFreq) -> Self: ...
+    def row_group_size_bytes(self, val: int) -> Self: ...
+    def row_group_size_rows(self, val: int) -> Self: ...
+    def max_page_size_bytes(self, val: int) -> Self: ...
+    def max_page_size_rows(self, val: int) -> Self: ...
+    def max_dictionary_size(self, val: int) -> Self: ...
+    def write_arrow_schema(self, enabled: bool) -> Self: ...
+    def build(self) -> ChunkedParquetWriterOptions: ...
+
+def merge_row_group_metadata(metdata_list: list) -> memoryview: ...
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index b95b1f39de1..672fe2be847 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -2,7 +2,7 @@
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport unique_ptr, make_unique
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -22,6 +22,9 @@ from pylibcudf.libcudf.io.parquet cimport (
     read_parquet as cpp_read_parquet,
     write_parquet as cpp_write_parquet,
     parquet_writer_options,
+    parquet_chunked_writer as cpp_parquet_chunked_writer,
+    chunked_parquet_writer_options,
+    merge_row_group_metadata as cpp_merge_row_group_metadata,
 )
 from pylibcudf.libcudf.io.types cimport (
     compression_type,
@@ -38,46 +41,205 @@ __all__ = [
     "ParquetWriterOptions",
     "ParquetWriterOptionsBuilder",
     "read_parquet",
-    "write_parquet"
+    "write_parquet",
+    "ParquetReaderOptions",
+    "ParquetReaderOptionsBuilder",
+    "ChunkedParquetWriterOptions",
+    "ChunkedParquetWriterOptionsBuilder"
+    "merge_row_group_metadata",
 ]
 
 
-cdef parquet_reader_options _setup_parquet_reader_options(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas=False,
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    cdef vector[string] col_vec
-    cdef parquet_reader_options opts = (
-        parquet_reader_options.builder(source_info.c_obj)
-        .convert_strings_to_categories(convert_strings_to_categories)
-        .use_pandas_metadata(use_pandas_metadata)
-        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
-        .use_arrow_schema(True)
-        .build()
-    )
-    if row_groups is not None:
-        opts.set_row_groups(row_groups)
-    if nrows != -1:
-        opts.set_num_rows(nrows)
-    if skip_rows != 0:
-        opts.set_skip_rows(skip_rows)
-    if columns is not None:
-        col_vec.reserve(len(columns))
-        for col in columns:
-            col_vec.push_back(<string>str(col).encode())
-        opts.set_columns(col_vec)
-    if filters is not None:
-        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
-    return opts
+cdef class ParquetReaderOptions:
+    """The settings to use for ``read_parquet``
+    For details, see :cpp:class:`cudf::io::parquet_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a ParquetReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::parquet_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the Parquet file from.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+            Builder to build ParquetReaderOptions
+        """
+        cdef ParquetReaderOptionsBuilder parquet_builder = (
+            ParquetReaderOptionsBuilder.__new__(ParquetReaderOptionsBuilder)
+        )
+        parquet_builder.c_obj = parquet_reader_options.builder(source.c_obj)
+        parquet_builder.source = source
+        return parquet_builder
+
+    cpdef void set_row_groups(self, list row_groups):
+        """
+        Sets list of individual row groups to read.
+
+        Parameters
+        ----------
+        row_groups : list
+            List of row groups to read
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] outer
+        cdef vector[size_type] inner
+        for row_group in row_groups:
+            for x in row_group:
+                inner.push_back(x)
+            outer.push_back(inner)
+            inner.clear()
+
+        self.c_obj.set_row_groups(outer)
+
+    cpdef void set_num_rows(self, size_type nrows):
+        """
+        Sets number of rows to read.
+
+        Parameters
+        ----------
+        nrows : size_type
+            Number of rows to read after skip
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip.
+
+        Parameters
+        ----------
+        skip_rows : int64_t
+            Number of rows to skip from start
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the columns to be read.
+
+        Parameters
+        ----------
+        col_names : list
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] vec
+        for name in col_names:
+            vec.push_back(<string>str(name).encode())
+        self.c_obj.set_columns(vec)
+
+    cpdef void set_filter(self, Expression filter):
+        """
+        Sets AST based filter for predicate pushdown.
+
+        Parameters
+        ----------
+        filter : Expression
+            AST expression to use as filter
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_filter(<expression &>dereference(filter.c_obj.get()))
+
+
+cdef class ParquetReaderOptionsBuilder:
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val):
+        """
+        Sets enable/disable conversion of strings to categories.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable conversion of string columns to categories
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.convert_strings_to_categories(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val):
+        """
+        Sets to enable/disable use of pandas metadata to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use pandas metadata
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_pandas_metadata(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val):
+        """
+        Sets to enable/disable reading of matching projected and filter
+        columns from mismatched Parquet sources.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to read matching projected and filter
+            columns from mismatched Parquet sources.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.allow_mismatched_pq_schemas(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val):
+        """
+        Sets to enable/disable use of arrow schema to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use arrow schema
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_arrow_schema(val)
+        return self
+
+    cpdef build(self):
+        """Create a ParquetReaderOptions object"""
+        cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__(
+            ParquetReaderOptions
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.source = self.source
+        return parquet_options
 
 
 cdef class ChunkedParquetReader:
@@ -88,63 +250,27 @@ cdef class ChunkedParquetReader:
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The names of the columns to be read
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
+    options : ParquetReaderOptions
+        Settings for controlling reading behavior
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
         or 0 if there is no limit.
     pass_read_limit : size_t, default 1024000000
         Limit on the amount of memory used for reading and decompressing data
         or 0 if there is no limit.
-    allow_mismatched_pq_schemas : bool, default False
-        Whether to read (matching) columns specified in `columns` from
-        the input files with otherwise mismatched schemas.
     """
     def __init__(
         self,
-        SourceInfo source_info,
-        list columns=None,
-        list row_groups=None,
-        bool use_pandas_metadata=True,
-        bool convert_strings_to_categories=False,
-        int64_t skip_rows = 0,
-        size_type nrows = -1,
+        ParquetReaderOptions options,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000,
-        bool allow_mismatched_pq_schemas=False
     ):
-
-        cdef parquet_reader_options opts = _setup_parquet_reader_options(
-            source_info,
-            columns,
-            row_groups,
-            filters=None,
-            convert_strings_to_categories=convert_strings_to_categories,
-            use_pandas_metadata=use_pandas_metadata,
-            skip_rows=skip_rows,
-            nrows=nrows,
-            allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-        )
-
         with nogil:
             self.reader.reset(
                 new cpp_chunked_parquet_reader(
                     chunk_read_limit,
                     pass_read_limit,
-                    opts
+                    options.c_obj,
                 )
             )
 
@@ -179,73 +305,309 @@ cdef class ChunkedParquetReader:
 
         return TableWithMetadata.from_libcudf(c_result)
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas = False,
-    # Disabled, these aren't used by cudf-python
-    # we should only add them back in if there's user demand
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+cpdef read_parquet(ParquetReaderOptions options):
+    """
+    Read from Parquet format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
 
     For details, see :cpp:func:`read_parquet`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    filters : Expression, default None
-        An AST :py:class:`pylibcudf.expressions.Expression`
-        to use for predicate pushdown.
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    allow_mismatched_pq_schemas : bool, default False
-        If True, enable reading (matching) columns specified in `columns`
-        from the input files with otherwise mismatched schemas.
-
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    options: ParquetReaderOptions
+        Settings for controlling reading behavior
     """
-    cdef table_with_metadata c_result
-    cdef parquet_reader_options opts = _setup_parquet_reader_options(
-        source_info,
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories,
-        use_pandas_metadata,
-        skip_rows,
-        nrows,
-        allow_mismatched_pq_schemas,
-    )
-
     with nogil:
-        c_result = move(cpp_read_parquet(opts))
+        c_result = move(cpp_read_parquet(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
 
+cdef class ParquetChunkedWriter:
+    cpdef memoryview close(self, list metadata_file_path):
+        """
+        Closes the chunked Parquet writer.
+
+        Parameters
+        ----------
+        metadata_file_path: list
+            Column chunks file path to be set in the raw output metadata
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] column_chunks_file_paths
+        cdef unique_ptr[vector[uint8_t]] out_metadata_c
+        if metadata_file_path:
+            for path in metadata_file_path:
+                column_chunks_file_paths.push_back(path.encode())
+        with nogil:
+            out_metadata_c = move(self.c_obj.get()[0].close(column_chunks_file_paths))
+        return memoryview(HostBuffer.from_unique_ptr(move(out_metadata_c)))
+
+    cpdef void write(self, Table table, object partitions_info=None):
+        """
+        Writes table to output.
+
+        Parameters
+        ----------
+        table: Table
+            Table that needs to be written
+        partitions_info: object, default None
+            Optional partitions to divide the table into.
+            If specified, must be same size as number of sinks.
+
+        Returns
+        -------
+        None
+        """
+        if partitions_info is None:
+            with nogil:
+                self.c_obj.get()[0].write(table.view())
+            return
+        cdef vector[partition_info] partitions
+        for part in partitions_info:
+            partitions.push_back(
+                partition_info(part[0], part[1])
+            )
+        with nogil:
+            self.c_obj.get()[0].write(table.view(), partitions)
+
+    @staticmethod
+    def from_options(ChunkedParquetWriterOptions options):
+        """
+        Creates a chunked Parquet writer from options
+
+        Parameters
+        ----------
+        options: ChunkedParquetWriterOptions
+            Settings for controlling writing behavior
+
+        Returns
+        -------
+        ParquetChunkedWriter
+        """
+        cdef ParquetChunkedWriter parquet_writer = ParquetChunkedWriter.__new__(
+            ParquetChunkedWriter
+        )
+        parquet_writer.c_obj.reset(new cpp_parquet_chunked_writer(options.c_obj))
+        return parquet_writer
+
+
+cdef class ChunkedParquetWriterOptions:
+    @staticmethod
+    def builder(SinkInfo sink):
+        """
+        Create builder to create ChunkedParquetWriterOptions.
+
+        Parameters
+        ----------
+        sink: SinkInfo
+            The sink used for writer output
+
+        Returns
+        -------
+        ChunkedParquetWriterOptionsBuilder
+        """
+        cdef ChunkedParquetWriterOptionsBuilder parquet_builder = (
+            ChunkedParquetWriterOptionsBuilder.__new__(
+                ChunkedParquetWriterOptionsBuilder
+            )
+        )
+        parquet_builder.c_obj = chunked_parquet_writer_options.builder(sink.c_obj)
+        parquet_builder.sink = sink
+        return parquet_builder
+
+    cpdef void set_dictionary_policy(self, dictionary_policy_t policy):
+        """
+        Sets the policy for dictionary use.
+
+        Parameters
+        ----------
+        policy : DictionaryPolicy
+            Policy for dictionary use
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_dictionary_policy(policy)
+
+
+cdef class ChunkedParquetWriterOptionsBuilder:
+    cpdef ChunkedParquetWriterOptionsBuilder metadata(
+        self,
+        TableInputMetadata metadata
+    ):
+        self.c_obj.metadata(metadata.c_obj)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata):
+        """
+        Sets Key-Value footer metadata.
+
+        Parameters
+        ----------
+        metadata : list[dict[str, str]]
+            Key-Value footer metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.key_value_metadata(
+            [
+                {key.encode(): value.encode() for key, value in mapping.items()}
+                for mapping in metadata
+            ]
+        )
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder compression(
+        self,
+        compression_type compression
+    ):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        compression : CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(compression)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf):
+        """
+        Sets the level of statistics.
+
+        Parameters
+        ----------
+        sf : StatisticsFreq
+            Level of statistics requested in the output file
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.stats_level(sf)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val):
+        """
+        Sets the maximum row group size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Maximum row group size, in bytes to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.row_group_size_bytes(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val):
+        """
+        Sets the maximum row group size, in rows.
+
+        Parameters
+        ----------
+        val : size_type
+            Maximum row group size, in rows to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.row_group_size_rows(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val):
+        """
+        Sets the maximum uncompressed page size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Maximum uncompressed page size, in bytes to set
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_page_size_bytes(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val):
+        """
+        Sets the maximum page size, in rows.
+
+        Parameters
+        ----------
+        val : size_type
+            Maximum page size, in rows to set.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_page_size_rows(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val):
+        """
+        Sets the maximum dictionary size, in bytes.
+
+        Parameters
+        ----------
+        val : size_t
+            Sets the maximum dictionary size, in bytes.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.max_dictionary_size(val)
+        return self
+
+    cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled):
+        """
+        Set to true if arrow schema is to be written.
+
+        Parameters
+        ----------
+        enabled : bool
+            Boolean value to enable/disable writing of arrow schema.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.write_arrow_schema(enabled)
+        return self
+
+    cpdef ChunkedParquetWriterOptions build(self):
+        """Create a ChunkedParquetWriterOptions object"""
+        cdef ChunkedParquetWriterOptions parquet_options = (
+            ChunkedParquetWriterOptions.__new__(ChunkedParquetWriterOptions)
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.sink = self.sink
+        return parquet_options
+
+
 cdef class ParquetWriterOptions:
 
     @staticmethod
@@ -570,10 +932,38 @@ cpdef memoryview write_parquet(ParquetWriterOptions options):
         (parquet FileMetadata thrift message) if requested in
         parquet_writer_options (empty blob otherwise).
     """
-    cdef parquet_writer_options c_options = options.c_obj
     cdef unique_ptr[vector[uint8_t]] c_result
 
     with nogil:
-        c_result = cpp_write_parquet(c_options)
+        c_result = cpp_write_parquet(move(options.c_obj))
 
     return memoryview(HostBuffer.from_unique_ptr(move(c_result)))
+
+
+cpdef memoryview merge_row_group_metadata(list metdata_list):
+    """
+    Merges multiple raw metadata blobs that were previously
+    created by write_parquet into a single metadata blob.
+
+    For details, see :cpp:func:`merge_row_group_metadata`.
+
+    Parameters
+    ----------
+    metdata_list : list
+        List of input file metadata
+
+    Returns
+    -------
+    memoryview
+        A parquet-compatible blob that contains the data for all row groups in the list
+    """
+    cdef vector[unique_ptr[vector[uint8_t]]] list_c
+    cdef unique_ptr[vector[uint8_t]] output_c
+
+    for blob in metdata_list:
+        list_c.push_back(move(make_unique[vector[uint8_t]](<vector[uint8_t]> blob)))
+
+    with nogil:
+        output_c = move(cpp_merge_row_group_metadata(list_c))
+
+    return memoryview(HostBuffer.from_unique_ptr(move(output_c)))
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index a1f3b17936c..61fe33d6805 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -65,7 +65,6 @@ cdef class ColumnInMetadata:
 
 cdef class TableInputMetadata:
     cdef table_input_metadata c_obj
-    cdef list column_metadata
 
 cdef class TableWithMetadata:
     cdef public Table tbl
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
index a3a559219ff..63fa9d1ff79 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyi
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -64,6 +64,8 @@ class PartitionInfo:
 
 class TableInputMetadata:
     def __init__(self, table: Table): ...
+    @property
+    def column_metadata(self) -> list[ColumnInMetadata]: ...
 
 class ColumnInMetadata:
     def set_name(self, name: str) -> Self: ...
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index a2155829f2c..458595ca0e0 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -288,12 +288,14 @@ cdef class TableInputMetadata:
     """
     def __init__(self, Table table):
         self.c_obj = table_input_metadata(table.view())
-        self.column_metadata = [
+
+    @property
+    def column_metadata(self):
+        return [
             ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self)
             for i in range(self.c_obj.column_metadata.size())
         ]
 
-
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
     (e.g. column names)
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 8570531dfde..9d1e8cba425 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const numeric_scalar[uint32_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash_permuted(
         const column_view &strings,
         const uint32_t seed,
         const column_view &a,
@@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
     ) except +
 
     cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const numeric_scalar[uint64_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64_permuted(
         const column_view &strings,
         const uint64_t seed,
         const column_view &a,
         const column_view &b,
         const size_type width,
     ) except +
-
-    cdef unique_ptr[column] word_minhash(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] word_minhash64(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 4f125d3a733..d88a7d4b825 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -15,11 +15,11 @@
 )
 
 __all__ = [
+    "byte_pair_encode",
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
-    "byte_pair_encode",
     "ngrams_tokenize",
     "normalize",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 6b544282f44..0af53748cdc 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -19,16 +17,10 @@ cpdef Column minhash_permuted(
     size_type width
 )
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
     Column b,
     size_type width
 )
-
-cpdef Column word_minhash(Column input, Column seeds)
-
-cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
index a2d9b6364f7..5d88cfbbea0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -1,13 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
-from pylibcudf.scalar import Scalar
 
 def minhash(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
 def minhash64(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
-def word_minhash(input: Column, seeds: Column) -> Column: ...
-def word_minhash64(input: Column, seeds: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 5448cc6de9b..84811cda867 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
-    minhash64_permuted as cpp_minhash64_permuted,
-    minhash_permuted as cpp_minhash_permuted,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
 )
-from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
-from pylibcudf.scalar cimport Scalar
-
-from cython.operator import dereference
-import warnings
 
 __all__ = [
     "minhash",
     "minhash64",
-    "word_minhash",
-    "word_minhash64",
 ]
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -81,7 +27,7 @@ cpdef Column minhash_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash_permuted`.
+    For details, see :cpp:func:`minhash`.
 
     Parameters
     ----------
@@ -104,7 +50,7 @@ cpdef Column minhash_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash_permuted(
+        c_result = cpp_minhash(
             input.view(),
             seed,
             a.view(),
@@ -114,50 +60,7 @@ cpdef Column minhash_permuted(
 
     return Column.from_libcudf(move(c_result))
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash64(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
@@ -168,7 +71,7 @@ cpdef Column minhash64_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x64_128 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash64_permuted`.
+    For details, see :cpp:func:`minhash64`.
 
     Parameters
     ----------
@@ -191,7 +94,7 @@ cpdef Column minhash64_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash64_permuted(
+        c_result = cpp_minhash64(
             input.view(),
             seed,
             a.view(),
@@ -200,62 +103,3 @@ cpdef Column minhash64_permuted(
         )
 
     return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`word_minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash64(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm though
-    only the first 64-bits of the hash are used in computing the output.
-
-    For details, see :cpp:func:`word_minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash64(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index 1cbaac57315..555ca2fb02c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -281,7 +281,7 @@ def test_read_csv_header(csv_table_data, source_or_sink, header):
         new_tbl_dict = {}
         for i, (name, vals) in enumerate(tbl_dict.items()):
             str_vals = [str(val) for val in vals]
-            new_tbl_dict[str(i)] = [name] + str_vals
+            new_tbl_dict[str(i)] = [name, *str_vals]
         pa_table = pa.table(new_tbl_dict)
 
     assert_table_and_meta_eq(
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index 94524acbcc8..da535809745 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -31,19 +31,24 @@ def test_read_parquet_basic(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    res = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    if nrows > -1:
+        options.set_num_rows(nrows)
+    if skiprows != 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None:
+        options.set_columns(columns)
+
+    res = plc.io.parquet.read_parquet(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)
 
     # Adapt to nrows/skiprows
     pa_table = pa_table.slice(
-        offset=skiprows, length=nrows if nrows != -1 else None
+        offset=skiprows, length=nrows if nrows > -1 else None
     )
 
     assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
@@ -95,9 +100,12 @@ def test_read_parquet_filters(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    plc_table_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]), filters=plc_filters
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    options.set_filter(plc_filters)
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(options)
     exp = read_table(source, filters=pa_filters)
     assert_table_and_meta_eq(
         exp, plc_table_w_meta, check_field_nullability=False
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ec533e64307..ad7a6f7a762 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -13,20 +13,13 @@ def minhash_input_data(request):
     return input_arr, seeds, request.param
 
 
-@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
-def word_minhash_input_data(request):
-    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
-    seeds = pa.array([2, 3, 4, 5], request.param)
-    return input_arr, seeds, request.param
-
-
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash_permuted(minhash_input_data, width):
+def test_minhash(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash_permuted
+        plc.nvtext.minhash.minhash
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64_permuted
+        else plc.nvtext.minhash.minhash64
     )
     result = minhash_func(
         plc.interop.from_arrow(input_arr),
@@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width):
     assert pa_result.type == pa.list_(
         pa.field("element", seed_type, nullable=False)
     )
-
-
-def test_word_minhash(word_minhash_input_data):
-    input_arr, seeds, seed_type = word_minhash_input_data
-    word_minhash_func = (
-        plc.nvtext.minhash.word_minhash
-        if seed_type == pa.uint32()
-        else plc.nvtext.minhash.word_minhash64
-    )
-    result = word_minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
-    )
-    pa_result = plc.interop.to_arrow(result)
-    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
-    assert pa_result.type == pa.list_(
-        pa.field("element", seed_type, nullable=False)
-    )
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index dc82eb363d0..53ee3e2b56e 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
+    "cuda-python>=11.8.5,<12.0a0",
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",