From 2eb71b28d9607e3dfa5b891cbc40ce53a5d27bc6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:05:34 -0400
Subject: [PATCH 1/6] Large strings gtest fixture and utilities (#15513)

Creates the base class and utilities for testing APIs to produce large strings.
The main purpose of the fixture is to enable the large strings environment variable(s) and to setup large test data that can be reused by multiple tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15513
---
 cpp/include/cudf_test/testing_main.hpp        |  37 ++++--
 cpp/tests/CMakeLists.txt                      |   9 ++
 cpp/tests/copying/concatenate_tests.cpp       |  43 ------
 cpp/tests/large_strings/concatenate_tests.cpp |  65 ++++++++++
 .../large_strings/large_strings_fixture.cpp   | 122 ++++++++++++++++++
 .../large_strings/large_strings_fixture.hpp   |  49 +++++++
 cpp/tests/large_strings/merge_tests.cpp       |  79 ++++++++++++
 cpp/tests/merge/merge_string_test.cpp         |  57 --------
 8 files changed, 351 insertions(+), 110 deletions(-)
 create mode 100644 cpp/tests/large_strings/concatenate_tests.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.hpp
 create mode 100644 cpp/tests/large_strings/merge_tests.cpp

diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index ecac761f7cb..66b831b917f 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   }
 }
 
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
+  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+  return resource;
+}
+
 /**
  * @brief Sets up stream mode memory resource adaptor
  *
@@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char** argv)                                       \
-  {                                                                     \
-    ::testing::InitGoogleTest(&argc, argv);                             \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();       \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode); \
-    rmm::mr::set_current_device_resource(resource.get());               \
-    auto adaptor = make_stream_mode_adaptor(cmd_opts);                  \
-    return RUN_ALL_TESTS();                                             \
+#define CUDF_TEST_PROGRAM_MAIN()                                            \
+  int main(int argc, char** argv)                                           \
+  {                                                                         \
+    ::testing::InitGoogleTest(&argc, argv);                                 \
+    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
+    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
+    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
+    return RUN_ALL_TESTS();                                                 \
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f59e675e1d5..6c56d82007a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -568,6 +568,15 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * large strings test ----------------------------------------------------------------------------
+ConfigureTest(
+  LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
+  large_strings/concatenate_tests.cpp
+  GPUS 1
+  PERCENT 100
+)
+
 # ##################################################################################################
 # * json path test --------------------------------------------------------------------------------
 ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 3e2e332936e..c2d1e1d9f4f 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
   EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
 }
 
-TEST_F(StringColumnTest, ConcatenateLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
-  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto view  = cudf::column_view(input);
-  std::vector<cudf::column_view> input_cols;
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_cols.push_back(view);
-    splits.push_back(view.size() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto result = cudf::concatenate(input_cols);
-  auto sv     = cudf::strings_column_view(result->view());
-  EXPECT_EQ(sv.size(), view.size() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  // verify results in sections
-  auto sliced = cudf::split(result->view(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  {
-    input_cols.clear();
-    input_cols.push_back(input);           // regular column
-    input_cols.push_back(result->view());  // large column
-    result = cudf::concatenate(input_cols);
-    sv     = cudf::strings_column_view(result->view());
-    EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
-    EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-    splits.push_back(view.size() * multiplier);
-    sliced = cudf::split(result->view(), splits);
-    for (auto c : sliced) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-    }
-  }
-}
-
 struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
new file mode 100644
index 00000000000..aa445bf761b
--- /dev/null
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct ConcatenateTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ConcatenateTest, ConcatenateVertical)
+{
+  auto input = this->long_column();
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_cols.clear();
+  input_cols.push_back(input);           // regular column
+  input_cols.push_back(result->view());  // large column
+  result = cudf::concatenate(input_cols);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.size() * multiplier);
+  sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
new file mode 100644
index 00000000000..59e0cd43d05
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/testing_main.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace cudf::test {
+class LargeStringsData {
+ public:
+  using DataPointer = std::unique_ptr<cudf::table>;
+
+  virtual ~LargeStringsData() {}
+
+  void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
+  {
+    _data[std::string(name)] = std::move(data);
+  }
+
+  cudf::table_view get_table(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
+  }
+
+  void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
+  {
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.emplace_back(std::move(data));
+    _data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
+  }
+
+  cudf::column_view get_column(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
+  }
+
+  bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }
+
+ protected:
+  std::map<std::string, DataPointer> _data;
+};
+
+cudf::column_view StringsLargeTest::wide_column()
+{
+  std::string name{"wide1"};
+  if (!g_ls_data->has_key(name)) {
+    auto input =
+      cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "The result does not include the value in the sum in"});
+    auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
+    auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
+    g_ls_data->add_column(name, std::move(result));
+  }
+  return g_ls_data->get_column(name);
+}
+
+cudf::column_view StringsLargeTest::long_column()
+{
+  std::string name("long1");
+  if (!g_ls_data->has_key(name)) {
+    auto itr = thrust::constant_iterator<std::string_view>(
+      "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+    auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+    g_ls_data->add_column(name, input.release());
+  }
+  return g_ls_data->get_column(name);
+}
+
+std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
+{
+  CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
+  auto lsd_data = std::make_unique<LargeStringsData>();
+  g_ls_data     = lsd_data.get();
+  return lsd_data;
+}
+
+LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
+}  // namespace cudf::test
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  // hardcoding the CUDA memory resource to keep from exceeding the pool
+  auto mr = cudf::test::make_cuda();
+  rmm::mr::set_current_device_resource(mr.get());
+  auto adaptor = make_stream_mode_adaptor(cmd_opts);
+
+  // create object to automatically be destroyed at the end of main()
+  auto lsd = cudf::test::StringsLargeTest::get_ls_data();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp
new file mode 100644
index 00000000000..8827b65f1ce
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+
+namespace cudf::test {
+class LargeStringsData;
+
+/**
+ * @brief Fixture for creating large strings tests
+ *
+ * Stores tests strings columns for reuse by specific tests.
+ * Creating the test input only once helps speed up the overall tests.
+ *
+ * Also automatically enables appropriate large strings environment variables.
+ */
+struct StringsLargeTest : public cudf::test::BaseFixture {
+  /**
+   * @brief Returns a column of long strings
+   */
+  cudf::column_view wide_column();
+
+  /**
+   * @brief Returns a long column of strings
+   */
+  cudf::column_view long_column();
+
+  large_strings_enabler g_ls_enabler;
+  static LargeStringsData* g_ls_data;
+
+  static std::unique_ptr<LargeStringsData> get_ls_data();
+};
+}  // namespace cudf::test
diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp
new file mode 100644
index 00000000000..afe6e424371
--- /dev/null
+++ b/cpp/tests/large_strings/merge_tests.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct MergeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(MergeTest, MergeLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index d7368d31944..28179a7341c 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,60 +411,3 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
-
-class MergeLargeStringsTest : public cudf::test::BaseFixture {};
-
-TEST_F(MergeLargeStringsTest, MergeLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                      // 50 bytes
-  auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto input_views = std::vector<cudf::table_view>();
-  auto const view  = cudf::table_view({input});
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_views.push_back(view);
-    splits.push_back(view.num_rows() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
-  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
-
-  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  auto sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  auto sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  input_views.clear();
-  input_views.push_back(view);            // regular column
-  input_views.push_back(result->view());  // large column
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-  splits.push_back(view.num_rows() * multiplier);
-  sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also check merge still returns 32-bit offsets for regular columns
-  input_views.clear();
-  input_views.push_back(view);
-  input_views.push_back(view);
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * 2);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
-  sliced = cudf::split(sv.parent(), {view.num_rows()});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
-}

From 8b4dc91fbee585e0f03cccc2b60ce7b68baa9a5f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:53:36 -1000
Subject: [PATCH 2/6] Replace RangeIndex._start/_stop/_step with _range
 (#15576)

The `._start/_stop/_step` attributes are wholly redundant with the similar attributes on a `range` object, so replacing with those attributes where needed

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15576
---
 python/cudf/cudf/core/index.py       | 128 +++++++++++----------------
 python/cudf/cudf/tests/test_index.py |   2 +-
 2 files changed, 55 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6f08b1d83b3..e457e818129 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -36,7 +36,6 @@
     is_integer,
     is_list_like,
     is_scalar,
-    is_signed_integer_dtype,
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
@@ -149,6 +148,15 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
+def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int:
+    """Validate start/stop/step argument in RangeIndex.__init__"""
+    if not is_integer(arg):
+        raise TypeError(
+            f"{arg_name} must be an integer, not {type(arg).__name__}"
+        )
+    return int(arg)
+
+
 class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
@@ -197,44 +205,29 @@ class RangeIndex(BaseIndex, BinaryOperand):
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
-        if step == 0:
-            raise ValueError("Step must not be zero.")
         if not cudf.api.types.is_hashable(name):
             raise ValueError("Name must be a hashable value.")
-        if dtype is not None and not is_signed_integer_dtype(dtype):
+        self._name = name
+        if dtype is not None and cudf.dtype(dtype).kind != "i":
             raise ValueError(f"{dtype=} must be a signed integer type")
 
         if isinstance(start, range):
-            therange = start
-            start = therange.start
-            stop = therange.stop
-            step = therange.step
-        if stop is None:
-            start, stop = 0, start
-        if not is_integer(start):
-            raise TypeError(
-                f"start must be an integer, not {type(start).__name__}"
-            )
-        self._start = int(start)
-        if not is_integer(stop):
-            raise TypeError(
-                f"stop must be an integer, not {type(stop).__name__}"
-            )
-        self._stop = int(stop)
-        if step is not None:
-            if not is_integer(step):
-                raise TypeError(
-                    f"step must be an integer, not {type(step).__name__}"
-                )
-            self._step = int(step)
+            self._range = start
         else:
-            self._step = 1
-        self._index = None
-        self._name = name
-        self._range = range(self._start, self._stop, self._step)
-        # _end is the actual last element of RangeIndex,
-        # whereas _stop is an upper bound.
-        self._end = self._start + self._step * (len(self._range) - 1)
+            if stop is None:
+                start, stop = 0, start
+            start = validate_range_arg(start, "start")
+            stop = validate_range_arg(stop, "stop")
+            if step is not None:
+                step = validate_range_arg(step, "step")
+            else:
+                step = 1
+            try:
+                self._range = range(start, stop, step)
+            except ValueError as err:
+                if step == 0:
+                    raise ValueError("Step must not be zero.") from err
+                raise
 
     def _copy_type_metadata(
         self, other: RangeIndex, *, override_dtypes=None
@@ -251,9 +244,9 @@ def searchsorted(
         na_position: Literal["first", "last"] = "last",
     ):
         assert (len(self) <= 1) or (
-            ascending == (self._step > 0)
+            ascending == (self.step > 0)
         ), "Invalid ascending flag"
-        return search_range(value, self.as_range, side=side)
+        return search_range(value, self._range, side=side)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -271,7 +264,7 @@ def start(self):
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
-        return self._start
+        return self._range.start
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -279,7 +272,7 @@ def stop(self):
         """
         The value of the stop parameter.
         """
-        return self._stop
+        return self._range.stop
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -287,7 +280,7 @@ def step(self):
         """
         The value of the step parameter.
         """
-        return self._step
+        return self._range.step
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -368,9 +361,7 @@ def copy(self, name=None, deep=False):
 
         name = self.name if name is None else name
 
-        return RangeIndex(
-            start=self._start, stop=self._stop, step=self._step, name=name
-        )
+        return RangeIndex(self._range, name=name)
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
@@ -389,8 +380,8 @@ def duplicated(self, keep="first"):
     @_cudf_nvtx_annotate
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
-            f", step={self._step}"
+            f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
+            f", step={self.step}"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
                 if self.name is not None
@@ -401,16 +392,16 @@ def __repr__(self):
 
     @_cudf_nvtx_annotate
     def __len__(self):
-        return len(range(self._start, self._stop, self._step))
+        return len(self._range)
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
 
-            lo = self._start + sl_start * self._step
-            hi = self._start + sl_stop * self._step
-            st = self._step * sl_step
+            lo = self.start + sl_start * self.step
+            hi = self.start + sl_stop * self.step
+            st = self.step * sl_step
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
@@ -419,18 +410,13 @@ def __getitem__(self, index):
                 index += len_self
             if not (0 <= index < len_self):
                 raise IndexError("Index out of bounds")
-            return self._start + index * self._step
+            return self.start + index * self.step
         return self._as_int_index()[index]
 
     @_cudf_nvtx_annotate
     def equals(self, other):
         if isinstance(other, RangeIndex):
-            if (self._start, self._stop, self._step) == (
-                other._start,
-                other._stop,
-                other._step,
-            ):
-                return True
+            return self._range == other._range
         return self._as_int_index().equals(other)
 
     @_cudf_nvtx_annotate
@@ -442,9 +428,9 @@ def serialize(self):
         # We don't need to store the GPU buffer for RangeIndexes
         # cuDF only needs to store start/stop and rehydrate
         # during de-serialization
-        header["index_column"]["start"] = self._start
-        header["index_column"]["stop"] = self._stop
-        header["index_column"]["step"] = self._step
+        header["index_column"]["start"] = self.start
+        header["index_column"]["stop"] = self.stop
+        header["index_column"]["step"] = self.step
         frames = []
 
         header["name"] = pickle.dumps(self.name)
@@ -484,9 +470,9 @@ def to_pandas(
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.RangeIndex(
-            start=self._start,
-            stop=self._stop,
-            step=self._step,
+            start=self.start,
+            stop=self.stop,
+            step=self.step,
             dtype=self.dtype,
             name=self.name,
         )
@@ -495,19 +481,15 @@ def to_pandas(
     def is_unique(self):
         return True
 
-    @cached_property
-    def as_range(self):
-        return range(self._start, self._stop, self._step)
-
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
-        return self._step > 0 or len(self) <= 1
+        return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
-        return self._step < 0 or len(self) <= 1
+        return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
@@ -590,12 +572,12 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
-        idx = (key - self._start) / self._step
-        idx_int_upper_bound = (self._stop - self._start) // self._step
+        idx = (key - self.start) / self.step
+        idx_int_upper_bound = (self.stop - self.start) // self.step
         if idx > idx_int_upper_bound or idx < 0:
             raise KeyError(key)
 
-        idx_int = (key - self._start) // self._step
+        idx_int = (key - self.start) // self.step
         if idx_int != idx:
             raise KeyError(key)
         return idx_int
@@ -607,9 +589,9 @@ def _union(self, other, sort=None):
             # following notation: *_o -> other, *_s -> self,
             # and *_r -> result
             start_s, step_s = self.start, self.step
-            end_s = self._end
+            end_s = self.start + self.step * (len(self) - 1)
             start_o, step_o = other.start, other.step
-            end_o = other._end
+            end_o = other.start + other.step * (len(other) - 1)
             if self.step < 0:
                 start_s, step_s, end_s = end_s, -step_s, start_s
             if other.step < 0:
@@ -854,9 +836,7 @@ def argsort(
             raise ValueError(f"invalid na_position: {na_position}")
 
         indices = cupy.arange(0, len(self))
-        if (ascending and self._step < 0) or (
-            not ascending and self._step > 0
-        ):
+        if (ascending and self.step < 0) or (not ascending and self.step > 0):
             indices = indices[::-1]
         return indices
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index ebbca57bd40..08a7a9148dd 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1606,7 +1606,7 @@ def test_rangeindex_name_not_hashable():
 def test_index_rangeindex_search_range():
     # step > 0
     ridx = RangeIndex(-13, 17, 4)
-    ri = ridx.as_range
+    ri = ridx._range
     for i in range(len(ridx)):
         assert i == search_range(ridx[i], ri, side="left")
         assert i + 1 == search_range(ridx[i], ri, side="right")

From 70a5b2bda500fe46cd14860b4e2ca0109893c434 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 13:40:03 -1000
Subject: [PATCH 3/6] Don't materialize column during RangeIndex methods
 (#15582)

Additionally implements some methods that are defined on `BaseIndex` that were not implemented on `RangeIndex` and adds some typing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15582
---
 python/cudf/cudf/core/_base_index.py |  10 ++-
 python/cudf/cudf/core/index.py       | 108 +++++++++++++++++----------
 python/cudf/cudf/tests/test_index.py |  23 ++++++
 3 files changed, 100 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index de44f392eef..b5630ff9a54 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False):
         """
         raise NotImplementedError
 
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         raise NotImplementedError
 
     def union(self, other, sort=None):
@@ -2061,7 +2061,13 @@ def dropna(self, how="any"):
             one null value. "all" drops only rows containing
             *all* null values.
         """
-
+        if how not in {"any", "all"}:
+            raise ValueError(f"{how=} must be 'any' or 'all'")
+        try:
+            if not self.hasnans:
+                return self.copy()
+        except NotImplementedError:
+            pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
         data_columns = [
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e457e818129..6c0acdc5fb0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -21,6 +21,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
@@ -248,6 +249,15 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+        if sort and self.step < 0:
+            codes = cupy.arange(len(self) - 1, -1, -1)
+            uniques = self[::-1]
+        else:
+            codes = cupy.arange(len(self), dtype=np.intp)
+            uniques = self
+        return codes, uniques
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
@@ -260,7 +270,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def start(self):
+    def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
@@ -268,7 +278,7 @@ def start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def stop(self):
+    def stop(self) -> int:
         """
         The value of the stop parameter.
         """
@@ -276,7 +286,7 @@ def stop(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def step(self):
+    def step(self) -> int:
         """
         The value of the step parameter.
         """
@@ -284,7 +294,7 @@ def step(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_rows(self):
+    def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
@@ -295,33 +305,33 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return True
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
@@ -369,12 +379,15 @@ def astype(self, dtype, copy: bool = True):
             return self
         return self._as_int_index().astype(dtype, copy=copy)
 
+    def fillna(self, value, downcast=None):
+        return self.copy()
+
     @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
     @_cudf_nvtx_annotate
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     @_cudf_nvtx_annotate
@@ -390,6 +403,11 @@ def __repr__(self):
             + ")"
         )
 
+    @property
+    @_cudf_nvtx_annotate
+    def size(self) -> int:
+        return len(self)
+
     @_cudf_nvtx_annotate
     def __len__(self):
         return len(self._range)
@@ -478,12 +496,12 @@ def to_pandas(
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
@@ -492,7 +510,7 @@ def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
                 "The deep parameter is ignored and is only included "
@@ -500,7 +518,7 @@ def memory_usage(self, deep=False):
             )
         return 0
 
-    def unique(self):
+    def unique(self) -> Self:
         # RangeIndex always has unique values
         return self
 
@@ -823,34 +841,37 @@ def _columns(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):
-        return self.to_pandas().values
+    def values_host(self) -> np.ndarray:
+        return np.arange(start=self.start, stop=self.stop, step=self.step)
 
     @_cudf_nvtx_annotate
     def argsort(
         self,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
-
-        indices = cupy.arange(0, len(self))
         if (ascending and self.step < 0) or (not ascending and self.step > 0):
-            indices = indices[::-1]
-        return indices
+            return cupy.arange(len(self) - 1, -1, -1)
+        else:
+            return cupy.arange(len(self))
 
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
     @_cudf_nvtx_annotate
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
-        return self._as_int_index().to_arrow()
+    def to_cupy(self) -> cupy.ndarray:
+        return self.values
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self) -> pa.Array:
+        return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -861,17 +882,17 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self):
+    def nunique(self) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
     @_cudf_nvtx_annotate
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
@@ -895,12 +916,15 @@ def max(self):
         return self._minmax("max")
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return cupy.arange(self.start, self.stop, self.step)
 
-    def any(self):
+    def any(self) -> bool:
         return any(self._range)
 
+    def all(self) -> bool:
+        return 0 not in self._range
+
     def append(self, other):
         result = self._as_int_index().append(other)
         return self._try_reconstruct_range_index(result)
@@ -926,14 +950,20 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
-    def __neg__(self):
-        return -self._as_int_index()
+    def __pos__(self) -> Self:
+        return self.copy()
 
-    def __pos__(self):
-        return +self._as_int_index()
+    def __neg__(self) -> Self:
+        rng = range(-self.start, -self.stop, -self.step)
+        return type(self)(rng, name=self.name)
 
-    def __abs__(self):
-        return abs(self._as_int_index())
+    def __abs__(self) -> Self | Index:
+        if len(self) == 0 or self.min() >= 0:
+            return self.copy()
+        elif self.max() <= 0:
+            return -self
+        else:
+            return abs(self._as_int_index())
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 08a7a9148dd..c7875b81440 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar):
     result = idx.to_pandas(arrow_type=True)
     expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
     pd.testing.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
+def test_rangeindex_all(data):
+    result = cudf.RangeIndex(data).all()
+    expected = cudf.Index(list(data)).all()
+    assert result == expected
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
+def test_rangeindex_factorize(sort, data):
+    res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
+    exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
+    assert_eq(res_codes, exp_codes)
+    assert_eq(res_uniques, exp_uniques)
+
+
+def test_rangeindex_dropna():
+    ri = cudf.RangeIndex(range(2))
+    result = ri.dropna()
+    expected = ri.copy()
+    assert_eq(result, expected)

From 4dc9ebbfe5b2a22949c5f24114918e4369d055cd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 08:53:11 -0400
Subject: [PATCH 4/6] Improve performance for cudf::strings::count_re (#15578)

Improves performance of `cudf::strings::count_re` when pattern starts with a literal character.
Although this is a specific use case, the regex code has special logic to help speed up the search in this case.

Since the pattern indicates the target must contain this character as the start of the matching sequence, it first does a normal find for the character before continuing matching the remaining pattern. The `find()` function can be inefficient for long strings since it is character based and must resolve the character's byte-position by counting from the beginning of the string. For a function like `count_re()` all occurrences are matched within a target meaning longer target strings can incur expensive counting.

The solution included here is to introduce a more efficient `find_char()` utility that accepts a `string_view::const_iterator()` which automatically keeps track of its byte and character positions. This helps minimize byte/character counting in between calls from `count_re()` and other similar functions that make repeated calls for all matches (e.g. `replace_re()` and `split_re()`).

Close #15567

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15578
---
 cpp/benchmarks/string/contains.cpp |  4 ++--
 cpp/benchmarks/string/count.cpp    | 12 ++++++++----
 cpp/src/strings/regex/regex.inl    | 19 ++++++++++++++-----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 6d839c1de64..ae6c8b844c8 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
 }
 
 // longer pattern lengths demand more working memory per string
-std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
 static void bench_contains(nvbench::state& state)
 {
@@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
   .add_int64_axis("row_width", {32, 64, 128, 256, 512})
   .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
-  .add_int64_axis("pattern", {0, 1});
+  .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index a656010dca5..f964bc5d224 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -25,10 +25,13 @@
 
 #include <nvbench/nvbench.cuh>
 
+static std::string patterns[] = {"\\d+", "a"};
+
 static void bench_count(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
-  std::string pattern = "\\d+";
+  auto const pattern = patterns[pattern_index];
 
   auto prog = cudf::strings::regex_program::create(pattern);
 
@@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
 NVBENCH_BENCH(bench_count)
   .set_name("count")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index ce12dc17aa4..10e06505094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
                                             : reinterpret_cast<reprog_device*>(buffer)[0];
 }
 
+__device__ __forceinline__ static string_view::const_iterator find_char(
+  cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
+{
+  while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
+    ++itr;
+  }
+  return itr;
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case BOL:
           if (pos == 0) break;
           if (jnk.startchar != '^') { return thrust::nullopt; }
-          --pos;
+          --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
-          auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return thrust::nullopt; }
-          pos = fidx + (jnk.starttype == BOL);
+          auto const find_itr = find_char(startchar, dstr, itr);
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          itr = find_itr + (jnk.starttype == BOL);
+          pos = itr.position();
           break;
         }
       }
-      itr += (pos - itr.position());  // faster to increment position
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {

From 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 15:12:01 -0400
Subject: [PATCH 5/6] Fix debug warnings/errors in from_arrow_device_test.cpp
 (#15596)

Fixes debug build errors introduced by #15458

These warnings show up in a debug build where warnings become errors.
Some of the errors:
```
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:103:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  103 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:105:29: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeDateTime(ArrowSchema*, ArrowType, ArrowTimeUnit, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  105 |   ArrowSchemaSetTypeDateTime(
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:107:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  107 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:110:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayInitFromSchema(ArrowArray*, const ArrowSchema*, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  110 |   ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:115:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  115 |   ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:118:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  118 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_NestedList_Test::TestBody()':
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:202:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  202 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:204:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  204 |   ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:205:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  205 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:208:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  208 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:209:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  209 |   ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:212:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  212 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:214:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  214 |   ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:226:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  226 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_StructColumn_Test::TestBody()':

```

Closes #15597

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15596
---
 cpp/tests/interop/from_arrow_device_test.cpp | 229 +++++++++++--------
 cpp/tests/interop/nanoarrow_utils.hpp        |   4 +-
 cpp/tests/interop/to_arrow_device_test.cpp   |   7 +-
 3 files changed, 135 insertions(+), 105 deletions(-)

diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 95cbe8057d1..66bd4dd1bfb 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -100,22 +100,26 @@ TEST_F(FromArrowDeviceTest, DateTimeTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = 6;
   input_array->null_count              = 0;
   input_array->children[0]->length     = 6;
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
   ArrowArrayBuffer(input_array->children[0], 1)->data =
     const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes =
+    sizeof(int64_t) * cudf::column_view(col).size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -155,23 +159,27 @@ TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
-  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  auto data_ptr  = expected_table_view.column(0).data<uint8_t>();
+  auto data_size = expected_table_view.column(0).size();
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = expected_table_view.num_rows();
   input_array->null_count              = 0;
   input_array->children[0]->length     = expected_table_view.num_rows();
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
-  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data       = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes = sizeof(T) * data_size;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -199,19 +207,21 @@ TEST_F(FromArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   nanoarrow::UniqueArray input_array;
@@ -223,7 +233,8 @@ TEST_F(FromArrowDeviceTest, NestedList)
   cudf::lists_column_view nested_view{lview.child()};
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -287,47 +298,52 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = 0;
 
   auto child = input_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
 
   input_array->length = expected_table_view.num_rows();
 
@@ -336,7 +352,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -354,14 +370,15 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -406,25 +423,28 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
-
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
-  ArrowSchemaSetName(input_schema->children[0], "a");
-  ArrowSchemaAllocateDictionary(input_schema->children[0]);
-  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
-  ArrowSchemaSetName(input_schema->children[1], "b");
-  ArrowSchemaAllocateDictionary(input_schema->children[1]);
-  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[2], "c");
-  ArrowSchemaAllocateDictionary(input_schema->children[2]);
-  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length     = expected_table.num_rows();
   input_array->null_count = 0;
 
@@ -446,7 +466,8 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   populate_from_col<int64_t>(input_array->children[2]->dictionary,
                              cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -562,20 +583,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -607,20 +630,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -652,20 +677,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -699,20 +726,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index b795bafed97..fb5d1060f6f 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -122,13 +122,13 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
   arr->length     = dview.size();
   arr->null_count = dview.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(dview.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index fb346dad538..626aeb53cdd 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -217,7 +217,8 @@ get_nanoarrow_tables(cudf::size_type length)
   populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
   arrow->children[5]->length     = struct_view.size();
   arrow->children[5]->null_count = struct_view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc));
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(struct_view.size());
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
@@ -241,13 +242,13 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }

From c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 26 Apr 2024 04:02:25 -0500
Subject: [PATCH 6/6] Fix a JNI bug in JSON parsing fixup (#15550)

When parsing JSON in the current code if no columns can be parsed out of the data, then an empty table is returned. Earlier we put in a work around to this so that we could pass in the number of rows needed and the JSON parsing code would make a table of null values for it. This had some issues with structs and lists which needed an extended way to produce the null scalar. This adds in code to do just that.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15550
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 28 ++++++++++++++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 22 +++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c8571dd841c..43603386649 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * The schema of data to be read in.
@@ -221,6 +222,13 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  public int getNumChildren() {
+    if (childSchemas == null) {
+      return 0;
+    }
+    return childSchemas.size();
+  }
+
   int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
@@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
     return false;
   }
 
-  public static class Builder {
+  public HostColumnVector.DataType asHostDataType() {
+    if (topLevelType == DType.LIST) {
+      assert(childSchemas != null && childSchemas.size() == 1);
+      HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
+      return new HostColumnVector.ListType(true, element);
+    } else if (topLevelType == DType.STRUCT) {
+      if (childSchemas == null) {
+        return new HostColumnVector.StructType(true);
+      } else {
+        List<HostColumnVector.DataType> childTypes =
+                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+        return new HostColumnVector.StructType(true, childTypes);
+      }
+    } else {
+      return new HostColumnVector.BasicType(true, topLevelType);
+    }
+  }
+
+    public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4038b3a40b8..4e737451ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
               columns[i] = tbl.getColumn(index).incRefCount();
             }
           } else {
-            try (Scalar s = Scalar.fromNull(types[i])) {
-              columns[i] = ColumnVector.fromScalar(s, rowCount);
+            if (types[i] == DType.LIST) {
+              Schema listSchema = schema.getChild(i);
+              Schema elementSchema = listSchema.getChild(0);
+              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else if (types[i] == DType.STRUCT) {
+              Schema structSchema = schema.getChild(i);
+              int numStructChildren = structSchema.getNumChildren();
+              DataType[] structChildrenTypes = new DataType[numStructChildren];
+              for (int j = 0; j < numStructChildren; j++) {
+                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
+              }
+              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else {
+              try (Scalar s = Scalar.fromNull(types[i])) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
             }
           }
         }