From d1b92e2ec3b943a99299db24873a89fe31e3c0e3 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:17:27 -0400 Subject: [PATCH] Large strings support in regex replace APIs (#15524) Updates the `replace_re()` and `replace_with_backrefs()` internal logic to support large strings. These functions use a regex-specific version of make-strings-children. Depends on #15363 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15524 --- cpp/src/strings/regex/utilities.cuh | 19 +++++++++---------- cpp/src/strings/replace/backref_re.cuh | 7 ++++--- cpp/src/strings/replace/replace_re.cu | 7 ++++--- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index cfe53937e66..afbfe9de049 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -19,8 +19,10 @@ #include "strings/regex/regex.cuh" #include +#include #include #include +#include #include #include @@ -116,10 +118,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto offsets = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().template data(); - size_and_exec_fn.d_offsets = d_offsets; + auto output_sizes = rmm::device_uvector(strings_count, stream); + size_and_exec_fn.d_sizes = output_sizes.data(); auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count); @@ -133,12 +133,11 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, for_each_kernel<<>>( size_and_exec_fn, d_prog, strings_count); } - - auto const char_bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(char_bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); + // Convert the sizes to offsets + auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + size_and_exec_fn.d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // Now build the chars column rmm::device_uvector chars(char_bytes, stream, mr); diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh index edd85f29e6c..b5b75cf8f40 100644 --- a/cpp/src/strings/replace/backref_re.cuh +++ b/cpp/src/strings/replace/backref_re.cuh @@ -45,13 +45,14 @@ struct backrefs_fn { string_view const d_repl; // string replacement template Iterator backrefs_begin; Iterator backrefs_end; - size_type* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -113,7 +114,7 @@ struct backrefs_fn { thrust::copy_n( thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr); } else { - d_offsets[idx] = nbytes; + d_sizes[idx] = nbytes; } } }; diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 1290302340b..fd988855424 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -43,13 +43,14 @@ struct replace_regex_fn { column_device_view const d_strings; string_view const d_repl; size_type const maxrepl; - size_type* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } @@ -90,7 +91,7 @@ struct replace_regex_fn { d_str.size_bytes() - last_pos.byte_offset(), // ^ ^ out_ptr); } else { - d_offsets[idx] = nbytes; + d_sizes[idx] = nbytes; } } };