Skip to content

Commit

Permalink
Large strings support in regex replace APIs (#15524)
Browse files Browse the repository at this point in the history
Updates the `replace_re()` and `replace_with_backrefs()` internal logic to support large strings.
These functions use a regex-specific version of make-strings-children.

Depends on #15363

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #15524
  • Loading branch information
davidwendt authored Apr 18, 2024
1 parent 7b9e815 commit d1b92e2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 16 deletions.
19 changes: 9 additions & 10 deletions cpp/src/strings/regex/utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
#include "strings/regex/regex.cuh"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -116,10 +118,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto offsets = make_numeric_column(
data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets->mutable_view().template data<int32_t>();
size_and_exec_fn.d_offsets = d_offsets;
auto output_sizes = rmm::device_uvector<size_type>(strings_count, stream);
size_and_exec_fn.d_sizes = output_sizes.data();

auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count);

Expand All @@ -133,12 +133,11 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
size_and_exec_fn, d_prog, strings_count);
}

auto const char_bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);
// Convert the sizes to offsets
auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
output_sizes.begin(), output_sizes.end(), stream, mr);
size_and_exec_fn.d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// Now build the chars column
rmm::device_uvector<char> chars(char_bytes, stream, mr);
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/strings/replace/backref_re.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ struct backrefs_fn {
string_view const d_repl; // string replacement template
Iterator backrefs_begin;
Iterator backrefs_end;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand Down Expand Up @@ -113,7 +114,7 @@ struct backrefs_fn {
thrust::copy_n(
thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr);
} else {
d_offsets[idx] = nbytes;
d_sizes[idx] = nbytes;
}
}
};
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/strings/replace/replace_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ struct replace_regex_fn {
column_device_view const d_strings;
string_view const d_repl;
size_type const maxrepl;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand Down Expand Up @@ -90,7 +91,7 @@ struct replace_regex_fn {
d_str.size_bytes() - last_pos.byte_offset(), // ^ ^
out_ptr);
} else {
d_offsets[idx] = nbytes;
d_sizes[idx] = nbytes;
}
}
};
Expand Down

0 comments on commit d1b92e2

Please sign in to comment.