Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable text build without relying on relaxed constexpr #17647

Draft
wants to merge 5 commits into
base: branch-25.02
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/cudf/utilities/type_dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf {
* @return The `cudf::type_id` corresponding to the specified type
*/
template <typename T>
inline constexpr type_id type_to_id()
CUDF_HOST_DEVICE inline constexpr type_id type_to_id()
{
return type_id::EMPTY;
};
Expand Down
15 changes: 8 additions & 7 deletions cpp/src/text/edit_distance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/functional>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/functional.h>
Expand Down Expand Up @@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
if (str_length == 0) return tgt_length;
if (tgt_length == 0) return str_length;

auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
// .first is min and .second is max
auto const [n, m] = std::minmax(str_length, tgt_length);
auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
auto const n = cuda::std::min(str_length, tgt_length);
auto const m = cuda::std::max(str_length, tgt_length);
// setup compute buffer pointers
auto v0 = buffer;
auto v1 = v0 + n + 1;
Expand All @@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
auto sub_cost = v0[j] + (*itr != *itr_tgt);
auto del_cost = v0[j + 1] + 1;
auto ins_cost = v1[j] + 1;
v1[j + 1] = std::min(std::min(sub_cost, del_cost), ins_cost);
v1[j + 1] = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost);
}
thrust::swap(v0, v1);
}
Expand Down Expand Up @@ -170,7 +171,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
? d_targets.element<cudf::string_view>(0)
: d_targets.element<cudf::string_view>(idx);
// just need 2 integers for each character of the shorter string
return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2;
});

// get the total size of the temporary compute buffer
Expand Down Expand Up @@ -241,7 +242,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
if (d_str1.empty() || d_str2.empty()) { return; }
// the temp size needed is 2 integers per character of the shorter string
d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
(std::min(d_str1.length(), d_str2.length()) + 1) * 2;
(cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2;
});

// get the total size for the compute buffer
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/text/jaccard.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <rmm/exec_policy.hpp>

#include <cub/cub.cuh>
#include <cuda/std/functional>
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
Expand Down Expand Up @@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string
}
}
auto const char_count = warp_reduce(temp_storage).Sum(count);
if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); }
}

/**
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/text/minhash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,13 @@

#include <cooperative_groups.h>
#include <cuda/atomic>
#include <cuda/std/limits>
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include <limits>

namespace nvtext {
namespace detail {
namespace {
Expand Down Expand Up @@ -156,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
// initialize the output -- only needed for wider strings
auto d_output = d_results + (str_idx * param_count);
for (auto i = lane_idx; i < param_count; i += tile_size) {
d_output[i] = std::numeric_limits<hash_value_type>::max();
d_output[i] = cuda::std::numeric_limits<hash_value_type>::max();
}
}
}
Expand Down Expand Up @@ -226,7 +225,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
? section_size
: cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);

auto const init = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
auto const init = size_bytes == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
auto const lane_idx = block.thread_rank();
auto const d_output = d_results + (str_idx * parameter_a.size());

Expand All @@ -235,7 +234,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,

// constants used in the permutation calculations
constexpr uint64_t mersenne_prime = (1UL << 61) - 1;
constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
constexpr hash_value_type hash_max = cuda::std::numeric_limits<hash_value_type>::max();

// found to be an efficient shared memory size for both hash types
__shared__ hash_value_type block_values[block_size * params_per_thread];
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/text/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <rmm/cuda_stream_view.hpp>

#include <cuda/atomic>
#include <cuda/std/functional>
#include <thrust/binary_search.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -196,7 +197,7 @@ struct sub_offset_fn {
{
// keep delimiter search within this sub-block
auto const end =
d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
// starting point of this sub-block
auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE);
while ((itr < end) &&
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/functional>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/pair.h>
Expand Down Expand Up @@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings,
constexpr uint8_t max_utf8_blocks_for_char = 4;
uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};

for (int i = 0; i < std::min(static_cast<size_t>(max_utf8_blocks_for_char),
total_bytes - start_byte_for_thread);
for (int i = 0; i < cuda::std::min(static_cast<size_t>(max_utf8_blocks_for_char),
total_bytes - start_byte_for_thread);
++i) {
utf8_blocks[i] = strings[start_byte_for_thread + i];
}
Expand Down
30 changes: 18 additions & 12 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/functional>
#include <cuda/std/limits>
#include <thrust/copy.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -87,23 +89,23 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi

// Deal with the start_word_indices array
if (char_for_thread < num_code_points) {
uint32_t val_to_write = std::numeric_limits<uint32_t>::max();
uint32_t val_to_write = cuda::std::numeric_limits<uint32_t>::max();
if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) &&
(code_points[char_for_thread - 1] == SPACE_CODE_POINT)) {
val_to_write = char_for_thread;
}
start_word_indices[char_for_thread] = val_to_write;

// Deal with the end_word_indices_array
val_to_write = std::numeric_limits<uint32_t>::max();
val_to_write = cuda::std::numeric_limits<uint32_t>::max();
if ((code_points[char_for_thread] != SPACE_CODE_POINT) &&
(char_for_thread + 1 < num_code_points) &&
(code_points[char_for_thread + 1] == SPACE_CODE_POINT)) {
val_to_write = char_for_thread + 1;
}
end_word_indices[char_for_thread] = val_to_write;

token_ids[char_for_thread] = std::numeric_limits<uint32_t>::max();
token_ids[char_for_thread] = cuda::std::numeric_limits<uint32_t>::max();
tokens_per_word[char_for_thread] = 0;
}
}
Expand Down Expand Up @@ -214,7 +216,7 @@ struct mark_special_tokens {
__device__ void operator()(size_t idx) const
{
uint32_t const start_index = start_word_indices[idx];
if ((start_index == std::numeric_limits<uint32_t>::max()) ||
if ((start_index == cuda::std::numeric_limits<uint32_t>::max()) ||
((start_index + MIN_ST_WIDTH + 2) > num_code_points))
return;
if (code_points[start_index] != '[') return;
Expand All @@ -225,12 +227,12 @@ struct mark_special_tokens {
uint32_t const end_index = [&] {
auto const begin = start_word_indices + start_pos;
auto const width =
std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
cuda::std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
auto const end = begin + width;
// checking the next start-word is more reliable than arbitrarily searching for ']'
// in case the text is split across string rows
auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) {
return swi != std::numeric_limits<uint32_t>::max();
return swi != cuda::std::numeric_limits<uint32_t>::max();
});
return iter == end ? start_index : static_cast<uint32_t>(iter - start_word_indices);
}();
Expand All @@ -254,11 +256,11 @@ struct mark_special_tokens {
thrust::fill(thrust::seq,
start_word_indices + start_index + 1, // keep the first one
start_word_indices + end_index + 1,
std::numeric_limits<uint32_t>::max());
cuda::std::numeric_limits<uint32_t>::max());
thrust::fill(thrust::seq,
end_word_indices + start_index,
end_word_indices + end_index + 1,
std::numeric_limits<uint32_t>::max());
cuda::std::numeric_limits<uint32_t>::max());

// reset the new end-word index
end_word_indices[end_pos] = end_pos + 1;
Expand Down Expand Up @@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
// We need to clean up the global array. This case is very uncommon.
// Only 0.016% of words cannot be resolved to a token from the squad dev set.
for (uint32_t i = 1; i < num_values_tokenized; ++i) {
token_ids[token_start + i] = std::numeric_limits<uint32_t>::max();
token_ids[token_start + i] = cuda::std::numeric_limits<uint32_t>::max();
}
num_values_tokenized = 0;
}
Expand Down Expand Up @@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu
}

struct copy_if_fn { // inline lambda not allowed in private or protected member function
__device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits<uint32_t>::max(); }
__device__ bool operator()(uint32_t cp)
{
return cp != cuda::std::numeric_limits<uint32_t>::max();
}
};

struct tranform_fn { // just converting uint8 value to uint32
Expand Down Expand Up @@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
auto itr_end = thrust::remove(rmm::exec_policy(stream),
device_word_indices.begin(),
device_word_indices.end(),
std::numeric_limits<uint32_t>::max());
cuda::std::numeric_limits<uint32_t>::max());

// The number of tokens selected will be double the number of words since we
// select from both the start and end index arrays.
Expand Down Expand Up @@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
// token so this will always have enough memory to store the contiguous tokens.
uint32_t* contiguous_token_ids = device_code_points;
auto const copy_size = // thrust::copy_if limited to copying int-max values
std::min(device_token_ids.size(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
cuda::std::min(device_token_ids.size(),
static_cast<std::size_t>(cuda::std::numeric_limits<int>::max()));
auto ids_itr = device_token_ids.begin();
auto const ids_end = device_token_ids.end();
while (ids_itr != ids_end) {
Expand Down
Loading