rapidsai · PointKernel · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
@@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf {
  * @return The `cudf::type_id` corresponding to the specified type
  */
 template <typename T>
-inline constexpr type_id type_to_id()
+CUDF_HOST_DEVICE inline constexpr type_id type_to_id()
 {
   return type_id::EMPTY;
 };

@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
   if (str_length == 0) return tgt_length;
   if (tgt_length == 0) return str_length;
 
-  auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
-  auto itr   = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
-  // .first is min and .second is max
-  auto const [n, m] = std::minmax(str_length, tgt_length);
+  auto begin   = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
+  auto itr     = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
+  auto const n = cuda::std::min(str_length, tgt_length);
+  auto const m = cuda::std::max(str_length, tgt_length);
   // setup compute buffer pointers
   auto v0 = buffer;
   auto v1 = v0 + n + 1;
@@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
       auto sub_cost = v0[j] + (*itr != *itr_tgt);
       auto del_cost = v0[j + 1] + 1;
       auto ins_cost = v1[j] + 1;
-      v1[j + 1]     = std::min(std::min(sub_cost, del_cost), ins_cost);
+      v1[j + 1]     = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost);
     }
     thrust::swap(v0, v1);
   }
@@ -170,7 +171,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                      ? d_targets.element<cudf::string_view>(0)
                                      : d_targets.element<cudf::string_view>(idx);
                       // just need 2 integers for each character of the shorter string
-                      return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
+                      return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2;
                     });
 
   // get the total size of the temporary compute buffer
@@ -241,7 +242,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
       if (d_str1.empty() || d_str2.empty()) { return; }
       // the temp size needed is 2 integers per character of the shorter string
       d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
-        (std::min(d_str1.length(), d_str2.length()) + 1) * 2;
+        (cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2;
     });
 
   // get the total size for the compute buffer

@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string
     }
   }
   auto const char_count = warp_reduce(temp_storage).Sum(count);
-  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); }
 }
 
 /**

@@ -40,14 +40,13 @@
 
 #include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/limits>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-#include <limits>
-
 namespace nvtext {
 namespace detail {
 namespace {
@@ -156,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     // initialize the output -- only needed for wider strings
     auto d_output = d_results + (str_idx * param_count);
     for (auto i = lane_idx; i < param_count; i += tile_size) {
-      d_output[i] = std::numeric_limits<hash_value_type>::max();
+      d_output[i] = cuda::std::numeric_limits<hash_value_type>::max();
     }
   }
 }
@@ -226,7 +225,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       ? section_size
       : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
 
-  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const init     = size_bytes == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = block.thread_rank();
   auto const d_output = d_results + (str_idx * parameter_a.size());
 
@@ -235,7 +234,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
 
   // constants used in the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
-  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+  constexpr hash_value_type hash_max = cuda::std::numeric_limits<hash_value_type>::max();
 
   // found to be an efficient shared memory size for both hash types
   __shared__ hash_value_type block_values[block_size * params_per_thread];

@@ -35,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -196,7 +197,7 @@ struct sub_offset_fn {
   {
     // keep delimiter search within this sub-block
     auto const end =
-      d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
+      d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
     // starting point of this sub-block
     auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE);
     while ((itr < end) &&

@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings,
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
 
-  for (int i = 0; i < std::min(static_cast<size_t>(max_utf8_blocks_for_char),
-                               total_bytes - start_byte_for_thread);
+  for (int i = 0; i < cuda::std::min(static_cast<size_t>(max_utf8_blocks_for_char),
+                                     total_bytes - start_byte_for_thread);
        ++i) {
     utf8_blocks[i] = strings[start_byte_for_thread + i];
   }

@@ -27,6 +27,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
+#include <cuda/std/limits>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -87,23 +89,23 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
-    uint32_t val_to_write = std::numeric_limits<uint32_t>::max();
+    uint32_t val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) &&
         (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) {
       val_to_write = char_for_thread;
     }
     start_word_indices[char_for_thread] = val_to_write;
 
     // Deal with the end_word_indices_array
-    val_to_write = std::numeric_limits<uint32_t>::max();
+    val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) &&
         (char_for_thread + 1 < num_code_points) &&
         (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) {
       val_to_write = char_for_thread + 1;
     }
     end_word_indices[char_for_thread] = val_to_write;
 
-    token_ids[char_for_thread]       = std::numeric_limits<uint32_t>::max();
+    token_ids[char_for_thread]       = cuda::std::numeric_limits<uint32_t>::max();
     tokens_per_word[char_for_thread] = 0;
   }
 }
@@ -214,7 +216,7 @@ struct mark_special_tokens {
   __device__ void operator()(size_t idx) const
   {
     uint32_t const start_index = start_word_indices[idx];
-    if ((start_index == std::numeric_limits<uint32_t>::max()) ||
+    if ((start_index == cuda::std::numeric_limits<uint32_t>::max()) ||
         ((start_index + MIN_ST_WIDTH + 2) > num_code_points))
       return;
     if (code_points[start_index] != '[') return;
@@ -225,12 +227,12 @@ struct mark_special_tokens {
     uint32_t const end_index = [&] {
       auto const begin = start_word_indices + start_pos;
       auto const width =
-        std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
+        cuda::std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
       auto const end = begin + width;
       // checking the next start-word is more reliable than arbitrarily searching for ']'
       // in case the text is split across string rows
       auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) {
-        return swi != std::numeric_limits<uint32_t>::max();
+        return swi != cuda::std::numeric_limits<uint32_t>::max();
       });
       return iter == end ? start_index : static_cast<uint32_t>(iter - start_word_indices);
     }();
@@ -254,11 +256,11 @@ struct mark_special_tokens {
     thrust::fill(thrust::seq,
                  start_word_indices + start_index + 1,  // keep the first one
                  start_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
     thrust::fill(thrust::seq,
                  end_word_indices + start_index,
                  end_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
 
     // reset the new end-word index
     end_word_indices[end_pos] = end_pos + 1;
@@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
       // We need to clean up the global array. This case is very uncommon.
       //  Only 0.016% of words cannot be resolved to a token from the squad dev set.
       for (uint32_t i = 1; i < num_values_tokenized; ++i) {
-        token_ids[token_start + i] = std::numeric_limits<uint32_t>::max();
+        token_ids[token_start + i] = cuda::std::numeric_limits<uint32_t>::max();
       }
       num_values_tokenized = 0;
     }
@@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu
 }
 
 struct copy_if_fn {  // inline lambda not allowed in private or protected member function
-  __device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits<uint32_t>::max(); }
+  __device__ bool operator()(uint32_t cp)
+  {
+    return cp != cuda::std::numeric_limits<uint32_t>::max();
+  }
 };
 
 struct tranform_fn {  // just converting uint8 value to uint32
@@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   auto itr_end = thrust::remove(rmm::exec_policy(stream),
                                 device_word_indices.begin(),
                                 device_word_indices.end(),
-                                std::numeric_limits<uint32_t>::max());
+                                cuda::std::numeric_limits<uint32_t>::max());
 
   // The number of tokens selected will be double the number of words since we
   // select from both the start and end index arrays.
@@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // token so this will always have enough memory to store the contiguous tokens.
   uint32_t* contiguous_token_ids = device_code_points;
   auto const copy_size           =  // thrust::copy_if limited to copying int-max values
-    std::min(device_token_ids.size(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
+    cuda::std::min(device_token_ids.size(),
+                   static_cast<std::size_t>(cuda::std::numeric_limits<int>::max()));
   auto ids_itr       = device_token_ids.begin();
   auto const ids_end = device_token_ids.end();
   while (ids_itr != ids_end) {