diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp index de67b7cd052..d8afcbc6040 100644 --- a/cpp/include/cudf/io/detail/tokenize_json.hpp +++ b/cpp/include/cudf/io/detail/tokenize_json.hpp @@ -130,6 +130,7 @@ enum class LineEndTokenOption { Keep, Discard }; * * @param json_in The JSON input * @param options Parsing options specifying the parsing behaviour + * @param line_end_option option whether to keep or discard line_end_token * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate * @return Pair of device vectors, where the first vector represents the token types and the second @@ -142,6 +143,23 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant + * sections from the input. + * + * @param json_in The JSON input + * @param options Parsing options specifying the parsing behaviour + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate + * @return Pair of device vectors, where the first vector represents the token types and the second + * vector represents the index within the input corresponding to each token + */ +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index ca6ece4d2dc..2fd7b32750f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -211,6 +211,7 @@ void get_stack_context(device_span json_in, * * @param tokens The tokens to be post-processed * @param token_indices The tokens' corresponding indices that are post-processed + * @param line_end_option option whether to keep or discard line_end_token * @param stream The cuda stream to dispatch GPU kernels to * @return Returns the post-processed token stream */ diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c6a1c51993d..68339c5c276 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1656,6 +1656,15 @@ std::pair, rmm::device_uvector> ge return std::make_pair(std::move(tokens), std::move(tokens_indices)); } +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return get_token_stream(json_in, options, LineEndTokenOption::Discard, stream, mr); +} + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 552a87eacd7..eb90c0a8509 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -436,12 +436,8 @@ TEST_F(JsonTest, TokenStream) cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; // Parse the JSON and get the token stream - auto [d_tokens_gpu, d_token_indices_gpu] = - cuio_json::detail::get_token_stream(d_input, - default_options, - cuio_json::detail::LineEndTokenOption::Discard, - stream, - rmm::mr::get_current_device_resource()); + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, default_options, stream, rmm::mr::get_current_device_resource()); // Copy back the number of tokens that were written auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);