From e2bd05e69b207af1bf8764eb7884c41a5f8abe16 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Fri, 26 Apr 2024 16:30:43 -0700 Subject: [PATCH] remove_line_end_token fix Signed-off-by: Suraj Aralihalli --- cpp/src/io/json/nested_json_gpu.cu | 82 ++++++++++++++++++++---------- cpp/tests/io/nested_json_test.cpp | 18 +++---- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 3fb1bf8dde8..541adc264b7 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -297,17 +297,16 @@ struct TransduceTokenKeepLineEnd { if (is_end_of_invalid_line) { switch (relative_offset) { - case 0 : return SymbolT{token_t::StructEnd, 0}; - case 1 : return SymbolT{token_t::StructBegin, 0}; - case 2 : return SymbolT{token_t::LineEnd, 0}; - default: return SymbolT{token_t::LineEnd, 0}; // doesn't appear + case 0: return SymbolT{token_t::StructEnd, 0}; + case 1: return SymbolT{token_t::StructBegin, 0}; + case 2: return SymbolT{token_t::LineEnd, 0}; + default: return SymbolT{token_t::LineEnd, 0}; // doesn't appear } } else if (is_end_of_valid_line) { return SymbolT{token_t::LineEnd, 0}; } else { return read_symbol; } - } template @@ -1539,34 +1538,63 @@ std::pair, rmm::device_uvector> pr // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; - auto filter_fst = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), - fst::detail::make_transition_table(token_filter::transition_table), - fst::detail::make_translation_functor(token_filter::TransduceTokenKeepLineEnd{}), - stream); auto const mr = rmm::mr::get_current_device_resource(); rmm::device_scalar d_num_selected_tokens(stream, mr); rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; - // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the - // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...), - // emitting a [StructBegin, StructEnd] pair on the end of such an invalid line. In that example, - // inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced with - // StructBegin, StructEnd. Also, all LineEnd are removed as well, as these are not relevant after - // this stage anymore - filter_fst.Transduce( - thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) + - tokens.size()), - static_cast(tokens.size()), - thrust::make_reverse_iterator( - thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) + - tokens.size()), - thrust::make_discard_iterator(), - d_num_selected_tokens.data(), - token_filter::start_state, - stream); + if (remove_line_end_token) { + // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the + // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...), + // emitting a [StructBegin, StructEnd] pair on the end of such an invalid line. In that example, + // inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced with + // StructBegin, StructEnd. Also, all LineEnd are removed as well, as these are not relevant + // after this stage anymore + + auto filter_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); + + filter_fst.Transduce( + thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) + + tokens.size()), + static_cast(tokens.size()), + thrust::make_reverse_iterator( + thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) + + tokens.size()), + thrust::make_discard_iterator(), + d_num_selected_tokens.data(), + token_filter::start_state, + stream); + } else { + // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the + // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...), + // emitting a [LineEnd, StructBegin, StructEnd] on the end of such an invalid line. In that + // example, inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced + // with LineEnd, StructBegin, StructEnd. Unlike the previous case, LineEnd tokens are retained + // however, the corresponding token index is written as 0. + + auto filter_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceTokenKeepLineEnd{}), + stream); + + filter_fst.Transduce( + thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) + + tokens.size()), + static_cast(tokens.size()), + thrust::make_reverse_iterator( + thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) + + tokens.size()), + thrust::make_discard_iterator(), + d_num_selected_tokens.data(), + token_filter::start_state, + stream); + } auto const num_total_tokens = d_num_selected_tokens.value(stream); rmm::device_uvector tokens_out{num_total_tokens, stream, mr}; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index a1aba738737..ecd0f41f349 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -1162,7 +1162,7 @@ TEST_F(JsonTest, TokenStreamWithLineEnd) using cuio_json::SymbolOffsetT; using cuio_json::SymbolT; // Test input - std::string const input = R"({"a":1} + std::string const input = R"({"a":1} {"b":2} {"c" {"d":4})"; @@ -1181,14 +1181,14 @@ TEST_F(JsonTest, TokenStreamWithLineEnd) // Parse the JSON and get the token stream auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( - d_input, default_options, stream, rmm::mr::get_current_device_resource()); + d_input, default_options, stream, rmm::mr::get_current_device_resource(), false); // Copy back the number of tokens that were written - auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); // // Golden token stream sample - using token_t = cuio_json::token_t; + using token_t = cuio_json::token_t; std::vector const golden_token_stream = { - //Line 1 + // Line 1 token_t::LineEnd, token_t::StructBegin, token_t::StructMemberBegin, @@ -1198,7 +1198,7 @@ TEST_F(JsonTest, TokenStreamWithLineEnd) token_t::ValueEnd, token_t::StructMemberEnd, token_t::StructEnd, - //Line 2 + // Line 2 token_t::LineEnd, token_t::StructBegin, token_t::StructMemberBegin, @@ -1208,11 +1208,11 @@ TEST_F(JsonTest, TokenStreamWithLineEnd) token_t::ValueEnd, token_t::StructMemberEnd, token_t::StructEnd, - //Error + // Error token_t::LineEnd, token_t::StructBegin, token_t::StructEnd, - //Line 3 + // Line 3 token_t::LineEnd, token_t::StructBegin, token_t::StructMemberBegin, @@ -1233,6 +1233,4 @@ TEST_F(JsonTest, TokenStreamWithLineEnd) } } - - CUDF_TEST_PROGRAM_MAIN()