From 876b669627ded22f2f4446d1e599828334b2de3c Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 2 Dec 2024 08:26:52 +0000 Subject: [PATCH 1/5] bug fix --- cpp/src/io/json/json_tree.cu | 96 +++++++++++++++++++++++++++++++++ cpp/tests/io/json/json_test.cpp | 11 ++++ 2 files changed, 107 insertions(+) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index e2fe926ea19..9908fbecb81 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -272,6 +272,102 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_FAIL("JSON Parser encountered an invalid format at location " + std::to_string(error_index[0])); } + // Ensure that begin tokens have matching end tokens + { + // Struct, List, StructMember, FieldName, String, Value + auto not_ok = thrust::transform_reduce( + rmm::exec_policy(stream), + tokens.begin(), + tokens.end(), + cuda::proclaim_return_type([] __device__(auto token) -> std::uint8_t { + std::uint8_t token_bits{0}; + switch (token) { + case token_t::StructBegin: [[fallthrough]]; + case token_t::StructEnd: { + token_bits |= (std::uint8_t{1} << 5); + break; + } + case token_t::ListBegin: [[fallthrough]]; + case token_t::ListEnd: { + token_bits |= (std::uint8_t{1} << 4); + break; + } + case token_t::StructMemberBegin: [[fallthrough]]; + case token_t::StructMemberEnd: { + token_bits |= (std::uint8_t{1} << 3); + break; + } + case token_t::FieldNameBegin: [[fallthrough]]; + case token_t::FieldNameEnd: { + token_bits |= (std::uint8_t{1} << 2); + break; + } + case token_t::StringBegin: [[fallthrough]]; + case token_t::StringEnd: { + token_bits |= (std::uint8_t{1} << 1); + break; + } + case token_t::ValueBegin: [[fallthrough]]; + case token_t::ValueEnd: { + token_bits |= std::uint8_t{1}; + break; + } + default: break; + } + return token_bits; + }), + std::uint8_t{0}, + thrust::bit_xor()); + /* + auto token_bitmask = cudf::detail::make_zeroed_device_uvector_async(tokens.size(), + stream, mr); thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + tokens.size(), + [tokens = tokens.begin(), token_bitmask = token_bitmask.begin()] __device__(auto idx) { + auto token = tokens[idx]; + auto token_bits = token_bitmask[idx]; + switch(token) { + case token_t::StructBegin: [[fallthrough]]; + case token_t::StructEnd: { + token_bits |= (std::uint8_t{1} << 5); + break; + } + case token_t::ListBegin: [[fallthrough]]; + case token_t::ListEnd: { + token_bits |= (std::uint8_t{1} << 4); + break; + } + case token_t::StructMemberBegin: [[fallthrough]]; + case token_t::StructMemberEnd: { + token_bits |= (std::uint8_t{1} << 3); + break; + } + case token_t::FieldNameBegin: [[fallthrough]]; + case token_t::FieldNameEnd: { + token_bits |= (std::uint8_t{1} << 2); + break; + } + case token_t::StringBegin: [[fallthrough]]; + case token_t::StringEnd: { + token_bits |= (std::uint8_t{1} << 1); + break; + } + case token_t::ValueBegin: [[fallthrough]]; + case token_t::ValueEnd: { + token_bits |= std::uint8_t{1}; + break; + } + default: break; + } + token_bitmask[idx] = token_bits; + }); + auto not_ok = thrust::reduce(rmm::exec_policy(stream), token_bitmask.begin(), + token_bitmask.end(), 0, + [] __device__(std::uint8_t a, std::uint8_t b) -> std::uint8_t { + return a ^ b; + }); + */ + CUDF_EXPECTS(!not_ok, "Some begin token does not have a matching end token"); + } auto const num_tokens = tokens.size(); auto const num_nodes = diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 3c8db99c3c7..bc496d323ed 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -3310,4 +3310,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}}); } +TEST_F(JsonReaderTest, MismatchedBeginEndTokens) +{ + std::string data = R"({"not_valid":"json)"; + auto opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL) + .build(); + EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() From 1751347ee41d716769b525022188722b7e421080 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 2 Dec 2024 09:01:03 +0000 Subject: [PATCH 2/5] cleanup --- cpp/src/io/json/json_tree.cu | 48 ------------------------------------ 1 file changed, 48 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 9908fbecb81..96e5197b3a0 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -318,54 +318,6 @@ tree_meta_t get_tree_representation(device_span tokens, }), std::uint8_t{0}, thrust::bit_xor()); - /* - auto token_bitmask = cudf::detail::make_zeroed_device_uvector_async(tokens.size(), - stream, mr); thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + tokens.size(), - [tokens = tokens.begin(), token_bitmask = token_bitmask.begin()] __device__(auto idx) { - auto token = tokens[idx]; - auto token_bits = token_bitmask[idx]; - switch(token) { - case token_t::StructBegin: [[fallthrough]]; - case token_t::StructEnd: { - token_bits |= (std::uint8_t{1} << 5); - break; - } - case token_t::ListBegin: [[fallthrough]]; - case token_t::ListEnd: { - token_bits |= (std::uint8_t{1} << 4); - break; - } - case token_t::StructMemberBegin: [[fallthrough]]; - case token_t::StructMemberEnd: { - token_bits |= (std::uint8_t{1} << 3); - break; - } - case token_t::FieldNameBegin: [[fallthrough]]; - case token_t::FieldNameEnd: { - token_bits |= (std::uint8_t{1} << 2); - break; - } - case token_t::StringBegin: [[fallthrough]]; - case token_t::StringEnd: { - token_bits |= (std::uint8_t{1} << 1); - break; - } - case token_t::ValueBegin: [[fallthrough]]; - case token_t::ValueEnd: { - token_bits |= std::uint8_t{1}; - break; - } - default: break; - } - token_bitmask[idx] = token_bits; - }); - auto not_ok = thrust::reduce(rmm::exec_policy(stream), token_bitmask.begin(), - token_bitmask.end(), 0, - [] __device__(std::uint8_t a, std::uint8_t b) -> std::uint8_t { - return a ^ b; - }); - */ CUDF_EXPECTS(!not_ok, "Some begin token does not have a matching end token"); } From 0535cc97d55c58979fc4e7067e6c4c7185743945 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 2 Dec 2024 14:58:49 +0000 Subject: [PATCH 3/5] corner case fix --- cpp/src/io/json/json_tree.cu | 95 ++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 96e5197b3a0..b1d202f1932 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -275,50 +275,59 @@ tree_meta_t get_tree_representation(device_span tokens, // Ensure that begin tokens have matching end tokens { // Struct, List, StructMember, FieldName, String, Value - auto not_ok = thrust::transform_reduce( - rmm::exec_policy(stream), - tokens.begin(), - tokens.end(), - cuda::proclaim_return_type([] __device__(auto token) -> std::uint8_t { - std::uint8_t token_bits{0}; - switch (token) { - case token_t::StructBegin: [[fallthrough]]; - case token_t::StructEnd: { - token_bits |= (std::uint8_t{1} << 5); - break; - } - case token_t::ListBegin: [[fallthrough]]; - case token_t::ListEnd: { - token_bits |= (std::uint8_t{1} << 4); - break; - } - case token_t::StructMemberBegin: [[fallthrough]]; - case token_t::StructMemberEnd: { - token_bits |= (std::uint8_t{1} << 3); - break; + // Corner case: If the input is a literal such as "true" or "false" that does not end in a + // newline character, then we will have only the ValueBegin token without the corresponding + // ValueEnd + if (tokens.size() == 1) { + auto h_tokens = cudf::detail::make_std_vector_sync(tokens, stream); + CUDF_EXPECTS(h_tokens[0] == token_t::ValueBegin, + "Some begin token does not have a matching end token"); + } else { + auto not_ok = thrust::transform_reduce( + rmm::exec_policy(stream), + tokens.begin(), + tokens.end(), + cuda::proclaim_return_type([] __device__(auto token) -> std::uint8_t { + std::uint8_t token_bits{0}; + switch (token) { + case token_t::StructBegin: [[fallthrough]]; + case token_t::StructEnd: { + token_bits |= (std::uint8_t{1} << 5); + break; + } + case token_t::ListBegin: [[fallthrough]]; + case token_t::ListEnd: { + token_bits |= (std::uint8_t{1} << 4); + break; + } + case token_t::StructMemberBegin: [[fallthrough]]; + case token_t::StructMemberEnd: { + token_bits |= (std::uint8_t{1} << 3); + break; + } + case token_t::FieldNameBegin: [[fallthrough]]; + case token_t::FieldNameEnd: { + token_bits |= (std::uint8_t{1} << 2); + break; + } + case token_t::StringBegin: [[fallthrough]]; + case token_t::StringEnd: { + token_bits |= (std::uint8_t{1} << 1); + break; + } + case token_t::ValueBegin: [[fallthrough]]; + case token_t::ValueEnd: { + token_bits |= std::uint8_t{1}; + break; + } + default: break; } - case token_t::FieldNameBegin: [[fallthrough]]; - case token_t::FieldNameEnd: { - token_bits |= (std::uint8_t{1} << 2); - break; - } - case token_t::StringBegin: [[fallthrough]]; - case token_t::StringEnd: { - token_bits |= (std::uint8_t{1} << 1); - break; - } - case token_t::ValueBegin: [[fallthrough]]; - case token_t::ValueEnd: { - token_bits |= std::uint8_t{1}; - break; - } - default: break; - } - return token_bits; - }), - std::uint8_t{0}, - thrust::bit_xor()); - CUDF_EXPECTS(!not_ok, "Some begin token does not have a matching end token"); + return token_bits; + }), + std::uint8_t{0}, + thrust::bit_xor()); + CUDF_EXPECTS(!not_ok, "Some begin token does not have a matching end token"); + } } auto const num_tokens = tokens.size(); From 2936e7f2f87f2c10c47d20fd91e485cb6617bf35 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 16 Dec 2024 06:29:01 +0000 Subject: [PATCH 4/5] simplifying logic --- cpp/src/io/fst/logical_stack.cuh | 6 ++++ cpp/src/io/json/json_tree.cu | 57 ------------------------------ cpp/src/io/json/nested_json_gpu.cu | 10 +++--- cpp/tests/io/json/json_test.cpp | 2 +- 4 files changed, 11 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 0f1fc7d572b..3b1ff7dc26f 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, stream)); } + // Check if the last element of d_kv_operations is 0. If not, then we have a problem. + if (num_symbols_in) { + StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream); + CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!"); + } + // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ reinterpret_cast(d_kv_operations.Current()), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index b1d202f1932..e2fe926ea19 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -272,63 +272,6 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_FAIL("JSON Parser encountered an invalid format at location " + std::to_string(error_index[0])); } - // Ensure that begin tokens have matching end tokens - { - // Struct, List, StructMember, FieldName, String, Value - // Corner case: If the input is a literal such as "true" or "false" that does not end in a - // newline character, then we will have only the ValueBegin token without the corresponding - // ValueEnd - if (tokens.size() == 1) { - auto h_tokens = cudf::detail::make_std_vector_sync(tokens, stream); - CUDF_EXPECTS(h_tokens[0] == token_t::ValueBegin, - "Some begin token does not have a matching end token"); - } else { - auto not_ok = thrust::transform_reduce( - rmm::exec_policy(stream), - tokens.begin(), - tokens.end(), - cuda::proclaim_return_type([] __device__(auto token) -> std::uint8_t { - std::uint8_t token_bits{0}; - switch (token) { - case token_t::StructBegin: [[fallthrough]]; - case token_t::StructEnd: { - token_bits |= (std::uint8_t{1} << 5); - break; - } - case token_t::ListBegin: [[fallthrough]]; - case token_t::ListEnd: { - token_bits |= (std::uint8_t{1} << 4); - break; - } - case token_t::StructMemberBegin: [[fallthrough]]; - case token_t::StructMemberEnd: { - token_bits |= (std::uint8_t{1} << 3); - break; - } - case token_t::FieldNameBegin: [[fallthrough]]; - case token_t::FieldNameEnd: { - token_bits |= (std::uint8_t{1} << 2); - break; - } - case token_t::StringBegin: [[fallthrough]]; - case token_t::StringEnd: { - token_bits |= (std::uint8_t{1} << 1); - break; - } - case token_t::ValueBegin: [[fallthrough]]; - case token_t::ValueEnd: { - token_bits |= std::uint8_t{1}; - break; - } - default: break; - } - return token_bits; - }), - std::uint8_t{0}, - thrust::bit_xor()); - CUDF_EXPECTS(!not_ok, "Some begin token does not have a matching end token"); - } - } auto const num_tokens = tokens.size(); auto const num_nodes = diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f1c2826c62a..30a28a1cf98 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1473,10 +1473,11 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - auto stack_ops_bufsize = d_num_stack_ops.value(stream); + // Copy back to actual number of stack operations + auto num_stack_ops = d_num_stack_ops.value(stream); // Sequence of stack symbols and their position in the original input (sparse representation) - rmm::device_uvector stack_ops{stack_ops_bufsize, stream}; - rmm::device_uvector stack_op_indices{stack_ops_bufsize, stream}; + rmm::device_uvector stack_ops{num_stack_ops, stream}; + rmm::device_uvector stack_op_indices{num_stack_ops, stream}; // Run bracket-brace FST to retrieve starting positions of structs and lists json_to_stack_ops_fst.Transduce(json_in.begin(), @@ -1487,9 +1488,6 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - // Copy back to actual number of stack operations - auto const num_stack_ops = d_num_stack_ops.value(stream); - // Stack operations with indices are converted to top of the stack for each character in the input if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { fst::sparse_stack_op_to_top_of_stack( diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index bc496d323ed..58e51cf0389 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -3312,7 +3312,7 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines) TEST_F(JsonReaderTest, MismatchedBeginEndTokens) { - std::string data = R"({"not_valid":"json)"; + std::string data = R"({"not_valid": "json)"; auto opts = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true) From a07be577ac2ab8a4acca8e82be208b65eb6b9094 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 16 Dec 2024 08:49:36 +0000 Subject: [PATCH 5/5] check needed only when recover with null is disabled --- cpp/src/io/fst/logical_stack.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 3b1ff7dc26f..98641f2c893 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -514,7 +514,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, } // Check if the last element of d_kv_operations is 0. If not, then we have a problem. - if (num_symbols_in) { + if (num_symbols_in && !supports_reset_op) { StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream); CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!"); }