Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect mismatches in begin and end tokens returned by JSON tokenizer FST #17471

Merged
merged 9 commits into from
Dec 19, 2024
6 changes: 6 additions & 0 deletions cpp/src/io/fst/logical_stack.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
stream));
}

// Check if the last element of d_kv_operations is 0. If not, then we have a problem.
if (num_symbols_in && !supports_reset_op) {
StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream);
CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!");
}

// Stable radix sort, sorting by stack level of the operations
d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Current()),
Expand Down
10 changes: 4 additions & 6 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1473,10 +1473,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
to_stack_op::start_state,
stream);

auto stack_ops_bufsize = d_num_stack_ops.value(stream);
// Copy back to actual number of stack operations
auto num_stack_ops = d_num_stack_ops.value(stream);
// Sequence of stack symbols and their position in the original input (sparse representation)
rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
rmm::device_uvector<StackSymbolT> stack_ops{num_stack_ops, stream};
rmm::device_uvector<SymbolOffsetT> stack_op_indices{num_stack_ops, stream};

// Run bracket-brace FST to retrieve starting positions of structs and lists
json_to_stack_ops_fst.Transduce(json_in.begin(),
Expand All @@ -1487,9 +1488,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
to_stack_op::start_state,
stream);

// Copy back to actual number of stack operations
auto const num_stack_ops = d_num_stack_ops.value(stream);

// Stack operations with indices are converted to top of the stack for each character in the input
if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3450,4 +3450,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}});
}

TEST_F(JsonReaderTest, MismatchedBeginEndTokens)
{
std::string data = R"({"not_valid": "json)";
auto opts =
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::FAIL)
.build();
EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error);
}

CUDF_TEST_PROGRAM_MAIN()
Loading