From 064dd7b02166cc67e882b708d66621bc3fafd70b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Apr 2024 16:20:32 -0700 Subject: [PATCH] Add fields to Parquet Statistics structure that were added in parquet-format 2.10 (#15412) [PARQUET-2352](https://github.com/apache/parquet-format/pull/216) added fields to the `Statistics` struct to indicate whether the min and max values were exact or had been truncated. This was somewhat ambiguous in the past. One reason to want to know this is to allow avoiding the decoding of pages (or column chunks) that contain a single value (if the min and max are the same value, and are known to be exact values, and there are no nulls, then the only valid value for the page will be that value). This PR adds these new fields, which will always be `true` in cuDF since cuDF does not support truncating min and max values in the statistics (but does support truncation in the page indexes). Authors: - Ed Seidl (https://github.com/etseidl) - Nghia Truong (https://github.com/ttnghia) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15412 --- cpp/src/io/parquet/compact_protocol_reader.cpp | 5 ++++- cpp/src/io/parquet/compact_protocol_writer.cpp | 2 ++ cpp/src/io/parquet/page_enc.cu | 3 +++ cpp/src/io/parquet/parquet.hpp | 4 ++++ cpp/tests/io/parquet_writer_test.cpp | 6 ++++++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index a3b58347e20..c9212334a96 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -763,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; + using optional_bool = parquet_field_optional; auto op = std::make_tuple(optional_binary(1, s->max), optional_binary(2, s->min), optional_int64(3, s->null_count), optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), - optional_binary(6, s->min_value)); + optional_binary(6, s->min_value), + optional_bool(7, s->is_max_value_exact), + optional_bool(8, s->is_min_value_exact)); function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 2174fe46663..14c99f728de 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -202,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s) if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } + if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); } + if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); } return c.value(); } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 227f13db60e..11b18579c58 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, auto const [min_ptr, min_size] = get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS); encoder.field_binary(6, min_ptr, min_size); + // cudf min/max statistics are always exact (i.e. not truncated) + encoder.field_bool(7, true); + encoder.field_bool(8, true); } encoder.end(&end); return end; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index fe9b6ead6d4..756726945cf 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -259,6 +259,10 @@ struct Statistics { thrust::optional> max_value; // min value for column determined by ColumnOrder thrust::optional> min_value; + // If true, max_value is the actual maximum value for a column + thrust::optional is_max_value_exact; + // If true, min_value is the actual minimum value for a column + thrust::optional is_min_value_exact; }; /** diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index e88afd73290..3a8763ed9f3 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -903,6 +903,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) ASSERT_TRUE(stats.min_value.has_value()); ASSERT_TRUE(stats.max_value.has_value()); + // check that min and max for the column chunk are exact (i.e. not truncated) + ASSERT_TRUE(stats.is_max_value_exact.has_value()); + EXPECT_TRUE(stats.is_max_value_exact.value()); + ASSERT_TRUE(stats.is_min_value_exact.has_value()); + EXPECT_TRUE(stats.is_min_value_exact.value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type;