diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index a3b58347e20..c9212334a96 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -763,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; + using optional_bool = parquet_field_optional; auto op = std::make_tuple(optional_binary(1, s->max), optional_binary(2, s->min), optional_int64(3, s->null_count), optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), - optional_binary(6, s->min_value)); + optional_binary(6, s->min_value), + optional_bool(7, s->is_max_value_exact), + optional_bool(8, s->is_min_value_exact)); function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 2174fe46663..14c99f728de 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -202,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s) if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } + if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); } + if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); } return c.value(); } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 227f13db60e..11b18579c58 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, auto const [min_ptr, min_size] = get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS); encoder.field_binary(6, min_ptr, min_size); + // cudf min/max statistics are always exact (i.e. not truncated) + encoder.field_bool(7, true); + encoder.field_bool(8, true); } encoder.end(&end); return end; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index fe9b6ead6d4..756726945cf 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -259,6 +259,10 @@ struct Statistics { thrust::optional> max_value; // min value for column determined by ColumnOrder thrust::optional> min_value; + // If true, max_value is the actual maximum value for a column + thrust::optional is_max_value_exact; + // If true, min_value is the actual minimum value for a column + thrust::optional is_min_value_exact; }; /** diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index e88afd73290..3a8763ed9f3 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -903,6 +903,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) ASSERT_TRUE(stats.min_value.has_value()); ASSERT_TRUE(stats.max_value.has_value()); + // check that min and max for the column chunk are exact (i.e. not truncated) + ASSERT_TRUE(stats.is_max_value_exact.has_value()); + EXPECT_TRUE(stats.is_max_value_exact.value()); + ASSERT_TRUE(stats.is_min_value_exact.has_value()); + EXPECT_TRUE(stats.is_min_value_exact.value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type;