From 67b147523301c8815e9292d31f8dd3e7d2116f99 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 28 Mar 2024 13:11:37 -0700 Subject: [PATCH 1/3] statistics are always exact --- cpp/src/io/parquet/compact_protocol_reader.cpp | 5 ++++- cpp/src/io/parquet/compact_protocol_writer.cpp | 2 ++ cpp/src/io/parquet/page_enc.cu | 3 +++ cpp/src/io/parquet/parquet.hpp | 4 ++++ cpp/tests/io/parquet_writer_test.cpp | 6 ++++++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index d39d832c18c..f39bd2b102c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -746,13 +746,16 @@ void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; + using optional_bool = parquet_field_optional; auto op = std::make_tuple(optional_binary(1, s->max), optional_binary(2, s->min), optional_int64(3, s->null_count), optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), - optional_binary(6, s->min_value)); + optional_binary(6, s->min_value), + optional_bool(7, s->is_max_value_exact), + optional_bool(8, s->is_min_value_exact)); function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index d610ec6c546..91d3fe24874 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -195,6 +195,8 @@ size_t CompactProtocolWriter::write(Statistics const& s) if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } + if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); } + if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); } return c.value(); } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index d881ab6f9b7..d72bf8f5c77 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2926,6 +2926,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, auto const [min_ptr, min_size] = get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS); encoder.field_binary(6, min_ptr, min_size); + // cudf min/max statistics are always exect (i.e. not truncated) + encoder.field_bool(7, true); + encoder.field_bool(8, true); } encoder.end(&end); return end; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 08f9fae145b..c9305eb2c12 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -259,6 +259,10 @@ struct Statistics { thrust::optional> max_value; // min value for column determined by ColumnOrder thrust::optional> min_value; + // If true, max_value is the actual maximum value for a column + thrust::optional is_max_value_exact; + // If true, min_value is the actual minimum value for a column + thrust::optional is_min_value_exact; }; /** diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index ffa672fb564..8dda66c0185 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -892,6 +892,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) ASSERT_TRUE(stats.min_value.has_value()); ASSERT_TRUE(stats.max_value.has_value()); + // check that min and max for the column chunk are exact (i.e. not truncated) + ASSERT_TRUE(stats.is_max_value_exact.has_value()); + EXPECT_TRUE(stats.is_max_value_exact.value()); + ASSERT_TRUE(stats.is_min_value_exact.has_value()); + EXPECT_TRUE(stats.is_min_value_exact.value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; From 3957e09b467d3bb920b54d51fa19f8e3f7eba74b Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 28 Mar 2024 23:09:26 +0000 Subject: [PATCH 2/3] update copyright date --- cpp/src/io/parquet/parquet.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c9305eb2c12..a0d8c48272d 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 2f64fb9c023fca384a8c2155fb12c2107ecb0c09 Mon Sep 17 00:00:00 2001 From: seidl Date: Tue, 2 Apr 2024 14:32:20 +0000 Subject: [PATCH 3/3] typo --- cpp/src/io/parquet/page_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index d72bf8f5c77..83cae83f513 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2926,7 +2926,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, auto const [min_ptr, min_size] = get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS); encoder.field_binary(6, min_ptr, min_size); - // cudf min/max statistics are always exect (i.e. not truncated) + // cudf min/max statistics are always exact (i.e. not truncated) encoder.field_bool(7, true); encoder.field_bool(8, true); }