From 73b53196dce3d7ac9527183bb3f6134c3cb5f30e Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Thu, 31 Oct 2024 13:42:30 +0900 Subject: [PATCH 1/3] perf(crypto): optimize `TwoAdicFri::CreateOpeningProof()` - parallelize copy. - merge `ReverseMatrixIndexBits()` into one single parallel loop. --- tachyon/crypto/commitments/fri/two_adic_fri.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tachyon/crypto/commitments/fri/two_adic_fri.h b/tachyon/crypto/commitments/fri/two_adic_fri.h index 0bfbd6ab6..96fb7721d 100644 --- a/tachyon/crypto/commitments/fri/two_adic_fri.h +++ b/tachyon/crypto/commitments/fri/two_adic_fri.h @@ -148,9 +148,13 @@ class TwoAdicFRI { reduced_openings[log_num_rows]; CHECK_EQ(reduced_opening_for_log_num_rows.size(), num_rows); - math::RowMajorMatrix block = - mat.topRows(num_rows >> config_.log_blowup); - ReverseMatrixIndexBits(block); + math::RowMajorMatrix block; + block.resize(num_rows >> config_.log_blowup, mat.cols()); + OMP_PARALLEL_FOR(size_t row = 0; row < num_rows >> config_.log_blowup; + ++row) { + block.row(row) = mat.row(base::bits::ReverseBitsLen( + row, log_num_rows - config_.log_blowup)); + } std::vector reduced_rows = DotExtPowers(mat, alpha); // TODO(ashjeong): Determine if using a matrix is a better fit. From aaf5896cf42e117fcb2a2ff44eb832a7b2ea250c Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Thu, 31 Oct 2024 14:59:39 +0900 Subject: [PATCH 2/3] chore(math): add missing license comments --- tachyon/math/finite_fields/packed_prime_field32_avx2.h | 5 +++++ tachyon/math/finite_fields/packed_prime_field32_avx512.h | 5 +++++ tachyon/math/finite_fields/packed_prime_field32_neon.h | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/tachyon/math/finite_fields/packed_prime_field32_avx2.h b/tachyon/math/finite_fields/packed_prime_field32_avx2.h index e610abce4..0ed7d9bea 100644 --- a/tachyon/math/finite_fields/packed_prime_field32_avx2.h +++ b/tachyon/math/finite_fields/packed_prime_field32_avx2.h @@ -1,3 +1,8 @@ +// Copyright (c) 2022 The Plonky3 Authors +// Use of this source code is governed by a MIT/Apache-2.0 style license that +// can be found in the LICENSE-MIT.plonky3 and the LICENCE-APACHE.plonky3 +// file. + #ifndef TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_AVX2_H_ #define TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_AVX2_H_ diff --git a/tachyon/math/finite_fields/packed_prime_field32_avx512.h b/tachyon/math/finite_fields/packed_prime_field32_avx512.h index 05bcd05f5..b157d9990 100644 --- a/tachyon/math/finite_fields/packed_prime_field32_avx512.h +++ b/tachyon/math/finite_fields/packed_prime_field32_avx512.h @@ -1,3 +1,8 @@ +// Copyright (c) 2022 The Plonky3 Authors +// Use of this source code is governed by a MIT/Apache-2.0 style license that +// can be found in the LICENSE-MIT.plonky3 and the LICENCE-APACHE.plonky3 +// file. + #ifndef TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_AVX512_H_ #define TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_AVX512_H_ diff --git a/tachyon/math/finite_fields/packed_prime_field32_neon.h b/tachyon/math/finite_fields/packed_prime_field32_neon.h index 57c8416c2..daa828dc5 100644 --- a/tachyon/math/finite_fields/packed_prime_field32_neon.h +++ b/tachyon/math/finite_fields/packed_prime_field32_neon.h @@ -1,3 +1,8 @@ +// Copyright (c) 2022 The Plonky3 Authors +// Use of this source code is governed by a MIT/Apache-2.0 style license that +// can be found in the LICENSE-MIT.plonky3 and the LICENCE-APACHE.plonky3 +// file. + #ifndef TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_NEON_H_ #define TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_NEON_H_ From 2e0f7f0b4e3024ebd9fb0d73b3afdb4f5fa16d25 Mon Sep 17 00:00:00 2001 From: Ryan Kim Date: Thu, 31 Oct 2024 15:53:04 +0900 Subject: [PATCH 3/3] perf(math): add dedicated pow 3, 5, 7 operations for Packed(Baby|Koala)Bear See https://github.com/Plonky3/Plonky3/commit/b314769. --- tachyon/math/base/semigroups.h | 45 ++++-- tachyon/math/finite_fields/BUILD.bazel | 5 +- .../internal/packed_baby_bear_avx2.cc | 12 ++ .../internal/packed_baby_bear_avx2.h | 4 + .../internal/packed_koala_bear_avx2.cc | 12 ++ .../internal/packed_koala_bear_avx2.h | 4 + .../finite_fields/packed_prime_field32_avx2.h | 138 +++++++++++++++++- 7 files changed, 202 insertions(+), 18 deletions(-) diff --git a/tachyon/math/base/semigroups.h b/tachyon/math/base/semigroups.h index e832865bb..ed655a313 100644 --- a/tachyon/math/base/semigroups.h +++ b/tachyon/math/base/semigroups.h @@ -53,6 +53,14 @@ std::declval().Name##InPlace()))> \ : std::true_type {} +#define SUPPORTS_DEDICATED_EXP_OPERATOR(Pow) \ + template \ + struct SupportsExp##Pow : std::false_type {}; \ + \ + template \ + struct SupportsExp##Pow().Exp##Pow()))> \ + : std::true_type {}; + namespace tachyon::math { namespace internal { @@ -60,6 +68,9 @@ SUPPORTS_BINARY_OPERATOR(Mul); SUPPORTS_UNARY_OPERATOR(SquareImpl); SUPPORTS_BINARY_OPERATOR(Add); SUPPORTS_UNARY_OPERATOR(DoubleImpl); +SUPPORTS_DEDICATED_EXP_OPERATOR(3); +SUPPORTS_DEDICATED_EXP_OPERATOR(5); +SUPPORTS_DEDICATED_EXP_OPERATOR(7); template struct SupportsSize : std::false_type {}; @@ -157,24 +168,36 @@ class MultiplicativeSemigroup { return g; else if constexpr (Power == 2) return Square(); - else if constexpr (Power == 3) - return Square() * g; - else if constexpr (Power == 4) + else if constexpr (Power == 3) { + if constexpr (internal::SupportsExp3::value) { + return g.Exp3(); + } else { + return Square() * g; + } + } else if constexpr (Power == 4) { return Square().Square(); - else if constexpr (Power == 5) { - MulResult g4 = Square(); - g4.SquareInPlace(); - return g4 * g; + } else if constexpr (Power == 5) { + if constexpr (internal::SupportsExp5::value) { + return g.Exp5(); + } else { + MulResult g4 = Square(); + g4.SquareInPlace(); + return g4 * g; + } } else if constexpr (Power == 6) { MulResult g2 = Square(); MulResult g4 = g2; g4.SquareInPlace(); return g4 * g2; } else if constexpr (Power == 7) { - MulResult g2 = Square(); - MulResult g4 = g2; - g4.SquareInPlace(); - return g4 * g2 * g; + if constexpr (internal::SupportsExp7::value) { + return g.Exp7(); + } else { + MulResult g2 = Square(); + MulResult g4 = g2; + g4.SquareInPlace(); + return g4 * g2 * g; + } } else { return DoPow(BigInt<1>(Power)); } diff --git a/tachyon/math/finite_fields/BUILD.bazel b/tachyon/math/finite_fields/BUILD.bazel index 3fda2b6d4..0c976b63f 100644 --- a/tachyon/math/finite_fields/BUILD.bazel +++ b/tachyon/math/finite_fields/BUILD.bazel @@ -164,7 +164,10 @@ tachyon_cc_library( tachyon_cc_library( name = "packed_prime_field32_avx2", hdrs = ["packed_prime_field32_avx2.h"], - deps = ["//tachyon/base:compiler_specific"], + deps = [ + "//tachyon/base:compiler_specific", + "//tachyon/base/functional:callback", + ], ) tachyon_cc_library( diff --git a/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.cc b/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.cc index 2ca99139f..4543f9882 100644 --- a/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.cc +++ b/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.cc @@ -103,4 +103,16 @@ PackedBabyBearAVX2 PackedBabyBearAVX2::Mul( return FromVector(math::Mul(ToVector(*this), ToVector(other))); } +PackedBabyBearAVX2 PackedBabyBearAVX2::Exp3() const { + return FromVector(math::Exp3(ToVector(*this), kP, kInv)); +} + +PackedBabyBearAVX2 PackedBabyBearAVX2::Exp5() const { + return FromVector(math::Exp5(ToVector(*this), kP, kInv)); +} + +PackedBabyBearAVX2 PackedBabyBearAVX2::Exp7() const { + return FromVector(math::Exp7(ToVector(*this), kP, kInv)); +} + } // namespace tachyon::math diff --git a/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.h b/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.h index 0da1a80ee..45b8edee6 100644 --- a/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.h +++ b/tachyon/math/finite_fields/baby_bear/internal/packed_baby_bear_avx2.h @@ -62,6 +62,10 @@ class TACHYON_EXPORT PackedBabyBearAVX2 final // MultiplicativeSemigroup methods PackedBabyBearAVX2 Mul(const PackedBabyBearAVX2& other) const; + + PackedBabyBearAVX2 Exp3() const; + PackedBabyBearAVX2 Exp5() const; + PackedBabyBearAVX2 Exp7() const; }; } // namespace tachyon::math diff --git a/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.cc b/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.cc index 1ebc91798..35a98a318 100644 --- a/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.cc +++ b/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.cc @@ -107,4 +107,16 @@ PackedKoalaBearAVX2 PackedKoalaBearAVX2::Mul( return FromVector(math::Mul(ToVector(*this), ToVector(other))); } +PackedKoalaBearAVX2 PackedKoalaBearAVX2::Exp3() const { + return FromVector(math::Exp3(ToVector(*this), kP, kInv)); +} + +PackedKoalaBearAVX2 PackedKoalaBearAVX2::Exp5() const { + return FromVector(math::Exp5(ToVector(*this), kP, kInv)); +} + +PackedKoalaBearAVX2 PackedKoalaBearAVX2::Exp7() const { + return FromVector(math::Exp7(ToVector(*this), kP, kInv)); +} + } // namespace tachyon::math diff --git a/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.h b/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.h index b7e009973..c1a5881a0 100644 --- a/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.h +++ b/tachyon/math/finite_fields/koala_bear/internal/packed_koala_bear_avx2.h @@ -62,6 +62,10 @@ class TACHYON_EXPORT PackedKoalaBearAVX2 final // MultiplicativeSemigroup methods PackedKoalaBearAVX2 Mul(const PackedKoalaBearAVX2& other) const; + + PackedKoalaBearAVX2 Exp3() const; + PackedKoalaBearAVX2 Exp5() const; + PackedKoalaBearAVX2 Exp7() const; }; } // namespace tachyon::math diff --git a/tachyon/math/finite_fields/packed_prime_field32_avx2.h b/tachyon/math/finite_fields/packed_prime_field32_avx2.h index 0ed7d9bea..bedc5b6fe 100644 --- a/tachyon/math/finite_fields/packed_prime_field32_avx2.h +++ b/tachyon/math/finite_fields/packed_prime_field32_avx2.h @@ -9,6 +9,7 @@ #include #include "tachyon/base/compiler_specific.h" +#include "tachyon/base/functional/callback.h" namespace tachyon::math { @@ -171,11 +172,59 @@ ALWAYS_INLINE __m256i NegateMod32(__m256i val, __m256i p) { // // [1] Modern Computer Arithmetic, Richard Brent and Paul Zimmermann, // Cambridge University Press, 2010, algorithm 2.7. -ALWAYS_INLINE __m256i MontyD(__m256i lhs, __m256i rhs, __m256i p, __m256i inv) { - __m256i prod = _mm256_mul_epu32(lhs, rhs); - __m256i q = _mm256_mul_epu32(prod, inv); + +// We provide 2 variants of Montgomery reduction depending on if the inputs are +// unsigned or signed. The unsigned variant follows steps 1 and 2 in the above +// protocol to produce D in (-P, ..., P). For the signed variant we assume -PB/2 +// < C < PB/2 and let Q := μ C mod B be the unique representative in [-B/2, ..., +// B/2 - 1]. The division in step 2 is clearly still exact and |C - Q P| <= |C| +// + |Q||P| < PB so D still lies in (-P, ..., P). + +// Perform a partial Montgomery reduction on each 64 bit element. +// Input must lie in {0, ..., 2³²P}. +// The output will lie in {-P, ..., P} and be stored in the upper 32 bits. +ALWAYS_INLINE __m256i PartialMontyRedUnsignedToSigned(__m256i input, __m256i p, + __m256i inv) { + __m256i q = _mm256_mul_epu32(input, inv); __m256i q_p = _mm256_mul_epu32(q, p); - return _mm256_sub_epi64(prod, q_p); + // By construction, the bottom 32 bits of input and q_p are equal. + // Thus |_mm256_sub_epi32| and |_mm256_sub_epi64| should act identically. + // However for some reason, the compiler gets confused if we use + // |_mm256_sub_epi64| and outputs a load of nonsense, see: + // https://godbolt.org/z/3W8M7Tv84. + return _mm256_sub_epi32(input, q_p); +} +// Perform a partial Montgomery reduction on each 64 bit element. +// Input must lie in {-2³¹P, ..., 2³¹P}. +// The output will lie in {-P, ..., P} and be stored in the upper 32 bits. +ALWAYS_INLINE __m256i PartialMontyRedSignedToSigned(__m256i input, __m256i p, + __m256i inv) { + __m256i q = _mm256_mul_epi32(input, inv); + __m256i q_p = _mm256_mul_epi32(q, p); + // Unlike the previous case the compiler output is essentially identical + // between |_mm256_sub_epi32| and |_mm256_sub_epi64|. We use + // |_mm256_sub_epi32| again just for consistency. + return _mm256_sub_epi32(input, q_p); +} + +// Multiply the field elements in the even index entries. +// |lhs[2i]|, |rhs[2i]| must be unsigned 32-bit integers such that +// |lhs[2i]| * |rhs[2i]| lies in {0, ..., 2³²P}. +// The output will lie in {-P, ..., P} and be stored in |output[2i + 1]|. +ALWAYS_INLINE __m256i MontyMul(__m256i lhs, __m256i rhs, __m256i p, + __m256i inv) { + __m256i prod = _mm256_mul_epu32(lhs, rhs); + return PartialMontyRedSignedToSigned(prod, p, inv); +} + +// Multiply the field elements in the even index entries. +// |lhs[2i]|, |rhs[2i]| must be signed 32-bit integers such that +// |lhs[2i]| * |rhs[2i]| lies in {-2³¹P, ..., 2³¹P}. +// The output will lie in {-P, ..., P} stored in |output[2i + 1]|. +ALWAYS_INLINE __m256i MontyMulSigned(__m256i lhs, __m256i rhs, __m256i p, + __m256i inv) { + __m256i prod = _mm256_mul_epi32(lhs, rhs); + return PartialMontyRedSignedToSigned(prod, p, inv); } ALWAYS_INLINE __m256i movehdup_epi32(__m256i x) { @@ -210,8 +259,8 @@ ALWAYS_INLINE __m256i MontMulMod32(__m256i lhs, __m256i rhs, __m256i p, __m256i lhs_odd = movehdup_epi32(lhs); __m256i rhs_odd = movehdup_epi32(rhs); - __m256i d_evn = MontyD(lhs_evn, rhs_evn, p, inv); - __m256i d_odd = MontyD(lhs_odd, rhs_odd, p, inv); + __m256i d_evn = MontyMul(lhs_evn, rhs_evn, p, inv); + __m256i d_odd = MontyMul(lhs_odd, rhs_odd, p, inv); __m256i d_evn_hi = movehdup_epi32(d_evn); __m256i t = _mm256_blend_epi32(d_evn_hi, d_odd, 0b10101010); @@ -220,6 +269,83 @@ ALWAYS_INLINE __m256i MontMulMod32(__m256i lhs, __m256i rhs, __m256i p, return _mm256_min_epu32(t, u); } +// Square the field elements in the even index entries. +// Inputs must be signed 32-bit integers. +// Outputs will be a signed integer in (-P, ..., P) copied into both the even +// and odd indices. +ALWAYS_INLINE __m256i ShiftedSquare(__m256i input, __m256i p, __m256i inv) { + // Note that we do not need a restriction on the size of |input[i]²| as + // 2³⁰ < P and |i32| <= 2³¹ and so => |input[i]²| <= 2⁶² < 2³²P. + __m256i square = _mm256_mul_epi32(input, input); + __m256i square_red = PartialMontyRedSignedToSigned(square, p, inv); + return movehdup_epi32(square_red); +} + +// Apply callback to the even and odd indices of the input vector. +// callback should only depend in the 32 bit entries in the even indices. +// The output of callback must lie in (-P, ..., P) and be stored in the odd +// indices. The even indices of the output of callback will not be read. The +// input should conform to the requirements of |callback|. +// NOTE(chokobole): This is to suppress the error below. +// clang-format off +// error: ignoring attributes on template argument '__m256i(__m256i, __m256i, __m256i)' [-Werror=ignored-attributes] +// clang-format on +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wignored-attributes" +ALWAYS_INLINE __m256i ApplyFuncToEvenOdd( + __m256i input, __m256i p, __m256i inv, + base::RepeatingCallback<__m256i(__m256i, __m256i, __m256i)> callback) { + __m256i input_evn = input; + __m256i input_odd = movehdup_epi32(input); + __m256i d_evn = callback.Run(input_evn, p, inv); + __m256i d_odd = callback.Run(input_odd, p, inv); + __m256i d_evn_hi = movehdup_epi32(d_evn); + __m256i t = _mm256_blend_epi32(d_evn_hi, d_odd, 0b10101010); + __m256i u = _mm256_add_epi32(t, p); + return _mm256_min_epu32(t, u); +} +#pragma GCC diagnostic pop + +// Cube the field elements in the even index entries. +// Inputs must be signed 32-bit integers in [-P, ..., P]. +// Outputs will be a signed integer in (-P, ..., P) stored in the odd indices. +ALWAYS_INLINE __m256i DoExp3(__m256i input, __m256i p, __m256i inv) { + __m256i square = ShiftedSquare(input, p, inv); + return MontyMulSigned(square, input, p, inv); +} + +ALWAYS_INLINE __m256i Exp3(__m256i input, __m256i p, __m256i inv) { + return ApplyFuncToEvenOdd(input, p, inv, &DoExp3); +} + +// Take the fifth power of the field elements in the even index +// entries. Inputs must be signed 32-bit integers in [-P, ..., P]. Outputs will +// be a signed integer in (-P, ..., P) stored in the odd indices. +ALWAYS_INLINE __m256i DoExp5(__m256i input, __m256i p, __m256i inv) { + __m256i square = ShiftedSquare(input, p, inv); + __m256i quad = ShiftedSquare(square, p, inv); + return MontyMulSigned(quad, input, p, inv); +} + +ALWAYS_INLINE __m256i Exp5(__m256i input, __m256i p, __m256i inv) { + return ApplyFuncToEvenOdd(input, p, inv, &DoExp5); +} + +/// Take the seventh power of the field elements in the even index +/// entries. Inputs must lie in [-P, ..., P]. Outputs will also lie in (-P, ..., +/// P) stored in the odd indices. +ALWAYS_INLINE __m256i DoExp7(__m256i input, __m256i p, __m256i inv) { + __m256i square = ShiftedSquare(input, p, inv); + __m256i cube = MontyMulSigned(square, input, p, inv); + __m256i cube_shifted = movehdup_epi32(cube); + __m256i quad = ShiftedSquare(square, p, inv); + return MontyMulSigned(quad, cube_shifted, p, inv); +} + +ALWAYS_INLINE __m256i Exp7(__m256i input, __m256i p, __m256i inv) { + return ApplyFuncToEvenOdd(input, p, inv, &DoExp7); +} + } // namespace tachyon::math #endif // TACHYON_MATH_FINITE_FIELDS_PACKED_PRIME_FIELD32_AVX2_H_