From 34a5a501c232993c9cd10ca75f51b932953fa8f0 Mon Sep 17 00:00:00 2001 From: Shahar Papini Date: Sat, 16 Mar 2024 10:03:08 +0200 Subject: [PATCH] WideFib test with AVX Backend --- src/commitment_scheme/blake2_hash.rs | 1 + src/core/air/accumulation.rs | 7 +- src/core/backend/avx512/accumulation.rs | 2 +- src/core/backend/avx512/blake2s.rs | 37 ++-- src/core/backend/avx512/mod.rs | 4 + src/examples/wide_fibonacci/avx.rs | 167 ++++++++++++++++++ .../wide_fibonacci/constraint_eval.rs | 18 +- src/examples/wide_fibonacci/mod.rs | 7 +- 8 files changed, 209 insertions(+), 34 deletions(-) create mode 100644 src/examples/wide_fibonacci/avx.rs diff --git a/src/commitment_scheme/blake2_hash.rs b/src/commitment_scheme/blake2_hash.rs index b530a083a..5bc8f26a5 100644 --- a/src/commitment_scheme/blake2_hash.rs +++ b/src/commitment_scheme/blake2_hash.rs @@ -3,6 +3,7 @@ use std::fmt; use blake2::{Blake2s256, Digest}; // Wrapper for the blake2s hash type. +#[repr(align(32))] #[derive(Clone, Copy, PartialEq, Default, Eq)] pub struct Blake2sHash([u8; 32]); diff --git a/src/core/air/accumulation.rs b/src/core/air/accumulation.rs index 91dd81aa2..9ec531180 100644 --- a/src/core/air/accumulation.rs +++ b/src/core/air/accumulation.rs @@ -140,6 +140,9 @@ impl DomainEvaluationAccumulator { .zip(self.n_cols_per_size.iter()) .skip(1) { + if *n_cols == 0 { + continue; + } let coeffs = SecureColumn:: { columns: values.columns.map(|c| { CircleEvaluation::::new( @@ -162,8 +165,8 @@ impl DomainEvaluationAccumulator { /// An domain accumulator for polynomials of a single size. pub struct ColumnAccumulator<'a, B: Backend> { - random_coeff_pow: SecureField, - col: &'a mut SecureColumn, + pub random_coeff_pow: SecureField, + pub col: &'a mut SecureColumn, } impl<'a> ColumnAccumulator<'a, CPUBackend> { pub fn accumulate(&mut self, index: usize, evaluation: SecureField) { diff --git a/src/core/backend/avx512/accumulation.rs b/src/core/backend/avx512/accumulation.rs index 080170015..da9178183 100644 --- a/src/core/backend/avx512/accumulation.rs +++ b/src/core/backend/avx512/accumulation.rs @@ -7,7 +7,7 @@ use crate::core::fields::secure_column::SecureColumn; impl AccumulationOps for AVX512Backend { fn accumulate(column: &mut SecureColumn, alpha: SecureField, other: &SecureColumn) { let alpha = PackedQM31::broadcast(alpha); - for i in 0..column.len() { + for i in 0..column.n_packs() { let res_coeff = column.packed_at(i) * alpha + other.packed_at(i); column.set_packed(i, res_coeff); } diff --git a/src/core/backend/avx512/blake2s.rs b/src/core/backend/avx512/blake2s.rs index 23329c52c..0552d1c7f 100644 --- a/src/core/backend/avx512/blake2s.rs +++ b/src/core/backend/avx512/blake2s.rs @@ -1,4 +1,4 @@ -use std::arch::x86_64::__m512i; +use std::arch::x86_64::{__m512i, _mm512_loadu_si512}; use itertools::Itertools; @@ -6,8 +6,8 @@ use super::blake2s_avx::{compress16, set1, transpose_msgs, untranspose_states}; use super::{AVX512Backend, VECS_LOG_SIZE}; use crate::commitment_scheme::blake2_hash::Blake2sHash; use crate::commitment_scheme::blake2_merkle::Blake2sMerkleHasher; -use crate::commitment_scheme::ops::MerkleOps; -use crate::core::backend::{Col, ColumnOps}; +use crate::commitment_scheme::ops::{MerkleHasher, MerkleOps}; +use crate::core::backend::{Col, Column, ColumnOps}; use crate::core::fields::m31::BaseField; impl ColumnOps for AVX512Backend { @@ -25,19 +25,20 @@ impl MerkleOps for AVX512Backend { columns: &[&Col], ) -> Vec { // Pad prev_layer if too small. - let mut padded_buffer = vec![]; - let prev_layer = if log_size < 4 { - prev_layer.map(|prev_layer| { - padded_buffer = prev_layer - .iter() - .copied() - .chain(std::iter::repeat(Blake2sHash::default())) - .collect_vec(); - &padded_buffer - }) - } else { - prev_layer - }; + if log_size < 4 { + return (0..(1 << log_size)) + .map(|i| { + Blake2sMerkleHasher::hash_node( + prev_layer.map(|prev_layer| (prev_layer[2 * i], prev_layer[2 * i + 1])), + &columns.iter().map(|column| column.at(i)).collect_vec(), + ) + }) + .collect(); + } + + if let Some(prev_layer) = prev_layer { + assert_eq!(prev_layer.len(), 1 << (log_size + 1)); + } // Commit to columns. let mut res = Vec::with_capacity(1 << log_size); @@ -46,7 +47,9 @@ impl MerkleOps for AVX512Backend { // Hash prev_layer. if let Some(prev_layer) = prev_layer { let ptr = prev_layer[(i << 5)..(i << 5) + 32].as_ptr() as *const __m512i; - let msgs: [__m512i; 16] = std::array::from_fn(|j| unsafe { *ptr.add(j) }); + let msgs: [__m512i; 16] = std::array::from_fn(|j| unsafe { + _mm512_loadu_si512(ptr.add(j) as *const i32) + }); state = unsafe { compress16( state, diff --git a/src/core/backend/avx512/mod.rs b/src/core/backend/avx512/mod.rs index a927eb35e..ff00bcb1c 100644 --- a/src/core/backend/avx512/mod.rs +++ b/src/core/backend/avx512/mod.rs @@ -136,6 +136,10 @@ impl FromIterator for BaseFieldVec { } impl SecureColumn { + pub fn n_packs(&self) -> usize { + self.columns[0].data.len() + } + pub fn packed_at(&self, vec_index: usize) -> PackedQM31 { unsafe { PackedQM31([ diff --git a/src/examples/wide_fibonacci/avx.rs b/src/examples/wide_fibonacci/avx.rs new file mode 100644 index 000000000..a04903f9e --- /dev/null +++ b/src/examples/wide_fibonacci/avx.rs @@ -0,0 +1,167 @@ +use itertools::Itertools; +use num_traits::One; + +use super::structs::WideFibComponent; +use crate::core::air::accumulation::{DomainEvaluationAccumulator, PointEvaluationAccumulator}; +use crate::core::air::{Air, Component, ComponentTrace, Mask}; +use crate::core::backend::avx512::qm31::PackedQM31; +use crate::core::backend::avx512::{AVX512Backend, PackedBaseField, VECS_LOG_SIZE}; +use crate::core::backend::{Col, Column}; +use crate::core::circle::{CirclePoint, Coset}; +use crate::core::constraints::coset_vanishing; +use crate::core::fields::m31::BaseField; +use crate::core::fields::qm31::SecureField; +use crate::core::fields::FieldExpOps; +use crate::core::poly::circle::{CanonicCoset, CircleEvaluation}; +use crate::core::poly::BitReversedOrder; +use crate::core::utils::bit_reverse_index; +use crate::core::ColumnVec; + +const N_COLS: usize = 1 << 8; + +pub struct WideFibAir { + component: WideFibComponent, +} +impl Air for WideFibAir { + fn components(&self) -> Vec<&dyn Component> { + vec![&self.component] + } +} + +pub fn gen_trace( + log_size: usize, +) -> ColumnVec> { + assert!(log_size >= VECS_LOG_SIZE); + let mut trace = (0..N_COLS) + .map(|_| Col::::zeros(1 << log_size)) + .collect_vec(); + for vec_index in 0..(1 << (log_size - VECS_LOG_SIZE)) { + let mut a = PackedBaseField::one(); + let mut b = PackedBaseField::one(); + trace[0].data[vec_index] = a; + trace[1].data[vec_index] = b; + trace.iter_mut().take(log_size).skip(2).for_each(|col| { + (a, b) = (b, a.square() + b.square()); + col.data[vec_index] = b; + }); + } + let domain = CanonicCoset::new(log_size as u32).circle_domain(); + trace + .into_iter() + .map(|eval| CircleEvaluation::::new(domain, eval)) + .collect_vec() +} + +impl Component for WideFibComponent { + fn max_constraint_log_degree_bound(&self) -> u32 { + self.log_size + 1 + } + + fn trace_log_degree_bounds(&self) -> Vec { + vec![self.log_size; N_COLS] + } + + fn evaluate_constraint_quotients_on_domain( + &self, + trace: &ComponentTrace<'_, AVX512Backend>, + evaluation_accumulator: &mut DomainEvaluationAccumulator, + ) { + assert_eq!(trace.columns.len(), N_COLS); + // TODO(spapini): Steal evaluation from commitment. + let eval_domain = CanonicCoset::new(self.log_size + 1).circle_domain(); + let trace_eval = trace + .columns + .iter() + .map(|poly| poly.evaluate(eval_domain)) + .collect_vec(); + let random_coeff = PackedQM31::broadcast(evaluation_accumulator.random_coeff); + let column_coeffs = (0..N_COLS) + .scan(PackedQM31::one(), |state, _| { + let res = *state; + *state *= random_coeff; + Some(res) + }) + .collect_vec(); + + let constraint_log_degree_bound = self.log_size + 1; + let [accum] = evaluation_accumulator.columns([(constraint_log_degree_bound, N_COLS - 2)]); + + for vec_row in 0..(1 << (eval_domain.log_size() - VECS_LOG_SIZE as u32)) { + // Numerator. + let mut row_res = PackedQM31::zero(); + let mut a = trace_eval[0].data[vec_row]; + let mut b = trace_eval[1].data[vec_row]; + #[allow(clippy::needless_range_loop)] + for i in 0..(N_COLS - 2) { + unsafe { + let c = *trace_eval.get_unchecked(i + 2).data.get_unchecked(vec_row); + row_res = row_res + column_coeffs[i] * (a.square() + b.square() - c); + (a, b) = (b, c); + } + } + + // Denominator. + // TODO(spapini): Optimized this, for the small number of columns case. + let points = std::array::from_fn(|i| { + eval_domain.at(bit_reverse_index( + (vec_row << VECS_LOG_SIZE) + i, + eval_domain.log_size(), + ) + 1) + }); + let mut shifted_xs = PackedBaseField::from_array(points.map(|p| p.x)); + for _ in 1..self.log_size { + shifted_xs = shifted_xs.square() - PackedBaseField::one(); + } + + accum.col.set_packed( + vec_row, + accum.col.packed_at(vec_row) * PackedQM31::broadcast(accum.random_coeff_pow) + + row_res, + ) + } + } + + fn mask(&self) -> Mask { + Mask(vec![vec![0]; N_COLS]) + } + + fn evaluate_constraint_quotients_at_point( + &self, + point: CirclePoint, + mask: &ColumnVec>, + evaluation_accumulator: &mut PointEvaluationAccumulator, + ) { + let constraint_zero_domain = Coset::subgroup(self.log_size); + let constraint_log_degree_bound = self.log_size + 1; + for i in 0..(N_COLS - 2) { + let numerator = mask[i][0].square() + mask[i + 1][0].square() - mask[i + 2][0]; + let denominator = coset_vanishing(constraint_zero_domain, point); + evaluation_accumulator.accumulate(constraint_log_degree_bound, numerator / denominator); + } + } +} + +#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))] +#[cfg(test)] +mod tests { + use crate::commitment_scheme::blake2_hash::Blake2sHasher; + use crate::commitment_scheme::hasher::Hasher; + use crate::core::channel::{Blake2sChannel, Channel}; + use crate::core::fields::m31::BaseField; + use crate::core::fields::IntoSlice; + use crate::core::prover::prove; + use crate::examples::wide_fibonacci::avx::{gen_trace, WideFibAir}; + use crate::examples::wide_fibonacci::structs::WideFibComponent; + + #[test] + fn test_avx_wide_fib_prove() { + // TODO(spapini): Increase to 20, to get 1GB of trace. + const LOG_SIZE: u32 = 12; + let component = WideFibComponent { log_size: LOG_SIZE }; + let air = WideFibAir { component }; + let trace = gen_trace(LOG_SIZE as usize); + let channel = &mut Blake2sChannel::new(Blake2sHasher::hash(BaseField::into_slice(&[]))); + // TODO(spapini): Fix the constraints. + prove(&air, channel, trace).unwrap_err(); + } +} diff --git a/src/examples/wide_fibonacci/constraint_eval.rs b/src/examples/wide_fibonacci/constraint_eval.rs index 81eda35be..31ad86b2e 100644 --- a/src/examples/wide_fibonacci/constraint_eval.rs +++ b/src/examples/wide_fibonacci/constraint_eval.rs @@ -33,12 +33,12 @@ impl Component for WideFibComponent { trace: &ComponentTrace<'_, CPUBackend>, evaluation_accumulator: &mut DomainEvaluationAccumulator, ) { + let constraint_log_degree = Component::::max_constraint_log_degree_bound(self); let mut trace_evals = vec![]; // TODO(ShaharS), Share this LDE with the commitment LDE. for poly_index in 0..64 { let poly = &trace.columns[poly_index]; - let trace_eval_domain = - CanonicCoset::new(self.max_constraint_log_degree_bound()).circle_domain(); + let trace_eval_domain = CanonicCoset::new(constraint_log_degree).circle_domain(); trace_evals.push(poly.evaluate(trace_eval_domain).bit_reverse()); } let zero_domain = CanonicCoset::new(self.log_size).coset; @@ -47,14 +47,11 @@ impl Component for WideFibComponent { for point in eval_domain.iter() { denoms.push(coset_vanishing(zero_domain, point)); } - let mut denom_inverses = - vec![BaseField::zero(); 1 << (self.max_constraint_log_degree_bound())]; + let mut denom_inverses = vec![BaseField::zero(); 1 << (constraint_log_degree)]; BaseField::batch_inverse(&denoms, &mut denom_inverses); - let mut numerators = - vec![SecureField::zero(); 1 << (self.max_constraint_log_degree_bound())]; + let mut numerators = vec![SecureField::zero(); 1 << (constraint_log_degree)]; let random_coeff = evaluation_accumulator.random_coeff; - let [mut accum] = - evaluation_accumulator.columns([(self.max_constraint_log_degree_bound(), 64)]); + let [mut accum] = evaluation_accumulator.columns([(constraint_log_degree, 64)]); for (i, point_index) in eval_domain.iter_indices().enumerate() { numerators[i] = numerators[i] * random_coeff + (trace_evals[2].get_at(point_index) @@ -420,10 +417,7 @@ impl Component for WideFibComponent { * trace_evals[62].get_at(point_index)))); } for (i, (num, denom)) in numerators.iter().zip(denom_inverses.iter()).enumerate() { - accum.accumulate( - bit_reverse_index(i, self.max_constraint_log_degree_bound()), - *num * *denom, - ); + accum.accumulate(bit_reverse_index(i, constraint_log_degree), *num * *denom); } } diff --git a/src/examples/wide_fibonacci/mod.rs b/src/examples/wide_fibonacci/mod.rs index 3680946f2..e9c64992d 100644 --- a/src/examples/wide_fibonacci/mod.rs +++ b/src/examples/wide_fibonacci/mod.rs @@ -1,3 +1,4 @@ +pub mod avx; pub mod constraint_eval; pub mod structs; pub mod trace_asserts; @@ -14,6 +15,7 @@ mod tests { use crate::core::air::accumulation::DomainEvaluationAccumulator; use crate::core::air::{Component, ComponentTrace}; use crate::core::backend::cpu::CPUCircleEvaluation; + use crate::core::backend::CPUBackend; use crate::core::fields::m31::BaseField; use crate::core::fields::qm31::QM31; use crate::core::poly::circle::CanonicCoset; @@ -73,8 +75,9 @@ mod tests { let res = acc.finalize(); let poly = res.0[0].clone(); - for coeff in - poly.coeffs[(1 << (wide_fib.max_constraint_log_degree_bound() - 1)) + 1..].iter() + for coeff in poly.coeffs + [(1 << (Component::::max_constraint_log_degree_bound(&wide_fib) - 1)) + 1..] + .iter() { assert_eq!(*coeff, BaseField::zero()); }