diff --git a/benches/README.md b/benches/README.md new file mode 100644 index 000000000..8e6d73f85 --- /dev/null +++ b/benches/README.md @@ -0,0 +1,2 @@ +dev benchmark results can be seen at +https://starkware-libs.github.io/stwo/dev/bench/index.html diff --git a/benches/eval_at_point.rs b/benches/eval_at_point.rs index ab7f2dc15..c1af615da 100644 --- a/benches/eval_at_point.rs +++ b/benches/eval_at_point.rs @@ -10,7 +10,7 @@ pub fn cpu_eval_at_secure_point(c: &mut criterion::Criterion) { use stwo::core::fields::qm31::QM31; use stwo::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps}; use stwo::core::poly::NaturalOrder; - let log_size = 23; + let log_size = 20; let rng = &mut StdRng::seed_from_u64(0); let domain = CanonicCoset::new(log_size as u32).circle_domain(); @@ -35,7 +35,7 @@ pub fn cpu_eval_at_secure_point(c: &mut criterion::Criterion) { ); let point = CirclePoint { x, y }; - c.bench_function("cpu eval_at_secure_field_point", |b| { + c.bench_function("cpu eval_at_secure_field_point 2^20", |b| { b.iter(|| { black_box(::eval_at_point(&poly, point)); }) @@ -52,7 +52,7 @@ pub fn avx512_eval_at_secure_point(c: &mut criterion::Criterion) { use stwo::core::fields::qm31::QM31; use stwo::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps}; use stwo::core::poly::NaturalOrder; - let log_size = 23; + let log_size = 20; let rng = &mut StdRng::seed_from_u64(0); let domain = CanonicCoset::new(log_size as u32).circle_domain(); @@ -77,7 +77,7 @@ pub fn avx512_eval_at_secure_point(c: &mut criterion::Criterion) { ); let point = CirclePoint { x, y }; - c.bench_function("avx eval_at_secure_field_point", |b| { + c.bench_function("avx eval_at_secure_field_point 2^20", |b| { b.iter(|| { black_box(::eval_at_point(&poly, point)); }) diff --git a/benches/fft.rs b/benches/fft.rs index f84303648..6235e5b60 100644 --- a/benches/fft.rs +++ b/benches/fft.rs @@ -1,63 +1,108 @@ #![feature(iter_array_chunks)] -use criterion::Criterion; +use criterion::{BenchmarkId, Criterion, Throughput}; use stwo::core::backend::avx512::fft::ifft::get_itwiddle_dbls; -use stwo::core::backend::avx512::fft::rfft::get_twiddle_dbls; -use stwo::core::backend::avx512::PackedBaseField; +use stwo::core::backend::avx512::fft::transpose_vecs; +use stwo::core::backend::avx512::{BaseFieldVec, PackedBaseField}; +use stwo::core::fields::m31::BaseField; +use stwo::core::poly::circle::CanonicCoset; +#[cfg(target_arch = "x86_64")] pub fn avx512_ifft(c: &mut criterion::Criterion) { use stwo::core::backend::avx512::fft::ifft; - use stwo::core::backend::avx512::BaseFieldVec; - use stwo::core::fields::m31::BaseField; - use stwo::core::poly::circle::CanonicCoset; use stwo::platform; if !platform::avx512_detected() { return; } - const LOG_SIZE: u32 = 26; - let domain = CanonicCoset::new(LOG_SIZE).circle_domain(); - let values = (0..domain.size()) - .map(|i| BaseField::from_u32_unchecked(i as u32)) - .collect::>(); + let mut group = c.benchmark_group("iffts"); + for log_size in 16..=28 { + let (mut values, twiddle_dbls) = prepare_values(log_size); - // Compute. - let mut values = BaseFieldVec::from_iter(values); - let twiddle_dbls = get_itwiddle_dbls(domain.half_coset); + group.throughput(Throughput::Bytes( + (std::mem::size_of::() as u64) << log_size, + )); + group.bench_function(BenchmarkId::new("avx ifft", log_size), |b| { + b.iter(|| unsafe { + ifft::ifft( + std::mem::transmute(values.data.as_mut_ptr()), + &twiddle_dbls + .iter() + .map(|x| x.as_slice()) + .collect::>(), + log_size as usize, + ); + }); + }); + } +} - c.bench_function("avx ifft 26bit", |b| { +#[cfg(target_arch = "x86_64")] +pub fn avx512_ifft_parts(c: &mut criterion::Criterion) { + use stwo::core::backend::avx512::fft::ifft; + use stwo::platform; + if !platform::avx512_detected() { + return; + } + + let (mut values, twiddle_dbls) = prepare_values(14); + let mut group = c.benchmark_group("ifft parts"); + + // Note: These benchmarks run only on 2^14 elements ebcause of their parameters. + // Increasing the figure above won't change the runtime of these benchmarks. + group.throughput(Throughput::Bytes(4 << 14)); + group.bench_function("avx ifft_vecwise_loop 2^14", |b| { b.iter(|| unsafe { - ifft::ifft( + ifft::ifft_vecwise_loop( std::mem::transmute(values.data.as_mut_ptr()), &twiddle_dbls .iter() .map(|x| x.as_slice()) .collect::>(), - LOG_SIZE as usize, + 9, + 0, ); - }) + }); + }); + + group.bench_function("avx ifft3_loop 2^14", |b| { + b.iter(|| unsafe { + ifft::ifft3_loop( + std::mem::transmute(values.data.as_mut_ptr()), + &twiddle_dbls + .iter() + .skip(3) + .map(|x| x.as_slice()) + .collect::>(), + 7, + 4, + 0, + ); + }); + }); + + let (mut values, _twiddle_dbls) = prepare_values(20); + group.throughput(Throughput::Bytes(4 << 20)); + group.bench_function("avx transpose_vecs 2^20", |b| { + b.iter(|| unsafe { + transpose_vecs( + std::mem::transmute(values.data.as_mut_ptr()), + (20 - 4) as usize, + ); + }); }); } +#[cfg(target_arch = "x86_64")] pub fn avx512_rfft(c: &mut criterion::Criterion) { use stwo::core::backend::avx512::fft::rfft; - use stwo::core::backend::avx512::BaseFieldVec; - use stwo::core::fields::m31::BaseField; - use stwo::core::poly::circle::CanonicCoset; use stwo::platform; if !platform::avx512_detected() { return; } const LOG_SIZE: u32 = 20; - let domain = CanonicCoset::new(LOG_SIZE).circle_domain(); - let values = (0..domain.size()) - .map(|i| BaseField::from_u32_unchecked(i as u32)) - .collect::>(); - - // Compute. - let values = BaseFieldVec::from_iter(values); - let twiddle_dbls = get_twiddle_dbls(domain.half_coset); + let (values, twiddle_dbls) = prepare_values(LOG_SIZE); c.bench_function("avx rfft 20bit", |b| { b.iter(|| unsafe { @@ -78,8 +123,20 @@ pub fn avx512_rfft(c: &mut criterion::Criterion) { }); } +#[cfg(target_arch = "x86_64")] +fn prepare_values(log_size: u32) -> (BaseFieldVec, Vec>) { + let domain = CanonicCoset::new(log_size).circle_domain(); + let values = (0..domain.size()) + .map(|i| BaseField::from_u32_unchecked(i as u32)) + .collect::>(); + let values = BaseFieldVec::from_iter(values); + let twiddle_dbls = get_itwiddle_dbls(domain.half_coset); + (values, twiddle_dbls) +} + +#[cfg(target_arch = "x86_64")] criterion::criterion_group!( name=avx_ifft; config = Criterion::default().sample_size(10); - targets=avx512_ifft, avx512_rfft); + targets=avx512_ifft, avx512_ifft_parts, avx512_rfft); criterion::criterion_main!(avx_ifft); diff --git a/src/core/backend/avx512/fft/ifft.rs b/src/core/backend/avx512/fft/ifft.rs index 0a4093126..6abfe0981 100644 --- a/src/core/backend/avx512/fft/ifft.rs +++ b/src/core/backend/avx512/fft/ifft.rs @@ -147,7 +147,7 @@ pub unsafe fn ifft_lower_without_vecwise( /// high_bits - The number of bits this loops needs to run on. /// index_h - The higher part of the index, iterated by the caller. /// # Safety -unsafe fn ifft_vecwise_loop( +pub unsafe fn ifft_vecwise_loop( values: *mut i32, twiddle_dbl: &[&[i32]], loop_bits: usize, @@ -183,7 +183,7 @@ unsafe fn ifft_vecwise_loop( /// The layers `layer`, `layer + 1`, `layer + 2` are applied. /// index_h - The higher part of the index, iterated by the caller. /// # Safety -unsafe fn ifft3_loop( +pub unsafe fn ifft3_loop( values: *mut i32, twiddle_dbl: &[&[i32]], loop_bits: usize,