Skip to content

Commit

Permalink
More detailed fft benchmarks (#532)
Browse files Browse the repository at this point in the history
  • Loading branch information
spapinistarkware authored Mar 31, 2024
1 parent 2a18eb8 commit df3b5d1
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 36 deletions.
2 changes: 2 additions & 0 deletions benches/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dev benchmark results can be seen at
https://starkware-libs.github.io/stwo/dev/bench/index.html
8 changes: 4 additions & 4 deletions benches/eval_at_point.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub fn cpu_eval_at_secure_point(c: &mut criterion::Criterion) {
use stwo::core::fields::qm31::QM31;
use stwo::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps};
use stwo::core::poly::NaturalOrder;
let log_size = 23;
let log_size = 20;
let rng = &mut StdRng::seed_from_u64(0);

let domain = CanonicCoset::new(log_size as u32).circle_domain();
Expand All @@ -35,7 +35,7 @@ pub fn cpu_eval_at_secure_point(c: &mut criterion::Criterion) {
);

let point = CirclePoint { x, y };
c.bench_function("cpu eval_at_secure_field_point", |b| {
c.bench_function("cpu eval_at_secure_field_point 2^20", |b| {
b.iter(|| {
black_box(<CPUBackend as PolyOps>::eval_at_point(&poly, point));
})
Expand All @@ -52,7 +52,7 @@ pub fn avx512_eval_at_secure_point(c: &mut criterion::Criterion) {
use stwo::core::fields::qm31::QM31;
use stwo::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps};
use stwo::core::poly::NaturalOrder;
let log_size = 23;
let log_size = 20;
let rng = &mut StdRng::seed_from_u64(0);

let domain = CanonicCoset::new(log_size as u32).circle_domain();
Expand All @@ -77,7 +77,7 @@ pub fn avx512_eval_at_secure_point(c: &mut criterion::Criterion) {
);

let point = CirclePoint { x, y };
c.bench_function("avx eval_at_secure_field_point", |b| {
c.bench_function("avx eval_at_secure_field_point 2^20", |b| {
b.iter(|| {
black_box(<AVX512Backend as PolyOps>::eval_at_point(&poly, point));
})
Expand Down
117 changes: 87 additions & 30 deletions benches/fft.rs
Original file line number Diff line number Diff line change
@@ -1,63 +1,108 @@
#![feature(iter_array_chunks)]

use criterion::Criterion;
use criterion::{BenchmarkId, Criterion, Throughput};
use stwo::core::backend::avx512::fft::ifft::get_itwiddle_dbls;
use stwo::core::backend::avx512::fft::rfft::get_twiddle_dbls;
use stwo::core::backend::avx512::PackedBaseField;
use stwo::core::backend::avx512::fft::transpose_vecs;
use stwo::core::backend::avx512::{BaseFieldVec, PackedBaseField};
use stwo::core::fields::m31::BaseField;
use stwo::core::poly::circle::CanonicCoset;

#[cfg(target_arch = "x86_64")]
pub fn avx512_ifft(c: &mut criterion::Criterion) {
use stwo::core::backend::avx512::fft::ifft;
use stwo::core::backend::avx512::BaseFieldVec;
use stwo::core::fields::m31::BaseField;
use stwo::core::poly::circle::CanonicCoset;
use stwo::platform;
if !platform::avx512_detected() {
return;
}

const LOG_SIZE: u32 = 26;
let domain = CanonicCoset::new(LOG_SIZE).circle_domain();
let values = (0..domain.size())
.map(|i| BaseField::from_u32_unchecked(i as u32))
.collect::<Vec<_>>();
let mut group = c.benchmark_group("iffts");
for log_size in 16..=28 {
let (mut values, twiddle_dbls) = prepare_values(log_size);

// Compute.
let mut values = BaseFieldVec::from_iter(values);
let twiddle_dbls = get_itwiddle_dbls(domain.half_coset);
group.throughput(Throughput::Bytes(
(std::mem::size_of::<BaseField>() as u64) << log_size,
));
group.bench_function(BenchmarkId::new("avx ifft", log_size), |b| {
b.iter(|| unsafe {
ifft::ifft(
std::mem::transmute(values.data.as_mut_ptr()),
&twiddle_dbls
.iter()
.map(|x| x.as_slice())
.collect::<Vec<_>>(),
log_size as usize,
);
});
});
}
}

c.bench_function("avx ifft 26bit", |b| {
#[cfg(target_arch = "x86_64")]
pub fn avx512_ifft_parts(c: &mut criterion::Criterion) {
use stwo::core::backend::avx512::fft::ifft;
use stwo::platform;
if !platform::avx512_detected() {
return;
}

let (mut values, twiddle_dbls) = prepare_values(14);
let mut group = c.benchmark_group("ifft parts");

// Note: These benchmarks run only on 2^14 elements ebcause of their parameters.
// Increasing the figure above won't change the runtime of these benchmarks.
group.throughput(Throughput::Bytes(4 << 14));
group.bench_function("avx ifft_vecwise_loop 2^14", |b| {
b.iter(|| unsafe {
ifft::ifft(
ifft::ifft_vecwise_loop(
std::mem::transmute(values.data.as_mut_ptr()),
&twiddle_dbls
.iter()
.map(|x| x.as_slice())
.collect::<Vec<_>>(),
LOG_SIZE as usize,
9,
0,
);
})
});
});

group.bench_function("avx ifft3_loop 2^14", |b| {
b.iter(|| unsafe {
ifft::ifft3_loop(
std::mem::transmute(values.data.as_mut_ptr()),
&twiddle_dbls
.iter()
.skip(3)
.map(|x| x.as_slice())
.collect::<Vec<_>>(),
7,
4,
0,
);
});
});

let (mut values, _twiddle_dbls) = prepare_values(20);
group.throughput(Throughput::Bytes(4 << 20));
group.bench_function("avx transpose_vecs 2^20", |b| {
b.iter(|| unsafe {
transpose_vecs(
std::mem::transmute(values.data.as_mut_ptr()),
(20 - 4) as usize,
);
});
});
}

#[cfg(target_arch = "x86_64")]
pub fn avx512_rfft(c: &mut criterion::Criterion) {
use stwo::core::backend::avx512::fft::rfft;
use stwo::core::backend::avx512::BaseFieldVec;
use stwo::core::fields::m31::BaseField;
use stwo::core::poly::circle::CanonicCoset;
use stwo::platform;
if !platform::avx512_detected() {
return;
}

const LOG_SIZE: u32 = 20;
let domain = CanonicCoset::new(LOG_SIZE).circle_domain();
let values = (0..domain.size())
.map(|i| BaseField::from_u32_unchecked(i as u32))
.collect::<Vec<_>>();

// Compute.
let values = BaseFieldVec::from_iter(values);
let twiddle_dbls = get_twiddle_dbls(domain.half_coset);
let (values, twiddle_dbls) = prepare_values(LOG_SIZE);

c.bench_function("avx rfft 20bit", |b| {
b.iter(|| unsafe {
Expand All @@ -78,8 +123,20 @@ pub fn avx512_rfft(c: &mut criterion::Criterion) {
});
}

#[cfg(target_arch = "x86_64")]
fn prepare_values(log_size: u32) -> (BaseFieldVec, Vec<Vec<i32>>) {
let domain = CanonicCoset::new(log_size).circle_domain();
let values = (0..domain.size())
.map(|i| BaseField::from_u32_unchecked(i as u32))
.collect::<Vec<_>>();
let values = BaseFieldVec::from_iter(values);
let twiddle_dbls = get_itwiddle_dbls(domain.half_coset);
(values, twiddle_dbls)
}

#[cfg(target_arch = "x86_64")]
criterion::criterion_group!(
name=avx_ifft;
config = Criterion::default().sample_size(10);
targets=avx512_ifft, avx512_rfft);
targets=avx512_ifft, avx512_ifft_parts, avx512_rfft);
criterion::criterion_main!(avx_ifft);
4 changes: 2 additions & 2 deletions src/core/backend/avx512/fft/ifft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ pub unsafe fn ifft_lower_without_vecwise(
/// high_bits - The number of bits this loops needs to run on.
/// index_h - The higher part of the index, iterated by the caller.
/// # Safety
unsafe fn ifft_vecwise_loop(
pub unsafe fn ifft_vecwise_loop(
values: *mut i32,
twiddle_dbl: &[&[i32]],
loop_bits: usize,
Expand Down Expand Up @@ -183,7 +183,7 @@ unsafe fn ifft_vecwise_loop(
/// The layers `layer`, `layer + 1`, `layer + 2` are applied.
/// index_h - The higher part of the index, iterated by the caller.
/// # Safety
unsafe fn ifft3_loop(
pub unsafe fn ifft3_loop(
values: *mut i32,
twiddle_dbl: &[&[i32]],
loop_bits: usize,
Expand Down

0 comments on commit df3b5d1

Please sign in to comment.