diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 0000000..c320e26 --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,58 @@ +# Benchmarks + +## Table of Contents + +- [Overview](#overview) +- [Benchmark Results](#benchmark-results) + - [Number-Theoretic Transform Benchmarks](#number-theoretic-transform-benchmarks) + - [Polynomial Multiplication Benchmarks](#polynomial-multiplication-benchmarks) + +## Overview + +This benchmark comparison report shows the difference in performance between parallel, NTT-based and serial, brute-force +polynomial multiplication algorithms. Each entry in the first table. + +Computer Stats: + +``` +CPU(s): 16 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +``` + +## Benchmark Results + +### Number-Theoretic Transform Benchmarks + +| | `NTT` | +|:------------|:-------------------------- | +| **`64`** | `202.26 us` (✅ **1.00x**) | +| **`128`** | `354.08 us` (✅ **1.00x**) | +| **`256`** | `665.54 us` (✅ **1.00x**) | +| **`512`** | `1.12 ms` (✅ **1.00x**) | +| **`1024`** | `2.00 ms` (✅ **1.00x**) | +| **`2048`** | `3.94 ms` (✅ **1.00x**) | +| **`4096`** | `7.69 ms` (✅ **1.00x**) | +| **`8192`** | `16.13 ms` (✅ **1.00x**) | +| **`16384`** | `34.01 ms` (✅ **1.00x**) | +| **`32768`** | `74.65 ms` (✅ **1.00x**) | + +### Polynomial Multiplication Benchmarks + +| | `NTT-Based` | `Brute-Force` | +|:------------|:--------------------------|:---------------------------------- | +| **`64`** | `1.18 ms` (✅ **1.00x**) | `48.62 us` (🚀 **24.21x faster**) | +| **`128`** | `2.30 ms` (✅ **1.00x**) | `198.30 us` (🚀 **11.59x faster**) | +| **`256`** | `3.54 ms` (✅ **1.00x**) | `766.71 us` (🚀 **4.62x faster**) | +| **`512`** | `6.50 ms` (✅ **1.00x**) | `3.11 ms` (🚀 **2.09x faster**) | +| **`1024`** | `12.43 ms` (✅ **1.00x**) | `12.34 ms` (✅ **1.01x faster**) | +| **`2048`** | `24.68 ms` (✅ **1.00x**) | `49.90 ms` (❌ *2.02x slower*) | +| **`4096`** | `51.36 ms` (✅ **1.00x**) | `200.91 ms` (❌ *3.91x slower*) | +| **`8192`** | `106.21 ms` (✅ **1.00x**) | `803.87 ms` (❌ *7.57x slower*) | +| **`16384`** | `226.19 ms` (✅ **1.00x**) | `3.24 s` (❌ *14.31x slower*) | +| **`32768`** | `467.75 ms` (✅ **1.00x**) | `12.75 s` (❌ *27.25x slower*) | + +--- +Made with [criterion-table](https://github.com/nu11ptr/criterion-table) + diff --git a/benches/benchmark.rs b/benches/benchmark.rs index da45c0f..8484abd 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -29,41 +29,38 @@ fn bench_forward(n: usize, c: &Constants) { } fn criterion_forward(c: &mut Criterion) { - let mut group = c.benchmark_group("bench_forward"); - (6..deg).for_each(|x| { - group.bench_function(BenchmarkId::from_parameter(x), |b| { - let c = working_modulus(BigInt::from(x), BigInt::from(2 * x + 1)); - b.iter(|| bench_forward(black_box(1 << x), black_box(&c))) + let mut group = c.benchmark_group("Number-Theoretic Transform Benchmarks"); + (6..deg).for_each(|n| { + let id = BenchmarkId::new("NTT", 1 << n); + let c = working_modulus(BigInt::from(n), BigInt::from(2 * n + 1)); + group.bench_with_input(id, &n, |b, n| { + b.iter(|| bench_forward(black_box(1 << n), black_box(&c))) }); }); } -fn criterion_mul(c: &mut Criterion) { - let mut group = c.benchmark_group("bench_mul"); - (6..deg).for_each(|x| { - group.bench_function(BenchmarkId::from_parameter(x), |b| { - let N = BigInt::from((2 * x as usize).next_power_of_two()); - let M = N << 1 + 1; - let c = working_modulus(N, M); - b.iter(|| bench_mul(black_box(1 << x), black_box(1 << x), black_box(&c))) +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("Polynomial Multiplication Benchmarks"); + + (6..deg).for_each(|n| { + let id = BenchmarkId::new("NTT-Based", 1 << n); + let N = BigInt::from((2 * n).next_power_of_two()); + let M = N << 1 + 1; + let c = working_modulus(N, M); + group.bench_with_input(id, &n, |b, n| { + b.iter(|| bench_mul(black_box(1 << n), black_box(1 << n), black_box(&c))) }); - }); - group.finish(); -} -fn criterion_brute_mul(c: &mut Criterion) { - let mut group = c.benchmark_group("bench_brute_mul"); - (6..deg).for_each(|x| { - group.bench_function(BenchmarkId::from_parameter(x), |b| { - b.iter(|| bench_mul_brute(black_box(1 << x), black_box(1 << x))) + let id = BenchmarkId::new("Brute-Force", 1 << n); + group.bench_with_input(id, &n, |b, n| { + b.iter(|| bench_mul_brute(black_box(1 << n), black_box(1 << n))) }); }); - group.finish(); } criterion_group! { name = benches; config = Criterion::default().sample_size(10); - targets = criterion_forward, criterion_mul, criterion_brute_mul + targets = criterion_forward, criterion_benchmark } criterion_main!(benches); diff --git a/tables.toml b/tables.toml new file mode 100644 index 0000000..872382d --- /dev/null +++ b/tables.toml @@ -0,0 +1,17 @@ +[top_comments] +Overview = """ +This benchmark comparison report shows the difference in performance between parallel, NTT-based and serial, brute-force +polynomial multiplication algorithms. Each entry in the first table. + +Computer Stats: + +``` +CPU(s): 16 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +``` +""" + +[table_comments] +criterion_benchmark = """"""