From 4a714c804eb768faeac75f57fd7e79fc18898aae Mon Sep 17 00:00:00 2001 From: Sagar Dhawan Date: Mon, 2 Dec 2024 12:34:14 -0800 Subject: [PATCH 1/3] feat(icicle): Add icicle MSM support --- .github/workflows/rust.yml | 22 ++ Cargo.lock | 66 ++++ Cargo.toml | 1 + examples/alloc/Cargo.toml | 3 + examples/collatz/Cargo.toml | 3 + examples/fibonacci/Cargo.toml | 3 + examples/muldiv/Cargo.toml | 2 + examples/multi-function/Cargo.toml | 3 + examples/overflow/Cargo.toml | 5 +- examples/sha2-chain/Cargo.toml | 3 + examples/sha2-ex/Cargo.toml | 3 + examples/sha3-chain/Cargo.toml | 4 +- examples/sha3-ex/Cargo.toml | 4 +- examples/stdlib/Cargo.toml | 3 + jolt-core/Cargo.toml | 36 +- jolt-core/benches/iai.rs | 2 +- jolt-core/benches/msm.rs | 136 +++++++ jolt-core/benches/msm_batch.rs | 176 +++++++++ jolt-core/src/jolt/vm/bytecode.rs | 1 + jolt-core/src/jolt/vm/mod.rs | 40 +- .../src/jolt/vm/timestamp_range_check.rs | 18 +- jolt-core/src/msm/icicle/adapter.rs | 372 ++++++++++++++++++ jolt-core/src/msm/icicle/mod.rs | 104 +++++ jolt-core/src/msm/mod.rs | 336 +++++++++++++++- jolt-core/src/poly/commitment/hyperkzg.rs | 86 ++-- jolt-core/src/poly/commitment/hyrax.rs | 107 ++--- jolt-core/src/poly/commitment/kzg.rs | 189 ++++++++- jolt-core/src/poly/commitment/pedersen.rs | 108 ++++- jolt-core/src/poly/commitment/zeromorph.rs | 96 +++-- jolt-core/src/poly/unipoly.rs | 3 +- jolt-core/src/utils/errors.rs | 2 +- jolt-core/src/utils/mod.rs | 61 +++ jolt-evm-verifier/script/Cargo.lock | 20 +- jolt-sdk/Cargo.toml | 2 +- src/main.rs | 3 + 35 files changed, 1807 insertions(+), 216 deletions(-) create mode 100644 jolt-core/benches/msm.rs create mode 100644 jolt-core/benches/msm_batch.rs create mode 100644 jolt-core/src/msm/icicle/adapter.rs create mode 100644 jolt-core/src/msm/icicle/mod.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 37f6f528e..8de965792 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,6 +40,11 @@ jobs: with: command: clippy args: --all + - name: cargo clippy icicle + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --all --features icicle machete: runs-on: ubuntu-latest @@ -77,6 +82,23 @@ jobs: - name: Run jolt-core tests run: cargo nextest run --release -p jolt-core + test-icicle: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions-rust-lang/setup-rust-toolchain@v1 + - name: Cache Jolt RISC-V Rust toolchain + uses: actions/cache@v4 + with: + key: jolt-rust-toolchain-${{hashFiles('guest-toolchain-tag')}} + path: ~/.jolt + - name: Install Jolt RISC-V Rust toolchain + run: cargo run install-toolchain + - name: Install nextest + uses: taiki-e/install-action@nextest + - name: Run jolt-core tests + run: cargo nextest run --release -p jolt-core --features icicle + on-chain: name: Onchain Verifier Tests runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index eab739140..18184b0b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -696,6 +696,15 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "cobs" version = "0.2.3" @@ -1546,6 +1555,48 @@ dependencies = [ "serde", ] +[[package]] +name = "icicle-bn254" +version = "3.1.0" +source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +dependencies = [ + "cmake", + "icicle-core", + "icicle-hash", + "icicle-runtime", +] + +[[package]] +name = "icicle-core" +version = "3.1.0" +source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +dependencies = [ + "hex", + "icicle-runtime", + "once_cell", + "rand 0.8.5", + "rayon", +] + +[[package]] +name = "icicle-hash" +version = "3.1.0" +source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +dependencies = [ + "cmake", + "icicle-core", + "icicle-runtime", + "rand 0.8.5", +] + +[[package]] +name = "icicle-runtime" +version = "3.1.0" +source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +dependencies = [ + "cmake", +] + [[package]] name = "icu_collections" version = "1.5.0" @@ -1830,10 +1881,14 @@ dependencies = [ "fixedbitset", "getrandom 0.2.15", "iai-callgrind", + "icicle-bn254", + "icicle-core", + "icicle-runtime", "indicatif", "itertools 0.10.5", "memory-stats", "num-integer", + "once_cell", "postcard", "rand 0.7.3", "rand_chacha 0.3.1", @@ -1844,6 +1899,7 @@ dependencies = [ "sha3", "strum", "strum_macros", + "sys-info", "target-lexicon", "thiserror", "tokio", @@ -3384,6 +3440,16 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "sys-info" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "sysinfo" version = "0.30.13" diff --git a/Cargo.toml b/Cargo.toml index 6323b4071..4108f1966 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,6 +54,7 @@ members = [ [features] host = ["jolt-sdk/host"] +icicle = ["jolt-core/icicle"] [lib] path = "./src/lib.rs" diff --git a/examples/alloc/Cargo.toml b/examples/alloc/Cargo.toml index 0064df0e3..6eca4708d 100644 --- a/examples/alloc/Cargo.toml +++ b/examples/alloc/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "alloc-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] \ No newline at end of file diff --git a/examples/collatz/Cargo.toml b/examples/collatz/Cargo.toml index e1c81a88a..7e57f89c3 100644 --- a/examples/collatz/Cargo.toml +++ b/examples/collatz/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "collatz-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] \ No newline at end of file diff --git a/examples/fibonacci/Cargo.toml b/examples/fibonacci/Cargo.toml index a5274ed94..97439eddf 100644 --- a/examples/fibonacci/Cargo.toml +++ b/examples/fibonacci/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "fibonacci-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/muldiv/Cargo.toml b/examples/muldiv/Cargo.toml index 6910f3f60..876b325e2 100644 --- a/examples/muldiv/Cargo.toml +++ b/examples/muldiv/Cargo.toml @@ -7,3 +7,5 @@ edition = "2021" jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "muldiv-guest", path = "./guest" } +[features] +icicle = ["jolt-sdk/icicle"] \ No newline at end of file diff --git a/examples/multi-function/Cargo.toml b/examples/multi-function/Cargo.toml index 76ff01257..4b1e300fd 100644 --- a/examples/multi-function/Cargo.toml +++ b/examples/multi-function/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "multi-function-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/overflow/Cargo.toml b/examples/overflow/Cargo.toml index a8bf498cd..41beb30ff 100644 --- a/examples/overflow/Cargo.toml +++ b/examples/overflow/Cargo.toml @@ -5,4 +5,7 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } -guest = { package = "overflow-guest", path = "./guest" } \ No newline at end of file +guest = { package = "overflow-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/sha2-chain/Cargo.toml b/examples/sha2-chain/Cargo.toml index 7d1261436..fbc091781 100644 --- a/examples/sha2-chain/Cargo.toml +++ b/examples/sha2-chain/Cargo.toml @@ -8,3 +8,6 @@ jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "sha2-chain-guest", path = "./guest" } hex = "0.4.3" + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/sha2-ex/Cargo.toml b/examples/sha2-ex/Cargo.toml index 8b51be32b..a45c0f436 100644 --- a/examples/sha2-ex/Cargo.toml +++ b/examples/sha2-ex/Cargo.toml @@ -8,3 +8,6 @@ jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "sha2-guest", path = "./guest" } hex = "0.4.3" + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/sha3-chain/Cargo.toml b/examples/sha3-chain/Cargo.toml index cf12c733a..f8ddd5e1f 100644 --- a/examples/sha3-chain/Cargo.toml +++ b/examples/sha3-chain/Cargo.toml @@ -6,5 +6,7 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "sha3-chain-guest", path = "./guest" } +hex = "0.4.3" -hex = "0.4.3" \ No newline at end of file +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/sha3-ex/Cargo.toml b/examples/sha3-ex/Cargo.toml index cce963386..39c0ba43c 100644 --- a/examples/sha3-ex/Cargo.toml +++ b/examples/sha3-ex/Cargo.toml @@ -6,5 +6,7 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "sha3-guest", path = "./guest" } - hex = "0.4.3" + +[features] +icicle = ["jolt-sdk/icicle"] diff --git a/examples/stdlib/Cargo.toml b/examples/stdlib/Cargo.toml index 1069f2eff..b606705ba 100644 --- a/examples/stdlib/Cargo.toml +++ b/examples/stdlib/Cargo.toml @@ -6,3 +6,6 @@ edition = "2021" [dependencies] jolt-sdk = { path = "../../jolt-sdk", features = ["host"] } guest = { package = "stdlib-guest", path = "./guest" } + +[features] +icicle = ["jolt-sdk/icicle"] \ No newline at end of file diff --git a/jolt-core/Cargo.toml b/jolt-core/Cargo.toml index d53e2d7e8..7f58c5c97 100644 --- a/jolt-core/Cargo.toml +++ b/jolt-core/Cargo.toml @@ -19,6 +19,18 @@ repository = "https://github.com/a16z/jolt" license-file = "LICENSE" keywords = ["SNARK", "cryptography", "proofs"] +[features] +default = [ + "ark-ec/parallel", + "ark-ff/parallel", + "ark-std/parallel", + "ark-ff/asm", + "host", + "rayon", +] +host = ["dep:reqwest", "dep:tokio"] +icicle = ["default", "dep:icicle-runtime", "dep:icicle-core", "dep:icicle-bn254"] + [dependencies] ark-bn254 = "0.4.0" ark-ec = { version = "0.4.2", default-features = false } @@ -65,6 +77,7 @@ bytemuck = "1.19.0" tokio = { version = "1.38.0", optional = true } alloy-primitives = "0.7.6" alloy-sol-types = "0.7.6" +once_cell = "1.19.0" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } @@ -93,23 +106,24 @@ harness = false name = "compute_cubic" harness = false +[[bench]] +name = "msm" +harness = false + +[[bench]] +name = "msm_batch" +harness = false + [lib] name = "jolt_core" path = "src/lib.rs" -[features] -default = [ - "ark-ec/parallel", - "ark-ff/parallel", - "ark-std/parallel", - "ark-ff/asm", - "host", - "rayon", -] -host = ["dep:reqwest", "dep:tokio"] - [target.'cfg(not(target_arch = "wasm32"))'.dependencies] +icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } +icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } +icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } memory-stats = "1.0.0" +sys-info = "0.9.1" tokio = { version = "1.38.0", optional = true, features = ["rt-multi-thread"] } [target.'cfg(target_arch = "wasm32")'.dependencies] diff --git a/jolt-core/benches/iai.rs b/jolt-core/benches/iai.rs index e5d971d64..0535f9177 100644 --- a/jolt-core/benches/iai.rs +++ b/jolt-core/benches/iai.rs @@ -35,7 +35,7 @@ fn eval_poly_setup(size: usize) -> (DensePolynomial, Vec) { #[library_benchmark] #[bench::long(msm_setup::(4096))] fn bench_msm(input: (Vec, Vec)) -> G { - black_box(VariableBaseMSM::msm(&G::normalize_batch(&input.0), &input.1).unwrap()) + black_box(VariableBaseMSM::msm(&G::normalize_batch(&input.0), None, &input.1).unwrap()) } #[library_benchmark] diff --git a/jolt-core/benches/msm.rs b/jolt-core/benches/msm.rs new file mode 100644 index 000000000..75fded2ca --- /dev/null +++ b/jolt-core/benches/msm.rs @@ -0,0 +1,136 @@ +use ark_bn254::{Bn254, Fr, G1Affine, G1Projective}; +use ark_ff::{BigInteger, PrimeField}; +use ark_std::rand::Rng; +use ark_std::UniformRand; +use ark_std::{One, Zero}; +use criterion::Criterion; +use jolt_core::field::JoltField; +#[cfg(feature = "icicle")] +use jolt_core::msm::Icicle; +use jolt_core::msm::{icicle_init, GpuBaseType, MsmType, VariableBaseMSM}; +use jolt_core::poly::commitment::commitment_scheme::CommitmentScheme; +use jolt_core::poly::commitment::zeromorph::Zeromorph; +use jolt_core::utils::transcript::{KeccakTranscript, Transcript}; +use rand_chacha::ChaCha20Rng; +use rand_core::{RngCore, SeedableRng}; +use rayon::prelude::*; + +const SRS_SIZE: usize = 1 << 20; + +// Sets up the benchmark +fn setup_bench( + msm_type: MsmType, +) -> ( + Vec, + Option>>, + Vec, +) +where + F: JoltField, + PCS: CommitmentScheme, + ProofTranscript: Transcript, +{ + let mut rng = ChaCha20Rng::seed_from_u64(SRS_SIZE as u64); + + let scalars = match msm_type { + MsmType::Zero => { + vec![Fr::zero(); SRS_SIZE] + } + MsmType::One => { + vec![Fr::one(); SRS_SIZE] + } + MsmType::Small(_) => (0..SRS_SIZE) + .into_iter() + .map(|_| { + let i = rng.gen_range(0..(1 << 10)); + ::from_u64(i).unwrap() + }) + .collect(), + MsmType::Medium(_) => (0..SRS_SIZE) + .into_iter() + .map(|_| { + let i = rng.next_u64(); + ::from_u64(i).unwrap() + }) + .collect(), + MsmType::Large(_) => (0..SRS_SIZE) + .into_iter() + .map(|_| { + let values: [u64; 4] = [ + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + ]; + let bigint = ark_ff::BigInteger256::new(values); + ::from_bytes(&bigint.to_bytes_le()) + }) + .collect(), + }; + + let bases: Vec = std::iter::repeat_with(|| G1Affine::rand(&mut rng)) + .take(SRS_SIZE) + .collect(); + #[cfg(feature = "icicle")] + let gpu_bases = Some( + bases + .par_iter() + .map(|base| G1Projective::from_ark_affine(base)) + .collect(), + ); + + let max_num_bits = scalars + .par_iter() + .map(|s| s.clone().into_bigint().num_bits()) + .max() + .unwrap(); + + println!("Using max num bits: {}", max_num_bits); + #[cfg(not(feature = "icicle"))] + let gpu_bases = None; + (bases, gpu_bases, scalars) +} + +fn benchmark_msm(c: &mut Criterion, name: &str, msm_type: MsmType) +where + F: JoltField, + PCS: CommitmentScheme, + ProofTranscript: Transcript, +{ + let (bases, gpu_bases, scalars) = setup_bench::(msm_type); + icicle_init(); + #[cfg(feature = "icicle")] + let id = format!("{} [mode:Icicle]", name); + #[cfg(not(feature = "icicle"))] + let id = format!("{} [mode:JOLT CPU]", name); + c.bench_function(&id, |b| { + b.iter(|| { + let msm = + ::msm(&bases, gpu_bases.as_deref(), &scalars); + let _ = msm.expect("MSM failed"); + }); + }); +} + +fn main() { + let mut criterion = Criterion::default() + .configure_from_args() + .sample_size(20) + .warm_up_time(std::time::Duration::from_secs(5)); + benchmark_msm::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm(Large)", + MsmType::Large(0 /* unused */), + ); + benchmark_msm::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm(Medium)", + MsmType::Medium(0 /* unused */), + ); + benchmark_msm::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm(Small)", + MsmType::Small(0 /* unused */), + ); + criterion.final_summary(); +} diff --git a/jolt-core/benches/msm_batch.rs b/jolt-core/benches/msm_batch.rs new file mode 100644 index 000000000..f6e674134 --- /dev/null +++ b/jolt-core/benches/msm_batch.rs @@ -0,0 +1,176 @@ +use ark_bn254::{Bn254, Fr, G1Affine, G1Projective}; +use ark_ff::BigInteger; +use ark_std::rand::seq::SliceRandom; +use ark_std::rand::Rng; +use ark_std::UniformRand; +use ark_std::{One, Zero}; +use criterion::Criterion; +use jolt_core::field::JoltField; +#[cfg(feature = "icicle")] +use jolt_core::msm::Icicle; +use jolt_core::msm::{icicle_init, GpuBaseType, MsmType, VariableBaseMSM}; +use jolt_core::poly::commitment::commitment_scheme::CommitmentScheme; +use jolt_core::poly::commitment::zeromorph::Zeromorph; +use jolt_core::utils::transcript::{KeccakTranscript, Transcript}; +use rand_chacha::ChaCha20Rng; +use rand_core::{RngCore, SeedableRng}; +#[cfg(feature = "icicle")] +use rayon::prelude::*; + +const SRS_SIZE: usize = 1 << 14; + +// Sets up the benchmark +fn setup_bench( + batch_config: BatchConfig, +) -> ( + Vec, + Option>>, + Vec>, +) +where + F: JoltField, + PCS: CommitmentScheme, + ProofTranscript: Transcript, +{ + let mut rng = ChaCha20Rng::seed_from_u64(SRS_SIZE as u64); + // For each type in the batch config create a vector of scalars + let mut scalar_batches: Vec> = vec![]; + + (0..batch_config.small) + .into_iter() + .for_each(|_| scalar_batches.push(get_scalars(MsmType::Small(0 /* unused */), SRS_SIZE))); + (0..batch_config.medium) + .into_iter() + .for_each(|_| scalar_batches.push(get_scalars(MsmType::Medium(0 /* unused */), SRS_SIZE))); + (0..batch_config.large) + .into_iter() + .for_each(|_| scalar_batches.push(get_scalars(MsmType::Large(0 /* unused */), SRS_SIZE))); + scalar_batches.shuffle(&mut rng); + + let bases: Vec = std::iter::repeat_with(|| G1Affine::rand(&mut rng)) + .take(SRS_SIZE) + .collect(); + #[cfg(feature = "icicle")] + let gpu_bases = Some( + bases + .par_iter() + .map(|base| G1Projective::from_ark_affine(base)) + .collect(), + ); + #[cfg(not(feature = "icicle"))] + let gpu_bases = None; + (bases, gpu_bases, scalar_batches) +} + +fn get_scalars(msm_type: MsmType, size: usize) -> Vec { + let mut rng = ChaCha20Rng::seed_from_u64(size as u64); + match msm_type { + MsmType::Zero => { + vec![Fr::zero(); size] + } + MsmType::One => { + vec![Fr::one(); size] + } + MsmType::Small(_) => (0..size) + .into_iter() + .map(|_| { + let i = rng.gen_range(0..(1 << 10)); + ::from_u64(i).unwrap() + }) + .collect(), + MsmType::Medium(_) => (0..size) + .into_iter() + .map(|_| { + let i = rng.next_u64(); + ::from_u64(i).unwrap() + }) + .collect(), + MsmType::Large(_) => (0..size) + .into_iter() + .map(|_| { + let values: [u64; 4] = [ + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + rng.next_u64(), + ]; + let bigint = ark_ff::BigInteger256::new(values); + ::from_bytes(&bigint.to_bytes_le()) + }) + .collect(), + } +} + +fn benchmark_msm_batch( + c: &mut Criterion, + name: &str, + batch_config: BatchConfig, +) where + F: JoltField, + PCS: CommitmentScheme, + ProofTranscript: Transcript, +{ + let (bases, gpu_bases, scalar_batches) = setup_bench::(batch_config); + let scalar_batches_ref: Vec<_> = scalar_batches + .iter() + .map(|inner_vec| inner_vec.as_slice()) + .collect(); + icicle_init(); + println!("Running benchmark for {:?}", batch_config); + #[cfg(feature = "icicle")] + let id = format!("{} [mode:Icicle]", name); + #[cfg(not(feature = "icicle"))] + let id = format!("{} [mode:JOLT CPU]", name); + c.bench_function(&id, |b| { + b.iter(|| { + let msm = ::batch_msm( + &bases, + gpu_bases.as_deref(), + &scalar_batches_ref, + ); + assert_eq!(msm.len(), scalar_batches.len()); + }); + }); +} + +#[derive(Debug, Clone, Copy)] +struct BatchConfig { + small: usize, + medium: usize, + large: usize, +} + +fn main() { + let mut criterion = Criterion::default() + .configure_from_args() + .sample_size(10) + .warm_up_time(std::time::Duration::from_secs(10)); + benchmark_msm_batch::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm_batch(bias: Large)", + BatchConfig { + small: 100, + medium: 100, + large: 300, + }, + ); + benchmark_msm_batch::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm_batch(bias: Medium)", + BatchConfig { + small: 100, + medium: 300, + large: 100, + }, + ); + benchmark_msm_batch::, Fr, KeccakTranscript>( + &mut criterion, + "VariableBaseMSM::msm_batch(bias: Small)", + BatchConfig { + small: 300, + medium: 100, + large: 100, + }, + ); + criterion.final_summary(); +} diff --git a/jolt-core/src/jolt/vm/bytecode.rs b/jolt-core/src/jolt/vm/bytecode.rs index 156950fa1..f96debd4d 100644 --- a/jolt-core/src/jolt/vm/bytecode.rs +++ b/jolt-core/src/jolt/vm/bytecode.rs @@ -22,6 +22,7 @@ use rayon::prelude::*; use super::{JoltPolynomials, JoltTraceStep}; use crate::utils::transcript::Transcript; + use crate::{ lasso::memory_checking::{MemoryCheckingProof, MemoryCheckingProver, MemoryCheckingVerifier}, poly::{dense_mlpoly::DensePolynomial, identity_poly::IdentityPolynomial}, diff --git a/jolt-core/src/jolt/vm/mod.rs b/jolt-core/src/jolt/vm/mod.rs index 46177850e..e606439ad 100644 --- a/jolt-core/src/jolt/vm/mod.rs +++ b/jolt-core/src/jolt/vm/mod.rs @@ -14,6 +14,7 @@ use std::marker::PhantomData; use strum::EnumCount; use timestamp_range_check::TimestampRangeCheckStuff; +use crate::join_conditional; use crate::jolt::{ instruction::{ div::DIVInstruction, divu::DIVUInstruction, mulh::MULHInstruction, @@ -26,6 +27,7 @@ use crate::jolt::{ use crate::lasso::memory_checking::{ Initializable, MemoryCheckingProver, MemoryCheckingVerifier, StructuredPolynomialData, }; +use crate::msm::icicle; use crate::poly::commitment::commitment_scheme::{BatchType, CommitmentScheme}; use crate::poly::dense_mlpoly::DensePolynomial; use crate::r1cs::inputs::{ConstraintInput, R1CSPolynomials, R1CSProof, R1CSStuff}; @@ -238,31 +240,57 @@ impl JoltPolynomials { PCS: CommitmentScheme, ProofTranscript: Transcript, { + let span = tracing::span!(tracing::Level::INFO, "commit::initialize"); + let _guard = span.enter(); let mut commitments = JoltCommitments::::initialize(preprocessing); + drop(_guard); + drop(span); let trace_polys = self.read_write_values(); - let trace_comitments = + let span = tracing::span!(tracing::Level::INFO, "commit::trace_commitments"); + let _guard = span.enter(); + let trace_commitments = PCS::batch_commit_polys_ref(&trace_polys, &preprocessing.generators, BatchType::Big); + drop(_guard); + drop(span); + commitments .read_write_values_mut() .into_iter() - .zip(trace_comitments.into_iter()) + .zip(trace_commitments.into_iter()) .for_each(|(dest, src)| *dest = src); + let span = tracing::span!(tracing::Level::INFO, "commit::t_final"); + let _guard = span.enter(); commitments.bytecode.t_final = PCS::commit(&self.bytecode.t_final, &preprocessing.generators); + drop(_guard); + drop(span); + + let span = tracing::span!(tracing::Level::INFO, "commit::read_write_memory"); + let _guard = span.enter(); ( commitments.read_write_memory.v_final, commitments.read_write_memory.t_final, - ) = rayon::join( + ) = join_conditional!( || PCS::commit(&self.read_write_memory.v_final, &preprocessing.generators), - || PCS::commit(&self.read_write_memory.t_final, &preprocessing.generators), + || PCS::commit(&self.read_write_memory.t_final, &preprocessing.generators) ); + drop(_guard); + drop(span); + + let span = tracing::span!( + tracing::Level::INFO, + "commit::commit_instructions_final_cts" + ); + let _guard = span.enter(); commitments.instruction_lookups.final_cts = PCS::batch_commit_polys( &self.instruction_lookups.final_cts, &preprocessing.generators, BatchType::Big, ); + drop(_guard); + drop(span); commitments } @@ -287,6 +315,10 @@ where max_memory_address: usize, max_trace_length: usize, ) -> JoltPreprocessing { + //TODO(sagar): This should be moved to a more appropriate place - icicle makes a network request + // which impacts prover time. + icicle::icicle_init(); + let bytecode_commitment_shapes = BytecodeProof::::commit_shapes( max_bytecode_size, max_trace_length, diff --git a/jolt-core/src/jolt/vm/timestamp_range_check.rs b/jolt-core/src/jolt/vm/timestamp_range_check.rs index 933e742f7..789a4a573 100644 --- a/jolt-core/src/jolt/vm/timestamp_range_check.rs +++ b/jolt-core/src/jolt/vm/timestamp_range_check.rs @@ -1,7 +1,9 @@ +use super::{JoltCommitments, JoltPolynomials, JoltStuff}; use crate::field::{JoltField, OptimizedMul}; use crate::lasso::memory_checking::{ ExogenousOpenings, Initializable, StructuredPolynomialData, VerifierComputedOpening, }; +use crate::poly::commitment::commitment_scheme::{BatchType, CommitShape, CommitmentScheme}; use crate::poly::opening_proof::{ProverOpeningAccumulator, VerifierOpeningAccumulator}; use crate::subprotocols::grand_product::{ BatchedDenseGrandProduct, BatchedGrandProduct, BatchedGrandProductLayer, @@ -9,14 +11,6 @@ use crate::subprotocols::grand_product::{ }; use crate::utils::math::Math; use crate::utils::thread::drop_in_background_thread; -use ark_serialize::{CanonicalDeserialize, CanonicalSerialize}; -use common::constants::MEMORY_OPS_PER_INSTRUCTION; -use itertools::interleave; -use rayon::prelude::*; -#[cfg(test)] -use std::collections::HashSet; - -use crate::poly::commitment::commitment_scheme::{BatchType, CommitShape, CommitmentScheme}; use crate::utils::transcript::Transcript; use crate::{ lasso::memory_checking::{ @@ -28,8 +22,12 @@ use crate::{ }, utils::errors::ProofVerifyError, }; - -use super::{JoltCommitments, JoltPolynomials, JoltStuff}; +use ark_serialize::{CanonicalDeserialize, CanonicalSerialize}; +use common::constants::MEMORY_OPS_PER_INSTRUCTION; +use itertools::interleave; +use rayon::prelude::*; +#[cfg(test)] +use std::collections::HashSet; #[derive(Default, CanonicalSerialize, CanonicalDeserialize)] pub struct TimestampRangeCheckStuff { diff --git a/jolt-core/src/msm/icicle/adapter.rs b/jolt-core/src/msm/icicle/adapter.rs new file mode 100644 index 000000000..0d40890ca --- /dev/null +++ b/jolt-core/src/msm/icicle/adapter.rs @@ -0,0 +1,372 @@ +use crate::msm::{GpuBaseType, MsmType, VariableBaseMSM}; +use ark_bn254::G1Projective; +use ark_ec::{CurveGroup, ScalarMul}; +use ark_ff::{BigInteger, Field, PrimeField}; +use icicle_bn254::curve::CurveCfg as IcicleBn254; +use icicle_core::curve::{Affine, Curve, Projective}; +use icicle_core::{ + msm::{msm, MSMConfig, MSM}, + traits::FieldImpl, +}; +use icicle_runtime::memory::HostOrDeviceSlice; +use icicle_runtime::stream::IcicleStreamHandle; +use icicle_runtime::{ + memory::{DeviceVec, HostSlice}, + stream::IcicleStream, +}; +use rayon::prelude::*; +use std::os::raw::c_void; + +impl Icicle for G1Projective { + type C = IcicleBn254; + + fn to_ark_projective(point: &Projective) -> Self { + let proj_x = + ::BaseField::from_random_bytes(&point.x.to_bytes_le()).unwrap(); + let proj_y = + ::BaseField::from_random_bytes(&point.y.to_bytes_le()).unwrap(); + let proj_z = + ::BaseField::from_random_bytes(&point.z.to_bytes_le()).unwrap(); + + let proj_x = proj_x * proj_z; + let proj_y = proj_y * proj_z * proj_z; + Self::new_unchecked(proj_x, proj_y, proj_z) + } + + fn from_ark_affine(point: &Self::MulBase) -> Affine { + let x_bytes: Vec = point + .x + .to_base_prime_field_elements() + .flat_map(|x| x.into_bigint().to_bytes_le()) + .collect(); + let y_bytes: Vec = point + .y + .to_base_prime_field_elements() + .flat_map(|x| x.into_bigint().to_bytes_le()) + .collect(); + let x = ::BaseField::from_bytes_le(&x_bytes); + let y = ::BaseField::from_bytes_le(&y_bytes); + Affine:: { x, y } + } +} + +pub trait Icicle: ScalarMul { + type C: Curve + MSM; + + // Note: To prevent excessive trait the arkworks conversion functions within icicle are reimplemented + fn to_ark_projective(point: &Projective) -> Self; + + fn from_ark_affine(point: &Self::MulBase) -> Affine; +} + +#[tracing::instrument(skip_all, name = "icicle_msm")] +pub fn icicle_msm( + bases: &[GpuBaseType], + scalars: &[V::ScalarField], + bit_size: usize, +) -> V { + assert!(scalars.len() <= bases.len()); + + let mut bases_slice = DeviceVec::>::device_malloc(bases.len()).unwrap(); + + let span = tracing::span!(tracing::Level::INFO, "convert_scalars"); + let _guard = span.enter(); + + let mut scalars_slice = + DeviceVec::<<::C as Curve>::ScalarField>::device_malloc(scalars.len()) + .unwrap(); + let scalars_mont = + unsafe { &*(scalars as *const _ as *const [<::C as Curve>::ScalarField]) }; + + drop(_guard); + drop(span); + + let mut stream = IcicleStream::create().unwrap(); + + let span = tracing::span!(tracing::Level::INFO, "copy_to_gpu"); + let _guard = span.enter(); + bases_slice + .copy_from_host_async(HostSlice::from_slice(bases), &stream) + .unwrap(); + scalars_slice + .copy_from_host_async(HostSlice::from_slice(scalars_mont), &stream) + .unwrap(); + drop(_guard); + drop(span); + + let mut msm_result = DeviceVec::>::device_malloc(1).unwrap(); + let mut cfg = MSMConfig::default(); + cfg.stream_handle = IcicleStreamHandle::from(&stream); + cfg.is_async = false; + cfg.are_scalars_montgomery_form = true; + cfg.bitsize = bit_size as i32; + + let span = tracing::span!(tracing::Level::INFO, "gpu_msm"); + let _guard = span.enter(); + + msm( + &scalars_slice, + &bases_slice[..scalars.len()], + &cfg, + &mut msm_result, + ) + .unwrap(); + + drop(_guard); + drop(span); + + let mut msm_host_result = [Projective::::zero(); 1]; + + let span = tracing::span!(tracing::Level::INFO, "copy_msm_result"); + let _guard = span.enter(); + msm_result + .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result)) + .unwrap(); + drop(_guard); + drop(span); + + stream.synchronize().unwrap(); + stream.destroy().unwrap(); + V::to_ark_projective(&msm_host_result[0]) +} + +/// Batch process msms - assumes batches are equal in size +/// Variable Batch sizes is not currently supported by icicle +#[tracing::instrument(skip_all)] +pub fn icicle_batch_msm( + bases: &[GpuBaseType], + scalar_batches: &[&[V::ScalarField]], + batch_type: MsmType, +) -> Vec { + let bases_len = bases.len(); + let batch_size = scalar_batches.len(); + assert!(scalar_batches.par_iter().all(|s| s.len() == bases_len)); + + let mut stream = IcicleStream::create().unwrap(); + icicle_runtime::warmup(&stream).unwrap(); + + let mut bases_slice = + DeviceVec::>::device_malloc_async(bases_len, &stream).unwrap(); + let span = tracing::span!(tracing::Level::INFO, "copy_bases_to_gpu"); + let _guard = span.enter(); + bases_slice + .copy_from_host_async(HostSlice::from_slice(bases), &stream) + .unwrap(); + drop(_guard); + drop(span); + + let mut msm_result = + DeviceVec::>::device_malloc_async(batch_size, &stream).unwrap(); + let mut msm_host_results = vec![Projective::::zero(); batch_size]; + let total_len: usize = scalar_batches.par_iter().map(|batch| batch.len()).sum(); + let mut scalars_slice = + DeviceVec::<<::C as Curve>::ScalarField>::device_malloc_async( + total_len, &stream, + ) + .unwrap(); + + let span = tracing::span!(tracing::Level::INFO, "copy_scalars_to_gpu"); + let _guard = span.enter(); + + let mut offset = 0; + for batch in scalar_batches { + let scalars_mont = unsafe { + &*(&batch[..] as *const _ as *const [<::C as Curve>::ScalarField]) + }; + copy_offset_from_host_async( + &mut scalars_slice, + HostSlice::from_slice(scalars_mont), + offset, + &stream, + ) + .unwrap(); + offset += batch.len(); + } + + drop(_guard); + drop(span); + + //TODO(sagar) why doesn't the GPU always go to 100% clock speeds + let mut cfg = MSMConfig::default(); + cfg.stream_handle = IcicleStreamHandle::from(&stream); + cfg.is_async = true; + cfg.are_scalars_montgomery_form = true; + cfg.batch_size = batch_size as i32; + cfg.bitsize = batch_type.num_bits() as i32; + cfg.ext + .set_int(icicle_core::msm::CUDA_MSM_LARGE_BUCKET_FACTOR, 5); + + let span = tracing::span!(tracing::Level::INFO, "msm_batch_gpu"); + let _guard = span.enter(); + msm(&scalars_slice, &bases_slice, &cfg, &mut msm_result).unwrap(); + drop(_guard); + drop(span); + + let span = tracing::span!(tracing::Level::INFO, "synchronize"); + let _guard = span.enter(); + stream.synchronize().unwrap(); + drop(_guard); + drop(span); + + let span = tracing::span!(tracing::Level::INFO, "copy_msm_result"); + let _guard = span.enter(); + msm_result + .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_results)) + .unwrap(); + drop(_guard); + drop(span); + + let span = tracing::span!(tracing::Level::INFO, "converting_results"); + let _guard = span.enter(); + stream.destroy().unwrap(); + msm_host_results + .into_par_iter() + .map(|res| V::to_ark_projective(&res)) + .collect() +} + +pub fn copy_offset_from_host_async( + dest: &mut DeviceVec, + src: &HostSlice, + offset: usize, + stream: &IcicleStream, +) -> Result<(), icicle_runtime::errors::eIcicleError> { + if dest.is_empty() { + return Ok(()); + } + + if !dest.is_on_active_device() { + panic!("not allocated on an active device"); + } + + if (src.len() + offset) > dest.len() { + panic!( + "offset {} + HostSlice.len() {} exceeds the size of the destination DeviceVec {}", + offset, + src.len(), + dest.len() + ); + } + + let size = size_of::() * src.len(); + unsafe { + icicle_runtime::icicle_copy_to_device_async( + dest.as_mut_ptr().add(offset) as *mut c_void, + src.as_ptr() as *const c_void, + size, + stream.handle, + ) + .wrap() + } +} + +pub fn icicle_from_ark(ark: &T) -> I +where + T: PrimeField, + I: FieldImpl, +{ + let mut ark_bytes = + Vec::with_capacity(T::BigInt::NUM_LIMBS * 8 * T::extension_degree() as usize); + for base_elem in ark.to_base_prime_field_elements() { + ark_bytes.extend_from_slice(&base_elem.into_bigint().to_bytes_le()); + } + I::from_bytes_le(&ark_bytes) +} + +pub fn icicle_to_ark(icicle: &I) -> T +where + T: PrimeField, + I: FieldImpl, +{ + T::from_random_bytes(&icicle.to_bytes_le()).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::msm::total_memory_bits; + use ark_bn254::{Fr, G1Affine, G1Projective}; + use ark_ec::VariableBaseMSM as ark_VariableBaseMSM; + use ark_std::UniformRand; + use icicle_bn254::curve::ScalarField as GPUScalar; + use rand_core::SeedableRng; + + #[test] + fn test_icicle_msm_consistency() { + let pow = 10; + let n = 1 << pow; + let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(n as u64); + for _ in 0..10 { + let scalars: Vec = std::iter::repeat_with(|| Fr::rand(&mut rng)) + .take(n) + .collect(); + let bases: Vec = std::iter::repeat_with(|| G1Affine::rand(&mut rng)) + .take(n) + .collect(); + + let gpu_bases = bases + .par_iter() + .map(|base| ::from_ark_affine(base)) + .collect::>(); + let icicle_res = icicle_msm::(&gpu_bases, &scalars, 256); + let arkworks_res: G1Projective = ark_VariableBaseMSM::msm(&bases, &scalars).unwrap(); + let no_gpu_res: G1Projective = + VariableBaseMSM::inner_msm(&bases, None, &scalars, false, None).unwrap(); + + assert_eq!(icicle_res, arkworks_res); + assert_eq!(icicle_res, no_gpu_res); + } + } + + #[test] + fn test_icicle_batch_msm_consistency() { + let pow = 10; + let n = 1 << pow; + let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(n as u64); + for _ in 0..10 { + let scalars: Vec = std::iter::repeat_with(|| Fr::rand(&mut rng)) + .take(n) + .collect(); + let scalar_batches = [scalars.as_slice(); 20]; + + let bases: Vec = std::iter::repeat_with(|| G1Affine::rand(&mut rng)) + .take(n) + .collect(); + + let gpu_bases = bases + .par_iter() + .map(|base| ::from_ark_affine(base)) + .collect::>(); + let icicle_res = + icicle_batch_msm::(&gpu_bases, &scalar_batches, MsmType::Large(256)); + let arkworks_res: Vec = (0..20) + .into_iter() + .map(|_| ark_VariableBaseMSM::msm(&bases, &scalars).unwrap()) + .collect(); + let no_gpu_res: Vec = (0..20) + .into_iter() + .map(|_| VariableBaseMSM::inner_msm(&bases, None, &scalars, false, None).unwrap()) + .collect(); + + assert_eq!(icicle_res, arkworks_res); + assert_eq!(icicle_res, no_gpu_res); + } + } + + #[test] + fn test_casting() { + let ark = Fr::from(100); + let gpu: GPUScalar = icicle_from_ark(&ark); + + let ark_bytes: [u8; 32] = unsafe { std::mem::transmute(ark) }; + let gpu_bytes: [u8; 32] = + unsafe { std::mem::transmute(icicle_to_ark::(&gpu)) }; + assert_eq!(ark_bytes, gpu_bytes); + } + + #[test] + fn test_total_memory() { + let total = total_memory_bits(); + assert!(total > 0); + } +} diff --git a/jolt-core/src/msm/icicle/mod.rs b/jolt-core/src/msm/icicle/mod.rs new file mode 100644 index 000000000..b77daee3d --- /dev/null +++ b/jolt-core/src/msm/icicle/mod.rs @@ -0,0 +1,104 @@ +#[cfg(not(feature = "icicle"))] +use ark_bn254::G1Projective; +use ark_ec::{CurveGroup, ScalarMul}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Once; + +#[cfg(feature = "icicle")] +pub(crate) mod adapter; +#[cfg(feature = "icicle")] +pub use adapter::*; + +static ICICLE_INIT: Once = Once::new(); +static ICICLE_READY: AtomicBool = AtomicBool::new(false); + +#[cfg(feature = "icicle")] +pub trait CurveGroupConfig: CurveGroup + Icicle {} +#[cfg(not(feature = "icicle"))] +pub trait CurveGroupConfig: CurveGroup {} + +#[cfg(feature = "icicle")] +pub trait ScalarMulConfig: ScalarMul + Icicle {} +#[cfg(not(feature = "icicle"))] +pub trait ScalarMulConfig: ScalarMul {} +#[cfg(not(feature = "icicle"))] +pub trait Icicle {} +#[cfg(not(feature = "icicle"))] +impl Icicle for G1Projective {} + +/// Initializes the icicle backend and sets the CUDA device as active and returns true if successful. +/// +/// Safe to call multiple times on the main thread; will only initialize the backend once. +/// +/// Todo(sagar) this takes almost 1 second - likely due to license check +/// Todo(sagar) Remove set_device from here. +#[tracing::instrument()] +pub fn icicle_init() -> bool { + let mut initialized = false; + + ICICLE_INIT.call_once(|| { + #[cfg(feature = "icicle")] + if icicle_runtime::load_backend_from_env_or_default().is_ok() { + if let Ok(devices) = icicle_runtime::get_registered_devices() { + println!("Initializing icicle: available devices {:?}", devices); + + // Attempt to set the CUDA device as active + let device = icicle_runtime::Device::new("CUDA", 0); + if icicle_runtime::set_device(&device).is_ok() { + println!("icicle using device: {:?}", device); + initialized = true; + } else { + println!("Failed to set CUDA device; falling back to CPU."); + } + } + } + + #[cfg(not(feature = "icicle"))] + { + initialized = false; + } + + #[cfg(feature = "icicle")] + if !initialized { + println!("Failed to initialize icicle backend; using JOLT CPU implementations."); + } + + ICICLE_READY.store(initialized, Ordering::Relaxed); + }); + + ICICLE_READY.load(Ordering::Relaxed) +} + +/// Returns the total memory available on the system in bits. +/// +/// If icicle is enabled, it will return the total memory available on the GPU in bits. +#[allow(dead_code)] +pub fn total_memory_bits() -> usize { + const DEFAULT_MEM_GB: usize = 30; + const BITS_PER_BYTE: usize = 8; + const BYTES_PER_KB: usize = 1024; + const BYTES_PER_GB: usize = 1024 * 1024 * 1024; + + #[cfg(feature = "icicle")] + if let Ok((total_bytes, _)) = icicle_runtime::get_available_memory() { + // If icicle is enabled and memory is available, return the total memory in bits. + return total_bytes.checked_mul(BITS_PER_BYTE).unwrap_or(usize::MAX); + } + + // Fallback to system memory if icicle is unavailable or not enabled. + #[cfg(not(target_arch = "wasm32"))] + if let Ok(mem_info) = sys_info::mem_info() { + return (mem_info.total as usize * BYTES_PER_KB) + .checked_mul(BITS_PER_BYTE) + .unwrap_or(usize::MAX); + } + + // Fallback to "default" memory if system memory retrieval fails. + DEFAULT_MEM_GB + .checked_mul( + BYTES_PER_GB + .checked_mul(BITS_PER_BYTE) + .unwrap_or(usize::MAX), + ) + .unwrap_or(usize::MAX) +} diff --git a/jolt-core/src/msm/mod.rs b/jolt-core/src/msm/mod.rs index 0577bbb12..365935de5 100644 --- a/jolt-core/src/msm/mod.rs +++ b/jolt-core/src/msm/mod.rs @@ -1,56 +1,322 @@ +use ark_ec::pairing::Pairing; use ark_ec::{CurveGroup, ScalarMul}; use ark_ff::{prelude::*, PrimeField}; use ark_std::cmp::Ordering; use ark_std::vec::Vec; +#[cfg(feature = "icicle")] +use icicle_core::curve::Affine; use rayon::prelude::*; -impl VariableBaseMSM for G {} +pub(crate) mod icicle; +use crate::utils::errors::ProofVerifyError; +pub use icicle::*; + +impl VariableBaseMSM for G {} + +#[cfg(feature = "icicle")] +pub type GpuBaseType = Affine; +#[cfg(not(feature = "icicle"))] +pub type GpuBaseType = G::MulBase; + +use itertools::Either; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +pub enum MsmType { + Zero, + One, + Small(usize), + Medium(usize), + Large(usize), +} + +impl MsmType { + fn from_u32(i: u32) -> MsmType { + match i { + 0 => MsmType::Zero, + 1 => MsmType::One, + 2..=10 => MsmType::Small(i as usize), + 11..=64 => MsmType::Medium(i as usize), + _ => MsmType::Large(i as usize), + } + } + + #[tracing::instrument(skip_all)] + fn from_scalars(scalars: &[S::ScalarField]) -> MsmType { + let max_num_bits = scalars + .par_iter() + .map(|s| s.into_bigint().num_bits()) + .max() + .unwrap(); + MsmType::from_u32(max_num_bits) + } + + #[allow(dead_code)] + fn num_bits(&self) -> usize { + match self { + MsmType::Zero => 0, + MsmType::One => 1, + MsmType::Small(i) => *i, + MsmType::Medium(i) => *i, + MsmType::Large(i) => *i, + } + } + + fn prefers_icicle(&self) -> bool { + match self { + MsmType::Zero | MsmType::One | MsmType::Small(_) => false, + #[cfg(feature = "icicle")] + MsmType::Medium(_) | MsmType::Large(_) => true, + #[cfg(not(feature = "icicle"))] + _ => false, + } + } +} + +type TrackedScalar<'a, P: Pairing> = (usize, &'a [P::ScalarField]); +pub type ScalarGroups<'a, P: Pairing> = (MsmType, Vec>); /// Copy of ark_ec::VariableBaseMSM with minor modifications to speed up /// known small element sized MSMs. -pub trait VariableBaseMSM: ScalarMul { - fn msm(bases: &[Self::MulBase], scalars: &[Self::ScalarField]) -> Result { +pub trait VariableBaseMSM: ScalarMul + Icicle { + #[tracing::instrument(skip_all)] + fn msm( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalars: &[Self::ScalarField], + ) -> Result { + Self::inner_msm(bases, gpu_bases, scalars, true, None) + } + + #[tracing::instrument(skip_all)] + fn msm_with_type( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalars: &[Self::ScalarField], + msm_type: MsmType, + ) -> Result { + Self::inner_msm(bases, gpu_bases, scalars, true, Some(msm_type)) + } + + #[tracing::instrument(skip_all)] + fn inner_msm( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalars: &[Self::ScalarField], + allow_icicle: bool, + msm_type: Option, + ) -> Result { + #[cfg(not(feature = "icicle"))] + assert!(gpu_bases.is_none()); + assert_eq!(bases.len(), gpu_bases.map_or(bases.len(), |b| b.len())); + (bases.len() == scalars.len()) .then(|| { - let max_num_bits = scalars - .par_iter() - .map(|s| s.into_bigint().num_bits()) - .max() - .unwrap(); - - match max_num_bits { - 0 => Self::zero(), - 1 => { + let msm_type = msm_type.unwrap_or_else(|| MsmType::from_scalars::(scalars)); + + match msm_type { + MsmType::Zero => Self::zero(), + MsmType::One => { let scalars_u64 = &map_field_elements_to_u64::(scalars); msm_binary(bases, scalars_u64) } - 2..=10 => { + MsmType::Small(max_num_bits) => { let scalars_u64 = &map_field_elements_to_u64::(scalars); - msm_small(bases, scalars_u64, max_num_bits as usize) + msm_small(bases, scalars_u64, max_num_bits) } - 11..=64 => { + MsmType::Medium(max_num_bits) => { + // TODO(sagar) caching this as "use_icicle = use_icicle" seems to cause a massive slowdown + if use_icicle(Some(msm_type.prefers_icicle() && allow_icicle)) { + #[cfg(feature = "icicle")] + { + let mut backup = vec![]; + let gpu_bases = gpu_bases.unwrap_or_else(|| { + backup = Self::get_gpu_bases(bases); + &backup + }); + return icicle_msm::(gpu_bases, scalars, max_num_bits); + } + #[cfg(not(feature = "icicle"))] + { + unreachable!( + "icicle_init must not return true without the icicle feature" + ); + } + } + let scalars_u64 = &map_field_elements_to_u64::(scalars); if Self::NEGATION_IS_CHEAP { - msm_u64_wnaf(bases, scalars_u64, max_num_bits as usize) + msm_u64_wnaf(bases, scalars_u64, max_num_bits) } else { - msm_u64(bases, scalars_u64, max_num_bits as usize) + msm_u64(bases, scalars_u64, max_num_bits) } } - _ => { + MsmType::Large(max_num_bits) => { + if use_icicle(Some(msm_type.prefers_icicle() && allow_icicle)) { + #[cfg(feature = "icicle")] + { + let mut backup = vec![]; + let gpu_bases = gpu_bases.unwrap_or_else(|| { + backup = Self::get_gpu_bases(bases); + &backup + }); + return icicle_msm::(gpu_bases, scalars, max_num_bits); + } + #[cfg(not(feature = "icicle"))] + { + unreachable!( + "icicle_init must not return true without the icicle feature" + ); + } + } + let scalars = scalars .par_iter() .map(|s| s.into_bigint()) .collect::>(); if Self::NEGATION_IS_CHEAP { - msm_bigint_wnaf(bases, &scalars, max_num_bits as usize) + msm_bigint_wnaf(bases, &scalars, max_num_bits) } else { - msm_bigint(bases, &scalars, max_num_bits as usize) + msm_bigint(bases, &scalars, max_num_bits) } } } }) - .ok_or_else(|| bases.len().min(scalars.len())) + .ok_or(ProofVerifyError::KeyLengthError(bases.len(), scalars.len())) + } + + #[tracing::instrument(skip_all)] + fn batch_msm( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalar_batches: &[&[Self::ScalarField]], + ) -> Vec { + Self::batch_msm_inner(bases, gpu_bases, scalar_batches, true, false) + } + + #[tracing::instrument(skip_all)] + fn variable_batch_msm( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalar_batches: &[&[Self::ScalarField]], + ) -> Vec { + Self::batch_msm_inner(bases, gpu_bases, scalar_batches, true, true) } + + #[tracing::instrument(skip_all)] + fn batch_msm_inner( + bases: &[Self::MulBase], + gpu_bases: Option<&[GpuBaseType]>, + scalar_batches: &[&[Self::ScalarField]], + allow_icicle: bool, + _variable_batches: bool, + ) -> Vec { + assert!(scalar_batches.par_iter().all(|s| s.len() == bases.len())); + #[cfg(not(feature = "icicle"))] + assert!(gpu_bases.is_none()); + assert_eq!(bases.len(), gpu_bases.map_or(bases.len(), |b| b.len())); + + if !use_icicle(Some(allow_icicle)) { + let span = tracing::span!(tracing::Level::INFO, "batch_msm_cpu_only"); + let _guard = span.enter(); + return scalar_batches + .into_par_iter() + .map(|scalars| Self::inner_msm(bases, None, scalars, false, None).unwrap()) + .collect(); + } + + // Split scalar batches into CPU and GPU workloads + let span = tracing::span!(tracing::Level::INFO, "group_scalar_indices_parallel"); + let _guard = span.enter(); + let (cpu_slices, gpu_slices): (Vec<_>, Vec<_>) = scalar_batches + .par_iter() + .enumerate() + .partition_map(|(i, scalar_slice)| { + let msm_type = MsmType::from_scalars::(scalar_slice); + if use_icicle(Some(allow_icicle && msm_type.prefers_icicle())) { + Either::Right((i, msm_type, *scalar_slice)) + } else { + Either::Left((i, msm_type, *scalar_slice)) + } + }); + drop(_guard); + drop(span); + let mut results = vec![Self::zero(); scalar_batches.len()]; + + // Handle CPU computations in parallel + let span = tracing::span!(tracing::Level::INFO, "batch_msm_cpu"); + let _guard = span.enter(); + let cpu_results: Vec<(usize, Self)> = cpu_slices + .into_par_iter() + .map(|(i, msm_type, scalars)| { + ( + i, + Self::msm_with_type(bases, None, scalars, msm_type).unwrap(), + ) + }) + .collect(); + drop(_guard); + drop(span); + + // Store CPU results + for (i, result) in cpu_results { + results[i] = result; + } + + // Handle GPU computations if available + if !gpu_slices.is_empty() && use_icicle(Some(allow_icicle)) { + #[cfg(feature = "icicle")] + { + let span = tracing::span!(tracing::Level::INFO, "batch_msms_gpu"); + let _guard = span.enter(); + let mut backup = vec![]; + let gpu_bases = gpu_bases.unwrap_or_else(|| { + backup = Self::get_gpu_bases(bases); + &backup + }); + + // includes putting the scalars and bases on device + let slice_bit_size = 256 * gpu_slices[0].2.len() * 2; + let slices_at_a_time = total_memory_bits() / slice_bit_size; + + // Process GPU batches with memory constraints + for work_chunk in gpu_slices.chunks(slices_at_a_time) { + let (scalar_types, chunk_scalars): (Vec<_>, Vec<&[Self::ScalarField]>) = + work_chunk + .par_iter() + .map(|(_, msm_type, scalars)| (*msm_type, *scalars)) + .unzip(); + + let max_scalar_type = scalar_types.par_iter().max().unwrap(); + let batch_results = + icicle_batch_msm::(gpu_bases, &chunk_scalars, *max_scalar_type); + + // Store GPU results using original indices + for ((original_idx, _, _), result) in work_chunk.iter().zip(batch_results) { + results[*original_idx] = result; + } + } + } + #[cfg(not(feature = "icicle"))] + { + unreachable!("icicle_init must not return true without the icicle feature"); + } + } + results + } + + #[cfg(feature = "icicle")] + #[tracing::instrument(skip_all)] + fn get_gpu_bases(bases: &[Self::MulBase]) -> Vec> { + bases + .par_iter() + .map(|base| ::from_ark_affine(base)) + .collect() + } +} + +fn use_icicle(additional_conditions: Option) -> bool { + let additional = additional_conditions.unwrap_or(true); + icicle_init() && additional } fn map_field_elements_to_u64(field_elements: &[V::ScalarField]) -> Vec { @@ -480,3 +746,31 @@ fn ln_without_floats(a: usize) -> usize { // log2(a) * ln(2) (ark_std::log2(a) * 69 / 100) as usize } + +#[cfg(test)] +mod tests { + use crate::msm::MsmType; + + #[test] + fn test_msm_type_conversion() { + let msm_type = MsmType::from_u32(0); + assert_eq!(msm_type, MsmType::Zero); + assert_eq!(msm_type.num_bits(), 0); + + let msm_type = MsmType::from_u32(1); + assert_eq!(msm_type, MsmType::One); + assert_eq!(msm_type.num_bits(), 1); + + let msm_type = MsmType::from_u32(2); + assert_eq!(msm_type, MsmType::Small(2)); + assert_eq!(msm_type.num_bits(), 2); + + let msm_type = MsmType::from_u32(11); + assert_eq!(msm_type, MsmType::Medium(11)); + assert_eq!(msm_type.num_bits(), 11); + + let msm_type = MsmType::from_u32(65); + assert_eq!(msm_type, MsmType::Large(65)); + assert_eq!(msm_type.num_bits(), 65); + } +} diff --git a/jolt-core/src/poly/commitment/hyperkzg.rs b/jolt-core/src/poly/commitment/hyperkzg.rs index 8c4e3b2b4..ca78a90d9 100644 --- a/jolt-core/src/poly/commitment/hyperkzg.rs +++ b/jolt-core/src/poly/commitment/hyperkzg.rs @@ -9,16 +9,17 @@ //! and within the KZG commitment scheme implementation itself). use super::{ commitment_scheme::{BatchType, CommitmentScheme}, - kzg, kzg::{KZGProverKey, KZGVerifierKey, UnivariateKZG}, }; -use crate::field; +use crate::field::JoltField; use crate::poly::commitment::commitment_scheme::CommitShape; +use crate::poly::commitment::kzg::CommitMode; use crate::utils::mul_0_1_optimized; use crate::utils::thread::unsafe_allocate_zero_vec; use crate::utils::transcript::Transcript; +use crate::{field, into_optimal_iter}; use crate::{ - msm::VariableBaseMSM, + msm::{Icicle, VariableBaseMSM}, poly::{commitment::kzg::SRS, dense_mlpoly::DensePolynomial, unipoly::UniPoly}, utils::{errors::ProofVerifyError, transcript::AppendToTranscript}, }; @@ -34,10 +35,18 @@ use rayon::iter::{ use std::{marker::PhantomData, sync::Arc}; use tracing::trace_span; -pub struct HyperKZGSRS(Arc>); +pub struct HyperKZGSRS(Arc>) +where + P::G1: Icicle; -impl HyperKZGSRS

{ - pub fn setup(rng: &mut R, max_degree: usize) -> Self { +impl HyperKZGSRS

+where + P::G1: Icicle, +{ + pub fn setup(rng: &mut R, max_degree: usize) -> Self + where + P::ScalarField: JoltField, + { Self(Arc::new(SRS::setup(rng, max_degree, 2))) } @@ -48,7 +57,10 @@ impl HyperKZGSRS

{ } #[derive(Clone, Debug)] -pub struct HyperKZGProverKey { +pub struct HyperKZGProverKey +where + P::G1: Icicle, +{ pub kzg_pk: KZGProverKey

, } @@ -57,7 +69,7 @@ pub struct HyperKZGVerifierKey { pub kzg_vk: KZGVerifierKey

, } -#[derive(Debug, PartialEq, CanonicalSerialize, CanonicalDeserialize)] +#[derive(Debug, Clone, PartialEq, CanonicalSerialize, CanonicalDeserialize)] pub struct HyperKZGCommitment(pub P::G1Affine); impl Default for HyperKZGCommitment

{ @@ -99,6 +111,7 @@ fn kzg_open_no_rem( ) -> P::G1Affine where

::ScalarField: field::JoltField, +

::G1: Icicle, { let h = compute_witness_polynomial::

(f, u); UnivariateKZG::commit(&pk.kzg_pk, &UniPoly::from_coeff(h)).unwrap() @@ -128,6 +141,7 @@ fn scalar_vector_muladd( s: P::ScalarField, ) where

::ScalarField: field::JoltField, +

::G1: Icicle, { assert!(a.len() >= v.len()); for i in 0..v.len() { @@ -141,6 +155,7 @@ fn kzg_compute_batch_polynomial( ) -> Vec where

::ScalarField: field::JoltField, +

::G1: Icicle, { let k = f.len(); // Number of polynomials we're batching @@ -161,6 +176,7 @@ fn kzg_open_batch( ) -> (Vec, Vec>) where

::ScalarField: field::JoltField, +

::G1: Icicle, { let k = f.len(); let t = u.len(); @@ -182,8 +198,7 @@ where let B = kzg_compute_batch_polynomial::

(f, q_powers); // Now open B at u0, ..., u_{t-1} - let w = u - .into_par_iter() + let w = into_optimal_iter!(u) .map(|ui| kzg_open_no_rem(&B, *ui, pk)) .collect::>(); @@ -206,6 +221,7 @@ fn kzg_verify_batch( ) -> bool where

::ScalarField: field::JoltField, +

::G1: Icicle, { let k = C.len(); let t = u.len(); @@ -258,6 +274,7 @@ where let L = ::msm( &[&C[..k], &[W[0], W[1], W[2], vk.kzg_vk.g1]].concat(), + None, &[ &q_powers_multiplied[..k], &[ @@ -285,6 +302,7 @@ pub struct HyperKZG { impl HyperKZG where

::ScalarField: field::JoltField, +

::G1: Icicle, { pub fn protocol_name() -> &'static [u8] { b"HyperKZG" @@ -320,11 +338,13 @@ where // Phase 1 -- create commitments com_1, ..., com_\ell // We do not compute final Pi (and its commitment) as it is constant and equals to 'eval' // also known to verifier, so can be derived on its side as well + let span = trace_span!("phase_1"); + let _enter = span.enter(); let mut polys: Vec> = Vec::new(); polys.push(poly.Z.to_vec()); for i in 0..ell - 1 { let Pi_len = polys[i].len() / 2; - let mut Pi = vec![P::ScalarField::zero(); Pi_len]; + let mut Pi = unsafe_allocate_zero_vec(Pi_len); #[allow(clippy::needless_range_loop)] Pi.par_iter_mut().enumerate().for_each(|(j, Pi_j)| { @@ -334,14 +354,16 @@ where polys.push(Pi); } + drop(_enter); + drop(span); assert_eq!(polys.len(), ell); assert_eq!(polys[ell - 1].len(), 2); // We do not need to commit to the first polynomial as it is already committed. // Compute commitments in parallel - let com: Vec = (1..polys.len()) - .into_par_iter() + // TODO(sragss): This could be done by batch too if it gets progressively smaller. + let com: Vec = into_optimal_iter!(1..polys.len()) .map(|i| UnivariateKZG::commit_slice(&pk.kzg_pk, &polys[i]).unwrap()) .collect(); @@ -508,6 +530,7 @@ impl CommitmentScheme for HyperKZG where

::ScalarField: field::JoltField, +

::G1: Icicle, { type Field = P::ScalarField; type Setup = (HyperKZGProverKey

, HyperKZGVerifierKey

); @@ -526,6 +549,7 @@ where .trim(max_len) } + #[tracing::instrument(skip_all, name = "HyperKZG::commit")] fn commit(poly: &DensePolynomial, setup: &Self::Setup) -> Self::Commitment { assert!( setup.0.kzg_pk.g1_powers().len() >= poly.Z.len(), @@ -536,36 +560,22 @@ where HyperKZGCommitment(UnivariateKZG::commit_slice(&setup.0.kzg_pk, &poly.Z).unwrap()) } + #[tracing::instrument(skip_all, name = "HyperKZG::batch_commit")] fn batch_commit( evals: &[&[Self::Field]], gens: &Self::Setup, batch_type: BatchType, ) -> Vec { - // TODO: assert lengths are valid - evals - .par_iter() - .map(|evals| { - assert!( - gens.0.kzg_pk.g1_powers().len() >= evals.len(), - "COMMIT KEY LENGTH ERROR {}, {}", - gens.0.kzg_pk.g1_powers().len(), - evals.len() - ); - match batch_type { - BatchType::GrandProduct => HyperKZGCommitment( - UnivariateKZG::commit_slice_with_mode( - &gens.0.kzg_pk, - evals, - kzg::CommitMode::GrandProduct, - ) - .unwrap(), - ), - _ => HyperKZGCommitment( - UnivariateKZG::commit_slice(&gens.0.kzg_pk, evals).unwrap(), - ), - } - }) - .collect::>() + let mode = match batch_type { + BatchType::GrandProduct => CommitMode::GrandProduct, + _ => CommitMode::Default, + }; + + UnivariateKZG::commit_batch_with_mode(&gens.0.kzg_pk, evals, mode) + .unwrap() + .into_par_iter() + .map(|c| HyperKZGCommitment(c)) + .collect() } fn commit_slice(evals: &[Self::Field], setup: &Self::Setup) -> Self::Commitment { diff --git a/jolt-core/src/poly/commitment/hyrax.rs b/jolt-core/src/poly/commitment/hyrax.rs index 1bd43e5bc..5f1d75035 100644 --- a/jolt-core/src/poly/commitment/hyrax.rs +++ b/jolt-core/src/poly/commitment/hyrax.rs @@ -15,10 +15,10 @@ use num_integer::Roots; use rayon::prelude::*; use tracing::trace_span; -use crate::msm::VariableBaseMSM; +use crate::msm::{icicle::Icicle, VariableBaseMSM}; #[derive(Clone)] -pub struct HyraxScheme { +pub struct HyraxScheme { marker: PhantomData<(G, ProofTranscript)>, } @@ -48,7 +48,7 @@ pub fn matrix_dimensions(num_vars: usize, ratio: usize) -> (usize, usize) { (col_size, row_size) } -impl, ProofTranscript: Transcript> +impl + Icicle, ProofTranscript: Transcript> CommitmentScheme for HyraxScheme { type Field = G::ScalarField; @@ -84,31 +84,6 @@ impl, ProofTranscript: Transcript> fn commit_slice(eval_slice: &[Self::Field], generators: &Self::Setup) -> Self::Commitment { HyraxCommitment::commit_slice(eval_slice, generators) } - fn prove( - _setup: &Self::Setup, - poly: &DensePolynomial, - opening_point: &[Self::Field], - transcript: &mut ProofTranscript, - ) -> Self::Proof { - // Implicitly prove is "prove_single", with a ratio = 1 - HyraxOpeningProof::prove(poly, opening_point, 1, transcript) - } - fn batch_prove( - _setup: &Self::Setup, - polynomials: &[&DensePolynomial], - opening_point: &[Self::Field], - openings: &[Self::Field], - batch_type: BatchType, - transcript: &mut ProofTranscript, - ) -> Self::BatchedProof { - BatchedHyraxOpeningProof::prove( - polynomials, - opening_point, - openings, - batch_type, - transcript, - ) - } fn combine_commitments( commitments: &[&Self::Commitment], coeffs: &[Self::Field], @@ -141,6 +116,31 @@ impl, ProofTranscript: Transcript> ); HyraxCommitment { row_commitments } } + fn prove( + _setup: &Self::Setup, + poly: &DensePolynomial, + opening_point: &[Self::Field], + transcript: &mut ProofTranscript, + ) -> Self::Proof { + // Implicitly prove is "prove_single", with a ratio = 1 + HyraxOpeningProof::prove(poly, opening_point, 1, transcript) + } + fn batch_prove( + _setup: &Self::Setup, + polynomials: &[&DensePolynomial], + opening_point: &[Self::Field], + openings: &[Self::Field], + batch_type: BatchType, + transcript: &mut ProofTranscript, + ) -> Self::BatchedProof { + BatchedHyraxOpeningProof::prove( + polynomials, + opening_point, + openings, + batch_type, + transcript, + ) + } fn verify( proof: &Self::Proof, @@ -185,16 +185,16 @@ impl, ProofTranscript: Transcript> } #[derive(Clone, CanonicalSerialize, CanonicalDeserialize)] -pub struct HyraxGenerators { +pub struct HyraxGenerators { pub gens: PedersenGenerators, } #[derive(Default, Clone, Debug, PartialEq, CanonicalSerialize, CanonicalDeserialize)] -pub struct HyraxCommitment { +pub struct HyraxCommitment { pub row_commitments: Vec, } -impl> HyraxCommitment { +impl + Icicle> HyraxCommitment { #[tracing::instrument(skip_all, name = "HyraxCommitment::commit")] pub fn commit( poly: &DensePolynomial, @@ -211,10 +211,15 @@ impl> HyraxCommitment { let (L_size, R_size) = matrix_dimensions(ell, 1); assert_eq!(L_size * R_size, n); - let gens = CurveGroup::normalize_batch(&generators.generators[..R_size]); + let gens = &generators.generators[..R_size]; + let gpu_generators = generators + .gpu_generators + .as_ref() + .map(|gens| &gens[..R_size]); + let row_commitments = eval_slice .par_chunks(R_size) - .map(|row| PedersenCommitment::commit_vector(row, &gens)) + .map(|row| PedersenCommitment::commit_vector(row, gens, gpu_generators)) .collect(); Self { row_commitments } } @@ -234,12 +239,17 @@ impl> HyraxCommitment { let (L_size, R_size) = matrix_dimensions(ell, ratio); assert_eq!(L_size * R_size, n); - let gens = CurveGroup::normalize_batch(&generators.generators[..R_size]); + let gens = &generators.generators[..R_size]; + let gpu_gens = generators + .gpu_generators + .as_ref() + .map(|gens| &gens[..R_size]); - let rows = batch.par_iter().flat_map(|poly| poly.par_chunks(R_size)); - let row_commitments: Vec = rows - .map(|row| PedersenCommitment::commit_vector(row, &gens)) + let rows: Vec<&[G::ScalarField]> = batch + .par_iter() + .flat_map(|poly| poly.par_chunks(R_size)) .collect(); + let row_commitments: Vec = G::batch_msm(gens, gpu_gens, &rows); row_commitments .par_chunks(L_size) @@ -250,7 +260,7 @@ impl> HyraxCommitment { } } -impl AppendToTranscript for HyraxCommitment { +impl AppendToTranscript for HyraxCommitment { fn append_to_transcript(&self, transcript: &mut ProofTranscript) { transcript.append_message(b"poly_commitment_begin"); for i in 0..self.row_commitments.len() { @@ -261,7 +271,7 @@ impl AppendToTranscript for HyraxCommitment { } #[derive(Debug, CanonicalSerialize, CanonicalDeserialize)] -pub struct HyraxOpeningProof { +pub struct HyraxOpeningProof { pub vector_matrix_product: Vec, _marker: PhantomData, } @@ -270,7 +280,7 @@ pub struct HyraxOpeningProof { impl HyraxOpeningProof where F: JoltField, - G: CurveGroup, + G: CurveGroup + Icicle, ProofTranscript: Transcript, { fn protocol_name() -> &'static [u8] { @@ -323,13 +333,16 @@ where // Verifier-derived commitment to u * a = \prod Com(u_j)^{a_j} let homomorphically_derived_commitment: G = - VariableBaseMSM::msm(&G::normalize_batch(&commitment.row_commitments), &L).unwrap(); + VariableBaseMSM::msm(&G::normalize_batch(&commitment.row_commitments), None, &L)?; let product_commitment = VariableBaseMSM::msm( - &G::normalize_batch(&pedersen_generators.generators[..R_size]), + &pedersen_generators.generators[..R_size], + pedersen_generators + .gpu_generators + .as_ref() + .map(|g| &g[..R_size]), &self.vector_matrix_product, - ) - .unwrap(); + )?; let dot_product = compute_dotproduct(&self.vector_matrix_product, &R); @@ -367,14 +380,14 @@ where } #[derive(Debug, CanonicalSerialize, CanonicalDeserialize)] -pub struct BatchedHyraxOpeningProof { +pub struct BatchedHyraxOpeningProof { pub joint_proof: HyraxOpeningProof, pub ratio: usize, _marker: PhantomData, } /// See Section 16.1 of Thaler's Proofs, Arguments, and Zero-Knowledge -impl, ProofTranscript: Transcript> +impl + Icicle, ProofTranscript: Transcript> BatchedHyraxOpeningProof { #[tracing::instrument(skip_all, name = "BatchedHyraxOpeningProof::prove")] @@ -537,7 +550,7 @@ mod tests { fn check_polynomial_commit_helper< F: JoltField, - G: CurveGroup, + G: CurveGroup + Icicle, const RATIO: usize, >() { let Z = vec![ diff --git a/jolt-core/src/poly/commitment/kzg.rs b/jolt-core/src/poly/commitment/kzg.rs index 6aa77ec36..dc66b3927 100644 --- a/jolt-core/src/poly/commitment/kzg.rs +++ b/jolt-core/src/poly/commitment/kzg.rs @@ -1,5 +1,6 @@ use crate::field::JoltField; -use crate::msm::VariableBaseMSM; +use crate::msm::{GpuBaseType, Icicle, VariableBaseMSM}; +use crate::optimal_iter; use crate::poly::unipoly::UniPoly; use crate::utils::errors::ProofVerifyError; use ark_ec::scalar_mul::fixed_base::FixedBase; @@ -12,18 +13,29 @@ use std::marker::PhantomData; use std::sync::Arc; #[derive(Clone, Debug)] -pub struct SRS { +pub struct SRS +where + P::G1: Icicle, +{ pub g1_powers: Vec, pub g2_powers: Vec, pub g_products: Vec, + // g1_powers in icicle's GPU types + pub gpu_g1: Option>>, } -impl SRS

{ +impl SRS

+where + P::G1: Icicle, +{ pub fn setup( mut rng: &mut R, num_g1_powers: usize, num_g2_powers: usize, - ) -> Self { + ) -> Self + where + P::ScalarField: JoltField, + { let beta = P::ScalarField::rand(&mut rng); let g1 = P::G1::rand(&mut rng); let g2 = P::G2::rand(&mut rng); @@ -69,16 +81,31 @@ impl SRS

{ let powers_of_2 = (0..num_powers).into_par_iter().map(|i| 1usize << i); let g_products = powers_of_2 .map(|power| { - ::msm(&g1_powers[..power], &all_ones_coeffs[..power]) - .unwrap() - .into_affine() + ::msm( + &g1_powers[..power], + None, + &all_ones_coeffs[..power], + ) + .unwrap() + .into_affine() }) .collect(); + #[cfg(feature = "icicle")] + let gpu_g1 = Some( + g1_powers + .par_iter() + .map(::from_ark_affine) + .collect::>(), + ); + #[cfg(not(feature = "icicle"))] + let gpu_g1 = None; + Self { g1_powers, g2_powers, g_products, + gpu_g1, } } @@ -98,7 +125,10 @@ impl SRS

{ } #[derive(Clone, Debug)] -pub struct KZGProverKey { +pub struct KZGProverKey +where + P::G1: Icicle, +{ srs: Arc>, // offset to read into SRS offset: usize, @@ -106,7 +136,10 @@ pub struct KZGProverKey { supported_size: usize, } -impl KZGProverKey

{ +impl KZGProverKey

+where + P::G1: Icicle, +{ pub fn new(srs: Arc>, offset: usize, supported_size: usize) -> Self { assert!( srs.g1_powers.len() >= offset + supported_size, @@ -125,6 +158,13 @@ impl KZGProverKey

{ pub fn g1_powers(&self) -> &[P::G1Affine] { &self.srs.g1_powers[self.offset..self.offset + self.supported_size] } + + pub fn gpu_g1(&self) -> Option<&[GpuBaseType]> { + self.srs + .gpu_g1 + .as_ref() + .map(|gpu_g1| &gpu_g1[self.offset..self.offset + self.supported_size]) + } } #[derive(Clone, Copy, Debug)] @@ -150,8 +190,111 @@ pub struct UnivariateKZG { impl UnivariateKZG

where -

::ScalarField: JoltField, + P::ScalarField: JoltField, + P::G1: Icicle, { + #[tracing::instrument(skip_all, name = "KZG::commit_batch")] + pub fn commit_batch( + pk: &KZGProverKey

, + coeffs: &[&[P::ScalarField]], + ) -> Result, ProofVerifyError> { + Self::commit_batch_with_mode(pk, coeffs, CommitMode::Default) + } + + #[tracing::instrument(skip_all, name = "KZG::commit_batch_with_mode")] + pub fn commit_batch_with_mode( + pk: &KZGProverKey

, + batches: &[&[P::ScalarField]], + mode: CommitMode, + ) -> Result, ProofVerifyError> { + let g1_powers = &pk.g1_powers(); + let gpu_g1 = pk.gpu_g1(); + + // batch commit requires all batches to have the same length + assert!(batches.par_iter().all(|s| s.len() == batches[0].len())); + assert!(batches[0].len() <= g1_powers.len()); + + if let Some(invalid) = batches.iter().find(|coeffs| coeffs.len() > g1_powers.len()) { + return Err(ProofVerifyError::KeyLengthError( + g1_powers.len(), + invalid.len(), + )); + } + + let batch_size = batches[0].len(); + match mode { + CommitMode::Default => { + let commitments = ::batch_msm( + &g1_powers[..batch_size], + gpu_g1.map(|g| &g[..batch_size]), + batches, + ); + Ok(commitments.into_iter().map(|c| c.into_affine()).collect()) + } + CommitMode::GrandProduct => { + // Commit to the non-1 coefficients first then combine them with the G commitment (all-1s vector) in the SRS + let (non_one_coeffs, (non_one_bases, non_one_gpu_bases)): ( + Vec<_>, + (Vec<_>, Vec<_>), + ) = batches + .par_iter() + .map(|coeff| { + let (coeffs, (bases, gpu_bases)): (Vec<_>, (Vec<_>, Vec<_>)) = coeff + .par_iter() + .enumerate() + .filter_map(|(i, coeff)| { + if *coeff != P::ScalarField::one() { + let gpu_base = gpu_g1.map(|g| g[i]); + // Subtract 1 from the coeff because we already have a commitment to the all the 1s + Some((*coeff - P::ScalarField::one(), (g1_powers[i], gpu_base))) + } else { + None + } + }) + .unzip(); + let gpu_bases: Option> = gpu_bases.into_par_iter().collect(); + (coeffs, (bases, gpu_bases)) + }) + .unzip(); + + // Perform MSM for the non-1 coefficients + assert_eq!(non_one_bases.len(), non_one_coeffs.len()); + //TODO(sagar) batch msm this + let commitments = optimal_iter!(non_one_coeffs) + .enumerate() + .map(|(i, coeffs)| { + let non_one_commitment = if !coeffs.is_empty() { + ::msm( + &non_one_bases[i], + non_one_gpu_bases[i].as_deref(), + coeffs, + ) + .unwrap() + } else { + P::G1::zero() + }; + + // find the right precomputed g_product to use + let num_powers = (coeffs.len() as f64).log2(); + assert_ne!( + num_powers.fract(), + 0.0, + "Invalid key length: {}", + coeffs.len() + ); + let num_powers = num_powers.floor() as usize; + + // Combine G * H: Multiply the precomputed G commitment with the non-1 commitment (H) + let final_commitment = pk.srs.g_products[num_powers] + non_one_commitment; + final_commitment.into_affine() + }) + .collect(); + + Ok(commitments) + } + } + } + #[tracing::instrument(skip_all, name = "KZG::commit_offset")] pub fn commit_offset( pk: &KZGProverKey

, @@ -214,20 +357,32 @@ where CommitMode::Default => { let c = ::msm( &pk.g1_powers()[offset..coeffs.len()], + pk.gpu_g1().map(|g| &g[offset..coeffs.len()]), &coeffs[offset..], - ) - .unwrap(); + )?; Ok(c.into_affine()) } CommitMode::GrandProduct => { let g1_powers = &pk.g1_powers()[offset..coeffs.len()]; + let gpu_g1 = pk.gpu_g1().map(|g| &g[offset..coeffs.len()]); let coeffs = &coeffs[offset..]; + let mut non_one_gpu_bases = if gpu_g1.is_some() { + Some(Vec::new()) + } else { + None + }; + // Commit to the non-1 coefficients first then combine them with the G commitment (all-1s vector) in the SRS let (non_one_coeffs, non_one_bases): (Vec<_>, Vec<_>) = coeffs .iter() .enumerate() .filter_map(|(i, coeff)| { if *coeff != P::ScalarField::one() { + if let Some(gpu_g1) = gpu_g1 { + if let Some(v) = non_one_gpu_bases.as_mut() { + v.push(gpu_g1[i]) + } + } // Subtract 1 from the coeff because we already have a commitment to the all the 1s Some((*coeff - P::ScalarField::one(), g1_powers[i])) } else { @@ -238,7 +393,11 @@ where // Perform MSM for the non-1 coefficients let non_one_commitment = if !non_one_coeffs.is_empty() { - ::msm(&non_one_bases, &non_one_coeffs).unwrap() + ::msm( + &non_one_bases, + non_one_gpu_bases.as_deref(), + &non_one_coeffs, + )? } else { P::G1::zero() }; @@ -270,9 +429,9 @@ where let (witness_poly, _) = poly.divide_with_remainder(&divisor).unwrap(); let proof = ::msm( &pk.g1_powers()[..witness_poly.coeffs.len()], + pk.gpu_g1().map(|g| &g[..witness_poly.coeffs.len()]), witness_poly.coeffs.as_slice(), - ) - .unwrap(); + )?; let evaluation = poly.evaluate(point); Ok((proof.into_affine(), evaluation)) } diff --git a/jolt-core/src/poly/commitment/pedersen.rs b/jolt-core/src/poly/commitment/pedersen.rs index d7405d6a4..59782ca09 100644 --- a/jolt-core/src/poly/commitment/pedersen.rs +++ b/jolt-core/src/poly/commitment/pedersen.rs @@ -1,19 +1,25 @@ +use crate::msm::Icicle; +use crate::msm::{GpuBaseType, VariableBaseMSM}; use ark_ec::CurveGroup; -use ark_serialize::{CanonicalDeserialize, CanonicalSerialize}; +use ark_serialize::{ + CanonicalDeserialize, CanonicalSerialize, Compress, SerializationError, Valid, Validate, +}; use ark_std::rand::SeedableRng; +use ark_std::UniformRand; use rand_chacha::ChaCha20Rng; +#[cfg(feature = "icicle")] +use rayon::prelude::*; use sha3::digest::{ExtendableOutput, Update}; use sha3::Shake256; -use std::io::Read; +use std::io::{Read, Write}; -use crate::msm::VariableBaseMSM; - -#[derive(Clone, CanonicalSerialize, CanonicalDeserialize)] -pub struct PedersenGenerators { - pub generators: Vec, +#[derive(Clone)] +pub struct PedersenGenerators { + pub generators: Vec, + pub gpu_generators: Option>>, } -impl PedersenGenerators { +impl PedersenGenerators { #[tracing::instrument(skip_all, name = "PedersenGenerators::new")] pub fn new(len: usize, label: &[u8]) -> Self { let mut shake = Shake256::default(); @@ -27,12 +33,25 @@ impl PedersenGenerators { reader.read_exact(&mut seed).unwrap(); let mut rng = ChaCha20Rng::from_seed(seed); - let mut generators: Vec = Vec::new(); + let mut generators: Vec = Vec::new(); for _ in 0..len { - generators.push(G::rand(&mut rng)); + generators.push(G::Affine::rand(&mut rng)); } - Self { generators } + #[cfg(feature = "icicle")] + let gpu_generators = Some( + generators + .par_iter() + .map(::from_ark_affine) + .collect::>(), + ); + #[cfg(not(feature = "icicle"))] + let gpu_generators = None; + + Self { + generators, + gpu_generators, + } } pub fn clone_n(&self, n: usize) -> PedersenGenerators { @@ -45,24 +64,81 @@ impl PedersenGenerators { let slice = &self.generators[..n]; PedersenGenerators { generators: slice.into(), + gpu_generators: self + .gpu_generators + .as_ref() + .map(|gpu_slice| gpu_slice[..n].into()), } } } -pub trait PedersenCommitment: Sized { +pub trait PedersenCommitment: Sized { fn commit(&self, gens: &PedersenGenerators) -> G; - fn commit_vector(inputs: &[Self], bases: &[G::Affine]) -> G; + fn commit_vector( + inputs: &[Self], + bases: &[G::Affine], + gpu_bases: Option<&[GpuBaseType]>, + ) -> G; } -impl PedersenCommitment for G::ScalarField { +impl PedersenCommitment for G::ScalarField { #[tracing::instrument(skip_all, name = "PedersenCommitment::commit")] fn commit(&self, gens: &PedersenGenerators) -> G { assert_eq!(gens.generators.len(), 1); gens.generators[0] * self } - fn commit_vector(inputs: &[Self], bases: &[G::Affine]) -> G { + #[tracing::instrument(skip_all, name = "PedersenCommitment::commit_vector")] + fn commit_vector( + inputs: &[Self], + bases: &[G::Affine], + gpu_bases: Option<&[GpuBaseType]>, + ) -> G { assert_eq!(bases.len(), inputs.len()); - VariableBaseMSM::msm(bases, inputs).unwrap() + VariableBaseMSM::msm(bases, gpu_bases, inputs).unwrap() + } +} + +impl CanonicalSerialize for PedersenGenerators { + fn serialize_with_mode( + &self, + writer: W, + compress: Compress, + ) -> Result<(), SerializationError> { + self.generators.serialize_with_mode(writer, compress) + } + + fn serialized_size(&self, compress: Compress) -> usize { + self.generators.serialized_size(compress) + } +} + +impl Valid for PedersenGenerators { + fn check(&self) -> Result<(), SerializationError> { + self.generators.check() + } +} + +impl CanonicalDeserialize for PedersenGenerators { + fn deserialize_with_mode( + reader: R, + compress: Compress, + validate: Validate, + ) -> Result { + let generators = Vec::::deserialize_with_mode(reader, compress, validate)?; + #[cfg(feature = "icicle")] + let gpu_generators = Some( + generators + .par_iter() + .map(::from_ark_affine) + .collect::>(), + ); + #[cfg(not(feature = "icicle"))] + let gpu_generators = None; + + Ok(Self { + generators, + gpu_generators, + }) } } diff --git a/jolt-core/src/poly/commitment/zeromorph.rs b/jolt-core/src/poly/commitment/zeromorph.rs index 505bd629c..fd0021a2c 100644 --- a/jolt-core/src/poly/commitment/zeromorph.rs +++ b/jolt-core/src/poly/commitment/zeromorph.rs @@ -1,10 +1,7 @@ #![allow(clippy::too_many_arguments)] #![allow(clippy::type_complexity)] -use std::{iter, marker::PhantomData}; - -use crate::field; -use crate::msm::VariableBaseMSM; +use crate::msm::{Icicle, VariableBaseMSM}; use crate::poly::{dense_mlpoly::DensePolynomial, unipoly::UniPoly}; use crate::utils::mul_0_1_optimized; use crate::utils::thread::unsafe_allocate_zero_vec; @@ -13,26 +10,36 @@ use crate::utils::{ transcript::{AppendToTranscript, Transcript}, }; use ark_ec::{pairing::Pairing, AffineRepr, CurveGroup}; -use ark_ff::{batch_inversion, Field}; +use ark_ff::batch_inversion; use ark_serialize::{CanonicalDeserialize, CanonicalSerialize}; use ark_std::{One, Zero}; use itertools::izip; use rand_chacha::{rand_core::SeedableRng, ChaCha20Rng}; use rand_core::{CryptoRng, RngCore}; use std::sync::Arc; +use std::{iter, marker::PhantomData}; use tracing::trace_span; -use rayon::prelude::*; - use super::{ commitment_scheme::{BatchType, CommitShape, CommitmentScheme}, kzg::{KZGProverKey, KZGVerifierKey, UnivariateKZG, SRS}, }; +use crate::field::JoltField; +use crate::optimal_iter; +use rayon::prelude::*; -pub struct ZeromorphSRS(Arc>); +pub struct ZeromorphSRS(Arc>) +where + P::G1: Icicle; -impl ZeromorphSRS

{ - pub fn setup(rng: &mut R, max_degree: usize) -> Self { +impl ZeromorphSRS

+where + P::G1: Icicle, +{ + pub fn setup(rng: &mut R, max_degree: usize) -> Self + where + P::ScalarField: JoltField, + { Self(Arc::new(SRS::setup(rng, max_degree, max_degree))) } @@ -53,7 +60,10 @@ impl ZeromorphSRS

{ //TODO: adapt interface to have prover and verifier key #[derive(Clone, Debug)] -pub struct ZeromorphProverKey { +pub struct ZeromorphProverKey +where + P::G1: Icicle, +{ pub commit_pp: KZGProverKey

, pub open_pp: KZGProverKey

, } @@ -91,7 +101,7 @@ fn compute_multilinear_quotients( point: &[P::ScalarField], ) -> (Vec>, P::ScalarField) where -

::ScalarField: field::JoltField, +

::ScalarField: JoltField, { let num_var = poly.get_num_vars(); assert_eq!(num_var, point.len()); @@ -134,7 +144,7 @@ fn compute_batched_lifted_degree_quotient( y_challenge: &P::ScalarField, ) -> (UniPoly, usize) where -

::ScalarField: field::JoltField, +

::ScalarField: JoltField, { let num_vars = quotients.len(); @@ -165,14 +175,15 @@ fn eval_and_quotient_scalars( challenges: &[P::ScalarField], ) -> (P::ScalarField, (Vec, Vec)) where -

::ScalarField: field::JoltField, +

::ScalarField: JoltField, { let num_vars = challenges.len(); // squares of x = [x, x^2, .. x^{2^k}, .. x^{2^num_vars}] - let squares_of_x: Vec<_> = iter::successors(Some(x_challenge), |&x| Some(x.square())) - .take(num_vars + 1) - .collect(); + let squares_of_x: Vec<_> = + iter::successors(Some(x_challenge), |&x| Some(JoltField::square(&x))) + .take(num_vars + 1) + .collect(); let offsets_of_x = { let mut offsets_of_x = squares_of_x @@ -228,7 +239,8 @@ pub struct Zeromorph { impl Zeromorph where -

::ScalarField: field::JoltField, +

::ScalarField: JoltField, +

::G1: Icicle, P: Pairing, ProofTranscript: Transcript, { @@ -277,12 +289,18 @@ where assert_eq!(quotients.len(), poly.get_num_vars()); assert_eq!(remainder, *eval); - // Compute the multilinear quotients q_k = q_k(X_0, ..., X_{k-1}) - let q_k_com: Vec = quotients - .par_iter() + // TODO(sagar): support variable_batch msms - or decide not to support them altogether + let q_k_com: Vec = optimal_iter!(quotients) .map(|q| UnivariateKZG::commit(&pp.commit_pp, q).unwrap()) .collect(); let q_comms: Vec = q_k_com.par_iter().map(|c| c.into_group()).collect(); + // Compute the multilinear quotients q_k = q_k(X_0, ..., X_{k-1}) + // let quotient_slices: Vec<&[P::ScalarField]> = + // quotients.iter().map(|q| q.coeffs.as_slice()).collect(); + // let q_k_com = UnivariateKZG::commit_batch(&pp.commit_pp, "ient_slices)?; + // let q_comms: Vec = q_k_com.par_iter().map(|c| c.into_group()).collect(); + // let quotient_max_len = quotient_slices.iter().map(|s| s.len()).max().unwrap(); + q_comms.iter().for_each(|c| transcript.append_point(c)); // Sample challenge y @@ -459,9 +477,7 @@ where proof.q_k_com.clone(), ] .concat(); - let zeta_z_com = ::msm(&bases, &scalars) - .unwrap() - .into_affine(); + let zeta_z_com = ::msm(&bases, None, &scalars)?.into_affine(); // e(pi, [tau]_2 - x * [1]_2) == e(C_{\zeta,Z}, -[X^(N_max - 2^n - 1)]_2) <==> e(C_{\zeta,Z} - x * pi, [X^{N_max - 2^n - 1}]_2) * e(-pi, [tau_2]) == 1 let pairing = P::multi_pairing( @@ -482,7 +498,8 @@ where impl CommitmentScheme for Zeromorph where -

::ScalarField: field::JoltField, +

::ScalarField: JoltField, +

::G1: Icicle, { type Field = P::ScalarField; type Setup = (ZeromorphProverKey

, ZeromorphVerifierKey

); @@ -490,7 +507,11 @@ where type Proof = ZeromorphProof

; type BatchedProof = ZeromorphProof

; - fn setup(shapes: &[CommitShape]) -> Self::Setup { + fn setup(shapes: &[CommitShape]) -> Self::Setup + where + P::ScalarField: JoltField, + P::G1: Icicle, + { let max_len = shapes.iter().map(|shape| shape.input_length).max().unwrap(); ZeromorphSRS(Arc::new(SRS::setup( @@ -519,22 +540,11 @@ where gens: &Self::Setup, _batch_type: BatchType, ) -> Vec { - // TODO: assert lengths are valid - evals - .par_iter() - .map(|evals| { - assert!( - gens.0.commit_pp.g1_powers().len() > evals.len(), - "COMMIT KEY LENGTH ERROR {}, {}", - gens.0.commit_pp.g1_powers().len(), - evals.len() - ); - ZeromorphCommitment( - UnivariateKZG::commit(&gens.0.commit_pp, &UniPoly::from_coeff(evals.to_vec())) - .unwrap(), - ) - }) - .collect::>() + UnivariateKZG::commit_batch(&gens.0.commit_pp, evals) + .unwrap() + .into_iter() + .map(|c| ZeromorphCommitment(c)) + .collect() } fn commit_slice(evals: &[Self::Field], setup: &Self::Setup) -> Self::Commitment { @@ -631,7 +641,7 @@ mod test { use crate::utils::math::Math; use crate::utils::transcript::{KeccakTranscript, Transcript}; use ark_bn254::{Bn254, Fr}; - use ark_ff::{BigInt, Zero}; + use ark_ff::{BigInt, Field, Zero}; use ark_std::{test_rng, UniformRand}; use rand_core::SeedableRng; diff --git a/jolt-core/src/poly/unipoly.rs b/jolt-core/src/poly/unipoly.rs index 85ba65a87..0f3bad8d2 100644 --- a/jolt-core/src/poly/unipoly.rs +++ b/jolt-core/src/poly/unipoly.rs @@ -7,7 +7,7 @@ use crate::utils::gaussian_elimination::gaussian_elimination; use crate::utils::transcript::{AppendToTranscript, Transcript}; use ark_serialize::*; use rand_core::{CryptoRng, RngCore}; -use rayon::iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator}; +use rayon::prelude::*; // ax^2 + bx + c stored as vec![c,b,a] // ax^3 + bx^2 + cx + d stored as vec![d,c,b,a] @@ -79,6 +79,7 @@ impl UniPoly { for (i, div_coeff) in divisor.coeffs.iter().enumerate() { remainder.coeffs[cur_q_degree + i] -= cur_q_coeff * *div_coeff; } + while let Some(true) = remainder.coeffs.last().map(|c| c == &F::zero()) { remainder.coeffs.pop(); } diff --git a/jolt-core/src/utils/errors.rs b/jolt-core/src/utils/errors.rs index 566b29651..0521dcd7b 100644 --- a/jolt-core/src/utils/errors.rs +++ b/jolt-core/src/utils/errors.rs @@ -14,7 +14,7 @@ pub enum ProofVerifyError { DecompressionError([u8; 32]), #[error("R1CS proof verification failed: {0}")] SpartanError(String), - #[error("Length Error: SRS Length: {0}, Key Length: {0}")] + #[error("Length Error: SRS Length: {0}, Key Length: {1}")] KeyLengthError(usize, usize), #[error("Invalid key length: {0}, expected power of 2")] InvalidKeyLength(usize), diff --git a/jolt-core/src/utils/mod.rs b/jolt-core/src/utils/mod.rs index 8a0e30cbd..78a1288fa 100644 --- a/jolt-core/src/utils/mod.rs +++ b/jolt-core/src/utils/mod.rs @@ -13,6 +13,67 @@ pub mod sol_types; pub mod thread; pub mod transcript; +/// Macros that determine the optimal iterator type based on the feature flags. +/// +/// For some cases (ex. offloading to GPU), we may not want to use a parallel iterator. +/// Specifically when icicle is enabled we want to be careful to use serial iteration in the right places. +/// Based on observations; multiple calls into icicle_msm functions can dramatically slow down GPU performance. +#[macro_export] +macro_rules! optimal_iter { + ($T:expr) => {{ + #[cfg(feature = "icicle")] + { + $T.iter() + } + #[cfg(not(feature = "icicle"))] + { + $T.par_iter() + } + }}; +} + +#[macro_export] +macro_rules! into_optimal_iter { + ($T:expr) => {{ + #[cfg(feature = "icicle")] + { + $T.into_iter() + } + #[cfg(not(feature = "icicle"))] + { + $T.into_par_iter() + } + }}; +} + +#[macro_export] +macro_rules! optimal_iter_mut { + ($T:expr) => {{ + #[cfg(feature = "icicle")] + { + $T.iter_mut() + } + #[cfg(not(feature = "icicle"))] + { + $T.par_iter_mut() + } + }}; +} + +#[macro_export] +macro_rules! join_conditional { + ($f1:expr, $f2:expr) => {{ + #[cfg(feature = "icicle")] + { + ($f1(), $f2()) + } + #[cfg(not(feature = "icicle"))] + { + rayon::join($f1, $f2) + } + }}; +} + /// Converts an integer value to a bitvector (all values {0,1}) of field elements. /// Note: ordering has the MSB in the highest index. All of the following represent the integer 1: /// - [1] diff --git a/jolt-evm-verifier/script/Cargo.lock b/jolt-evm-verifier/script/Cargo.lock index 6b6e46adf..afa6c574d 100644 --- a/jolt-evm-verifier/script/Cargo.lock +++ b/jolt-evm-verifier/script/Cargo.lock @@ -1575,6 +1575,7 @@ dependencies = [ "itertools 0.10.5", "memory-stats", "num-integer", + "once_cell", "postcard", "rand 0.7.3", "rand_chacha 0.3.1", @@ -1585,6 +1586,7 @@ dependencies = [ "sha3", "strum", "strum_macros", + "sys-info", "target-lexicon", "thiserror", "tokio", @@ -2568,9 +2570,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -2850,6 +2852,16 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "sys-info" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "system-configuration" version = "0.6.1" @@ -2957,9 +2969,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.41.0" +version = "1.41.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", "bytes", diff --git a/jolt-sdk/Cargo.toml b/jolt-sdk/Cargo.toml index f34ad6f7c..bd25f46c9 100644 --- a/jolt-sdk/Cargo.toml +++ b/jolt-sdk/Cargo.toml @@ -21,11 +21,11 @@ host = [ "dep:ark-bn254", "postcard/use-std", ] - guest-std = [ "postcard/use-std", "jolt-sdk-macros/guest-std", ] +icicle = ["host", "jolt-core?/icicle"] [dependencies] postcard = { version = "1.0.8", default-features = false } diff --git a/src/main.rs b/src/main.rs index da0218fd7..81ea27e1c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -189,6 +189,9 @@ lto = "fat" jolt = { package = "jolt-sdk", git = "https://github.com/a16z/jolt", features = ["host"] } guest = { path = "./guest" } +[features] +icicle = ["jolt-sdk/icicle"] + [patch.crates-io] ark-ff = { git = "https://github.com/a16z/arkworks-algebra", branch = "optimize/field-from-u64" } ark-ec = { git = "https://github.com/a16z/arkworks-algebra", branch = "optimize/field-from-u64" } From f65a0c7ccc28a1dd22189c61364e048837b42c7c Mon Sep 17 00:00:00 2001 From: Sagar Dhawan Date: Mon, 16 Dec 2024 08:47:38 -0800 Subject: [PATCH 2/3] use icicle-jolt to compile cuda deps --- Cargo.lock | 17 +++++++++-------- README.md | 23 +++++++++++++++++++++++ jolt-core/Cargo.toml | 6 +++--- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18184b0b5..aefc84ab4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1557,8 +1557,8 @@ dependencies = [ [[package]] name = "icicle-bn254" -version = "3.1.0" -source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +version = "3.2.0" +source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108" dependencies = [ "cmake", "icicle-core", @@ -1568,8 +1568,8 @@ dependencies = [ [[package]] name = "icicle-core" -version = "3.1.0" -source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +version = "3.2.0" +source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108" dependencies = [ "hex", "icicle-runtime", @@ -1580,8 +1580,8 @@ dependencies = [ [[package]] name = "icicle-hash" -version = "3.1.0" -source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +version = "3.2.0" +source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108" dependencies = [ "cmake", "icicle-core", @@ -1591,10 +1591,11 @@ dependencies = [ [[package]] name = "icicle-runtime" -version = "3.1.0" -source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4" +version = "3.2.0" +source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108" dependencies = [ "cmake", + "once_cell", ] [[package]] diff --git a/README.md b/README.md index 4765c81f9..a76463f8d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,29 @@ Examples in the [`examples`](./examples/) directory can be run using e.g. ```cargo run --release -p sha2-chain``` +## CUDA Support + +JOLT supports CUDA acceleration via [icicle](https://github.com/ingonyama-zk/icicle-jolt). + +Dependencies: +1. Install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) +2. Install [CMake](https://cmake.org/) + +Now you may build Jolt with CUDA acceleration using the `--features icicle` flag. + +### Build + +```cargo build -p jolt-core --features icicle``` + +### Bench + +``` +# Set the icicle backend path - this won't be needed in the future +export ICICLE_BACKEND_INSTALL_DIR=$(pwd)/target/debug/deps/icicle/lib/backend +cargo bench --bench msm_batch --no-fail-fast -p jolt-core --features icicle +``` + +Note - NVIDIA doesn't support cross compilation on MacOS. Only Windows or Linux. ## Performance profiling diff --git a/jolt-core/Cargo.toml b/jolt-core/Cargo.toml index 7f58c5c97..0153b9dde 100644 --- a/jolt-core/Cargo.toml +++ b/jolt-core/Cargo.toml @@ -119,9 +119,9 @@ name = "jolt_core" path = "src/lib.rs" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] -icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } -icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } -icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true } +icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", features = ["cuda_backend"], rev = "ed93e21", optional = true } +icicle-core = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", rev = "ed93e21", optional = true } +icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", features = ["cuda_backend"], rev = "ed93e21", optional = true } memory-stats = "1.0.0" sys-info = "0.9.1" tokio = { version = "1.38.0", optional = true, features = ["rt-multi-thread"] } From 146b4b8e849ca5693e9bd01f60eb9a737a9023d0 Mon Sep 17 00:00:00 2001 From: Sagar Dhawan Date: Mon, 16 Dec 2024 08:50:33 -0800 Subject: [PATCH 3/3] remove icicle from CI --- .github/workflows/rust.yml | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8de965792..37f6f528e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,11 +40,6 @@ jobs: with: command: clippy args: --all - - name: cargo clippy icicle - uses: actions-rs/cargo@v1 - with: - command: clippy - args: --all --features icicle machete: runs-on: ubuntu-latest @@ -82,23 +77,6 @@ jobs: - name: Run jolt-core tests run: cargo nextest run --release -p jolt-core - test-icicle: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions-rust-lang/setup-rust-toolchain@v1 - - name: Cache Jolt RISC-V Rust toolchain - uses: actions/cache@v4 - with: - key: jolt-rust-toolchain-${{hashFiles('guest-toolchain-tag')}} - path: ~/.jolt - - name: Install Jolt RISC-V Rust toolchain - run: cargo run install-toolchain - - name: Install nextest - uses: taiki-e/install-action@nextest - - name: Run jolt-core tests - run: cargo nextest run --release -p jolt-core --features icicle - on-chain: name: Onchain Verifier Tests runs-on: ubuntu-latest