From 4a714c804eb768faeac75f57fd7e79fc18898aae Mon Sep 17 00:00:00 2001
From: Sagar Dhawan <sdhawan@a16z.com>
Date: Mon, 2 Dec 2024 12:34:14 -0800
Subject: [PATCH 1/3] feat(icicle): Add icicle MSM support

---
 .github/workflows/rust.yml                    |  22 ++
 Cargo.lock                                    |  66 ++++
 Cargo.toml                                    |   1 +
 examples/alloc/Cargo.toml                     |   3 +
 examples/collatz/Cargo.toml                   |   3 +
 examples/fibonacci/Cargo.toml                 |   3 +
 examples/muldiv/Cargo.toml                    |   2 +
 examples/multi-function/Cargo.toml            |   3 +
 examples/overflow/Cargo.toml                  |   5 +-
 examples/sha2-chain/Cargo.toml                |   3 +
 examples/sha2-ex/Cargo.toml                   |   3 +
 examples/sha3-chain/Cargo.toml                |   4 +-
 examples/sha3-ex/Cargo.toml                   |   4 +-
 examples/stdlib/Cargo.toml                    |   3 +
 jolt-core/Cargo.toml                          |  36 +-
 jolt-core/benches/iai.rs                      |   2 +-
 jolt-core/benches/msm.rs                      | 136 +++++++
 jolt-core/benches/msm_batch.rs                | 176 +++++++++
 jolt-core/src/jolt/vm/bytecode.rs             |   1 +
 jolt-core/src/jolt/vm/mod.rs                  |  40 +-
 .../src/jolt/vm/timestamp_range_check.rs      |  18 +-
 jolt-core/src/msm/icicle/adapter.rs           | 372 ++++++++++++++++++
 jolt-core/src/msm/icicle/mod.rs               | 104 +++++
 jolt-core/src/msm/mod.rs                      | 336 +++++++++++++++-
 jolt-core/src/poly/commitment/hyperkzg.rs     |  86 ++--
 jolt-core/src/poly/commitment/hyrax.rs        | 107 ++---
 jolt-core/src/poly/commitment/kzg.rs          | 189 ++++++++-
 jolt-core/src/poly/commitment/pedersen.rs     | 108 ++++-
 jolt-core/src/poly/commitment/zeromorph.rs    |  96 +++--
 jolt-core/src/poly/unipoly.rs                 |   3 +-
 jolt-core/src/utils/errors.rs                 |   2 +-
 jolt-core/src/utils/mod.rs                    |  61 +++
 jolt-evm-verifier/script/Cargo.lock           |  20 +-
 jolt-sdk/Cargo.toml                           |   2 +-
 src/main.rs                                   |   3 +
 35 files changed, 1807 insertions(+), 216 deletions(-)
 create mode 100644 jolt-core/benches/msm.rs
 create mode 100644 jolt-core/benches/msm_batch.rs
 create mode 100644 jolt-core/src/msm/icicle/adapter.rs
 create mode 100644 jolt-core/src/msm/icicle/mod.rs

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 37f6f528e..8de965792 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -40,6 +40,11 @@ jobs:
         with:
           command: clippy
           args: --all
+      - name: cargo clippy icicle
+        uses: actions-rs/cargo@v1
+        with:
+          command: clippy
+          args: --all --features icicle
 
   machete:
     runs-on: ubuntu-latest
@@ -77,6 +82,23 @@ jobs:
       - name: Run jolt-core tests
         run: cargo nextest run --release -p jolt-core
 
+  test-icicle:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+      - name: Cache Jolt RISC-V Rust toolchain
+        uses: actions/cache@v4
+        with:
+          key: jolt-rust-toolchain-${{hashFiles('guest-toolchain-tag')}}
+          path: ~/.jolt
+      - name: Install Jolt RISC-V Rust toolchain
+        run: cargo run install-toolchain
+      - name: Install nextest
+        uses: taiki-e/install-action@nextest
+      - name: Run jolt-core tests
+        run: cargo nextest run --release -p jolt-core --features icicle
+
   on-chain:
     name: Onchain Verifier Tests
     runs-on: ubuntu-latest
diff --git a/Cargo.lock b/Cargo.lock
index eab739140..18184b0b5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -696,6 +696,15 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "cobs"
 version = "0.2.3"
@@ -1546,6 +1555,48 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "icicle-bn254"
+version = "3.1.0"
+source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+dependencies = [
+ "cmake",
+ "icicle-core",
+ "icicle-hash",
+ "icicle-runtime",
+]
+
+[[package]]
+name = "icicle-core"
+version = "3.1.0"
+source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+dependencies = [
+ "hex",
+ "icicle-runtime",
+ "once_cell",
+ "rand 0.8.5",
+ "rayon",
+]
+
+[[package]]
+name = "icicle-hash"
+version = "3.1.0"
+source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+dependencies = [
+ "cmake",
+ "icicle-core",
+ "icicle-runtime",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "icicle-runtime"
+version = "3.1.0"
+source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+dependencies = [
+ "cmake",
+]
+
 [[package]]
 name = "icu_collections"
 version = "1.5.0"
@@ -1830,10 +1881,14 @@ dependencies = [
  "fixedbitset",
  "getrandom 0.2.15",
  "iai-callgrind",
+ "icicle-bn254",
+ "icicle-core",
+ "icicle-runtime",
  "indicatif",
  "itertools 0.10.5",
  "memory-stats",
  "num-integer",
+ "once_cell",
  "postcard",
  "rand 0.7.3",
  "rand_chacha 0.3.1",
@@ -1844,6 +1899,7 @@ dependencies = [
  "sha3",
  "strum",
  "strum_macros",
+ "sys-info",
  "target-lexicon",
  "thiserror",
  "tokio",
@@ -3384,6 +3440,16 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "sys-info"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c"
+dependencies = [
+ "cc",
+ "libc",
+]
+
 [[package]]
 name = "sysinfo"
 version = "0.30.13"
diff --git a/Cargo.toml b/Cargo.toml
index 6323b4071..4108f1966 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ members = [
 
 [features]
 host = ["jolt-sdk/host"]
+icicle = ["jolt-core/icicle"]
 
 [lib]
 path = "./src/lib.rs"
diff --git a/examples/alloc/Cargo.toml b/examples/alloc/Cargo.toml
index 0064df0e3..6eca4708d 100644
--- a/examples/alloc/Cargo.toml
+++ b/examples/alloc/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "alloc-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
\ No newline at end of file
diff --git a/examples/collatz/Cargo.toml b/examples/collatz/Cargo.toml
index e1c81a88a..7e57f89c3 100644
--- a/examples/collatz/Cargo.toml
+++ b/examples/collatz/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "collatz-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
\ No newline at end of file
diff --git a/examples/fibonacci/Cargo.toml b/examples/fibonacci/Cargo.toml
index a5274ed94..97439eddf 100644
--- a/examples/fibonacci/Cargo.toml
+++ b/examples/fibonacci/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "fibonacci-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/muldiv/Cargo.toml b/examples/muldiv/Cargo.toml
index 6910f3f60..876b325e2 100644
--- a/examples/muldiv/Cargo.toml
+++ b/examples/muldiv/Cargo.toml
@@ -7,3 +7,5 @@ edition = "2021"
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "muldiv-guest", path = "./guest" }
 
+[features]
+icicle = ["jolt-sdk/icicle"]
\ No newline at end of file
diff --git a/examples/multi-function/Cargo.toml b/examples/multi-function/Cargo.toml
index 76ff01257..4b1e300fd 100644
--- a/examples/multi-function/Cargo.toml
+++ b/examples/multi-function/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "multi-function-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/overflow/Cargo.toml b/examples/overflow/Cargo.toml
index a8bf498cd..41beb30ff 100644
--- a/examples/overflow/Cargo.toml
+++ b/examples/overflow/Cargo.toml
@@ -5,4 +5,7 @@ edition = "2021"
 
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
-guest = { package = "overflow-guest", path = "./guest" }
\ No newline at end of file
+guest = { package = "overflow-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/sha2-chain/Cargo.toml b/examples/sha2-chain/Cargo.toml
index 7d1261436..fbc091781 100644
--- a/examples/sha2-chain/Cargo.toml
+++ b/examples/sha2-chain/Cargo.toml
@@ -8,3 +8,6 @@ jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "sha2-chain-guest", path = "./guest" }
 
 hex = "0.4.3"
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/sha2-ex/Cargo.toml b/examples/sha2-ex/Cargo.toml
index 8b51be32b..a45c0f436 100644
--- a/examples/sha2-ex/Cargo.toml
+++ b/examples/sha2-ex/Cargo.toml
@@ -8,3 +8,6 @@ jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "sha2-guest", path = "./guest" }
 
 hex = "0.4.3"
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/sha3-chain/Cargo.toml b/examples/sha3-chain/Cargo.toml
index cf12c733a..f8ddd5e1f 100644
--- a/examples/sha3-chain/Cargo.toml
+++ b/examples/sha3-chain/Cargo.toml
@@ -6,5 +6,7 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "sha3-chain-guest", path = "./guest" }
+hex = "0.4.3"
 
-hex = "0.4.3"
\ No newline at end of file
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/sha3-ex/Cargo.toml b/examples/sha3-ex/Cargo.toml
index cce963386..39c0ba43c 100644
--- a/examples/sha3-ex/Cargo.toml
+++ b/examples/sha3-ex/Cargo.toml
@@ -6,5 +6,7 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "sha3-guest", path = "./guest" }
-
 hex = "0.4.3"
+
+[features]
+icicle = ["jolt-sdk/icicle"]
diff --git a/examples/stdlib/Cargo.toml b/examples/stdlib/Cargo.toml
index 1069f2eff..b606705ba 100644
--- a/examples/stdlib/Cargo.toml
+++ b/examples/stdlib/Cargo.toml
@@ -6,3 +6,6 @@ edition = "2021"
 [dependencies]
 jolt-sdk = { path = "../../jolt-sdk", features = ["host"] }
 guest = { package = "stdlib-guest", path = "./guest" }
+
+[features]
+icicle = ["jolt-sdk/icicle"]
\ No newline at end of file
diff --git a/jolt-core/Cargo.toml b/jolt-core/Cargo.toml
index d53e2d7e8..7f58c5c97 100644
--- a/jolt-core/Cargo.toml
+++ b/jolt-core/Cargo.toml
@@ -19,6 +19,18 @@ repository = "https://github.com/a16z/jolt"
 license-file = "LICENSE"
 keywords = ["SNARK", "cryptography", "proofs"]
 
+[features]
+default = [
+    "ark-ec/parallel",
+    "ark-ff/parallel",
+    "ark-std/parallel",
+    "ark-ff/asm",
+    "host",
+    "rayon",
+]
+host = ["dep:reqwest", "dep:tokio"]
+icicle = ["default", "dep:icicle-runtime", "dep:icicle-core", "dep:icicle-bn254"]
+
 [dependencies]
 ark-bn254 = "0.4.0"
 ark-ec = { version = "0.4.2", default-features = false }
@@ -65,6 +77,7 @@ bytemuck = "1.19.0"
 tokio = { version = "1.38.0", optional = true }
 alloy-primitives = "0.7.6"
 alloy-sol-types = "0.7.6"
+once_cell = "1.19.0"
 
 [dev-dependencies]
 criterion = { version = "0.5.1", features = ["html_reports"] }
@@ -93,23 +106,24 @@ harness = false
 name = "compute_cubic"
 harness = false
 
+[[bench]]
+name = "msm"
+harness = false
+
+[[bench]]
+name = "msm_batch"
+harness = false
+
 [lib]
 name = "jolt_core"
 path = "src/lib.rs"
 
-[features]
-default = [
-    "ark-ec/parallel",
-    "ark-ff/parallel",
-    "ark-std/parallel",
-    "ark-ff/asm",
-    "host",
-    "rayon",
-]
-host = ["dep:reqwest", "dep:tokio"]
-
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
 memory-stats = "1.0.0"
+sys-info = "0.9.1"
 tokio = { version = "1.38.0", optional = true, features = ["rt-multi-thread"] }
 
 [target.'cfg(target_arch = "wasm32")'.dependencies]
diff --git a/jolt-core/benches/iai.rs b/jolt-core/benches/iai.rs
index e5d971d64..0535f9177 100644
--- a/jolt-core/benches/iai.rs
+++ b/jolt-core/benches/iai.rs
@@ -35,7 +35,7 @@ fn eval_poly_setup<F: JoltField>(size: usize) -> (DensePolynomial<F>, Vec<F>) {
 #[library_benchmark]
 #[bench::long(msm_setup::<G1Projective>(4096))]
 fn bench_msm<G: CurveGroup>(input: (Vec<G>, Vec<G::ScalarField>)) -> G {
-    black_box(VariableBaseMSM::msm(&G::normalize_batch(&input.0), &input.1).unwrap())
+    black_box(VariableBaseMSM::msm(&G::normalize_batch(&input.0), None, &input.1).unwrap())
 }
 
 #[library_benchmark]
diff --git a/jolt-core/benches/msm.rs b/jolt-core/benches/msm.rs
new file mode 100644
index 000000000..75fded2ca
--- /dev/null
+++ b/jolt-core/benches/msm.rs
@@ -0,0 +1,136 @@
+use ark_bn254::{Bn254, Fr, G1Affine, G1Projective};
+use ark_ff::{BigInteger, PrimeField};
+use ark_std::rand::Rng;
+use ark_std::UniformRand;
+use ark_std::{One, Zero};
+use criterion::Criterion;
+use jolt_core::field::JoltField;
+#[cfg(feature = "icicle")]
+use jolt_core::msm::Icicle;
+use jolt_core::msm::{icicle_init, GpuBaseType, MsmType, VariableBaseMSM};
+use jolt_core::poly::commitment::commitment_scheme::CommitmentScheme;
+use jolt_core::poly::commitment::zeromorph::Zeromorph;
+use jolt_core::utils::transcript::{KeccakTranscript, Transcript};
+use rand_chacha::ChaCha20Rng;
+use rand_core::{RngCore, SeedableRng};
+use rayon::prelude::*;
+
+const SRS_SIZE: usize = 1 << 20;
+
+// Sets up the benchmark
+fn setup_bench<PCS, F, ProofTranscript>(
+    msm_type: MsmType,
+) -> (
+    Vec<G1Affine>,
+    Option<Vec<GpuBaseType<G1Projective>>>,
+    Vec<Fr>,
+)
+where
+    F: JoltField,
+    PCS: CommitmentScheme<ProofTranscript, Field = F>,
+    ProofTranscript: Transcript,
+{
+    let mut rng = ChaCha20Rng::seed_from_u64(SRS_SIZE as u64);
+
+    let scalars = match msm_type {
+        MsmType::Zero => {
+            vec![Fr::zero(); SRS_SIZE]
+        }
+        MsmType::One => {
+            vec![Fr::one(); SRS_SIZE]
+        }
+        MsmType::Small(_) => (0..SRS_SIZE)
+            .into_iter()
+            .map(|_| {
+                let i = rng.gen_range(0..(1 << 10));
+                <Fr as JoltField>::from_u64(i).unwrap()
+            })
+            .collect(),
+        MsmType::Medium(_) => (0..SRS_SIZE)
+            .into_iter()
+            .map(|_| {
+                let i = rng.next_u64();
+                <Fr as JoltField>::from_u64(i).unwrap()
+            })
+            .collect(),
+        MsmType::Large(_) => (0..SRS_SIZE)
+            .into_iter()
+            .map(|_| {
+                let values: [u64; 4] = [
+                    rng.next_u64(),
+                    rng.next_u64(),
+                    rng.next_u64(),
+                    rng.next_u64(),
+                ];
+                let bigint = ark_ff::BigInteger256::new(values);
+                <Fr as JoltField>::from_bytes(&bigint.to_bytes_le())
+            })
+            .collect(),
+    };
+
+    let bases: Vec<G1Affine> = std::iter::repeat_with(|| G1Affine::rand(&mut rng))
+        .take(SRS_SIZE)
+        .collect();
+    #[cfg(feature = "icicle")]
+    let gpu_bases = Some(
+        bases
+            .par_iter()
+            .map(|base| G1Projective::from_ark_affine(base))
+            .collect(),
+    );
+
+    let max_num_bits = scalars
+        .par_iter()
+        .map(|s| s.clone().into_bigint().num_bits())
+        .max()
+        .unwrap();
+
+    println!("Using max num bits: {}", max_num_bits);
+    #[cfg(not(feature = "icicle"))]
+    let gpu_bases = None;
+    (bases, gpu_bases, scalars)
+}
+
+fn benchmark_msm<PCS, F, ProofTranscript>(c: &mut Criterion, name: &str, msm_type: MsmType)
+where
+    F: JoltField,
+    PCS: CommitmentScheme<ProofTranscript, Field = F>,
+    ProofTranscript: Transcript,
+{
+    let (bases, gpu_bases, scalars) = setup_bench::<PCS, F, ProofTranscript>(msm_type);
+    icicle_init();
+    #[cfg(feature = "icicle")]
+    let id = format!("{} [mode:Icicle]", name);
+    #[cfg(not(feature = "icicle"))]
+    let id = format!("{} [mode:JOLT CPU]", name);
+    c.bench_function(&id, |b| {
+        b.iter(|| {
+            let msm =
+                <G1Projective as VariableBaseMSM>::msm(&bases, gpu_bases.as_deref(), &scalars);
+            let _ = msm.expect("MSM failed");
+        });
+    });
+}
+
+fn main() {
+    let mut criterion = Criterion::default()
+        .configure_from_args()
+        .sample_size(20)
+        .warm_up_time(std::time::Duration::from_secs(5));
+    benchmark_msm::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm(Large)",
+        MsmType::Large(0 /* unused */),
+    );
+    benchmark_msm::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm(Medium)",
+        MsmType::Medium(0 /* unused */),
+    );
+    benchmark_msm::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm(Small)",
+        MsmType::Small(0 /* unused */),
+    );
+    criterion.final_summary();
+}
diff --git a/jolt-core/benches/msm_batch.rs b/jolt-core/benches/msm_batch.rs
new file mode 100644
index 000000000..f6e674134
--- /dev/null
+++ b/jolt-core/benches/msm_batch.rs
@@ -0,0 +1,176 @@
+use ark_bn254::{Bn254, Fr, G1Affine, G1Projective};
+use ark_ff::BigInteger;
+use ark_std::rand::seq::SliceRandom;
+use ark_std::rand::Rng;
+use ark_std::UniformRand;
+use ark_std::{One, Zero};
+use criterion::Criterion;
+use jolt_core::field::JoltField;
+#[cfg(feature = "icicle")]
+use jolt_core::msm::Icicle;
+use jolt_core::msm::{icicle_init, GpuBaseType, MsmType, VariableBaseMSM};
+use jolt_core::poly::commitment::commitment_scheme::CommitmentScheme;
+use jolt_core::poly::commitment::zeromorph::Zeromorph;
+use jolt_core::utils::transcript::{KeccakTranscript, Transcript};
+use rand_chacha::ChaCha20Rng;
+use rand_core::{RngCore, SeedableRng};
+#[cfg(feature = "icicle")]
+use rayon::prelude::*;
+
+const SRS_SIZE: usize = 1 << 14;
+
+// Sets up the benchmark
+fn setup_bench<PCS, F, ProofTranscript>(
+    batch_config: BatchConfig,
+) -> (
+    Vec<G1Affine>,
+    Option<Vec<GpuBaseType<G1Projective>>>,
+    Vec<Vec<Fr>>,
+)
+where
+    F: JoltField,
+    PCS: CommitmentScheme<ProofTranscript, Field = F>,
+    ProofTranscript: Transcript,
+{
+    let mut rng = ChaCha20Rng::seed_from_u64(SRS_SIZE as u64);
+    // For each type in the batch config create a vector of scalars
+    let mut scalar_batches: Vec<Vec<Fr>> = vec![];
+
+    (0..batch_config.small)
+        .into_iter()
+        .for_each(|_| scalar_batches.push(get_scalars(MsmType::Small(0 /* unused */), SRS_SIZE)));
+    (0..batch_config.medium)
+        .into_iter()
+        .for_each(|_| scalar_batches.push(get_scalars(MsmType::Medium(0 /* unused */), SRS_SIZE)));
+    (0..batch_config.large)
+        .into_iter()
+        .for_each(|_| scalar_batches.push(get_scalars(MsmType::Large(0 /* unused */), SRS_SIZE)));
+    scalar_batches.shuffle(&mut rng);
+
+    let bases: Vec<G1Affine> = std::iter::repeat_with(|| G1Affine::rand(&mut rng))
+        .take(SRS_SIZE)
+        .collect();
+    #[cfg(feature = "icicle")]
+    let gpu_bases = Some(
+        bases
+            .par_iter()
+            .map(|base| G1Projective::from_ark_affine(base))
+            .collect(),
+    );
+    #[cfg(not(feature = "icicle"))]
+    let gpu_bases = None;
+    (bases, gpu_bases, scalar_batches)
+}
+
+fn get_scalars(msm_type: MsmType, size: usize) -> Vec<Fr> {
+    let mut rng = ChaCha20Rng::seed_from_u64(size as u64);
+    match msm_type {
+        MsmType::Zero => {
+            vec![Fr::zero(); size]
+        }
+        MsmType::One => {
+            vec![Fr::one(); size]
+        }
+        MsmType::Small(_) => (0..size)
+            .into_iter()
+            .map(|_| {
+                let i = rng.gen_range(0..(1 << 10));
+                <Fr as JoltField>::from_u64(i).unwrap()
+            })
+            .collect(),
+        MsmType::Medium(_) => (0..size)
+            .into_iter()
+            .map(|_| {
+                let i = rng.next_u64();
+                <Fr as JoltField>::from_u64(i).unwrap()
+            })
+            .collect(),
+        MsmType::Large(_) => (0..size)
+            .into_iter()
+            .map(|_| {
+                let values: [u64; 4] = [
+                    rng.next_u64(),
+                    rng.next_u64(),
+                    rng.next_u64(),
+                    rng.next_u64(),
+                ];
+                let bigint = ark_ff::BigInteger256::new(values);
+                <Fr as JoltField>::from_bytes(&bigint.to_bytes_le())
+            })
+            .collect(),
+    }
+}
+
+fn benchmark_msm_batch<PCS, F, ProofTranscript>(
+    c: &mut Criterion,
+    name: &str,
+    batch_config: BatchConfig,
+) where
+    F: JoltField,
+    PCS: CommitmentScheme<ProofTranscript, Field = F>,
+    ProofTranscript: Transcript,
+{
+    let (bases, gpu_bases, scalar_batches) = setup_bench::<PCS, F, ProofTranscript>(batch_config);
+    let scalar_batches_ref: Vec<_> = scalar_batches
+        .iter()
+        .map(|inner_vec| inner_vec.as_slice())
+        .collect();
+    icicle_init();
+    println!("Running benchmark for {:?}", batch_config);
+    #[cfg(feature = "icicle")]
+    let id = format!("{} [mode:Icicle]", name);
+    #[cfg(not(feature = "icicle"))]
+    let id = format!("{} [mode:JOLT CPU]", name);
+    c.bench_function(&id, |b| {
+        b.iter(|| {
+            let msm = <G1Projective as VariableBaseMSM>::batch_msm(
+                &bases,
+                gpu_bases.as_deref(),
+                &scalar_batches_ref,
+            );
+            assert_eq!(msm.len(), scalar_batches.len());
+        });
+    });
+}
+
+#[derive(Debug, Clone, Copy)]
+struct BatchConfig {
+    small: usize,
+    medium: usize,
+    large: usize,
+}
+
+fn main() {
+    let mut criterion = Criterion::default()
+        .configure_from_args()
+        .sample_size(10)
+        .warm_up_time(std::time::Duration::from_secs(10));
+    benchmark_msm_batch::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm_batch(bias: Large)",
+        BatchConfig {
+            small: 100,
+            medium: 100,
+            large: 300,
+        },
+    );
+    benchmark_msm_batch::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm_batch(bias: Medium)",
+        BatchConfig {
+            small: 100,
+            medium: 300,
+            large: 100,
+        },
+    );
+    benchmark_msm_batch::<Zeromorph<Bn254, KeccakTranscript>, Fr, KeccakTranscript>(
+        &mut criterion,
+        "VariableBaseMSM::msm_batch(bias: Small)",
+        BatchConfig {
+            small: 300,
+            medium: 100,
+            large: 100,
+        },
+    );
+    criterion.final_summary();
+}
diff --git a/jolt-core/src/jolt/vm/bytecode.rs b/jolt-core/src/jolt/vm/bytecode.rs
index 156950fa1..f96debd4d 100644
--- a/jolt-core/src/jolt/vm/bytecode.rs
+++ b/jolt-core/src/jolt/vm/bytecode.rs
@@ -22,6 +22,7 @@ use rayon::prelude::*;
 
 use super::{JoltPolynomials, JoltTraceStep};
 use crate::utils::transcript::Transcript;
+
 use crate::{
     lasso::memory_checking::{MemoryCheckingProof, MemoryCheckingProver, MemoryCheckingVerifier},
     poly::{dense_mlpoly::DensePolynomial, identity_poly::IdentityPolynomial},
diff --git a/jolt-core/src/jolt/vm/mod.rs b/jolt-core/src/jolt/vm/mod.rs
index 46177850e..e606439ad 100644
--- a/jolt-core/src/jolt/vm/mod.rs
+++ b/jolt-core/src/jolt/vm/mod.rs
@@ -14,6 +14,7 @@ use std::marker::PhantomData;
 use strum::EnumCount;
 use timestamp_range_check::TimestampRangeCheckStuff;
 
+use crate::join_conditional;
 use crate::jolt::{
     instruction::{
         div::DIVInstruction, divu::DIVUInstruction, mulh::MULHInstruction,
@@ -26,6 +27,7 @@ use crate::jolt::{
 use crate::lasso::memory_checking::{
     Initializable, MemoryCheckingProver, MemoryCheckingVerifier, StructuredPolynomialData,
 };
+use crate::msm::icicle;
 use crate::poly::commitment::commitment_scheme::{BatchType, CommitmentScheme};
 use crate::poly::dense_mlpoly::DensePolynomial;
 use crate::r1cs::inputs::{ConstraintInput, R1CSPolynomials, R1CSProof, R1CSStuff};
@@ -238,31 +240,57 @@ impl<F: JoltField> JoltPolynomials<F> {
         PCS: CommitmentScheme<ProofTranscript, Field = F>,
         ProofTranscript: Transcript,
     {
+        let span = tracing::span!(tracing::Level::INFO, "commit::initialize");
+        let _guard = span.enter();
         let mut commitments = JoltCommitments::<PCS, ProofTranscript>::initialize(preprocessing);
+        drop(_guard);
+        drop(span);
 
         let trace_polys = self.read_write_values();
-        let trace_comitments =
+        let span = tracing::span!(tracing::Level::INFO, "commit::trace_commitments");
+        let _guard = span.enter();
+        let trace_commitments =
             PCS::batch_commit_polys_ref(&trace_polys, &preprocessing.generators, BatchType::Big);
+        drop(_guard);
+        drop(span);
+
         commitments
             .read_write_values_mut()
             .into_iter()
-            .zip(trace_comitments.into_iter())
+            .zip(trace_commitments.into_iter())
             .for_each(|(dest, src)| *dest = src);
 
+        let span = tracing::span!(tracing::Level::INFO, "commit::t_final");
+        let _guard = span.enter();
         commitments.bytecode.t_final =
             PCS::commit(&self.bytecode.t_final, &preprocessing.generators);
+        drop(_guard);
+        drop(span);
+
+        let span = tracing::span!(tracing::Level::INFO, "commit::read_write_memory");
+        let _guard = span.enter();
         (
             commitments.read_write_memory.v_final,
             commitments.read_write_memory.t_final,
-        ) = rayon::join(
+        ) = join_conditional!(
             || PCS::commit(&self.read_write_memory.v_final, &preprocessing.generators),
-            || PCS::commit(&self.read_write_memory.t_final, &preprocessing.generators),
+            || PCS::commit(&self.read_write_memory.t_final, &preprocessing.generators)
         );
+        drop(_guard);
+        drop(span);
+
+        let span = tracing::span!(
+            tracing::Level::INFO,
+            "commit::commit_instructions_final_cts"
+        );
+        let _guard = span.enter();
         commitments.instruction_lookups.final_cts = PCS::batch_commit_polys(
             &self.instruction_lookups.final_cts,
             &preprocessing.generators,
             BatchType::Big,
         );
+        drop(_guard);
+        drop(span);
 
         commitments
     }
@@ -287,6 +315,10 @@ where
         max_memory_address: usize,
         max_trace_length: usize,
     ) -> JoltPreprocessing<C, F, PCS, ProofTranscript> {
+        //TODO(sagar): This should be moved to a more appropriate place - icicle makes a network request
+        // which impacts prover time.
+        icicle::icicle_init();
+
         let bytecode_commitment_shapes = BytecodeProof::<F, PCS, ProofTranscript>::commit_shapes(
             max_bytecode_size,
             max_trace_length,
diff --git a/jolt-core/src/jolt/vm/timestamp_range_check.rs b/jolt-core/src/jolt/vm/timestamp_range_check.rs
index 933e742f7..789a4a573 100644
--- a/jolt-core/src/jolt/vm/timestamp_range_check.rs
+++ b/jolt-core/src/jolt/vm/timestamp_range_check.rs
@@ -1,7 +1,9 @@
+use super::{JoltCommitments, JoltPolynomials, JoltStuff};
 use crate::field::{JoltField, OptimizedMul};
 use crate::lasso::memory_checking::{
     ExogenousOpenings, Initializable, StructuredPolynomialData, VerifierComputedOpening,
 };
+use crate::poly::commitment::commitment_scheme::{BatchType, CommitShape, CommitmentScheme};
 use crate::poly::opening_proof::{ProverOpeningAccumulator, VerifierOpeningAccumulator};
 use crate::subprotocols::grand_product::{
     BatchedDenseGrandProduct, BatchedGrandProduct, BatchedGrandProductLayer,
@@ -9,14 +11,6 @@ use crate::subprotocols::grand_product::{
 };
 use crate::utils::math::Math;
 use crate::utils::thread::drop_in_background_thread;
-use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
-use common::constants::MEMORY_OPS_PER_INSTRUCTION;
-use itertools::interleave;
-use rayon::prelude::*;
-#[cfg(test)]
-use std::collections::HashSet;
-
-use crate::poly::commitment::commitment_scheme::{BatchType, CommitShape, CommitmentScheme};
 use crate::utils::transcript::Transcript;
 use crate::{
     lasso::memory_checking::{
@@ -28,8 +22,12 @@ use crate::{
     },
     utils::errors::ProofVerifyError,
 };
-
-use super::{JoltCommitments, JoltPolynomials, JoltStuff};
+use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
+use common::constants::MEMORY_OPS_PER_INSTRUCTION;
+use itertools::interleave;
+use rayon::prelude::*;
+#[cfg(test)]
+use std::collections::HashSet;
 
 #[derive(Default, CanonicalSerialize, CanonicalDeserialize)]
 pub struct TimestampRangeCheckStuff<T: CanonicalSerialize + CanonicalDeserialize + Sync> {
diff --git a/jolt-core/src/msm/icicle/adapter.rs b/jolt-core/src/msm/icicle/adapter.rs
new file mode 100644
index 000000000..0d40890ca
--- /dev/null
+++ b/jolt-core/src/msm/icicle/adapter.rs
@@ -0,0 +1,372 @@
+use crate::msm::{GpuBaseType, MsmType, VariableBaseMSM};
+use ark_bn254::G1Projective;
+use ark_ec::{CurveGroup, ScalarMul};
+use ark_ff::{BigInteger, Field, PrimeField};
+use icicle_bn254::curve::CurveCfg as IcicleBn254;
+use icicle_core::curve::{Affine, Curve, Projective};
+use icicle_core::{
+    msm::{msm, MSMConfig, MSM},
+    traits::FieldImpl,
+};
+use icicle_runtime::memory::HostOrDeviceSlice;
+use icicle_runtime::stream::IcicleStreamHandle;
+use icicle_runtime::{
+    memory::{DeviceVec, HostSlice},
+    stream::IcicleStream,
+};
+use rayon::prelude::*;
+use std::os::raw::c_void;
+
+impl Icicle for G1Projective {
+    type C = IcicleBn254;
+
+    fn to_ark_projective(point: &Projective<Self::C>) -> Self {
+        let proj_x =
+            <Self as CurveGroup>::BaseField::from_random_bytes(&point.x.to_bytes_le()).unwrap();
+        let proj_y =
+            <Self as CurveGroup>::BaseField::from_random_bytes(&point.y.to_bytes_le()).unwrap();
+        let proj_z =
+            <Self as CurveGroup>::BaseField::from_random_bytes(&point.z.to_bytes_le()).unwrap();
+
+        let proj_x = proj_x * proj_z;
+        let proj_y = proj_y * proj_z * proj_z;
+        Self::new_unchecked(proj_x, proj_y, proj_z)
+    }
+
+    fn from_ark_affine(point: &Self::MulBase) -> Affine<Self::C> {
+        let x_bytes: Vec<u8> = point
+            .x
+            .to_base_prime_field_elements()
+            .flat_map(|x| x.into_bigint().to_bytes_le())
+            .collect();
+        let y_bytes: Vec<u8> = point
+            .y
+            .to_base_prime_field_elements()
+            .flat_map(|x| x.into_bigint().to_bytes_le())
+            .collect();
+        let x = <Self::C as Curve>::BaseField::from_bytes_le(&x_bytes);
+        let y = <Self::C as Curve>::BaseField::from_bytes_le(&y_bytes);
+        Affine::<Self::C> { x, y }
+    }
+}
+
+pub trait Icicle: ScalarMul {
+    type C: Curve<ScalarField: > + MSM<Self::C>;
+
+    // Note: To prevent excessive trait the arkworks conversion functions within icicle are reimplemented
+    fn to_ark_projective(point: &Projective<Self::C>) -> Self;
+
+    fn from_ark_affine(point: &Self::MulBase) -> Affine<Self::C>;
+}
+
+#[tracing::instrument(skip_all, name = "icicle_msm")]
+pub fn icicle_msm<V: VariableBaseMSM>(
+    bases: &[GpuBaseType<V>],
+    scalars: &[V::ScalarField],
+    bit_size: usize,
+) -> V {
+    assert!(scalars.len() <= bases.len());
+
+    let mut bases_slice = DeviceVec::<GpuBaseType<V>>::device_malloc(bases.len()).unwrap();
+
+    let span = tracing::span!(tracing::Level::INFO, "convert_scalars");
+    let _guard = span.enter();
+
+    let mut scalars_slice =
+        DeviceVec::<<<V as Icicle>::C as Curve>::ScalarField>::device_malloc(scalars.len())
+            .unwrap();
+    let scalars_mont =
+        unsafe { &*(scalars as *const _ as *const [<<V as Icicle>::C as Curve>::ScalarField]) };
+
+    drop(_guard);
+    drop(span);
+
+    let mut stream = IcicleStream::create().unwrap();
+
+    let span = tracing::span!(tracing::Level::INFO, "copy_to_gpu");
+    let _guard = span.enter();
+    bases_slice
+        .copy_from_host_async(HostSlice::from_slice(bases), &stream)
+        .unwrap();
+    scalars_slice
+        .copy_from_host_async(HostSlice::from_slice(scalars_mont), &stream)
+        .unwrap();
+    drop(_guard);
+    drop(span);
+
+    let mut msm_result = DeviceVec::<Projective<V::C>>::device_malloc(1).unwrap();
+    let mut cfg = MSMConfig::default();
+    cfg.stream_handle = IcicleStreamHandle::from(&stream);
+    cfg.is_async = false;
+    cfg.are_scalars_montgomery_form = true;
+    cfg.bitsize = bit_size as i32;
+
+    let span = tracing::span!(tracing::Level::INFO, "gpu_msm");
+    let _guard = span.enter();
+
+    msm(
+        &scalars_slice,
+        &bases_slice[..scalars.len()],
+        &cfg,
+        &mut msm_result,
+    )
+    .unwrap();
+
+    drop(_guard);
+    drop(span);
+
+    let mut msm_host_result = [Projective::<V::C>::zero(); 1];
+
+    let span = tracing::span!(tracing::Level::INFO, "copy_msm_result");
+    let _guard = span.enter();
+    msm_result
+        .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_result))
+        .unwrap();
+    drop(_guard);
+    drop(span);
+
+    stream.synchronize().unwrap();
+    stream.destroy().unwrap();
+    V::to_ark_projective(&msm_host_result[0])
+}
+
+/// Batch process msms - assumes batches are equal in size
+/// Variable Batch sizes is not currently supported by icicle
+#[tracing::instrument(skip_all)]
+pub fn icicle_batch_msm<V: VariableBaseMSM>(
+    bases: &[GpuBaseType<V>],
+    scalar_batches: &[&[V::ScalarField]],
+    batch_type: MsmType,
+) -> Vec<V> {
+    let bases_len = bases.len();
+    let batch_size = scalar_batches.len();
+    assert!(scalar_batches.par_iter().all(|s| s.len() == bases_len));
+
+    let mut stream = IcicleStream::create().unwrap();
+    icicle_runtime::warmup(&stream).unwrap();
+
+    let mut bases_slice =
+        DeviceVec::<GpuBaseType<V>>::device_malloc_async(bases_len, &stream).unwrap();
+    let span = tracing::span!(tracing::Level::INFO, "copy_bases_to_gpu");
+    let _guard = span.enter();
+    bases_slice
+        .copy_from_host_async(HostSlice::from_slice(bases), &stream)
+        .unwrap();
+    drop(_guard);
+    drop(span);
+
+    let mut msm_result =
+        DeviceVec::<Projective<V::C>>::device_malloc_async(batch_size, &stream).unwrap();
+    let mut msm_host_results = vec![Projective::<V::C>::zero(); batch_size];
+    let total_len: usize = scalar_batches.par_iter().map(|batch| batch.len()).sum();
+    let mut scalars_slice =
+        DeviceVec::<<<V as Icicle>::C as Curve>::ScalarField>::device_malloc_async(
+            total_len, &stream,
+        )
+        .unwrap();
+
+    let span = tracing::span!(tracing::Level::INFO, "copy_scalars_to_gpu");
+    let _guard = span.enter();
+
+    let mut offset = 0;
+    for batch in scalar_batches {
+        let scalars_mont = unsafe {
+            &*(&batch[..] as *const _ as *const [<<V as Icicle>::C as Curve>::ScalarField])
+        };
+        copy_offset_from_host_async(
+            &mut scalars_slice,
+            HostSlice::from_slice(scalars_mont),
+            offset,
+            &stream,
+        )
+        .unwrap();
+        offset += batch.len();
+    }
+
+    drop(_guard);
+    drop(span);
+
+    //TODO(sagar) why doesn't the GPU always go to 100% clock speeds
+    let mut cfg = MSMConfig::default();
+    cfg.stream_handle = IcicleStreamHandle::from(&stream);
+    cfg.is_async = true;
+    cfg.are_scalars_montgomery_form = true;
+    cfg.batch_size = batch_size as i32;
+    cfg.bitsize = batch_type.num_bits() as i32;
+    cfg.ext
+        .set_int(icicle_core::msm::CUDA_MSM_LARGE_BUCKET_FACTOR, 5);
+
+    let span = tracing::span!(tracing::Level::INFO, "msm_batch_gpu");
+    let _guard = span.enter();
+    msm(&scalars_slice, &bases_slice, &cfg, &mut msm_result).unwrap();
+    drop(_guard);
+    drop(span);
+
+    let span = tracing::span!(tracing::Level::INFO, "synchronize");
+    let _guard = span.enter();
+    stream.synchronize().unwrap();
+    drop(_guard);
+    drop(span);
+
+    let span = tracing::span!(tracing::Level::INFO, "copy_msm_result");
+    let _guard = span.enter();
+    msm_result
+        .copy_to_host(HostSlice::from_mut_slice(&mut msm_host_results))
+        .unwrap();
+    drop(_guard);
+    drop(span);
+
+    let span = tracing::span!(tracing::Level::INFO, "converting_results");
+    let _guard = span.enter();
+    stream.destroy().unwrap();
+    msm_host_results
+        .into_par_iter()
+        .map(|res| V::to_ark_projective(&res))
+        .collect()
+}
+
+pub fn copy_offset_from_host_async<T>(
+    dest: &mut DeviceVec<T>,
+    src: &HostSlice<T>,
+    offset: usize,
+    stream: &IcicleStream,
+) -> Result<(), icicle_runtime::errors::eIcicleError> {
+    if dest.is_empty() {
+        return Ok(());
+    }
+
+    if !dest.is_on_active_device() {
+        panic!("not allocated on an active device");
+    }
+
+    if (src.len() + offset) > dest.len() {
+        panic!(
+            "offset {} + HostSlice.len() {} exceeds the size of the destination DeviceVec {}",
+            offset,
+            src.len(),
+            dest.len()
+        );
+    }
+
+    let size = size_of::<T>() * src.len();
+    unsafe {
+        icicle_runtime::icicle_copy_to_device_async(
+            dest.as_mut_ptr().add(offset) as *mut c_void,
+            src.as_ptr() as *const c_void,
+            size,
+            stream.handle,
+        )
+        .wrap()
+    }
+}
+
+pub fn icicle_from_ark<T, I>(ark: &T) -> I
+where
+    T: PrimeField,
+    I: FieldImpl,
+{
+    let mut ark_bytes =
+        Vec::with_capacity(T::BigInt::NUM_LIMBS * 8 * T::extension_degree() as usize);
+    for base_elem in ark.to_base_prime_field_elements() {
+        ark_bytes.extend_from_slice(&base_elem.into_bigint().to_bytes_le());
+    }
+    I::from_bytes_le(&ark_bytes)
+}
+
+pub fn icicle_to_ark<T, I>(icicle: &I) -> T
+where
+    T: PrimeField,
+    I: FieldImpl,
+{
+    T::from_random_bytes(&icicle.to_bytes_le()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::msm::total_memory_bits;
+    use ark_bn254::{Fr, G1Affine, G1Projective};
+    use ark_ec::VariableBaseMSM as ark_VariableBaseMSM;
+    use ark_std::UniformRand;
+    use icicle_bn254::curve::ScalarField as GPUScalar;
+    use rand_core::SeedableRng;
+
+    #[test]
+    fn test_icicle_msm_consistency() {
+        let pow = 10;
+        let n = 1 << pow;
+        let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(n as u64);
+        for _ in 0..10 {
+            let scalars: Vec<Fr> = std::iter::repeat_with(|| Fr::rand(&mut rng))
+                .take(n)
+                .collect();
+            let bases: Vec<G1Affine> = std::iter::repeat_with(|| G1Affine::rand(&mut rng))
+                .take(n)
+                .collect();
+
+            let gpu_bases = bases
+                .par_iter()
+                .map(|base| <G1Projective as Icicle>::from_ark_affine(base))
+                .collect::<Vec<_>>();
+            let icicle_res = icicle_msm::<G1Projective>(&gpu_bases, &scalars, 256);
+            let arkworks_res: G1Projective = ark_VariableBaseMSM::msm(&bases, &scalars).unwrap();
+            let no_gpu_res: G1Projective =
+                VariableBaseMSM::inner_msm(&bases, None, &scalars, false, None).unwrap();
+
+            assert_eq!(icicle_res, arkworks_res);
+            assert_eq!(icicle_res, no_gpu_res);
+        }
+    }
+
+    #[test]
+    fn test_icicle_batch_msm_consistency() {
+        let pow = 10;
+        let n = 1 << pow;
+        let mut rng = rand_chacha::ChaCha20Rng::seed_from_u64(n as u64);
+        for _ in 0..10 {
+            let scalars: Vec<Fr> = std::iter::repeat_with(|| Fr::rand(&mut rng))
+                .take(n)
+                .collect();
+            let scalar_batches = [scalars.as_slice(); 20];
+
+            let bases: Vec<G1Affine> = std::iter::repeat_with(|| G1Affine::rand(&mut rng))
+                .take(n)
+                .collect();
+
+            let gpu_bases = bases
+                .par_iter()
+                .map(|base| <G1Projective as Icicle>::from_ark_affine(base))
+                .collect::<Vec<_>>();
+            let icicle_res =
+                icicle_batch_msm::<G1Projective>(&gpu_bases, &scalar_batches, MsmType::Large(256));
+            let arkworks_res: Vec<G1Projective> = (0..20)
+                .into_iter()
+                .map(|_| ark_VariableBaseMSM::msm(&bases, &scalars).unwrap())
+                .collect();
+            let no_gpu_res: Vec<G1Projective> = (0..20)
+                .into_iter()
+                .map(|_| VariableBaseMSM::inner_msm(&bases, None, &scalars, false, None).unwrap())
+                .collect();
+
+            assert_eq!(icicle_res, arkworks_res);
+            assert_eq!(icicle_res, no_gpu_res);
+        }
+    }
+
+    #[test]
+    fn test_casting() {
+        let ark = Fr::from(100);
+        let gpu: GPUScalar = icicle_from_ark(&ark);
+
+        let ark_bytes: [u8; 32] = unsafe { std::mem::transmute(ark) };
+        let gpu_bytes: [u8; 32] =
+            unsafe { std::mem::transmute(icicle_to_ark::<Fr, GPUScalar>(&gpu)) };
+        assert_eq!(ark_bytes, gpu_bytes);
+    }
+
+    #[test]
+    fn test_total_memory() {
+        let total = total_memory_bits();
+        assert!(total > 0);
+    }
+}
diff --git a/jolt-core/src/msm/icicle/mod.rs b/jolt-core/src/msm/icicle/mod.rs
new file mode 100644
index 000000000..b77daee3d
--- /dev/null
+++ b/jolt-core/src/msm/icicle/mod.rs
@@ -0,0 +1,104 @@
+#[cfg(not(feature = "icicle"))]
+use ark_bn254::G1Projective;
+use ark_ec::{CurveGroup, ScalarMul};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Once;
+
+#[cfg(feature = "icicle")]
+pub(crate) mod adapter;
+#[cfg(feature = "icicle")]
+pub use adapter::*;
+
+static ICICLE_INIT: Once = Once::new();
+static ICICLE_READY: AtomicBool = AtomicBool::new(false);
+
+#[cfg(feature = "icicle")]
+pub trait CurveGroupConfig: CurveGroup + Icicle {}
+#[cfg(not(feature = "icicle"))]
+pub trait CurveGroupConfig: CurveGroup {}
+
+#[cfg(feature = "icicle")]
+pub trait ScalarMulConfig: ScalarMul + Icicle {}
+#[cfg(not(feature = "icicle"))]
+pub trait ScalarMulConfig: ScalarMul {}
+#[cfg(not(feature = "icicle"))]
+pub trait Icicle {}
+#[cfg(not(feature = "icicle"))]
+impl Icicle for G1Projective {}
+
+/// Initializes the icicle backend and sets the CUDA device as active and returns true if successful.
+///
+/// Safe to call multiple times on the main thread; will only initialize the backend once.
+///
+/// Todo(sagar) this takes almost 1 second - likely due to license check
+/// Todo(sagar) Remove set_device from here.
+#[tracing::instrument()]
+pub fn icicle_init() -> bool {
+    let mut initialized = false;
+
+    ICICLE_INIT.call_once(|| {
+        #[cfg(feature = "icicle")]
+        if icicle_runtime::load_backend_from_env_or_default().is_ok() {
+            if let Ok(devices) = icicle_runtime::get_registered_devices() {
+                println!("Initializing icicle: available devices {:?}", devices);
+
+                // Attempt to set the CUDA device as active
+                let device = icicle_runtime::Device::new("CUDA", 0);
+                if icicle_runtime::set_device(&device).is_ok() {
+                    println!("icicle using device: {:?}", device);
+                    initialized = true;
+                } else {
+                    println!("Failed to set CUDA device; falling back to CPU.");
+                }
+            }
+        }
+
+        #[cfg(not(feature = "icicle"))]
+        {
+            initialized = false;
+        }
+
+        #[cfg(feature = "icicle")]
+        if !initialized {
+            println!("Failed to initialize icicle backend; using JOLT CPU implementations.");
+        }
+
+        ICICLE_READY.store(initialized, Ordering::Relaxed);
+    });
+
+    ICICLE_READY.load(Ordering::Relaxed)
+}
+
+/// Returns the total memory available on the system in bits.
+///
+/// If icicle is enabled, it will return the total memory available on the GPU in bits.
+#[allow(dead_code)]
+pub fn total_memory_bits() -> usize {
+    const DEFAULT_MEM_GB: usize = 30;
+    const BITS_PER_BYTE: usize = 8;
+    const BYTES_PER_KB: usize = 1024;
+    const BYTES_PER_GB: usize = 1024 * 1024 * 1024;
+
+    #[cfg(feature = "icicle")]
+    if let Ok((total_bytes, _)) = icicle_runtime::get_available_memory() {
+        // If icicle is enabled and memory is available, return the total memory in bits.
+        return total_bytes.checked_mul(BITS_PER_BYTE).unwrap_or(usize::MAX);
+    }
+
+    // Fallback to system memory if icicle is unavailable or not enabled.
+    #[cfg(not(target_arch = "wasm32"))]
+    if let Ok(mem_info) = sys_info::mem_info() {
+        return (mem_info.total as usize * BYTES_PER_KB)
+            .checked_mul(BITS_PER_BYTE)
+            .unwrap_or(usize::MAX);
+    }
+
+    // Fallback to "default" memory if system memory retrieval fails.
+    DEFAULT_MEM_GB
+        .checked_mul(
+            BYTES_PER_GB
+                .checked_mul(BITS_PER_BYTE)
+                .unwrap_or(usize::MAX),
+        )
+        .unwrap_or(usize::MAX)
+}
diff --git a/jolt-core/src/msm/mod.rs b/jolt-core/src/msm/mod.rs
index 0577bbb12..365935de5 100644
--- a/jolt-core/src/msm/mod.rs
+++ b/jolt-core/src/msm/mod.rs
@@ -1,56 +1,322 @@
+use ark_ec::pairing::Pairing;
 use ark_ec::{CurveGroup, ScalarMul};
 use ark_ff::{prelude::*, PrimeField};
 use ark_std::cmp::Ordering;
 use ark_std::vec::Vec;
+#[cfg(feature = "icicle")]
+use icicle_core::curve::Affine;
 use rayon::prelude::*;
 
-impl<G: CurveGroup> VariableBaseMSM for G {}
+pub(crate) mod icicle;
+use crate::utils::errors::ProofVerifyError;
+pub use icicle::*;
+
+impl<G: CurveGroup + Icicle> VariableBaseMSM for G {}
+
+#[cfg(feature = "icicle")]
+pub type GpuBaseType<G: Icicle> = Affine<G::C>;
+#[cfg(not(feature = "icicle"))]
+pub type GpuBaseType<G: ScalarMul> = G::MulBase;
+
+use itertools::Either;
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+pub enum MsmType {
+    Zero,
+    One,
+    Small(usize),
+    Medium(usize),
+    Large(usize),
+}
+
+impl MsmType {
+    fn from_u32(i: u32) -> MsmType {
+        match i {
+            0 => MsmType::Zero,
+            1 => MsmType::One,
+            2..=10 => MsmType::Small(i as usize),
+            11..=64 => MsmType::Medium(i as usize),
+            _ => MsmType::Large(i as usize),
+        }
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn from_scalars<S: ScalarMul>(scalars: &[S::ScalarField]) -> MsmType {
+        let max_num_bits = scalars
+            .par_iter()
+            .map(|s| s.into_bigint().num_bits())
+            .max()
+            .unwrap();
+        MsmType::from_u32(max_num_bits)
+    }
+
+    #[allow(dead_code)]
+    fn num_bits(&self) -> usize {
+        match self {
+            MsmType::Zero => 0,
+            MsmType::One => 1,
+            MsmType::Small(i) => *i,
+            MsmType::Medium(i) => *i,
+            MsmType::Large(i) => *i,
+        }
+    }
+
+    fn prefers_icicle(&self) -> bool {
+        match self {
+            MsmType::Zero | MsmType::One | MsmType::Small(_) => false,
+            #[cfg(feature = "icicle")]
+            MsmType::Medium(_) | MsmType::Large(_) => true,
+            #[cfg(not(feature = "icicle"))]
+            _ => false,
+        }
+    }
+}
+
+type TrackedScalar<'a, P: Pairing> = (usize, &'a [P::ScalarField]);
+pub type ScalarGroups<'a, P: Pairing> = (MsmType, Vec<TrackedScalar<'a, P>>);
 
 /// Copy of ark_ec::VariableBaseMSM with minor modifications to speed up
 /// known small element sized MSMs.
-pub trait VariableBaseMSM: ScalarMul {
-    fn msm(bases: &[Self::MulBase], scalars: &[Self::ScalarField]) -> Result<Self, usize> {
+pub trait VariableBaseMSM: ScalarMul + Icicle {
+    #[tracing::instrument(skip_all)]
+    fn msm(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalars: &[Self::ScalarField],
+    ) -> Result<Self, ProofVerifyError> {
+        Self::inner_msm(bases, gpu_bases, scalars, true, None)
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn msm_with_type(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalars: &[Self::ScalarField],
+        msm_type: MsmType,
+    ) -> Result<Self, ProofVerifyError> {
+        Self::inner_msm(bases, gpu_bases, scalars, true, Some(msm_type))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn inner_msm(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalars: &[Self::ScalarField],
+        allow_icicle: bool,
+        msm_type: Option<MsmType>,
+    ) -> Result<Self, ProofVerifyError> {
+        #[cfg(not(feature = "icicle"))]
+        assert!(gpu_bases.is_none());
+        assert_eq!(bases.len(), gpu_bases.map_or(bases.len(), |b| b.len()));
+
         (bases.len() == scalars.len())
             .then(|| {
-                let max_num_bits = scalars
-                    .par_iter()
-                    .map(|s| s.into_bigint().num_bits())
-                    .max()
-                    .unwrap();
-
-                match max_num_bits {
-                    0 => Self::zero(),
-                    1 => {
+                let msm_type = msm_type.unwrap_or_else(|| MsmType::from_scalars::<Self>(scalars));
+
+                match msm_type {
+                    MsmType::Zero => Self::zero(),
+                    MsmType::One => {
                         let scalars_u64 = &map_field_elements_to_u64::<Self>(scalars);
                         msm_binary(bases, scalars_u64)
                     }
-                    2..=10 => {
+                    MsmType::Small(max_num_bits) => {
                         let scalars_u64 = &map_field_elements_to_u64::<Self>(scalars);
-                        msm_small(bases, scalars_u64, max_num_bits as usize)
+                        msm_small(bases, scalars_u64, max_num_bits)
                     }
-                    11..=64 => {
+                    MsmType::Medium(max_num_bits) => {
+                        // TODO(sagar) caching this as "use_icicle = use_icicle" seems to cause a massive slowdown
+                        if use_icicle(Some(msm_type.prefers_icicle() && allow_icicle)) {
+                            #[cfg(feature = "icicle")]
+                            {
+                                let mut backup = vec![];
+                                let gpu_bases = gpu_bases.unwrap_or_else(|| {
+                                    backup = Self::get_gpu_bases(bases);
+                                    &backup
+                                });
+                                return icicle_msm::<Self>(gpu_bases, scalars, max_num_bits);
+                            }
+                            #[cfg(not(feature = "icicle"))]
+                            {
+                                unreachable!(
+                                    "icicle_init must not return true without the icicle feature"
+                                );
+                            }
+                        }
+
                         let scalars_u64 = &map_field_elements_to_u64::<Self>(scalars);
                         if Self::NEGATION_IS_CHEAP {
-                            msm_u64_wnaf(bases, scalars_u64, max_num_bits as usize)
+                            msm_u64_wnaf(bases, scalars_u64, max_num_bits)
                         } else {
-                            msm_u64(bases, scalars_u64, max_num_bits as usize)
+                            msm_u64(bases, scalars_u64, max_num_bits)
                         }
                     }
-                    _ => {
+                    MsmType::Large(max_num_bits) => {
+                        if use_icicle(Some(msm_type.prefers_icicle() && allow_icicle)) {
+                            #[cfg(feature = "icicle")]
+                            {
+                                let mut backup = vec![];
+                                let gpu_bases = gpu_bases.unwrap_or_else(|| {
+                                    backup = Self::get_gpu_bases(bases);
+                                    &backup
+                                });
+                                return icicle_msm::<Self>(gpu_bases, scalars, max_num_bits);
+                            }
+                            #[cfg(not(feature = "icicle"))]
+                            {
+                                unreachable!(
+                                    "icicle_init must not return true without the icicle feature"
+                                );
+                            }
+                        }
+
                         let scalars = scalars
                             .par_iter()
                             .map(|s| s.into_bigint())
                             .collect::<Vec<_>>();
                         if Self::NEGATION_IS_CHEAP {
-                            msm_bigint_wnaf(bases, &scalars, max_num_bits as usize)
+                            msm_bigint_wnaf(bases, &scalars, max_num_bits)
                         } else {
-                            msm_bigint(bases, &scalars, max_num_bits as usize)
+                            msm_bigint(bases, &scalars, max_num_bits)
                         }
                     }
                 }
             })
-            .ok_or_else(|| bases.len().min(scalars.len()))
+            .ok_or(ProofVerifyError::KeyLengthError(bases.len(), scalars.len()))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn batch_msm(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalar_batches: &[&[Self::ScalarField]],
+    ) -> Vec<Self> {
+        Self::batch_msm_inner(bases, gpu_bases, scalar_batches, true, false)
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn variable_batch_msm(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalar_batches: &[&[Self::ScalarField]],
+    ) -> Vec<Self> {
+        Self::batch_msm_inner(bases, gpu_bases, scalar_batches, true, true)
     }
+
+    #[tracing::instrument(skip_all)]
+    fn batch_msm_inner(
+        bases: &[Self::MulBase],
+        gpu_bases: Option<&[GpuBaseType<Self>]>,
+        scalar_batches: &[&[Self::ScalarField]],
+        allow_icicle: bool,
+        _variable_batches: bool,
+    ) -> Vec<Self> {
+        assert!(scalar_batches.par_iter().all(|s| s.len() == bases.len()));
+        #[cfg(not(feature = "icicle"))]
+        assert!(gpu_bases.is_none());
+        assert_eq!(bases.len(), gpu_bases.map_or(bases.len(), |b| b.len()));
+
+        if !use_icicle(Some(allow_icicle)) {
+            let span = tracing::span!(tracing::Level::INFO, "batch_msm_cpu_only");
+            let _guard = span.enter();
+            return scalar_batches
+                .into_par_iter()
+                .map(|scalars| Self::inner_msm(bases, None, scalars, false, None).unwrap())
+                .collect();
+        }
+
+        // Split scalar batches into CPU and GPU workloads
+        let span = tracing::span!(tracing::Level::INFO, "group_scalar_indices_parallel");
+        let _guard = span.enter();
+        let (cpu_slices, gpu_slices): (Vec<_>, Vec<_>) = scalar_batches
+            .par_iter()
+            .enumerate()
+            .partition_map(|(i, scalar_slice)| {
+                let msm_type = MsmType::from_scalars::<Self>(scalar_slice);
+                if use_icicle(Some(allow_icicle && msm_type.prefers_icicle())) {
+                    Either::Right((i, msm_type, *scalar_slice))
+                } else {
+                    Either::Left((i, msm_type, *scalar_slice))
+                }
+            });
+        drop(_guard);
+        drop(span);
+        let mut results = vec![Self::zero(); scalar_batches.len()];
+
+        // Handle CPU computations in parallel
+        let span = tracing::span!(tracing::Level::INFO, "batch_msm_cpu");
+        let _guard = span.enter();
+        let cpu_results: Vec<(usize, Self)> = cpu_slices
+            .into_par_iter()
+            .map(|(i, msm_type, scalars)| {
+                (
+                    i,
+                    Self::msm_with_type(bases, None, scalars, msm_type).unwrap(),
+                )
+            })
+            .collect();
+        drop(_guard);
+        drop(span);
+
+        // Store CPU results
+        for (i, result) in cpu_results {
+            results[i] = result;
+        }
+
+        // Handle GPU computations if available
+        if !gpu_slices.is_empty() && use_icicle(Some(allow_icicle)) {
+            #[cfg(feature = "icicle")]
+            {
+                let span = tracing::span!(tracing::Level::INFO, "batch_msms_gpu");
+                let _guard = span.enter();
+                let mut backup = vec![];
+                let gpu_bases = gpu_bases.unwrap_or_else(|| {
+                    backup = Self::get_gpu_bases(bases);
+                    &backup
+                });
+
+                // includes putting the scalars and bases on device
+                let slice_bit_size = 256 * gpu_slices[0].2.len() * 2;
+                let slices_at_a_time = total_memory_bits() / slice_bit_size;
+
+                // Process GPU batches with memory constraints
+                for work_chunk in gpu_slices.chunks(slices_at_a_time) {
+                    let (scalar_types, chunk_scalars): (Vec<_>, Vec<&[Self::ScalarField]>) =
+                        work_chunk
+                            .par_iter()
+                            .map(|(_, msm_type, scalars)| (*msm_type, *scalars))
+                            .unzip();
+
+                    let max_scalar_type = scalar_types.par_iter().max().unwrap();
+                    let batch_results =
+                        icicle_batch_msm::<Self>(gpu_bases, &chunk_scalars, *max_scalar_type);
+
+                    // Store GPU results using original indices
+                    for ((original_idx, _, _), result) in work_chunk.iter().zip(batch_results) {
+                        results[*original_idx] = result;
+                    }
+                }
+            }
+            #[cfg(not(feature = "icicle"))]
+            {
+                unreachable!("icicle_init must not return true without the icicle feature");
+            }
+        }
+        results
+    }
+
+    #[cfg(feature = "icicle")]
+    #[tracing::instrument(skip_all)]
+    fn get_gpu_bases(bases: &[Self::MulBase]) -> Vec<GpuBaseType<Self>> {
+        bases
+            .par_iter()
+            .map(|base| <Self as Icicle>::from_ark_affine(base))
+            .collect()
+    }
+}
+
+fn use_icicle(additional_conditions: Option<bool>) -> bool {
+    let additional = additional_conditions.unwrap_or(true);
+    icicle_init() && additional
 }
 
 fn map_field_elements_to_u64<V: VariableBaseMSM>(field_elements: &[V::ScalarField]) -> Vec<u64> {
@@ -480,3 +746,31 @@ fn ln_without_floats(a: usize) -> usize {
     // log2(a) * ln(2)
     (ark_std::log2(a) * 69 / 100) as usize
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::msm::MsmType;
+
+    #[test]
+    fn test_msm_type_conversion() {
+        let msm_type = MsmType::from_u32(0);
+        assert_eq!(msm_type, MsmType::Zero);
+        assert_eq!(msm_type.num_bits(), 0);
+
+        let msm_type = MsmType::from_u32(1);
+        assert_eq!(msm_type, MsmType::One);
+        assert_eq!(msm_type.num_bits(), 1);
+
+        let msm_type = MsmType::from_u32(2);
+        assert_eq!(msm_type, MsmType::Small(2));
+        assert_eq!(msm_type.num_bits(), 2);
+
+        let msm_type = MsmType::from_u32(11);
+        assert_eq!(msm_type, MsmType::Medium(11));
+        assert_eq!(msm_type.num_bits(), 11);
+
+        let msm_type = MsmType::from_u32(65);
+        assert_eq!(msm_type, MsmType::Large(65));
+        assert_eq!(msm_type.num_bits(), 65);
+    }
+}
diff --git a/jolt-core/src/poly/commitment/hyperkzg.rs b/jolt-core/src/poly/commitment/hyperkzg.rs
index 8c4e3b2b4..ca78a90d9 100644
--- a/jolt-core/src/poly/commitment/hyperkzg.rs
+++ b/jolt-core/src/poly/commitment/hyperkzg.rs
@@ -9,16 +9,17 @@
 //! and within the KZG commitment scheme implementation itself).
 use super::{
     commitment_scheme::{BatchType, CommitmentScheme},
-    kzg,
     kzg::{KZGProverKey, KZGVerifierKey, UnivariateKZG},
 };
-use crate::field;
+use crate::field::JoltField;
 use crate::poly::commitment::commitment_scheme::CommitShape;
+use crate::poly::commitment::kzg::CommitMode;
 use crate::utils::mul_0_1_optimized;
 use crate::utils::thread::unsafe_allocate_zero_vec;
 use crate::utils::transcript::Transcript;
+use crate::{field, into_optimal_iter};
 use crate::{
-    msm::VariableBaseMSM,
+    msm::{Icicle, VariableBaseMSM},
     poly::{commitment::kzg::SRS, dense_mlpoly::DensePolynomial, unipoly::UniPoly},
     utils::{errors::ProofVerifyError, transcript::AppendToTranscript},
 };
@@ -34,10 +35,18 @@ use rayon::iter::{
 use std::{marker::PhantomData, sync::Arc};
 use tracing::trace_span;
 
-pub struct HyperKZGSRS<P: Pairing>(Arc<SRS<P>>);
+pub struct HyperKZGSRS<P: Pairing>(Arc<SRS<P>>)
+where
+    P::G1: Icicle;
 
-impl<P: Pairing> HyperKZGSRS<P> {
-    pub fn setup<R: RngCore + CryptoRng>(rng: &mut R, max_degree: usize) -> Self {
+impl<P: Pairing> HyperKZGSRS<P>
+where
+    P::G1: Icicle,
+{
+    pub fn setup<R: RngCore + CryptoRng>(rng: &mut R, max_degree: usize) -> Self
+    where
+        P::ScalarField: JoltField,
+    {
         Self(Arc::new(SRS::setup(rng, max_degree, 2)))
     }
 
@@ -48,7 +57,10 @@ impl<P: Pairing> HyperKZGSRS<P> {
 }
 
 #[derive(Clone, Debug)]
-pub struct HyperKZGProverKey<P: Pairing> {
+pub struct HyperKZGProverKey<P: Pairing>
+where
+    P::G1: Icicle,
+{
     pub kzg_pk: KZGProverKey<P>,
 }
 
@@ -57,7 +69,7 @@ pub struct HyperKZGVerifierKey<P: Pairing> {
     pub kzg_vk: KZGVerifierKey<P>,
 }
 
-#[derive(Debug, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
+#[derive(Debug, Clone, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
 pub struct HyperKZGCommitment<P: Pairing>(pub P::G1Affine);
 
 impl<P: Pairing> Default for HyperKZGCommitment<P> {
@@ -99,6 +111,7 @@ fn kzg_open_no_rem<P: Pairing>(
 ) -> P::G1Affine
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     let h = compute_witness_polynomial::<P>(f, u);
     UnivariateKZG::commit(&pk.kzg_pk, &UniPoly::from_coeff(h)).unwrap()
@@ -128,6 +141,7 @@ fn scalar_vector_muladd<P: Pairing>(
     s: P::ScalarField,
 ) where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     assert!(a.len() >= v.len());
     for i in 0..v.len() {
@@ -141,6 +155,7 @@ fn kzg_compute_batch_polynomial<P: Pairing>(
 ) -> Vec<P::ScalarField>
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     let k = f.len(); // Number of polynomials we're batching
 
@@ -161,6 +176,7 @@ fn kzg_open_batch<P: Pairing, ProofTranscript: Transcript>(
 ) -> (Vec<P::G1Affine>, Vec<Vec<P::ScalarField>>)
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     let k = f.len();
     let t = u.len();
@@ -182,8 +198,7 @@ where
     let B = kzg_compute_batch_polynomial::<P>(f, q_powers);
 
     // Now open B at u0, ..., u_{t-1}
-    let w = u
-        .into_par_iter()
+    let w = into_optimal_iter!(u)
         .map(|ui| kzg_open_no_rem(&B, *ui, pk))
         .collect::<Vec<P::G1Affine>>();
 
@@ -206,6 +221,7 @@ fn kzg_verify_batch<P: Pairing, ProofTranscript: Transcript>(
 ) -> bool
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     let k = C.len();
     let t = u.len();
@@ -258,6 +274,7 @@ where
 
     let L = <P::G1 as VariableBaseMSM>::msm(
         &[&C[..k], &[W[0], W[1], W[2], vk.kzg_vk.g1]].concat(),
+        None,
         &[
             &q_powers_multiplied[..k],
             &[
@@ -285,6 +302,7 @@ pub struct HyperKZG<P: Pairing, ProofTranscript: Transcript> {
 impl<P: Pairing, ProofTranscript: Transcript> HyperKZG<P, ProofTranscript>
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     pub fn protocol_name() -> &'static [u8] {
         b"HyperKZG"
@@ -320,11 +338,13 @@ where
         // Phase 1  -- create commitments com_1, ..., com_\ell
         // We do not compute final Pi (and its commitment) as it is constant and equals to 'eval'
         // also known to verifier, so can be derived on its side as well
+        let span = trace_span!("phase_1");
+        let _enter = span.enter();
         let mut polys: Vec<Vec<P::ScalarField>> = Vec::new();
         polys.push(poly.Z.to_vec());
         for i in 0..ell - 1 {
             let Pi_len = polys[i].len() / 2;
-            let mut Pi = vec![P::ScalarField::zero(); Pi_len];
+            let mut Pi = unsafe_allocate_zero_vec(Pi_len);
 
             #[allow(clippy::needless_range_loop)]
             Pi.par_iter_mut().enumerate().for_each(|(j, Pi_j)| {
@@ -334,14 +354,16 @@ where
 
             polys.push(Pi);
         }
+        drop(_enter);
+        drop(span);
 
         assert_eq!(polys.len(), ell);
         assert_eq!(polys[ell - 1].len(), 2);
 
         // We do not need to commit to the first polynomial as it is already committed.
         // Compute commitments in parallel
-        let com: Vec<P::G1Affine> = (1..polys.len())
-            .into_par_iter()
+        // TODO(sragss): This could be done by batch too if it gets progressively smaller.
+        let com: Vec<P::G1Affine> = into_optimal_iter!(1..polys.len())
             .map(|i| UnivariateKZG::commit_slice(&pk.kzg_pk, &polys[i]).unwrap())
             .collect();
 
@@ -508,6 +530,7 @@ impl<P: Pairing, ProofTranscript: Transcript> CommitmentScheme<ProofTranscript>
     for HyperKZG<P, ProofTranscript>
 where
     <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     type Field = P::ScalarField;
     type Setup = (HyperKZGProverKey<P>, HyperKZGVerifierKey<P>);
@@ -526,6 +549,7 @@ where
         .trim(max_len)
     }
 
+    #[tracing::instrument(skip_all, name = "HyperKZG::commit")]
     fn commit(poly: &DensePolynomial<Self::Field>, setup: &Self::Setup) -> Self::Commitment {
         assert!(
             setup.0.kzg_pk.g1_powers().len() >= poly.Z.len(),
@@ -536,36 +560,22 @@ where
         HyperKZGCommitment(UnivariateKZG::commit_slice(&setup.0.kzg_pk, &poly.Z).unwrap())
     }
 
+    #[tracing::instrument(skip_all, name = "HyperKZG::batch_commit")]
     fn batch_commit(
         evals: &[&[Self::Field]],
         gens: &Self::Setup,
         batch_type: BatchType,
     ) -> Vec<Self::Commitment> {
-        // TODO: assert lengths are valid
-        evals
-            .par_iter()
-            .map(|evals| {
-                assert!(
-                    gens.0.kzg_pk.g1_powers().len() >= evals.len(),
-                    "COMMIT KEY LENGTH ERROR {}, {}",
-                    gens.0.kzg_pk.g1_powers().len(),
-                    evals.len()
-                );
-                match batch_type {
-                    BatchType::GrandProduct => HyperKZGCommitment(
-                        UnivariateKZG::commit_slice_with_mode(
-                            &gens.0.kzg_pk,
-                            evals,
-                            kzg::CommitMode::GrandProduct,
-                        )
-                        .unwrap(),
-                    ),
-                    _ => HyperKZGCommitment(
-                        UnivariateKZG::commit_slice(&gens.0.kzg_pk, evals).unwrap(),
-                    ),
-                }
-            })
-            .collect::<Vec<_>>()
+        let mode = match batch_type {
+            BatchType::GrandProduct => CommitMode::GrandProduct,
+            _ => CommitMode::Default,
+        };
+
+        UnivariateKZG::commit_batch_with_mode(&gens.0.kzg_pk, evals, mode)
+            .unwrap()
+            .into_par_iter()
+            .map(|c| HyperKZGCommitment(c))
+            .collect()
     }
 
     fn commit_slice(evals: &[Self::Field], setup: &Self::Setup) -> Self::Commitment {
diff --git a/jolt-core/src/poly/commitment/hyrax.rs b/jolt-core/src/poly/commitment/hyrax.rs
index 1bd43e5bc..5f1d75035 100644
--- a/jolt-core/src/poly/commitment/hyrax.rs
+++ b/jolt-core/src/poly/commitment/hyrax.rs
@@ -15,10 +15,10 @@ use num_integer::Roots;
 use rayon::prelude::*;
 use tracing::trace_span;
 
-use crate::msm::VariableBaseMSM;
+use crate::msm::{icicle::Icicle, VariableBaseMSM};
 
 #[derive(Clone)]
-pub struct HyraxScheme<G: CurveGroup, ProofTranscript: Transcript> {
+pub struct HyraxScheme<G: CurveGroup + Icicle, ProofTranscript: Transcript> {
     marker: PhantomData<(G, ProofTranscript)>,
 }
 
@@ -48,7 +48,7 @@ pub fn matrix_dimensions(num_vars: usize, ratio: usize) -> (usize, usize) {
     (col_size, row_size)
 }
 
-impl<F: JoltField, G: CurveGroup<ScalarField = F>, ProofTranscript: Transcript>
+impl<F: JoltField, G: CurveGroup<ScalarField = F> + Icicle, ProofTranscript: Transcript>
     CommitmentScheme<ProofTranscript> for HyraxScheme<G, ProofTranscript>
 {
     type Field = G::ScalarField;
@@ -84,31 +84,6 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>, ProofTranscript: Transcript>
     fn commit_slice(eval_slice: &[Self::Field], generators: &Self::Setup) -> Self::Commitment {
         HyraxCommitment::commit_slice(eval_slice, generators)
     }
-    fn prove(
-        _setup: &Self::Setup,
-        poly: &DensePolynomial<Self::Field>,
-        opening_point: &[Self::Field],
-        transcript: &mut ProofTranscript,
-    ) -> Self::Proof {
-        // Implicitly prove is "prove_single", with a ratio = 1
-        HyraxOpeningProof::prove(poly, opening_point, 1, transcript)
-    }
-    fn batch_prove(
-        _setup: &Self::Setup,
-        polynomials: &[&DensePolynomial<Self::Field>],
-        opening_point: &[Self::Field],
-        openings: &[Self::Field],
-        batch_type: BatchType,
-        transcript: &mut ProofTranscript,
-    ) -> Self::BatchedProof {
-        BatchedHyraxOpeningProof::prove(
-            polynomials,
-            opening_point,
-            openings,
-            batch_type,
-            transcript,
-        )
-    }
     fn combine_commitments(
         commitments: &[&Self::Commitment],
         coeffs: &[Self::Field],
@@ -141,6 +116,31 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>, ProofTranscript: Transcript>
             );
         HyraxCommitment { row_commitments }
     }
+    fn prove(
+        _setup: &Self::Setup,
+        poly: &DensePolynomial<Self::Field>,
+        opening_point: &[Self::Field],
+        transcript: &mut ProofTranscript,
+    ) -> Self::Proof {
+        // Implicitly prove is "prove_single", with a ratio = 1
+        HyraxOpeningProof::prove(poly, opening_point, 1, transcript)
+    }
+    fn batch_prove(
+        _setup: &Self::Setup,
+        polynomials: &[&DensePolynomial<Self::Field>],
+        opening_point: &[Self::Field],
+        openings: &[Self::Field],
+        batch_type: BatchType,
+        transcript: &mut ProofTranscript,
+    ) -> Self::BatchedProof {
+        BatchedHyraxOpeningProof::prove(
+            polynomials,
+            opening_point,
+            openings,
+            batch_type,
+            transcript,
+        )
+    }
 
     fn verify(
         proof: &Self::Proof,
@@ -185,16 +185,16 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>, ProofTranscript: Transcript>
 }
 
 #[derive(Clone, CanonicalSerialize, CanonicalDeserialize)]
-pub struct HyraxGenerators<G: CurveGroup> {
+pub struct HyraxGenerators<G: CurveGroup + Icicle> {
     pub gens: PedersenGenerators<G>,
 }
 
 #[derive(Default, Clone, Debug, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
-pub struct HyraxCommitment<G: CurveGroup> {
+pub struct HyraxCommitment<G: CurveGroup + Icicle> {
     pub row_commitments: Vec<G>,
 }
 
-impl<F: JoltField, G: CurveGroup<ScalarField = F>> HyraxCommitment<G> {
+impl<F: JoltField, G: CurveGroup<ScalarField = F> + Icicle> HyraxCommitment<G> {
     #[tracing::instrument(skip_all, name = "HyraxCommitment::commit")]
     pub fn commit(
         poly: &DensePolynomial<G::ScalarField>,
@@ -211,10 +211,15 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>> HyraxCommitment<G> {
         let (L_size, R_size) = matrix_dimensions(ell, 1);
         assert_eq!(L_size * R_size, n);
 
-        let gens = CurveGroup::normalize_batch(&generators.generators[..R_size]);
+        let gens = &generators.generators[..R_size];
+        let gpu_generators = generators
+            .gpu_generators
+            .as_ref()
+            .map(|gens| &gens[..R_size]);
+
         let row_commitments = eval_slice
             .par_chunks(R_size)
-            .map(|row| PedersenCommitment::commit_vector(row, &gens))
+            .map(|row| PedersenCommitment::commit_vector(row, gens, gpu_generators))
             .collect();
         Self { row_commitments }
     }
@@ -234,12 +239,17 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>> HyraxCommitment<G> {
         let (L_size, R_size) = matrix_dimensions(ell, ratio);
         assert_eq!(L_size * R_size, n);
 
-        let gens = CurveGroup::normalize_batch(&generators.generators[..R_size]);
+        let gens = &generators.generators[..R_size];
+        let gpu_gens = generators
+            .gpu_generators
+            .as_ref()
+            .map(|gens| &gens[..R_size]);
 
-        let rows = batch.par_iter().flat_map(|poly| poly.par_chunks(R_size));
-        let row_commitments: Vec<G> = rows
-            .map(|row| PedersenCommitment::commit_vector(row, &gens))
+        let rows: Vec<&[G::ScalarField]> = batch
+            .par_iter()
+            .flat_map(|poly| poly.par_chunks(R_size))
             .collect();
+        let row_commitments: Vec<G> = G::batch_msm(gens, gpu_gens, &rows);
 
         row_commitments
             .par_chunks(L_size)
@@ -250,7 +260,7 @@ impl<F: JoltField, G: CurveGroup<ScalarField = F>> HyraxCommitment<G> {
     }
 }
 
-impl<G: CurveGroup> AppendToTranscript for HyraxCommitment<G> {
+impl<G: CurveGroup + Icicle> AppendToTranscript for HyraxCommitment<G> {
     fn append_to_transcript<ProofTranscript: Transcript>(&self, transcript: &mut ProofTranscript) {
         transcript.append_message(b"poly_commitment_begin");
         for i in 0..self.row_commitments.len() {
@@ -261,7 +271,7 @@ impl<G: CurveGroup> AppendToTranscript for HyraxCommitment<G> {
 }
 
 #[derive(Debug, CanonicalSerialize, CanonicalDeserialize)]
-pub struct HyraxOpeningProof<G: CurveGroup, ProofTranscript: Transcript> {
+pub struct HyraxOpeningProof<G: CurveGroup + Icicle, ProofTranscript: Transcript> {
     pub vector_matrix_product: Vec<G::ScalarField>,
     _marker: PhantomData<ProofTranscript>,
 }
@@ -270,7 +280,7 @@ pub struct HyraxOpeningProof<G: CurveGroup, ProofTranscript: Transcript> {
 impl<F, G, ProofTranscript> HyraxOpeningProof<G, ProofTranscript>
 where
     F: JoltField,
-    G: CurveGroup<ScalarField = F>,
+    G: CurveGroup<ScalarField = F> + Icicle,
     ProofTranscript: Transcript,
 {
     fn protocol_name() -> &'static [u8] {
@@ -323,13 +333,16 @@ where
 
         // Verifier-derived commitment to u * a = \prod Com(u_j)^{a_j}
         let homomorphically_derived_commitment: G =
-            VariableBaseMSM::msm(&G::normalize_batch(&commitment.row_commitments), &L).unwrap();
+            VariableBaseMSM::msm(&G::normalize_batch(&commitment.row_commitments), None, &L)?;
 
         let product_commitment = VariableBaseMSM::msm(
-            &G::normalize_batch(&pedersen_generators.generators[..R_size]),
+            &pedersen_generators.generators[..R_size],
+            pedersen_generators
+                .gpu_generators
+                .as_ref()
+                .map(|g| &g[..R_size]),
             &self.vector_matrix_product,
-        )
-        .unwrap();
+        )?;
 
         let dot_product = compute_dotproduct(&self.vector_matrix_product, &R);
 
@@ -367,14 +380,14 @@ where
 }
 
 #[derive(Debug, CanonicalSerialize, CanonicalDeserialize)]
-pub struct BatchedHyraxOpeningProof<G: CurveGroup, ProofTranscript: Transcript> {
+pub struct BatchedHyraxOpeningProof<G: CurveGroup + Icicle, ProofTranscript: Transcript> {
     pub joint_proof: HyraxOpeningProof<G, ProofTranscript>,
     pub ratio: usize,
     _marker: PhantomData<ProofTranscript>,
 }
 
 /// See Section 16.1 of Thaler's Proofs, Arguments, and Zero-Knowledge
-impl<F: JoltField, G: CurveGroup<ScalarField = F>, ProofTranscript: Transcript>
+impl<F: JoltField, G: CurveGroup<ScalarField = F> + Icicle, ProofTranscript: Transcript>
     BatchedHyraxOpeningProof<G, ProofTranscript>
 {
     #[tracing::instrument(skip_all, name = "BatchedHyraxOpeningProof::prove")]
@@ -537,7 +550,7 @@ mod tests {
 
     fn check_polynomial_commit_helper<
         F: JoltField,
-        G: CurveGroup<ScalarField = F>,
+        G: CurveGroup<ScalarField = F> + Icicle,
         const RATIO: usize,
     >() {
         let Z = vec![
diff --git a/jolt-core/src/poly/commitment/kzg.rs b/jolt-core/src/poly/commitment/kzg.rs
index 6aa77ec36..dc66b3927 100644
--- a/jolt-core/src/poly/commitment/kzg.rs
+++ b/jolt-core/src/poly/commitment/kzg.rs
@@ -1,5 +1,6 @@
 use crate::field::JoltField;
-use crate::msm::VariableBaseMSM;
+use crate::msm::{GpuBaseType, Icicle, VariableBaseMSM};
+use crate::optimal_iter;
 use crate::poly::unipoly::UniPoly;
 use crate::utils::errors::ProofVerifyError;
 use ark_ec::scalar_mul::fixed_base::FixedBase;
@@ -12,18 +13,29 @@ use std::marker::PhantomData;
 use std::sync::Arc;
 
 #[derive(Clone, Debug)]
-pub struct SRS<P: Pairing> {
+pub struct SRS<P: Pairing>
+where
+    P::G1: Icicle,
+{
     pub g1_powers: Vec<P::G1Affine>,
     pub g2_powers: Vec<P::G2Affine>,
     pub g_products: Vec<P::G1Affine>,
+    // g1_powers in icicle's GPU types
+    pub gpu_g1: Option<Vec<GpuBaseType<P::G1>>>,
 }
 
-impl<P: Pairing> SRS<P> {
+impl<P: Pairing> SRS<P>
+where
+    P::G1: Icicle,
+{
     pub fn setup<R: RngCore + CryptoRng>(
         mut rng: &mut R,
         num_g1_powers: usize,
         num_g2_powers: usize,
-    ) -> Self {
+    ) -> Self
+    where
+        P::ScalarField: JoltField,
+    {
         let beta = P::ScalarField::rand(&mut rng);
         let g1 = P::G1::rand(&mut rng);
         let g2 = P::G2::rand(&mut rng);
@@ -69,16 +81,31 @@ impl<P: Pairing> SRS<P> {
         let powers_of_2 = (0..num_powers).into_par_iter().map(|i| 1usize << i);
         let g_products = powers_of_2
             .map(|power| {
-                <P::G1 as VariableBaseMSM>::msm(&g1_powers[..power], &all_ones_coeffs[..power])
-                    .unwrap()
-                    .into_affine()
+                <P::G1 as VariableBaseMSM>::msm(
+                    &g1_powers[..power],
+                    None,
+                    &all_ones_coeffs[..power],
+                )
+                .unwrap()
+                .into_affine()
             })
             .collect();
 
+        #[cfg(feature = "icicle")]
+        let gpu_g1 = Some(
+            g1_powers
+                .par_iter()
+                .map(<P::G1 as Icicle>::from_ark_affine)
+                .collect::<Vec<_>>(),
+        );
+        #[cfg(not(feature = "icicle"))]
+        let gpu_g1 = None;
+
         Self {
             g1_powers,
             g2_powers,
             g_products,
+            gpu_g1,
         }
     }
 
@@ -98,7 +125,10 @@ impl<P: Pairing> SRS<P> {
 }
 
 #[derive(Clone, Debug)]
-pub struct KZGProverKey<P: Pairing> {
+pub struct KZGProverKey<P: Pairing>
+where
+    P::G1: Icicle,
+{
     srs: Arc<SRS<P>>,
     // offset to read into SRS
     offset: usize,
@@ -106,7 +136,10 @@ pub struct KZGProverKey<P: Pairing> {
     supported_size: usize,
 }
 
-impl<P: Pairing> KZGProverKey<P> {
+impl<P: Pairing> KZGProverKey<P>
+where
+    P::G1: Icicle,
+{
     pub fn new(srs: Arc<SRS<P>>, offset: usize, supported_size: usize) -> Self {
         assert!(
             srs.g1_powers.len() >= offset + supported_size,
@@ -125,6 +158,13 @@ impl<P: Pairing> KZGProverKey<P> {
     pub fn g1_powers(&self) -> &[P::G1Affine] {
         &self.srs.g1_powers[self.offset..self.offset + self.supported_size]
     }
+
+    pub fn gpu_g1(&self) -> Option<&[GpuBaseType<P::G1>]> {
+        self.srs
+            .gpu_g1
+            .as_ref()
+            .map(|gpu_g1| &gpu_g1[self.offset..self.offset + self.supported_size])
+    }
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -150,8 +190,111 @@ pub struct UnivariateKZG<P: Pairing> {
 
 impl<P: Pairing> UnivariateKZG<P>
 where
-    <P as Pairing>::ScalarField: JoltField,
+    P::ScalarField: JoltField,
+    P::G1: Icicle,
 {
+    #[tracing::instrument(skip_all, name = "KZG::commit_batch")]
+    pub fn commit_batch(
+        pk: &KZGProverKey<P>,
+        coeffs: &[&[P::ScalarField]],
+    ) -> Result<Vec<P::G1Affine>, ProofVerifyError> {
+        Self::commit_batch_with_mode(pk, coeffs, CommitMode::Default)
+    }
+
+    #[tracing::instrument(skip_all, name = "KZG::commit_batch_with_mode")]
+    pub fn commit_batch_with_mode(
+        pk: &KZGProverKey<P>,
+        batches: &[&[P::ScalarField]],
+        mode: CommitMode,
+    ) -> Result<Vec<P::G1Affine>, ProofVerifyError> {
+        let g1_powers = &pk.g1_powers();
+        let gpu_g1 = pk.gpu_g1();
+
+        // batch commit requires all batches to have the same length
+        assert!(batches.par_iter().all(|s| s.len() == batches[0].len()));
+        assert!(batches[0].len() <= g1_powers.len());
+
+        if let Some(invalid) = batches.iter().find(|coeffs| coeffs.len() > g1_powers.len()) {
+            return Err(ProofVerifyError::KeyLengthError(
+                g1_powers.len(),
+                invalid.len(),
+            ));
+        }
+
+        let batch_size = batches[0].len();
+        match mode {
+            CommitMode::Default => {
+                let commitments = <P::G1 as VariableBaseMSM>::batch_msm(
+                    &g1_powers[..batch_size],
+                    gpu_g1.map(|g| &g[..batch_size]),
+                    batches,
+                );
+                Ok(commitments.into_iter().map(|c| c.into_affine()).collect())
+            }
+            CommitMode::GrandProduct => {
+                // Commit to the non-1 coefficients first then combine them with the G commitment (all-1s vector) in the SRS
+                let (non_one_coeffs, (non_one_bases, non_one_gpu_bases)): (
+                    Vec<_>,
+                    (Vec<_>, Vec<_>),
+                ) = batches
+                    .par_iter()
+                    .map(|coeff| {
+                        let (coeffs, (bases, gpu_bases)): (Vec<_>, (Vec<_>, Vec<_>)) = coeff
+                            .par_iter()
+                            .enumerate()
+                            .filter_map(|(i, coeff)| {
+                                if *coeff != P::ScalarField::one() {
+                                    let gpu_base = gpu_g1.map(|g| g[i]);
+                                    // Subtract 1 from the coeff because we already have a commitment to the all the 1s
+                                    Some((*coeff - P::ScalarField::one(), (g1_powers[i], gpu_base)))
+                                } else {
+                                    None
+                                }
+                            })
+                            .unzip();
+                        let gpu_bases: Option<Vec<_>> = gpu_bases.into_par_iter().collect();
+                        (coeffs, (bases, gpu_bases))
+                    })
+                    .unzip();
+
+                // Perform MSM for the non-1 coefficients
+                assert_eq!(non_one_bases.len(), non_one_coeffs.len());
+                //TODO(sagar) batch msm this
+                let commitments = optimal_iter!(non_one_coeffs)
+                    .enumerate()
+                    .map(|(i, coeffs)| {
+                        let non_one_commitment = if !coeffs.is_empty() {
+                            <P::G1 as VariableBaseMSM>::msm(
+                                &non_one_bases[i],
+                                non_one_gpu_bases[i].as_deref(),
+                                coeffs,
+                            )
+                            .unwrap()
+                        } else {
+                            P::G1::zero()
+                        };
+
+                        // find the right precomputed g_product to use
+                        let num_powers = (coeffs.len() as f64).log2();
+                        assert_ne!(
+                            num_powers.fract(),
+                            0.0,
+                            "Invalid key length: {}",
+                            coeffs.len()
+                        );
+                        let num_powers = num_powers.floor() as usize;
+
+                        // Combine G * H: Multiply the precomputed G commitment with the non-1 commitment (H)
+                        let final_commitment = pk.srs.g_products[num_powers] + non_one_commitment;
+                        final_commitment.into_affine()
+                    })
+                    .collect();
+
+                Ok(commitments)
+            }
+        }
+    }
+
     #[tracing::instrument(skip_all, name = "KZG::commit_offset")]
     pub fn commit_offset(
         pk: &KZGProverKey<P>,
@@ -214,20 +357,32 @@ where
             CommitMode::Default => {
                 let c = <P::G1 as VariableBaseMSM>::msm(
                     &pk.g1_powers()[offset..coeffs.len()],
+                    pk.gpu_g1().map(|g| &g[offset..coeffs.len()]),
                     &coeffs[offset..],
-                )
-                .unwrap();
+                )?;
                 Ok(c.into_affine())
             }
             CommitMode::GrandProduct => {
                 let g1_powers = &pk.g1_powers()[offset..coeffs.len()];
+                let gpu_g1 = pk.gpu_g1().map(|g| &g[offset..coeffs.len()]);
                 let coeffs = &coeffs[offset..];
+                let mut non_one_gpu_bases = if gpu_g1.is_some() {
+                    Some(Vec::new())
+                } else {
+                    None
+                };
+
                 // Commit to the non-1 coefficients first then combine them with the G commitment (all-1s vector) in the SRS
                 let (non_one_coeffs, non_one_bases): (Vec<_>, Vec<_>) = coeffs
                     .iter()
                     .enumerate()
                     .filter_map(|(i, coeff)| {
                         if *coeff != P::ScalarField::one() {
+                            if let Some(gpu_g1) = gpu_g1 {
+                                if let Some(v) = non_one_gpu_bases.as_mut() {
+                                    v.push(gpu_g1[i])
+                                }
+                            }
                             // Subtract 1 from the coeff because we already have a commitment to the all the 1s
                             Some((*coeff - P::ScalarField::one(), g1_powers[i]))
                         } else {
@@ -238,7 +393,11 @@ where
 
                 // Perform MSM for the non-1 coefficients
                 let non_one_commitment = if !non_one_coeffs.is_empty() {
-                    <P::G1 as VariableBaseMSM>::msm(&non_one_bases, &non_one_coeffs).unwrap()
+                    <P::G1 as VariableBaseMSM>::msm(
+                        &non_one_bases,
+                        non_one_gpu_bases.as_deref(),
+                        &non_one_coeffs,
+                    )?
                 } else {
                     P::G1::zero()
                 };
@@ -270,9 +429,9 @@ where
         let (witness_poly, _) = poly.divide_with_remainder(&divisor).unwrap();
         let proof = <P::G1 as VariableBaseMSM>::msm(
             &pk.g1_powers()[..witness_poly.coeffs.len()],
+            pk.gpu_g1().map(|g| &g[..witness_poly.coeffs.len()]),
             witness_poly.coeffs.as_slice(),
-        )
-        .unwrap();
+        )?;
         let evaluation = poly.evaluate(point);
         Ok((proof.into_affine(), evaluation))
     }
diff --git a/jolt-core/src/poly/commitment/pedersen.rs b/jolt-core/src/poly/commitment/pedersen.rs
index d7405d6a4..59782ca09 100644
--- a/jolt-core/src/poly/commitment/pedersen.rs
+++ b/jolt-core/src/poly/commitment/pedersen.rs
@@ -1,19 +1,25 @@
+use crate::msm::Icicle;
+use crate::msm::{GpuBaseType, VariableBaseMSM};
 use ark_ec::CurveGroup;
-use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
+use ark_serialize::{
+    CanonicalDeserialize, CanonicalSerialize, Compress, SerializationError, Valid, Validate,
+};
 use ark_std::rand::SeedableRng;
+use ark_std::UniformRand;
 use rand_chacha::ChaCha20Rng;
+#[cfg(feature = "icicle")]
+use rayon::prelude::*;
 use sha3::digest::{ExtendableOutput, Update};
 use sha3::Shake256;
-use std::io::Read;
+use std::io::{Read, Write};
 
-use crate::msm::VariableBaseMSM;
-
-#[derive(Clone, CanonicalSerialize, CanonicalDeserialize)]
-pub struct PedersenGenerators<G: CurveGroup> {
-    pub generators: Vec<G>,
+#[derive(Clone)]
+pub struct PedersenGenerators<G: CurveGroup + Icicle> {
+    pub generators: Vec<G::Affine>,
+    pub gpu_generators: Option<Vec<GpuBaseType<G>>>,
 }
 
-impl<G: CurveGroup> PedersenGenerators<G> {
+impl<G: CurveGroup + Icicle> PedersenGenerators<G> {
     #[tracing::instrument(skip_all, name = "PedersenGenerators::new")]
     pub fn new(len: usize, label: &[u8]) -> Self {
         let mut shake = Shake256::default();
@@ -27,12 +33,25 @@ impl<G: CurveGroup> PedersenGenerators<G> {
         reader.read_exact(&mut seed).unwrap();
         let mut rng = ChaCha20Rng::from_seed(seed);
 
-        let mut generators: Vec<G> = Vec::new();
+        let mut generators: Vec<G::Affine> = Vec::new();
         for _ in 0..len {
-            generators.push(G::rand(&mut rng));
+            generators.push(G::Affine::rand(&mut rng));
         }
 
-        Self { generators }
+        #[cfg(feature = "icicle")]
+        let gpu_generators = Some(
+            generators
+                .par_iter()
+                .map(<G as Icicle>::from_ark_affine)
+                .collect::<Vec<_>>(),
+        );
+        #[cfg(not(feature = "icicle"))]
+        let gpu_generators = None;
+
+        Self {
+            generators,
+            gpu_generators,
+        }
     }
 
     pub fn clone_n(&self, n: usize) -> PedersenGenerators<G> {
@@ -45,24 +64,81 @@ impl<G: CurveGroup> PedersenGenerators<G> {
         let slice = &self.generators[..n];
         PedersenGenerators {
             generators: slice.into(),
+            gpu_generators: self
+                .gpu_generators
+                .as_ref()
+                .map(|gpu_slice| gpu_slice[..n].into()),
         }
     }
 }
 
-pub trait PedersenCommitment<G: CurveGroup>: Sized {
+pub trait PedersenCommitment<G: CurveGroup + Icicle>: Sized {
     fn commit(&self, gens: &PedersenGenerators<G>) -> G;
-    fn commit_vector(inputs: &[Self], bases: &[G::Affine]) -> G;
+    fn commit_vector(
+        inputs: &[Self],
+        bases: &[G::Affine],
+        gpu_bases: Option<&[GpuBaseType<G>]>,
+    ) -> G;
 }
 
-impl<G: CurveGroup> PedersenCommitment<G> for G::ScalarField {
+impl<G: CurveGroup + Icicle> PedersenCommitment<G> for G::ScalarField {
     #[tracing::instrument(skip_all, name = "PedersenCommitment::commit")]
     fn commit(&self, gens: &PedersenGenerators<G>) -> G {
         assert_eq!(gens.generators.len(), 1);
         gens.generators[0] * self
     }
 
-    fn commit_vector(inputs: &[Self], bases: &[G::Affine]) -> G {
+    #[tracing::instrument(skip_all, name = "PedersenCommitment::commit_vector")]
+    fn commit_vector(
+        inputs: &[Self],
+        bases: &[G::Affine],
+        gpu_bases: Option<&[GpuBaseType<G>]>,
+    ) -> G {
         assert_eq!(bases.len(), inputs.len());
-        VariableBaseMSM::msm(bases, inputs).unwrap()
+        VariableBaseMSM::msm(bases, gpu_bases, inputs).unwrap()
+    }
+}
+
+impl<G: CurveGroup + Icicle> CanonicalSerialize for PedersenGenerators<G> {
+    fn serialize_with_mode<W: Write>(
+        &self,
+        writer: W,
+        compress: Compress,
+    ) -> Result<(), SerializationError> {
+        self.generators.serialize_with_mode(writer, compress)
+    }
+
+    fn serialized_size(&self, compress: Compress) -> usize {
+        self.generators.serialized_size(compress)
+    }
+}
+
+impl<G: CurveGroup + Icicle> Valid for PedersenGenerators<G> {
+    fn check(&self) -> Result<(), SerializationError> {
+        self.generators.check()
+    }
+}
+
+impl<G: CurveGroup + Icicle> CanonicalDeserialize for PedersenGenerators<G> {
+    fn deserialize_with_mode<R: Read>(
+        reader: R,
+        compress: Compress,
+        validate: Validate,
+    ) -> Result<Self, SerializationError> {
+        let generators = Vec::<G::Affine>::deserialize_with_mode(reader, compress, validate)?;
+        #[cfg(feature = "icicle")]
+        let gpu_generators = Some(
+            generators
+                .par_iter()
+                .map(<G as Icicle>::from_ark_affine)
+                .collect::<Vec<_>>(),
+        );
+        #[cfg(not(feature = "icicle"))]
+        let gpu_generators = None;
+
+        Ok(Self {
+            generators,
+            gpu_generators,
+        })
     }
 }
diff --git a/jolt-core/src/poly/commitment/zeromorph.rs b/jolt-core/src/poly/commitment/zeromorph.rs
index 505bd629c..fd0021a2c 100644
--- a/jolt-core/src/poly/commitment/zeromorph.rs
+++ b/jolt-core/src/poly/commitment/zeromorph.rs
@@ -1,10 +1,7 @@
 #![allow(clippy::too_many_arguments)]
 #![allow(clippy::type_complexity)]
 
-use std::{iter, marker::PhantomData};
-
-use crate::field;
-use crate::msm::VariableBaseMSM;
+use crate::msm::{Icicle, VariableBaseMSM};
 use crate::poly::{dense_mlpoly::DensePolynomial, unipoly::UniPoly};
 use crate::utils::mul_0_1_optimized;
 use crate::utils::thread::unsafe_allocate_zero_vec;
@@ -13,26 +10,36 @@ use crate::utils::{
     transcript::{AppendToTranscript, Transcript},
 };
 use ark_ec::{pairing::Pairing, AffineRepr, CurveGroup};
-use ark_ff::{batch_inversion, Field};
+use ark_ff::batch_inversion;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 use ark_std::{One, Zero};
 use itertools::izip;
 use rand_chacha::{rand_core::SeedableRng, ChaCha20Rng};
 use rand_core::{CryptoRng, RngCore};
 use std::sync::Arc;
+use std::{iter, marker::PhantomData};
 use tracing::trace_span;
 
-use rayon::prelude::*;
-
 use super::{
     commitment_scheme::{BatchType, CommitShape, CommitmentScheme},
     kzg::{KZGProverKey, KZGVerifierKey, UnivariateKZG, SRS},
 };
+use crate::field::JoltField;
+use crate::optimal_iter;
+use rayon::prelude::*;
 
-pub struct ZeromorphSRS<P: Pairing>(Arc<SRS<P>>);
+pub struct ZeromorphSRS<P: Pairing>(Arc<SRS<P>>)
+where
+    P::G1: Icicle;
 
-impl<P: Pairing> ZeromorphSRS<P> {
-    pub fn setup<R: RngCore + CryptoRng>(rng: &mut R, max_degree: usize) -> Self {
+impl<P: Pairing> ZeromorphSRS<P>
+where
+    P::G1: Icicle,
+{
+    pub fn setup<R: RngCore + CryptoRng>(rng: &mut R, max_degree: usize) -> Self
+    where
+        P::ScalarField: JoltField,
+    {
         Self(Arc::new(SRS::setup(rng, max_degree, max_degree)))
     }
 
@@ -53,7 +60,10 @@ impl<P: Pairing> ZeromorphSRS<P> {
 
 //TODO: adapt interface to have prover and verifier key
 #[derive(Clone, Debug)]
-pub struct ZeromorphProverKey<P: Pairing> {
+pub struct ZeromorphProverKey<P: Pairing>
+where
+    P::G1: Icicle,
+{
     pub commit_pp: KZGProverKey<P>,
     pub open_pp: KZGProverKey<P>,
 }
@@ -91,7 +101,7 @@ fn compute_multilinear_quotients<P: Pairing>(
     point: &[P::ScalarField],
 ) -> (Vec<UniPoly<P::ScalarField>>, P::ScalarField)
 where
-    <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::ScalarField: JoltField,
 {
     let num_var = poly.get_num_vars();
     assert_eq!(num_var, point.len());
@@ -134,7 +144,7 @@ fn compute_batched_lifted_degree_quotient<P: Pairing>(
     y_challenge: &P::ScalarField,
 ) -> (UniPoly<P::ScalarField>, usize)
 where
-    <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::ScalarField: JoltField,
 {
     let num_vars = quotients.len();
 
@@ -165,14 +175,15 @@ fn eval_and_quotient_scalars<P: Pairing>(
     challenges: &[P::ScalarField],
 ) -> (P::ScalarField, (Vec<P::ScalarField>, Vec<P::ScalarField>))
 where
-    <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::ScalarField: JoltField,
 {
     let num_vars = challenges.len();
 
     // squares of x = [x, x^2, .. x^{2^k}, .. x^{2^num_vars}]
-    let squares_of_x: Vec<_> = iter::successors(Some(x_challenge), |&x| Some(x.square()))
-        .take(num_vars + 1)
-        .collect();
+    let squares_of_x: Vec<_> =
+        iter::successors(Some(x_challenge), |&x| Some(JoltField::square(&x)))
+            .take(num_vars + 1)
+            .collect();
 
     let offsets_of_x = {
         let mut offsets_of_x = squares_of_x
@@ -228,7 +239,8 @@ pub struct Zeromorph<P: Pairing, ProofTranscript: Transcript> {
 
 impl<P, ProofTranscript> Zeromorph<P, ProofTranscript>
 where
-    <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::ScalarField: JoltField,
+    <P as Pairing>::G1: Icicle,
     P: Pairing,
     ProofTranscript: Transcript,
 {
@@ -277,12 +289,18 @@ where
         assert_eq!(quotients.len(), poly.get_num_vars());
         assert_eq!(remainder, *eval);
 
-        // Compute the multilinear quotients q_k = q_k(X_0, ..., X_{k-1})
-        let q_k_com: Vec<P::G1Affine> = quotients
-            .par_iter()
+        // TODO(sagar): support variable_batch msms - or decide not to support them altogether
+        let q_k_com: Vec<P::G1Affine> = optimal_iter!(quotients)
             .map(|q| UnivariateKZG::commit(&pp.commit_pp, q).unwrap())
             .collect();
         let q_comms: Vec<P::G1> = q_k_com.par_iter().map(|c| c.into_group()).collect();
+        // Compute the multilinear quotients q_k = q_k(X_0, ..., X_{k-1})
+        // let quotient_slices: Vec<&[P::ScalarField]> =
+        //     quotients.iter().map(|q| q.coeffs.as_slice()).collect();
+        // let q_k_com = UnivariateKZG::commit_batch(&pp.commit_pp, &quotient_slices)?;
+        // let q_comms: Vec<P::G1> = q_k_com.par_iter().map(|c| c.into_group()).collect();
+        // let quotient_max_len = quotient_slices.iter().map(|s| s.len()).max().unwrap();
+
         q_comms.iter().for_each(|c| transcript.append_point(c));
 
         // Sample challenge y
@@ -459,9 +477,7 @@ where
             proof.q_k_com.clone(),
         ]
         .concat();
-        let zeta_z_com = <P::G1 as VariableBaseMSM>::msm(&bases, &scalars)
-            .unwrap()
-            .into_affine();
+        let zeta_z_com = <P::G1 as VariableBaseMSM>::msm(&bases, None, &scalars)?.into_affine();
 
         // e(pi, [tau]_2 - x * [1]_2) == e(C_{\zeta,Z}, -[X^(N_max - 2^n - 1)]_2) <==> e(C_{\zeta,Z} - x * pi, [X^{N_max - 2^n - 1}]_2) * e(-pi, [tau_2]) == 1
         let pairing = P::multi_pairing(
@@ -482,7 +498,8 @@ where
 impl<P: Pairing, ProofTranscript: Transcript> CommitmentScheme<ProofTranscript>
     for Zeromorph<P, ProofTranscript>
 where
-    <P as Pairing>::ScalarField: field::JoltField,
+    <P as Pairing>::ScalarField: JoltField,
+    <P as Pairing>::G1: Icicle,
 {
     type Field = P::ScalarField;
     type Setup = (ZeromorphProverKey<P>, ZeromorphVerifierKey<P>);
@@ -490,7 +507,11 @@ where
     type Proof = ZeromorphProof<P>;
     type BatchedProof = ZeromorphProof<P>;
 
-    fn setup(shapes: &[CommitShape]) -> Self::Setup {
+    fn setup(shapes: &[CommitShape]) -> Self::Setup
+    where
+        P::ScalarField: JoltField,
+        P::G1: Icicle,
+    {
         let max_len = shapes.iter().map(|shape| shape.input_length).max().unwrap();
 
         ZeromorphSRS(Arc::new(SRS::setup(
@@ -519,22 +540,11 @@ where
         gens: &Self::Setup,
         _batch_type: BatchType,
     ) -> Vec<Self::Commitment> {
-        // TODO: assert lengths are valid
-        evals
-            .par_iter()
-            .map(|evals| {
-                assert!(
-                    gens.0.commit_pp.g1_powers().len() > evals.len(),
-                    "COMMIT KEY LENGTH ERROR {}, {}",
-                    gens.0.commit_pp.g1_powers().len(),
-                    evals.len()
-                );
-                ZeromorphCommitment(
-                    UnivariateKZG::commit(&gens.0.commit_pp, &UniPoly::from_coeff(evals.to_vec()))
-                        .unwrap(),
-                )
-            })
-            .collect::<Vec<_>>()
+        UnivariateKZG::commit_batch(&gens.0.commit_pp, evals)
+            .unwrap()
+            .into_iter()
+            .map(|c| ZeromorphCommitment(c))
+            .collect()
     }
 
     fn commit_slice(evals: &[Self::Field], setup: &Self::Setup) -> Self::Commitment {
@@ -631,7 +641,7 @@ mod test {
     use crate::utils::math::Math;
     use crate::utils::transcript::{KeccakTranscript, Transcript};
     use ark_bn254::{Bn254, Fr};
-    use ark_ff::{BigInt, Zero};
+    use ark_ff::{BigInt, Field, Zero};
     use ark_std::{test_rng, UniformRand};
     use rand_core::SeedableRng;
 
diff --git a/jolt-core/src/poly/unipoly.rs b/jolt-core/src/poly/unipoly.rs
index 85ba65a87..0f3bad8d2 100644
--- a/jolt-core/src/poly/unipoly.rs
+++ b/jolt-core/src/poly/unipoly.rs
@@ -7,7 +7,7 @@ use crate::utils::gaussian_elimination::gaussian_elimination;
 use crate::utils::transcript::{AppendToTranscript, Transcript};
 use ark_serialize::*;
 use rand_core::{CryptoRng, RngCore};
-use rayon::iter::{IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator};
+use rayon::prelude::*;
 
 // ax^2 + bx + c stored as vec![c,b,a]
 // ax^3 + bx^2 + cx + d stored as vec![d,c,b,a]
@@ -79,6 +79,7 @@ impl<F: JoltField> UniPoly<F> {
                 for (i, div_coeff) in divisor.coeffs.iter().enumerate() {
                     remainder.coeffs[cur_q_degree + i] -= cur_q_coeff * *div_coeff;
                 }
+
                 while let Some(true) = remainder.coeffs.last().map(|c| c == &F::zero()) {
                     remainder.coeffs.pop();
                 }
diff --git a/jolt-core/src/utils/errors.rs b/jolt-core/src/utils/errors.rs
index 566b29651..0521dcd7b 100644
--- a/jolt-core/src/utils/errors.rs
+++ b/jolt-core/src/utils/errors.rs
@@ -14,7 +14,7 @@ pub enum ProofVerifyError {
     DecompressionError([u8; 32]),
     #[error("R1CS proof verification failed: {0}")]
     SpartanError(String),
-    #[error("Length Error: SRS Length: {0}, Key Length: {0}")]
+    #[error("Length Error: SRS Length: {0}, Key Length: {1}")]
     KeyLengthError(usize, usize),
     #[error("Invalid key length: {0}, expected power of 2")]
     InvalidKeyLength(usize),
diff --git a/jolt-core/src/utils/mod.rs b/jolt-core/src/utils/mod.rs
index 8a0e30cbd..78a1288fa 100644
--- a/jolt-core/src/utils/mod.rs
+++ b/jolt-core/src/utils/mod.rs
@@ -13,6 +13,67 @@ pub mod sol_types;
 pub mod thread;
 pub mod transcript;
 
+/// Macros that determine the optimal iterator type based on the feature flags.
+///
+/// For some cases (ex. offloading to GPU), we may not want to use a parallel iterator.
+/// Specifically when icicle is enabled we want to be careful to use serial iteration in the right places.
+/// Based on observations; multiple calls into icicle_msm functions can dramatically slow down GPU performance.
+#[macro_export]
+macro_rules! optimal_iter {
+    ($T:expr) => {{
+        #[cfg(feature = "icicle")]
+        {
+            $T.iter()
+        }
+        #[cfg(not(feature = "icicle"))]
+        {
+            $T.par_iter()
+        }
+    }};
+}
+
+#[macro_export]
+macro_rules! into_optimal_iter {
+    ($T:expr) => {{
+        #[cfg(feature = "icicle")]
+        {
+            $T.into_iter()
+        }
+        #[cfg(not(feature = "icicle"))]
+        {
+            $T.into_par_iter()
+        }
+    }};
+}
+
+#[macro_export]
+macro_rules! optimal_iter_mut {
+    ($T:expr) => {{
+        #[cfg(feature = "icicle")]
+        {
+            $T.iter_mut()
+        }
+        #[cfg(not(feature = "icicle"))]
+        {
+            $T.par_iter_mut()
+        }
+    }};
+}
+
+#[macro_export]
+macro_rules! join_conditional {
+    ($f1:expr, $f2:expr) => {{
+        #[cfg(feature = "icicle")]
+        {
+            ($f1(), $f2())
+        }
+        #[cfg(not(feature = "icicle"))]
+        {
+            rayon::join($f1, $f2)
+        }
+    }};
+}
+
 /// Converts an integer value to a bitvector (all values {0,1}) of field elements.
 /// Note: ordering has the MSB in the highest index. All of the following represent the integer 1:
 /// - [1]
diff --git a/jolt-evm-verifier/script/Cargo.lock b/jolt-evm-verifier/script/Cargo.lock
index 6b6e46adf..afa6c574d 100644
--- a/jolt-evm-verifier/script/Cargo.lock
+++ b/jolt-evm-verifier/script/Cargo.lock
@@ -1575,6 +1575,7 @@ dependencies = [
  "itertools 0.10.5",
  "memory-stats",
  "num-integer",
+ "once_cell",
  "postcard",
  "rand 0.7.3",
  "rand_chacha 0.3.1",
@@ -1585,6 +1586,7 @@ dependencies = [
  "sha3",
  "strum",
  "strum_macros",
+ "sys-info",
  "target-lexicon",
  "thiserror",
  "tokio",
@@ -2568,9 +2570,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.12.0"
+version = "2.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6"
+checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -2850,6 +2852,16 @@ dependencies = [
  "syn 2.0.87",
 ]
 
+[[package]]
+name = "sys-info"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c"
+dependencies = [
+ "cc",
+ "libc",
+]
+
 [[package]]
 name = "system-configuration"
 version = "0.6.1"
@@ -2957,9 +2969,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.41.0"
+version = "1.41.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb"
+checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33"
 dependencies = [
  "backtrace",
  "bytes",
diff --git a/jolt-sdk/Cargo.toml b/jolt-sdk/Cargo.toml
index f34ad6f7c..bd25f46c9 100644
--- a/jolt-sdk/Cargo.toml
+++ b/jolt-sdk/Cargo.toml
@@ -21,11 +21,11 @@ host = [
     "dep:ark-bn254",
     "postcard/use-std",
 ]
-
 guest-std = [
     "postcard/use-std",
     "jolt-sdk-macros/guest-std",
 ]
+icicle = ["host", "jolt-core?/icicle"]
 
 [dependencies]
 postcard = { version = "1.0.8", default-features = false }
diff --git a/src/main.rs b/src/main.rs
index da0218fd7..81ea27e1c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -189,6 +189,9 @@ lto = "fat"
 jolt = { package = "jolt-sdk", git = "https://github.com/a16z/jolt", features = ["host"] }
 guest = { path = "./guest" }
 
+[features]
+icicle = ["jolt-sdk/icicle"]
+
 [patch.crates-io]
 ark-ff = { git = "https://github.com/a16z/arkworks-algebra", branch = "optimize/field-from-u64" }
 ark-ec = { git = "https://github.com/a16z/arkworks-algebra", branch = "optimize/field-from-u64" }

From f65a0c7ccc28a1dd22189c61364e048837b42c7c Mon Sep 17 00:00:00 2001
From: Sagar Dhawan <sdhawan@a16z.com>
Date: Mon, 16 Dec 2024 08:47:38 -0800
Subject: [PATCH 2/3] use icicle-jolt to compile cuda deps

---
 Cargo.lock           | 17 +++++++++--------
 README.md            | 23 +++++++++++++++++++++++
 jolt-core/Cargo.toml |  6 +++---
 3 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 18184b0b5..aefc84ab4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1557,8 +1557,8 @@ dependencies = [
 
 [[package]]
 name = "icicle-bn254"
-version = "3.1.0"
-source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+version = "3.2.0"
+source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108"
 dependencies = [
  "cmake",
  "icicle-core",
@@ -1568,8 +1568,8 @@ dependencies = [
 
 [[package]]
 name = "icicle-core"
-version = "3.1.0"
-source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+version = "3.2.0"
+source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108"
 dependencies = [
  "hex",
  "icicle-runtime",
@@ -1580,8 +1580,8 @@ dependencies = [
 
 [[package]]
 name = "icicle-hash"
-version = "3.1.0"
-source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+version = "3.2.0"
+source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108"
 dependencies = [
  "cmake",
  "icicle-core",
@@ -1591,10 +1591,11 @@ dependencies = [
 
 [[package]]
 name = "icicle-runtime"
-version = "3.1.0"
-source = "git+https://github.com/ingonyama-zk/icicle.git?tag=v3.1.0#38712a95af4a118a124321c81383daa93f59f1e4"
+version = "3.2.0"
+source = "git+https://github.com/ingonyama-zk/icicle-jolt.git?rev=ed93e21#ed93e21cbb405822b0aa1b58b5dc6c7837a04108"
 dependencies = [
  "cmake",
+ "once_cell",
 ]
 
 [[package]]
diff --git a/README.md b/README.md
index 4765c81f9..a76463f8d 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,29 @@ Examples in the [`examples`](./examples/) directory can be run using e.g.
 
 ```cargo run --release -p sha2-chain```
 
+## CUDA Support
+
+JOLT supports CUDA acceleration via [icicle](https://github.com/ingonyama-zk/icicle-jolt).
+
+Dependencies:
+1. Install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
+2. Install [CMake](https://cmake.org/)
+
+Now you may build Jolt with CUDA acceleration using the `--features icicle` flag.
+
+### Build
+
+```cargo build -p jolt-core --features icicle```
+
+### Bench
+
+```
+# Set the icicle backend path - this won't be needed in the future
+export ICICLE_BACKEND_INSTALL_DIR=$(pwd)/target/debug/deps/icicle/lib/backend
+cargo bench --bench msm_batch --no-fail-fast -p jolt-core --features icicle
+```
+
+Note - NVIDIA doesn't support cross compilation on MacOS. Only Windows or Linux.
 
 ## Performance profiling
 
diff --git a/jolt-core/Cargo.toml b/jolt-core/Cargo.toml
index 7f58c5c97..0153b9dde 100644
--- a/jolt-core/Cargo.toml
+++ b/jolt-core/Cargo.toml
@@ -119,9 +119,9 @@ name = "jolt_core"
 path = "src/lib.rs"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
-icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
-icicle-core = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
-icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle.git", tag = "v3.1.0", optional = true }
+icicle-runtime = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", features = ["cuda_backend"], rev = "ed93e21", optional = true }
+icicle-core = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", rev = "ed93e21", optional = true }
+icicle-bn254 = { git = "https://github.com/ingonyama-zk/icicle-jolt.git", features = ["cuda_backend"], rev = "ed93e21", optional = true }
 memory-stats = "1.0.0"
 sys-info = "0.9.1"
 tokio = { version = "1.38.0", optional = true, features = ["rt-multi-thread"] }

From 146b4b8e849ca5693e9bd01f60eb9a737a9023d0 Mon Sep 17 00:00:00 2001
From: Sagar Dhawan <sdhawan@a16z.com>
Date: Mon, 16 Dec 2024 08:50:33 -0800
Subject: [PATCH 3/3] remove icicle from CI

---
 .github/workflows/rust.yml | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 8de965792..37f6f528e 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -40,11 +40,6 @@ jobs:
         with:
           command: clippy
           args: --all
-      - name: cargo clippy icicle
-        uses: actions-rs/cargo@v1
-        with:
-          command: clippy
-          args: --all --features icicle
 
   machete:
     runs-on: ubuntu-latest
@@ -82,23 +77,6 @@ jobs:
       - name: Run jolt-core tests
         run: cargo nextest run --release -p jolt-core
 
-  test-icicle:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-      - name: Cache Jolt RISC-V Rust toolchain
-        uses: actions/cache@v4
-        with:
-          key: jolt-rust-toolchain-${{hashFiles('guest-toolchain-tag')}}
-          path: ~/.jolt
-      - name: Install Jolt RISC-V Rust toolchain
-        run: cargo run install-toolchain
-      - name: Install nextest
-        uses: taiki-e/install-action@nextest
-      - name: Run jolt-core tests
-        run: cargo nextest run --release -p jolt-core --features icicle
-
   on-chain:
     name: Onchain Verifier Tests
     runs-on: ubuntu-latest