From e2630b28203e874bc24c26fd50acaddc6301f0ba Mon Sep 17 00:00:00 2001 From: ingo_deploy Date: Mon, 1 May 2023 09:15:37 +0300 Subject: [PATCH 1/3] Danksharding flow based on the new ICICLE API --- .gitmodules | 3 +- fast-danksharding/Cargo.toml | 4 +- fast-danksharding/src/cuda/lib.cu | 42 +++++ fast-danksharding/src/fast_danksharding.rs | 182 ++++++++++++++++++++- fast-danksharding/src/lib.rs | 44 +++++ 5 files changed, 272 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 02c6dce..8d994ce 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "icicle"] path = icicle - url = https://github.com/ingonyama-zk/icicle.git + url = https://github.com/DmytroTym/icicle/ + branch = new_api diff --git a/fast-danksharding/Cargo.toml b/fast-danksharding/Cargo.toml index 852a638..2b3693f 100644 --- a/fast-danksharding/Cargo.toml +++ b/fast-danksharding/Cargo.toml @@ -8,13 +8,15 @@ homepage = "https://www.ingonyama.com" repository = "https://github.com/ingonyama-zk/fast-danksharding" [dependencies] -icicle-utils = { git = "https://github.com/ingonyama-zk/icicle.git" } +icicle-utils = { git = "https://github.com/DmytroTym/icicle.git", branch = "new_api" } hex="0.4.3" ark-std = "0.3.0" ark-ff = "0.3.0" ark-poly = "0.3.0" ark-ec = { version = "0.3.0", features = [ "parallel" ] } ark-bls12-381 = { version = "0.3.0", optional = true } +rustacuda = "0.1" +rustacuda_core = "0.1" [build-dependencies] cc = { version = "1.0", features = ["parallel"] } diff --git a/fast-danksharding/src/cuda/lib.cu b/fast-danksharding/src/cuda/lib.cu index 9c9342d..a9626e2 100644 --- a/fast-danksharding/src/cuda/lib.cu +++ b/fast-danksharding/src/cuda/lib.cu @@ -1,6 +1,9 @@ #include "../../../icicle/icicle/curves/curve_config.cuh" #include +const int TILE_DIM = 32; +const int BLOCK_ROWS = 8; + template void point_sum(P *h_outputs, P *h_inputs, unsigned nof_rows, unsigned nof_cols, unsigned l); @@ -63,3 +66,42 @@ extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_ro // out->z = 0; //TODO: .set_infinity() } } + +// the shared-memory version of matrix transpose taken from here: https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/ +template +__global__ void transpose_kernel(T *odata, const T *idata) +{ + __shared__ T tile[TILE_DIM][TILE_DIM+1]; + + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + int width = gridDim.x * TILE_DIM; + int height = gridDim.y * TILE_DIM; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + odata[(y+j)*height + x] = tile[threadIdx.x][threadIdx.y+j]; +} + +extern "C" int transpose_matrix(scalar_field_t *out, scalar_field_t *in, size_t nof_rows, size_t nof_cols, size_t device_id = 0) +{ + try + { + dim3 dimGrid(nof_rows / TILE_DIM, nof_cols / TILE_DIM, 1); + dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); + transpose_kernel <<>> (out, in); + + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); // TODO: error code and message + } +} diff --git a/fast-danksharding/src/fast_danksharding.rs b/fast-danksharding/src/fast_danksharding.rs index efcd20f..306e22d 100644 --- a/fast-danksharding/src/fast_danksharding.rs +++ b/fast-danksharding/src/fast_danksharding.rs @@ -1,5 +1,7 @@ use std::time::Instant; +use rustacuda::prelude::*; +use rustacuda::memory::DevicePointer; use icicle_utils::{field::Point, *}; use crate::{matrix::*, utils::*, *}; @@ -8,9 +10,11 @@ pub const FLOW_SIZE: usize = 1 << 12; //4096 //prod flow size pub const TEST_SIZE_DIV: usize = 1; //TODO: Prod size / test size for speedup pub const TEST_SIZE: usize = FLOW_SIZE / TEST_SIZE_DIV; //test flow size pub const M_POINTS: usize = TEST_SIZE; +pub const LOG_M_POINTS: usize = 12; pub const SRS_SIZE: usize = M_POINTS; pub const S_GROUP_SIZE: usize = 2 * M_POINTS; pub const N_ROWS: usize = 256 / TEST_SIZE_DIV; +pub const LOG_N_ROWS: usize = 8; pub const FOLD_SIZE: usize = 512 / TEST_SIZE_DIV; //TODO: the casing is to match diagram @@ -255,12 +259,188 @@ pub fn main_flow() { println!("success !!!",); } +#[allow(non_snake_case)] +#[allow(non_upper_case_globals)] +pub fn alternate_flow() { + let D_in_host = get_debug_data_scalar_field_vec("D_in.csv"); + let tf_u = &get_debug_data_scalars("roots_u.csv", 1, N_ROWS)[0]; + let SRS_host = get_debug_data_points_proj_xy1_vec("SRS.csv", M_POINTS); + let roots_w = get_debug_data_scalars("roots_w.csv", M_POINTS, 1); + let tf_w = rows_to_cols(&roots_w)[0].to_vec().to_vec(); + //TODO: now S is preprocessed, copy preprocessing here + let S = get_debug_data_points_proj_xy1_vec("S.csv", 2 * M_POINTS); + + let mut q_ = Vec::>::new(); + const l: usize = 16; + println!("loaded test data, processing..."); + + let pre_time = Instant::now(); + // set up the device + let _ctx = rustacuda::quick_init(); + // build domains (i.e. compute twiddle factors) + let mut interpolate_row_domain = build_domain(M_POINTS, LOG_M_POINTS, true); + let mut evaluate_row_domain = build_domain(M_POINTS, LOG_M_POINTS, false); + let mut interpolate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, true); + let mut evaluate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, false); + // build cosets (i.e. powers of roots of unity `w` and `v`) + let mut row_coset = build_domain(M_POINTS, LOG_M_POINTS + 1, false); + let mut column_coset = build_domain(N_ROWS, LOG_N_ROWS + 1, false); + // transfer `D_in` into device memory + let mut D_in = DeviceBuffer::from_slice(&D_in_host[..]).unwrap(); + // transfer the SRS into device memory + debug_assert!(SRS_host[0].to_ark_affine().is_on_curve()); + let s_affine: Vec<_> = vec![SRS_host.iter().map(|p| p.to_xy_strip_z()).collect::>(); N_ROWS].concat(); + let mut SRS = DeviceBuffer::from_slice(&s_affine[..]).unwrap(); + + println!("pre-computation {:0.3?}", pre_time.elapsed()); + + //C_rows = INTT_rows(D_in) + let mut C_rows = interpolate_scalars_batch(&mut D_in, &mut interpolate_row_domain, N_ROWS); + + println!("pre-branch {:0.3?}", pre_time.elapsed()); + + //////////////////////////////// + println!("Branch 1"); + //////////////////////////////// + let br1_time = Instant::now(); + + // K0 = MSM_rows(C_rows) (256x1) + let mut K0 = commit_batch(&mut SRS, &mut C_rows, N_ROWS); + println!("K0 {:0.3?}", br1_time.elapsed()); + + // B0 = ECINTT_col(K0) N_POINTS x 1 (256x1) + let mut B0 = interpolate_points(&mut K0, &mut interpolate_column_domain); + println!("B0 {:0.3?}", br1_time.elapsed()); + + // K1 = ECNTT_col(MUL_col(B0, [1 u u^2 ...])) N_POINTS x 1 (256x1) + let K1 = evaluate_points_on_coset(&mut B0, &mut evaluate_column_domain, &mut column_coset); + println!("K1 {:0.3?}", br1_time.elapsed()); + + // K = [K0, K1] // 2*N_POINTS x 1 (512x1 commitments) + let mut K: Vec = (0..2 * N_ROWS).map(|_| Point::zero()).collect(); + K0.copy_to(&mut K[..N_ROWS]).unwrap(); + K1.copy_to(&mut K[N_ROWS..]).unwrap(); + println!("K {:0.3?}", br1_time.elapsed()); + + println!("Branch1 {:0.3?}", br1_time.elapsed()); + assert_eq!(K, get_debug_data_points_proj_xy1_vec("K.csv", 2 * N_ROWS)); + + //////////////////////////////// + println!("Branch 2"); + //////////////////////////////// + let br2_time = Instant::now(); + + let mut D_rows = evaluate_scalars_on_coset_batch(&mut C_rows, &mut evaluate_row_domain, N_ROWS, &mut row_coset); + println!("D_both {:0.3?}", br2_time.elapsed()); + + let mut D_transposed = unsafe { DeviceBuffer::uninitialized(2 * N_ROWS * M_POINTS).unwrap() }; + transpose_scalar_matrix(&mut D_transposed.as_device_ptr(), &mut D_in, M_POINTS, N_ROWS); + transpose_scalar_matrix(&mut D_transposed.as_device_ptr().wrapping_offset((N_ROWS * M_POINTS) as isize), &mut D_rows, M_POINTS, N_ROWS); + + let mut C0 = interpolate_scalars_batch(&mut D_transposed, &mut interpolate_column_domain, 2 * M_POINTS); + let mut D_cols = evaluate_scalars_on_coset_batch(&mut C0, &mut evaluate_column_domain, 2 * M_POINTS, &mut column_coset); + + let mut D = unsafe { DeviceBuffer::uninitialized(4 * N_ROWS * M_POINTS).unwrap() }; + transpose_scalar_matrix(&mut D.as_device_ptr(), &mut D_transposed, N_ROWS, 2 * M_POINTS); + transpose_scalar_matrix(&mut D.as_device_ptr().wrapping_offset((2 * N_ROWS * M_POINTS) as isize), &mut D_cols, N_ROWS, 2 * M_POINTS); + + let mut D_host_flat: Vec = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect(); + D.copy_to(&mut D_host_flat[..]).unwrap(); + let D_host_flat = D_host_flat.into_iter().map(|x| { Scalar { s: x } }).collect::>(); + let D_host = D_host_flat.chunks(2 * M_POINTS).collect::>(); + assert_eq!(D_host, get_debug_data_scalars("D.csv", 2 * N_ROWS, 2 * M_POINTS)); + + println!("Branch2 {:0.3?}", br2_time.elapsed()); + let D_b4rbo = D_host.clone(); + + //////////////////////////////// + println!("Branch 3"); + //////////////////////////////// + let br3_time = Instant::now(); + + //d0 = MUL_row(d[mu], [S]) 1x8192 + let d0: Vec<_> = (0..2 * N_ROWS) + .map(|i| { + let mut s = S.clone(); + multp_vec(&mut s, &D_b4rbo[i], 0); + s + }) + .collect(); + debug_assert_eq!( + d0, + get_debug_data_points_xy1("d0.csv", 2 * N_ROWS, 2 * M_POINTS) + ); + + let mut d1 = vec![Point::infinity(); (2 * N_ROWS) * (2 * M_POINTS / l)]; + let d0: Vec<_> = d0.into_iter().flatten().collect(); + + addp_vec(&mut d1, &d0, 2 * N_ROWS, 2 * M_POINTS, l, 0); + + let d1 = split_vec_to_matrix(&d1, 2 * N_ROWS).clone(); + debug_assert_eq!( + d1, + get_debug_data_points_xy1("d1.csv", 2 * N_ROWS, 2 * N_ROWS) + ); + + let mut delta0: Vec<_> = d1.into_iter().flatten().collect(); + println!("iecntt batch for delta0"); + //delta0 = ECINTT_row(d1) 1x512 + iecntt_batch(&mut delta0, 2 * N_ROWS, 0); + debug_assert_eq!( + delta0, + get_debug_data_points_proj_xy1_vec("delta0.csv", 2 * N_ROWS * 2 * N_ROWS) + ); + + delta0.chunks_mut(2 * N_ROWS).for_each(|delta0_i| { + // delta1 = delta0 << 256 1x512 + let delta1_i = [&delta0_i[N_ROWS..], &vec![Point::infinity(); N_ROWS]].concat(); + q_.push(delta1_i); + }); + + let mut delta1: Vec<_> = q_.into_iter().flatten().collect(); + + println!("ecntt batch for delta1"); + //q[mu] = ECNTT_row(delta1) 1x512 + ecntt_batch(&mut delta1, 2 * N_ROWS, 0); + + let q_ = split_vec_to_matrix(&delta1, 2 * N_ROWS).clone(); + + debug_assert_eq!( + q_, + get_debug_data_points_xy1("q.csv", 2 * N_ROWS, 2 * N_ROWS) + ); + + println!("final check"); + + let P = q_ + .iter() + .map(|row| list_to_reverse_bit_order(&row.clone())) + .collect::>() + .to_vec(); + + //final assertion + println!("Branch3 {:0.3?}", br3_time.elapsed()); + + assert_eq!( + P, + get_debug_data_points_xy1("P.csv", 2 * N_ROWS, 2 * N_ROWS) + ); + + assert_ne!(P[12][23], Point::zero()); //dummy check + println!("success !!!",); +} + #[cfg(test)] mod tests { - use super::main_flow; + use super::{main_flow, alternate_flow}; #[test] fn test_main_flow() { main_flow(); } + + #[test] + fn test_alternate_flow() { + alternate_flow(); + } } diff --git a/fast-danksharding/src/lib.rs b/fast-danksharding/src/lib.rs index a14bb3e..61c1dfe 100644 --- a/fast-danksharding/src/lib.rs +++ b/fast-danksharding/src/lib.rs @@ -4,6 +4,9 @@ pub mod utils; use std::ffi::c_int; +use rustacuda::memory::DevicePointer; +use rustacuda::prelude::DeviceBuffer; + use icicle_utils::field::*; use crate::{ @@ -21,6 +24,14 @@ extern "C" { l: usize, device_id: usize, ) -> c_int; + + fn transpose_matrix( + out: DevicePointer, + input: DevicePointer, + nof_rows: usize, + nof_cols: usize, + device_id: usize, + ) -> c_int; } pub fn addp_vec( @@ -43,6 +54,23 @@ pub fn addp_vec( } } +pub fn transpose_scalar_matrix( + output: &mut DevicePointer, + input: &mut DeviceBuffer, + nof_rows: usize, + nof_cols: usize, +) -> i32 { + unsafe { + transpose_matrix( + *output, + input.as_device_ptr(), + nof_rows, + nof_cols, + 0, + ) + } +} + fn get_debug_data_scalars(filename: &str, height: usize, lenght: usize) -> Vec> { let from_limbs = get_debug_data_scalar_vec(filename); let result = split_vec_to_matrix(&from_limbs, lenght); @@ -61,6 +89,22 @@ fn get_debug_data_scalar_vec(filename: &str) -> Vec { from_limbs } +fn get_debug_data_scalar_field_vec(filename: &str) -> Vec { + let limbs = csv_be_to_u32_be_limbs( + &format!("{}{}", get_test_set_path(), filename), + SCALAR_LIMBS, + ); + + fn field_from_limbs_be(a: &[u32]) -> ScalarField { + let mut a_mut = a.to_vec(); + a_mut.reverse(); + ScalarField::from_limbs(&a_mut) + } + + let from_limbs = from_limbs(limbs, SCALAR_LIMBS, field_from_limbs_be); + from_limbs +} + fn get_test_set_path() -> String { #[cfg(test)] let data_root_path = format!("../test_vectors/{}x{}/", N_ROWS, M_POINTS); From 311c960d348c24073a2dc8ffcf8bf3cc207d278e Mon Sep 17 00:00:00 2001 From: ingo_deploy Date: Thu, 4 May 2023 00:35:06 +0300 Subject: [PATCH 2/3] Adapted new commits to ICICLE --- fast-danksharding/Cargo.toml | 2 +- fast-danksharding/src/fast_danksharding.rs | 36 +++++++++++++--------- fast-danksharding/src/lib.rs | 36 ++++++++-------------- 3 files changed, 35 insertions(+), 39 deletions(-) diff --git a/fast-danksharding/Cargo.toml b/fast-danksharding/Cargo.toml index 2b3693f..fa08e48 100644 --- a/fast-danksharding/Cargo.toml +++ b/fast-danksharding/Cargo.toml @@ -8,7 +8,7 @@ homepage = "https://www.ingonyama.com" repository = "https://github.com/ingonyama-zk/fast-danksharding" [dependencies] -icicle-utils = { git = "https://github.com/DmytroTym/icicle.git", branch = "new_api" } +icicle-utils = { git = "https://github.com/DmytroTym/icicle.git", rev = "ae4e696" } hex="0.4.3" ark-std = "0.3.0" ark-ff = "0.3.0" diff --git a/fast-danksharding/src/fast_danksharding.rs b/fast-danksharding/src/fast_danksharding.rs index 306e22d..7ea9228 100644 --- a/fast-danksharding/src/fast_danksharding.rs +++ b/fast-danksharding/src/fast_danksharding.rs @@ -1,7 +1,6 @@ use std::time::Instant; use rustacuda::prelude::*; -use rustacuda::memory::DevicePointer; use icicle_utils::{field::Point, *}; use crate::{matrix::*, utils::*, *}; @@ -262,11 +261,8 @@ pub fn main_flow() { #[allow(non_snake_case)] #[allow(non_upper_case_globals)] pub fn alternate_flow() { - let D_in_host = get_debug_data_scalar_field_vec("D_in.csv"); - let tf_u = &get_debug_data_scalars("roots_u.csv", 1, N_ROWS)[0]; + let D_in_host = get_debug_data_scalar_vec("D_in.csv"); let SRS_host = get_debug_data_points_proj_xy1_vec("SRS.csv", M_POINTS); - let roots_w = get_debug_data_scalars("roots_w.csv", M_POINTS, 1); - let tf_w = rows_to_cols(&roots_w)[0].to_vec().to_vec(); //TODO: now S is preprocessed, copy preprocessing here let S = get_debug_data_points_proj_xy1_vec("S.csv", 2 * M_POINTS); @@ -294,6 +290,7 @@ pub fn alternate_flow() { println!("pre-computation {:0.3?}", pre_time.elapsed()); + reverse_order_scalars_batch(&mut D_in, N_ROWS); //C_rows = INTT_rows(D_in) let mut C_rows = interpolate_scalars_batch(&mut D_in, &mut interpolate_row_domain, N_ROWS); @@ -305,25 +302,27 @@ pub fn alternate_flow() { let br1_time = Instant::now(); // K0 = MSM_rows(C_rows) (256x1) - let mut K0 = commit_batch(&mut SRS, &mut C_rows, N_ROWS); + let mut K0 = commit_batch(&mut SRS, &mut C_rows, N_ROWS); + let mut K: Vec = (0..2 * N_ROWS).map(|_| Point::zero()).collect(); + K0.copy_to(&mut K[..N_ROWS]).unwrap(); println!("K0 {:0.3?}", br1_time.elapsed()); + reverse_order_points(&mut K0); // B0 = ECINTT_col(K0) N_POINTS x 1 (256x1) let mut B0 = interpolate_points(&mut K0, &mut interpolate_column_domain); println!("B0 {:0.3?}", br1_time.elapsed()); // K1 = ECNTT_col(MUL_col(B0, [1 u u^2 ...])) N_POINTS x 1 (256x1) - let K1 = evaluate_points_on_coset(&mut B0, &mut evaluate_column_domain, &mut column_coset); + let mut K1 = evaluate_points_on_coset(&mut B0, &mut evaluate_column_domain, &mut column_coset); println!("K1 {:0.3?}", br1_time.elapsed()); + reverse_order_points(&mut K1); // K = [K0, K1] // 2*N_POINTS x 1 (512x1 commitments) - let mut K: Vec = (0..2 * N_ROWS).map(|_| Point::zero()).collect(); - K0.copy_to(&mut K[..N_ROWS]).unwrap(); K1.copy_to(&mut K[N_ROWS..]).unwrap(); println!("K {:0.3?}", br1_time.elapsed()); - println!("Branch1 {:0.3?}", br1_time.elapsed()); assert_eq!(K, get_debug_data_points_proj_xy1_vec("K.csv", 2 * N_ROWS)); + println!("Branch1 {:0.3?}", br1_time.elapsed()); //////////////////////////////// println!("Branch 2"); @@ -337,21 +336,28 @@ pub fn alternate_flow() { transpose_scalar_matrix(&mut D_transposed.as_device_ptr(), &mut D_in, M_POINTS, N_ROWS); transpose_scalar_matrix(&mut D_transposed.as_device_ptr().wrapping_offset((N_ROWS * M_POINTS) as isize), &mut D_rows, M_POINTS, N_ROWS); + let mut D = unsafe { DeviceBuffer::uninitialized(4 * N_ROWS * M_POINTS).unwrap() }; + transpose_scalar_matrix(&mut D.as_device_ptr(), &mut D_transposed, N_ROWS, 2 * M_POINTS); + + reverse_order_scalars_batch(&mut D_transposed, 2 * M_POINTS); let mut C0 = interpolate_scalars_batch(&mut D_transposed, &mut interpolate_column_domain, 2 * M_POINTS); let mut D_cols = evaluate_scalars_on_coset_batch(&mut C0, &mut evaluate_column_domain, 2 * M_POINTS, &mut column_coset); + reverse_order_scalars_batch(&mut D_cols, 2 * M_POINTS); - let mut D = unsafe { DeviceBuffer::uninitialized(4 * N_ROWS * M_POINTS).unwrap() }; - transpose_scalar_matrix(&mut D.as_device_ptr(), &mut D_transposed, N_ROWS, 2 * M_POINTS); transpose_scalar_matrix(&mut D.as_device_ptr().wrapping_offset((2 * N_ROWS * M_POINTS) as isize), &mut D_cols, N_ROWS, 2 * M_POINTS); let mut D_host_flat: Vec = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect(); D.copy_to(&mut D_host_flat[..]).unwrap(); - let D_host_flat = D_host_flat.into_iter().map(|x| { Scalar { s: x } }).collect::>(); let D_host = D_host_flat.chunks(2 * M_POINTS).collect::>(); - assert_eq!(D_host, get_debug_data_scalars("D.csv", 2 * N_ROWS, 2 * M_POINTS)); println!("Branch2 {:0.3?}", br2_time.elapsed()); - let D_b4rbo = D_host.clone(); + debug_assert_eq!(D_host, get_debug_data_scalars("D.csv", 2 * N_ROWS, 2 * M_POINTS)); + + reverse_order_scalars_batch(&mut D, 2 * N_ROWS); + let mut D_b4rbo_flat: Vec = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect(); + D.copy_to(&mut D_b4rbo_flat[..]).unwrap(); + let D_b4rbo = D_b4rbo_flat.chunks(2 * M_POINTS).collect::>(); + debug_assert_eq!(D_b4rbo, get_debug_data_scalars("D_b4rbo.csv", 2 * N_ROWS, 2 * M_POINTS)); //////////////////////////////// println!("Branch 3"); diff --git a/fast-danksharding/src/lib.rs b/fast-danksharding/src/lib.rs index 61c1dfe..3140e02 100644 --- a/fast-danksharding/src/lib.rs +++ b/fast-danksharding/src/lib.rs @@ -71,7 +71,7 @@ pub fn transpose_scalar_matrix( } } -fn get_debug_data_scalars(filename: &str, height: usize, lenght: usize) -> Vec> { +fn get_debug_data_scalars(filename: &str, height: usize, lenght: usize) -> Vec> { let from_limbs = get_debug_data_scalar_vec(filename); let result = split_vec_to_matrix(&from_limbs, lenght); assert_eq!(result.len(), height); @@ -80,27 +80,17 @@ fn get_debug_data_scalars(filename: &str, height: usize, lenght: usize) -> Vec Vec { - let limbs = csv_be_to_u32_be_limbs( - &format!("{}{}", get_test_set_path(), filename), - SCALAR_LIMBS, - ); - let from_limbs = from_limbs(limbs, SCALAR_LIMBS, Scalar::from_limbs_be); - from_limbs +fn field_from_limbs_be(a: &[u32]) -> ScalarField { + let mut a_mut = a.to_vec(); + a_mut.reverse(); + ScalarField::from_limbs(&a_mut) } -fn get_debug_data_scalar_field_vec(filename: &str) -> Vec { +fn get_debug_data_scalar_vec(filename: &str) -> Vec { let limbs = csv_be_to_u32_be_limbs( &format!("{}{}", get_test_set_path(), filename), SCALAR_LIMBS, ); - - fn field_from_limbs_be(a: &[u32]) -> ScalarField { - let mut a_mut = a.to_vec(); - a_mut.reverse(); - ScalarField::from_limbs(&a_mut) - } - let from_limbs = from_limbs(limbs, SCALAR_LIMBS, field_from_limbs_be); from_limbs } @@ -198,8 +188,8 @@ mod tests { *, }; - fn scalar_to_ark_mod_p(sc: &Scalar) -> Fr { - let sc_ark = BigInteger256::new(u32_vec_to_u64_vec(&sc.limbs()).try_into().unwrap()); + fn scalar_to_ark_mod_p(sc: &ScalarField) -> Fr { + let sc_ark = BigInteger256::new(u32_vec_to_u64_vec(&sc.limbs().to_vec()).try_into().unwrap()); Fr::new(sc_ark) } @@ -276,7 +266,7 @@ mod tests { let limbs = hex_be_to_padded_u32_le_vec(py_str, SCALAR_LIMBS); - let sc = Scalar::from_limbs_le(&limbs); + let sc = ScalarField::from_limbs(&limbs); let modulus = BigInteger256::new([ 0xffffffff00000001, @@ -305,7 +295,7 @@ mod tests { let limbs = hex_be_to_padded_u32_be_vec(sample_be, SCALAR_LIMBS); assert_eq!(limbs.len(), SCALAR_LIMBS); - let scalar = Scalar::from_limbs_be(&limbs); + let scalar = field_from_limbs_be(&limbs); let sample_u32le = [ 0x3f7d4adeu32, @@ -324,8 +314,8 @@ mod tests { #[test] #[allow(non_snake_case)] fn test_vec_saclar_mul() { - let mut intoo = [Scalar::one(), Scalar::one(), Scalar::zero()]; - let expected = [Scalar::one(), Scalar::zero(), Scalar::zero()]; + let mut intoo = [ScalarField::one(), ScalarField::one(), ScalarField::zero()]; + let expected = [ScalarField::one(), ScalarField::zero(), ScalarField::zero()]; mult_sc_vec(&mut intoo, &expected, 0); assert_eq!(intoo, expected); } @@ -356,7 +346,7 @@ mod tests { }; let mut inout = [dummy_one, dummy_one, Point::zero()]; - let scalars = [Scalar::one(), Scalar::zero(), Scalar::zero()]; + let scalars = [ScalarField::one(), ScalarField::zero(), ScalarField::zero()]; let expected = [ Point::zero(), Point { From 6a5753c34ad38e8579faf3f7773826636018d072 Mon Sep 17 00:00:00 2001 From: ingo_deploy Date: Fri, 5 May 2023 16:05:11 +0300 Subject: [PATCH 3/3] Third stage now fully utilizing new ICICLE API --- fast-danksharding/src/cuda/lib.cu | 64 +++++++++--- fast-danksharding/src/fast_danksharding.rs | 112 +++++++-------------- fast-danksharding/src/lib.rs | 25 ++++- kzg_data_availability/tests.py | 4 +- 4 files changed, 105 insertions(+), 100 deletions(-) diff --git a/fast-danksharding/src/cuda/lib.cu b/fast-danksharding/src/cuda/lib.cu index a9626e2..297910f 100644 --- a/fast-danksharding/src/cuda/lib.cu +++ b/fast-danksharding/src/cuda/lib.cu @@ -3,6 +3,7 @@ const int TILE_DIM = 32; const int BLOCK_ROWS = 8; +const int MAX_THREAD_NUM = 256; template void point_sum(P *h_outputs, P *h_inputs, unsigned nof_rows, unsigned nof_cols, unsigned l); @@ -52,7 +53,7 @@ void point_sum(P* h_outputs, P* h_inputs, unsigned nof_rows, unsigned nof_cols, cudaFree(d_outputs); } -extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_rows, size_t nof_cols, size_t l, size_t device_id = 0) +extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_rows, size_t nof_cols, size_t l) { try { @@ -64,6 +65,38 @@ extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_ro { printf("error %s", ex.what()); // TODO: error code and message // out->z = 0; //TODO: .set_infinity() + return -1; + } +} + +template +__global__ void shift_kernel(T *arr, unsigned nof_rows, unsigned nof_cols_div_2) +{ + // printf("block id: %d; thread id: %d \n", blockIdx.x, threadIdx.x); + unsigned id = blockIdx.x * MAX_THREAD_NUM + threadIdx.x; + + if (id < nof_rows * nof_cols_div_2) { + unsigned col_id = id % nof_cols_div_2; + unsigned row_id = id / nof_cols_div_2; + arr[row_id * 2 * nof_cols_div_2 + col_id] = arr[(2 * row_id + 1) * nof_cols_div_2 + col_id]; + arr[(2 * row_id + 1) * nof_cols_div_2 + col_id] = T::zero(); + } +} + +extern "C" int shift_batch(projective_t *arr, size_t nof_rows, size_t nof_cols_div_2) +{ + try + { + int thread_num = MAX_THREAD_NUM; + int block_num = (nof_rows * nof_cols_div_2) / thread_num; + shift_kernel <<>> (arr, nof_rows, nof_cols_div_2); + + return CUDA_SUCCESS; + } + catch (const std::runtime_error &ex) + { + printf("error %s", ex.what()); // TODO: error code and message + return -1; } } @@ -71,26 +104,26 @@ extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_ro template __global__ void transpose_kernel(T *odata, const T *idata) { - __shared__ T tile[TILE_DIM][TILE_DIM+1]; - - int x = blockIdx.x * TILE_DIM + threadIdx.x; - int y = blockIdx.y * TILE_DIM + threadIdx.y; - int width = gridDim.x * TILE_DIM; - int height = gridDim.y * TILE_DIM; + __shared__ T tile[TILE_DIM][TILE_DIM+1]; + + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + int width = gridDim.x * TILE_DIM; + int height = gridDim.y * TILE_DIM; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) - tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x]; - __syncthreads(); + __syncthreads(); - x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset - y = blockIdx.x * TILE_DIM + threadIdx.y; + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) - odata[(y+j)*height + x] = tile[threadIdx.x][threadIdx.y+j]; + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + odata[(y+j)*height + x] = tile[threadIdx.x][threadIdx.y+j]; } -extern "C" int transpose_matrix(scalar_field_t *out, scalar_field_t *in, size_t nof_rows, size_t nof_cols, size_t device_id = 0) +extern "C" int transpose_matrix(scalar_field_t *out, scalar_field_t *in, size_t nof_rows, size_t nof_cols) { try { @@ -103,5 +136,6 @@ extern "C" int transpose_matrix(scalar_field_t *out, scalar_field_t *in, size_t catch (const std::runtime_error &ex) { printf("error %s", ex.what()); // TODO: error code and message + return -1; } } diff --git a/fast-danksharding/src/fast_danksharding.rs b/fast-danksharding/src/fast_danksharding.rs index 7ea9228..d543b80 100644 --- a/fast-danksharding/src/fast_danksharding.rs +++ b/fast-danksharding/src/fast_danksharding.rs @@ -6,14 +6,14 @@ use icicle_utils::{field::Point, *}; use crate::{matrix::*, utils::*, *}; pub const FLOW_SIZE: usize = 1 << 12; //4096 //prod flow size -pub const TEST_SIZE_DIV: usize = 1; //TODO: Prod size / test size for speedup -pub const TEST_SIZE: usize = FLOW_SIZE / TEST_SIZE_DIV; //test flow size -pub const M_POINTS: usize = TEST_SIZE; -pub const LOG_M_POINTS: usize = 12; +pub const LOG_TEST_SIZE_DIV: usize = 3; //TODO: Prod size / test size for speedup +pub const TEST_SIZE_DIV: usize = 1 << LOG_TEST_SIZE_DIV; //TODO: Prod size / test size for speedup +pub const M_POINTS: usize = FLOW_SIZE / TEST_SIZE_DIV; //test flow size +pub const LOG_M_POINTS: usize = 12 - LOG_TEST_SIZE_DIV; pub const SRS_SIZE: usize = M_POINTS; pub const S_GROUP_SIZE: usize = 2 * M_POINTS; pub const N_ROWS: usize = 256 / TEST_SIZE_DIV; -pub const LOG_N_ROWS: usize = 8; +pub const LOG_N_ROWS: usize = 8 - LOG_TEST_SIZE_DIV; pub const FOLD_SIZE: usize = 512 / TEST_SIZE_DIV; //TODO: the casing is to match diagram @@ -255,7 +255,7 @@ pub fn main_flow() { ); assert_ne!(P[12][23], Point::zero()); //dummy check - println!("success !!!",); + println!("success !!!"); } #[allow(non_snake_case)] @@ -264,9 +264,8 @@ pub fn alternate_flow() { let D_in_host = get_debug_data_scalar_vec("D_in.csv"); let SRS_host = get_debug_data_points_proj_xy1_vec("SRS.csv", M_POINTS); //TODO: now S is preprocessed, copy preprocessing here - let S = get_debug_data_points_proj_xy1_vec("S.csv", 2 * M_POINTS); + let S_host = get_debug_data_points_proj_xy1_vec("S.csv", 2 * M_POINTS); - let mut q_ = Vec::>::new(); const l: usize = 16; println!("loaded test data, processing..."); @@ -278,6 +277,8 @@ pub fn alternate_flow() { let mut evaluate_row_domain = build_domain(M_POINTS, LOG_M_POINTS, false); let mut interpolate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, true); let mut evaluate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, false); + let mut interpolate_column_large_domain = build_domain(2 * N_ROWS, LOG_N_ROWS + 1, true); + let mut evaluate_column_large_domain = build_domain(2 * N_ROWS, LOG_N_ROWS + 1, false); // build cosets (i.e. powers of roots of unity `w` and `v`) let mut row_coset = build_domain(M_POINTS, LOG_M_POINTS + 1, false); let mut column_coset = build_domain(N_ROWS, LOG_N_ROWS + 1, false); @@ -285,13 +286,17 @@ pub fn alternate_flow() { let mut D_in = DeviceBuffer::from_slice(&D_in_host[..]).unwrap(); // transfer the SRS into device memory debug_assert!(SRS_host[0].to_ark_affine().is_on_curve()); - let s_affine: Vec<_> = vec![SRS_host.iter().map(|p| p.to_xy_strip_z()).collect::>(); N_ROWS].concat(); - let mut SRS = DeviceBuffer::from_slice(&s_affine[..]).unwrap(); + let SRS_affine: Vec<_> = vec![SRS_host.iter().map(|p| p.to_xy_strip_z()).collect::>(); N_ROWS].concat(); + let mut SRS = DeviceBuffer::from_slice(&SRS_affine[..]).unwrap(); + // transfer S into device memory after suitable bit-reversal + let S_host_rbo = list_to_reverse_bit_order(&S_host[..])[..].chunks(l).map(|chunk| list_to_reverse_bit_order(chunk)).collect::>().concat(); + let S_affine: Vec<_> = vec![S_host_rbo.iter().map(|p| p.to_xy_strip_z()).collect::>(); 2 * N_ROWS].concat(); + let mut S = DeviceBuffer::from_slice(&S_affine[..]).unwrap(); println!("pre-computation {:0.3?}", pre_time.elapsed()); - reverse_order_scalars_batch(&mut D_in, N_ROWS); //C_rows = INTT_rows(D_in) + reverse_order_scalars_batch(&mut D_in, N_ROWS); let mut C_rows = interpolate_scalars_batch(&mut D_in, &mut interpolate_row_domain, N_ROWS); println!("pre-branch {:0.3?}", pre_time.elapsed()); @@ -303,7 +308,7 @@ pub fn alternate_flow() { // K0 = MSM_rows(C_rows) (256x1) let mut K0 = commit_batch(&mut SRS, &mut C_rows, N_ROWS); - let mut K: Vec = (0..2 * N_ROWS).map(|_| Point::zero()).collect(); + let mut K = vec![Point::zero(); 2 * N_ROWS]; K0.copy_to(&mut K[..N_ROWS]).unwrap(); println!("K0 {:0.3?}", br1_time.elapsed()); @@ -346,94 +351,45 @@ pub fn alternate_flow() { transpose_scalar_matrix(&mut D.as_device_ptr().wrapping_offset((2 * N_ROWS * M_POINTS) as isize), &mut D_cols, N_ROWS, 2 * M_POINTS); - let mut D_host_flat: Vec = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect(); + let mut D_host_flat = vec![ScalarField::zero(); 4 * N_ROWS * M_POINTS]; D.copy_to(&mut D_host_flat[..]).unwrap(); let D_host = D_host_flat.chunks(2 * M_POINTS).collect::>(); println!("Branch2 {:0.3?}", br2_time.elapsed()); debug_assert_eq!(D_host, get_debug_data_scalars("D.csv", 2 * N_ROWS, 2 * M_POINTS)); - reverse_order_scalars_batch(&mut D, 2 * N_ROWS); - let mut D_b4rbo_flat: Vec = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect(); - D.copy_to(&mut D_b4rbo_flat[..]).unwrap(); - let D_b4rbo = D_b4rbo_flat.chunks(2 * M_POINTS).collect::>(); - debug_assert_eq!(D_b4rbo, get_debug_data_scalars("D_b4rbo.csv", 2 * N_ROWS, 2 * M_POINTS)); - //////////////////////////////// println!("Branch 3"); //////////////////////////////// let br3_time = Instant::now(); - //d0 = MUL_row(d[mu], [S]) 1x8192 - let d0: Vec<_> = (0..2 * N_ROWS) - .map(|i| { - let mut s = S.clone(); - multp_vec(&mut s, &D_b4rbo[i], 0); - s - }) - .collect(); - debug_assert_eq!( - d0, - get_debug_data_points_xy1("d0.csv", 2 * N_ROWS, 2 * M_POINTS) - ); + //d1 = MSM_batch(D[i], [S], l) 1x8192 + reverse_order_scalars_batch(&mut D, (4 * M_POINTS * N_ROWS) / l); + let mut d1 = commit_batch(&mut S, &mut D, (4 * M_POINTS * N_ROWS) / l); - let mut d1 = vec![Point::infinity(); (2 * N_ROWS) * (2 * M_POINTS / l)]; - let d0: Vec<_> = d0.into_iter().flatten().collect(); - - addp_vec(&mut d1, &d0, 2 * N_ROWS, 2 * M_POINTS, l, 0); - - let d1 = split_vec_to_matrix(&d1, 2 * N_ROWS).clone(); - debug_assert_eq!( - d1, - get_debug_data_points_xy1("d1.csv", 2 * N_ROWS, 2 * N_ROWS) - ); - - let mut delta0: Vec<_> = d1.into_iter().flatten().collect(); - println!("iecntt batch for delta0"); //delta0 = ECINTT_row(d1) 1x512 - iecntt_batch(&mut delta0, 2 * N_ROWS, 0); - debug_assert_eq!( - delta0, - get_debug_data_points_proj_xy1_vec("delta0.csv", 2 * N_ROWS * 2 * N_ROWS) - ); - - delta0.chunks_mut(2 * N_ROWS).for_each(|delta0_i| { - // delta1 = delta0 << 256 1x512 - let delta1_i = [&delta0_i[N_ROWS..], &vec![Point::infinity(); N_ROWS]].concat(); - q_.push(delta1_i); - }); + let mut delta0 = interpolate_points_batch(&mut d1, &mut interpolate_column_large_domain, 2 * N_ROWS); - let mut delta1: Vec<_> = q_.into_iter().flatten().collect(); - - println!("ecntt batch for delta1"); - //q[mu] = ECNTT_row(delta1) 1x512 - ecntt_batch(&mut delta1, 2 * N_ROWS, 0); + // delta0 = delta0 << 256 1x512 + shift_points_batch(&mut delta0, 2 * N_ROWS); - let q_ = split_vec_to_matrix(&delta1, 2 * N_ROWS).clone(); + //q[mu] = ECNTT_row(delta0) 1x512 + let P = evaluate_points_batch(&mut delta0, &mut evaluate_column_large_domain, 2 * N_ROWS); + let mut P_host_flat: Vec = (0..(4 * M_POINTS * N_ROWS) / l).map(|_| Point::zero()).collect(); + P.copy_to(&mut P_host_flat[..]).unwrap(); + let P_host = split_vec_to_matrix(&P_host_flat, 2 * N_ROWS).clone(); + //final assertion debug_assert_eq!( - q_, - get_debug_data_points_xy1("q.csv", 2 * N_ROWS, 2 * N_ROWS) + P_host, + get_debug_data_points_xy1("P.csv", 2 * N_ROWS, 2 * N_ROWS) ); - println!("final check"); - let P = q_ - .iter() - .map(|row| list_to_reverse_bit_order(&row.clone())) - .collect::>() - .to_vec(); - - //final assertion println!("Branch3 {:0.3?}", br3_time.elapsed()); - assert_eq!( - P, - get_debug_data_points_xy1("P.csv", 2 * N_ROWS, 2 * N_ROWS) - ); - - assert_ne!(P[12][23], Point::zero()); //dummy check - println!("success !!!",); + assert_ne!(P_host[12][23], Point::zero()); //dummy check + println!("success !!!"); } #[cfg(test)] diff --git a/fast-danksharding/src/lib.rs b/fast-danksharding/src/lib.rs index 3140e02..6107a91 100644 --- a/fast-danksharding/src/lib.rs +++ b/fast-danksharding/src/lib.rs @@ -22,7 +22,6 @@ extern "C" { nof_rows: usize, nof_cols: usize, l: usize, - device_id: usize, ) -> c_int; fn transpose_matrix( @@ -30,7 +29,12 @@ extern "C" { input: DevicePointer, nof_rows: usize, nof_cols: usize, - device_id: usize, + ) -> c_int; + + fn shift_batch( + arr: DevicePointer, + nof_rows: usize, + nof_cols_div_2: usize, ) -> c_int; } @@ -40,7 +44,7 @@ pub fn addp_vec( nof_rows: usize, nof_cols: usize, l: usize, - device_id: usize, + _device_id: usize, ) -> i32 { unsafe { sum_of_points( @@ -49,7 +53,6 @@ pub fn addp_vec( nof_rows, nof_cols, l, - device_id, ) } } @@ -66,7 +69,19 @@ pub fn transpose_scalar_matrix( input.as_device_ptr(), nof_rows, nof_cols, - 0, + ) + } +} + +pub fn shift_points_batch( + arr: &mut DeviceBuffer, + batch_size: usize, +) -> i32 { + unsafe { + shift_batch( + arr.as_device_ptr(), + batch_size, + arr.len() / (2 * batch_size), ) } } diff --git a/kzg_data_availability/tests.py b/kzg_data_availability/tests.py index a8cc2d7..ade1569 100644 --- a/kzg_data_availability/tests.py +++ b/kzg_data_availability/tests.py @@ -25,8 +25,8 @@ import csv # 8x reduced size for testing, for full set use n = 512 and m = 4096 -n = 32*8 -m = 512*8 +n = 32 +m = 512 PRIMITIVE_ROOT = 5 MAX_DEGREE_POLY = MODULUS-1