Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Danksharding flow #17

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "icicle"]
path = icicle
url = https://github.com/ingonyama-zk/icicle.git
url = https://github.com/DmytroTym/icicle/
branch = new_api
4 changes: 3 additions & 1 deletion fast-danksharding/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ homepage = "https://www.ingonyama.com"
repository = "https://github.com/ingonyama-zk/fast-danksharding"

[dependencies]
icicle-utils = { git = "https://github.com/ingonyama-zk/icicle.git" }
icicle-utils = { git = "https://github.com/DmytroTym/icicle.git", branch = "new_api" }
hex="0.4.3"
ark-std = "0.3.0"
ark-ff = "0.3.0"
ark-poly = "0.3.0"
ark-ec = { version = "0.3.0", features = [ "parallel" ] }
ark-bls12-381 = { version = "0.3.0", optional = true }
rustacuda = "0.1"
rustacuda_core = "0.1"

[build-dependencies]
cc = { version = "1.0", features = ["parallel"] }
Expand Down
42 changes: 42 additions & 0 deletions fast-danksharding/src/cuda/lib.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include "../../../icicle/icicle/curves/curve_config.cuh"
#include <cuda.h>

const int TILE_DIM = 32;
const int BLOCK_ROWS = 8;

template <typename P>
void point_sum(P *h_outputs, P *h_inputs, unsigned nof_rows, unsigned nof_cols, unsigned l);

Expand Down Expand Up @@ -63,3 +66,42 @@ extern "C" int sum_of_points(projective_t *out, projective_t in[], size_t nof_ro
// out->z = 0; //TODO: .set_infinity()
}
}

// the shared-memory version of matrix transpose taken from here: https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
template <typename T>
__global__ void transpose_kernel(T *odata, const T *idata)
{
__shared__ T tile[TILE_DIM][TILE_DIM+1];

int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
int height = gridDim.y * TILE_DIM;

for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];

__syncthreads();

x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;

for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*height + x] = tile[threadIdx.x][threadIdx.y+j];
}

extern "C" int transpose_matrix(scalar_field_t *out, scalar_field_t *in, size_t nof_rows, size_t nof_cols, size_t device_id = 0)
{
try
{
dim3 dimGrid(nof_rows / TILE_DIM, nof_cols / TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
transpose_kernel <scalar_t> <<<dimGrid, dimBlock>>> (out, in);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scalar_t ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, changed to scalar_field_t although why do we have two separate names for the same thing?


return CUDA_SUCCESS;
}
catch (const std::runtime_error &ex)
{
printf("error %s", ex.what()); // TODO: error code and message
}
}
182 changes: 181 additions & 1 deletion fast-danksharding/src/fast_danksharding.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::time::Instant;

use rustacuda::prelude::*;
use rustacuda::memory::DevicePointer;
use icicle_utils::{field::Point, *};

use crate::{matrix::*, utils::*, *};
Expand All @@ -8,9 +10,11 @@ pub const FLOW_SIZE: usize = 1 << 12; //4096 //prod flow size
pub const TEST_SIZE_DIV: usize = 1; //TODO: Prod size / test size for speedup
pub const TEST_SIZE: usize = FLOW_SIZE / TEST_SIZE_DIV; //test flow size
pub const M_POINTS: usize = TEST_SIZE;
pub const LOG_M_POINTS: usize = 12;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems it won't work for reduced test vector sizes

pub const SRS_SIZE: usize = M_POINTS;
pub const S_GROUP_SIZE: usize = 2 * M_POINTS;
pub const N_ROWS: usize = 256 / TEST_SIZE_DIV;
pub const LOG_N_ROWS: usize = 8;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

pub const FOLD_SIZE: usize = 512 / TEST_SIZE_DIV;

//TODO: the casing is to match diagram
Expand Down Expand Up @@ -255,12 +259,188 @@ pub fn main_flow() {
println!("success !!!",);
}

#[allow(non_snake_case)]
#[allow(non_upper_case_globals)]
pub fn alternate_flow() {
let D_in_host = get_debug_data_scalar_field_vec("D_in.csv");
let tf_u = &get_debug_data_scalars("roots_u.csv", 1, N_ROWS)[0];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls review for unused vars

let SRS_host = get_debug_data_points_proj_xy1_vec("SRS.csv", M_POINTS);
let roots_w = get_debug_data_scalars("roots_w.csv", M_POINTS, 1);
let tf_w = rows_to_cols(&roots_w)[0].to_vec().to_vec();
//TODO: now S is preprocessed, copy preprocessing here
let S = get_debug_data_points_proj_xy1_vec("S.csv", 2 * M_POINTS);

let mut q_ = Vec::<Vec<Point>>::new();
const l: usize = 16;
println!("loaded test data, processing...");

let pre_time = Instant::now();
// set up the device
let _ctx = rustacuda::quick_init();
// build domains (i.e. compute twiddle factors)
let mut interpolate_row_domain = build_domain(M_POINTS, LOG_M_POINTS, true);
let mut evaluate_row_domain = build_domain(M_POINTS, LOG_M_POINTS, false);
let mut interpolate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, true);
let mut evaluate_column_domain = build_domain(N_ROWS, LOG_N_ROWS, false);
// build cosets (i.e. powers of roots of unity `w` and `v`)
let mut row_coset = build_domain(M_POINTS, LOG_M_POINTS + 1, false);
let mut column_coset = build_domain(N_ROWS, LOG_N_ROWS + 1, false);
// transfer `D_in` into device memory
let mut D_in = DeviceBuffer::from_slice(&D_in_host[..]).unwrap();
// transfer the SRS into device memory
debug_assert!(SRS_host[0].to_ark_affine().is_on_curve());
let s_affine: Vec<_> = vec![SRS_host.iter().map(|p| p.to_xy_strip_z()).collect::<Vec<_>>(); N_ROWS].concat();
let mut SRS = DeviceBuffer::from_slice(&s_affine[..]).unwrap();

println!("pre-computation {:0.3?}", pre_time.elapsed());

//C_rows = INTT_rows(D_in)
let mut C_rows = interpolate_scalars_batch(&mut D_in, &mut interpolate_row_domain, N_ROWS);

println!("pre-branch {:0.3?}", pre_time.elapsed());

////////////////////////////////
println!("Branch 1");
////////////////////////////////
let br1_time = Instant::now();

// K0 = MSM_rows(C_rows) (256x1)
let mut K0 = commit_batch(&mut SRS, &mut C_rows, N_ROWS);
println!("K0 {:0.3?}", br1_time.elapsed());

// B0 = ECINTT_col(K0) N_POINTS x 1 (256x1)
let mut B0 = interpolate_points(&mut K0, &mut interpolate_column_domain);
println!("B0 {:0.3?}", br1_time.elapsed());

// K1 = ECNTT_col(MUL_col(B0, [1 u u^2 ...])) N_POINTS x 1 (256x1)
let K1 = evaluate_points_on_coset(&mut B0, &mut evaluate_column_domain, &mut column_coset);
println!("K1 {:0.3?}", br1_time.elapsed());

// K = [K0, K1] // 2*N_POINTS x 1 (512x1 commitments)
let mut K: Vec<Point> = (0..2 * N_ROWS).map(|_| Point::zero()).collect();
K0.copy_to(&mut K[..N_ROWS]).unwrap();
K1.copy_to(&mut K[N_ROWS..]).unwrap();
println!("K {:0.3?}", br1_time.elapsed());

println!("Branch1 {:0.3?}", br1_time.elapsed());
assert_eq!(K, get_debug_data_points_proj_xy1_vec("K.csv", 2 * N_ROWS));

////////////////////////////////
println!("Branch 2");
////////////////////////////////
let br2_time = Instant::now();

let mut D_rows = evaluate_scalars_on_coset_batch(&mut C_rows, &mut evaluate_row_domain, N_ROWS, &mut row_coset);
println!("D_both {:0.3?}", br2_time.elapsed());

let mut D_transposed = unsafe { DeviceBuffer::uninitialized(2 * N_ROWS * M_POINTS).unwrap() };
transpose_scalar_matrix(&mut D_transposed.as_device_ptr(), &mut D_in, M_POINTS, N_ROWS);
transpose_scalar_matrix(&mut D_transposed.as_device_ptr().wrapping_offset((N_ROWS * M_POINTS) as isize), &mut D_rows, M_POINTS, N_ROWS);

let mut C0 = interpolate_scalars_batch(&mut D_transposed, &mut interpolate_column_domain, 2 * M_POINTS);
let mut D_cols = evaluate_scalars_on_coset_batch(&mut C0, &mut evaluate_column_domain, 2 * M_POINTS, &mut column_coset);

let mut D = unsafe { DeviceBuffer::uninitialized(4 * N_ROWS * M_POINTS).unwrap() };
transpose_scalar_matrix(&mut D.as_device_ptr(), &mut D_transposed, N_ROWS, 2 * M_POINTS);
transpose_scalar_matrix(&mut D.as_device_ptr().wrapping_offset((2 * N_ROWS * M_POINTS) as isize), &mut D_cols, N_ROWS, 2 * M_POINTS);

let mut D_host_flat: Vec<ScalarField> = (0..4 * N_ROWS * FLOW_SIZE).map(|_| ScalarField::zero()).collect();
D.copy_to(&mut D_host_flat[..]).unwrap();
let D_host_flat = D_host_flat.into_iter().map(|x| { Scalar { s: x } }).collect::<Vec<_>>();
let D_host = D_host_flat.chunks(2 * M_POINTS).collect::<Vec<_>>();
assert_eq!(D_host, get_debug_data_scalars("D.csv", 2 * N_ROWS, 2 * M_POINTS));

println!("Branch2 {:0.3?}", br2_time.elapsed());
let D_b4rbo = D_host.clone();

////////////////////////////////
println!("Branch 3");
////////////////////////////////
let br3_time = Instant::now();

//d0 = MUL_row(d[mu], [S]) 1x8192
let d0: Vec<_> = (0..2 * N_ROWS)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually "batch" version of this loop should be faster, pls look at

multp_vec(&mut d0, &D_b4rbo, 0);
- and
let mut d0 = vec![S; 2 * N_ROWS].concat();
ondevice equivalent should be faster

Copy link
Author

@DmytroTym DmytroTym May 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The third phase was unfinished in that commit, improved now

.map(|i| {
let mut s = S.clone();
multp_vec(&mut s, &D_b4rbo[i], 0);
s
})
.collect();
debug_assert_eq!(
d0,
get_debug_data_points_xy1("d0.csv", 2 * N_ROWS, 2 * M_POINTS)
);

let mut d1 = vec![Point::infinity(); (2 * N_ROWS) * (2 * M_POINTS / l)];
let d0: Vec<_> = d0.into_iter().flatten().collect();

addp_vec(&mut d1, &d0, 2 * N_ROWS, 2 * M_POINTS, l, 0);

let d1 = split_vec_to_matrix(&d1, 2 * N_ROWS).clone();
debug_assert_eq!(
d1,
get_debug_data_points_xy1("d1.csv", 2 * N_ROWS, 2 * N_ROWS)
);

let mut delta0: Vec<_> = d1.into_iter().flatten().collect();
println!("iecntt batch for delta0");
//delta0 = ECINTT_row(d1) 1x512
iecntt_batch(&mut delta0, 2 * N_ROWS, 0);
debug_assert_eq!(
delta0,
get_debug_data_points_proj_xy1_vec("delta0.csv", 2 * N_ROWS * 2 * N_ROWS)
);

delta0.chunks_mut(2 * N_ROWS).for_each(|delta0_i| {
// delta1 = delta0 << 256 1x512
let delta1_i = [&delta0_i[N_ROWS..], &vec![Point::infinity(); N_ROWS]].concat();
q_.push(delta1_i);
});

let mut delta1: Vec<_> = q_.into_iter().flatten().collect();

println!("ecntt batch for delta1");
//q[mu] = ECNTT_row(delta1) 1x512
ecntt_batch(&mut delta1, 2 * N_ROWS, 0);

let q_ = split_vec_to_matrix(&delta1, 2 * N_ROWS).clone();

debug_assert_eq!(
q_,
get_debug_data_points_xy1("q.csv", 2 * N_ROWS, 2 * N_ROWS)
);

println!("final check");

let P = q_
.iter()
.map(|row| list_to_reverse_bit_order(&row.clone()))
.collect::<Vec<_>>()
.to_vec();

//final assertion
println!("Branch3 {:0.3?}", br3_time.elapsed());

assert_eq!(
P,
get_debug_data_points_xy1("P.csv", 2 * N_ROWS, 2 * N_ROWS)
);

assert_ne!(P[12][23], Point::zero()); //dummy check
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about icicle #55, probably a quick fix?

println!("success !!!",);
}

#[cfg(test)]
mod tests {
use super::main_flow;
use super::{main_flow, alternate_flow};

#[test]
fn test_main_flow() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure current "main_flow" worth keeping if new API is correct and faster

Copy link
Author

@DmytroTym DmytroTym May 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove current main_flow if everyone's happy with the new one, I left it for the sake of easier comparison for now

main_flow();
}

#[test]
fn test_alternate_flow() {
alternate_flow();
}
}
44 changes: 44 additions & 0 deletions fast-danksharding/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ pub mod utils;

use std::ffi::c_int;

use rustacuda::memory::DevicePointer;
use rustacuda::prelude::DeviceBuffer;

use icicle_utils::field::*;

use crate::{
Expand All @@ -21,6 +24,14 @@ extern "C" {
l: usize,
device_id: usize,
) -> c_int;

fn transpose_matrix(
out: DevicePointer<ScalarField>,
input: DevicePointer<ScalarField>,
nof_rows: usize,
nof_cols: usize,
device_id: usize,
) -> c_int;
}

pub fn addp_vec(
Expand All @@ -43,6 +54,23 @@ pub fn addp_vec(
}
}

pub fn transpose_scalar_matrix(
output: &mut DevicePointer<ScalarField>,
input: &mut DeviceBuffer<ScalarField>,
nof_rows: usize,
nof_cols: usize,
) -> i32 {
unsafe {
transpose_matrix(
*output,
input.as_device_ptr(),
nof_rows,
nof_cols,
0,
)
}
}

fn get_debug_data_scalars(filename: &str, height: usize, lenght: usize) -> Vec<Vec<Scalar>> {
let from_limbs = get_debug_data_scalar_vec(filename);
let result = split_vec_to_matrix(&from_limbs, lenght);
Expand All @@ -61,6 +89,22 @@ fn get_debug_data_scalar_vec(filename: &str) -> Vec<Scalar> {
from_limbs
}

fn get_debug_data_scalar_field_vec(filename: &str) -> Vec<ScalarField> {
let limbs = csv_be_to_u32_be_limbs(
&format!("{}{}", get_test_set_path(), filename),
SCALAR_LIMBS,
);

fn field_from_limbs_be(a: &[u32]) -> ScalarField {
let mut a_mut = a.to_vec();
a_mut.reverse();
ScalarField::from_limbs(&a_mut)
}

let from_limbs = from_limbs(limbs, SCALAR_LIMBS, field_from_limbs_be);
from_limbs
}

fn get_test_set_path() -> String {
#[cfg(test)]
let data_root_path = format!("../test_vectors/{}x{}/", N_ROWS, M_POINTS);
Expand Down