Skip to content

Commit

Permalink
multiplication benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
svpolonsky committed Dec 19, 2023
1 parent 2c41756 commit eb1fc17
Show file tree
Hide file tree
Showing 7 changed files with 405 additions and 0 deletions.
42 changes: 42 additions & 0 deletions benchmarks/multiply/.devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Make sure NVIDIA Container Toolkit is installed on your host

# Use the specified base image
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

# Update and install dependencies
RUN apt-get update && apt-get install -y \
nsight-systems-12.2 \
cmake \
protobuf-compiler \
curl \
build-essential \
git \
libboost-all-dev \
jq \
postgresql-client \
python3-pip \
&& rm -rf /var/lib/apt/lists/*

# apt-get install cuda-nsight-systems-12-2

# Clone Icicle from a GitHub repository
RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle

# Benchmarking in C++
RUN git clone https://github.com/google/benchmark.git /opt/benchmark \
&& cd /opt/benchmark \
&& cmake -E make_directory "build" \
&& cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -S . -B "build" \
&& cmake --build "build" --config Release \
&& cmake --build "build" --config Release --target install


# Set the working directory in the container
WORKDIR /icicle-benchmark
# COPY . .
# RUN mkdir -p build && \
# cmake -S . -B build && \
# cmake --build build

# Specify the default command for the container
CMD ["/bin/bash"]
25 changes: 25 additions & 0 deletions benchmarks/multiply/.devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "Icicle Benchmarks - multiply",
"build": {
"dockerfile": "Dockerfile"
},
"workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-benchmark,type=bind",
"workspaceFolder": "/icicle-benchmark",
"runArgs": [
"--gpus",
"all"
],
"postCreateCommand": [
"nvidia-smi"
],
"customizations": {
"vscode": {
"extensions": [
"ms-vscode.cmake-tools",
"ms-python.python",
"ms-azuretools.vscode-docker",
"ms-vscode.cpptools-extension-pack"
]
}
}
}
29 changes: 29 additions & 0 deletions benchmarks/multiply/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(zk-benchmarks LANGUAGES CUDA CXX)

#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
#set(CMAKE_CUDA_FLAGS_RELEASE "")
#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")

include_directories("/icicle")
include_directories("/opt/benchmark/include")

add_executable(
benchmark
benchmark.cu
)

find_library(BENCHMARK_LIBRARY benchmark PATHS /usr/local/lib)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(benchmark ${BENCHMARK_LIBRARY} ${NVML_LIBRARY})
set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

22 changes: 22 additions & 0 deletions benchmarks/multiply/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Icicle benchmark: multiply operation

The benchmark measures the runtime of the vector operation $c[i] = a[i] * b[i]^n$, where $n$ is sufficiently large and we can ignore the memory access times.

## Best-Practices

We recommend to run the benchmarks in [ZK-containers](../ZK-containers.md) to save your time and mental energy.

## Targets

We designed the benchmark to estimate how many operations per second a given GPU can sustain.

## Run benchmark

Inside the container,

```sh
./compile.sh
./run.sh
```


145 changes: 145 additions & 0 deletions benchmarks/multiply/benchmark.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#define CURVE_BN254 1
#define CURVE_BLS12_381 2
#define CURVE_BLS12_377 3

#define CURVE CURVE_BLS12_377

#include <stdio.h>
#include <iostream>
#include <string>
#include <cuda_runtime.h>
#include <nvml.h>
#include </opt/benchmark/include/benchmark/benchmark.h>
#include "/icicle/icicle/primitives/field.cuh"

#if CURVE == CURVE_BN254

#include "/icicle/icicle/curves/bn254/curve_config.cuh"
using namespace BN254;
const std::string curve = "BN254";

#elif CURVE == CURVE_BLS12_381

#include "/icicle/icicle/curves/bls12_381/curve_config.cuh"
using namespace BLS12_381;
const std::string curve = "BLS12-381";

#elif CURVE == CURVE_BLS12_377

#include "/icicle/icicle/curves/bls12_377/curve_config.cuh"
using namespace BLS12_377;
const std::string curve = "BLS12-377";

#endif


#include "/icicle/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh"

template <typename S, int N>
__global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n_elments)
{
const S b = vec_b[tid];
S r = vec_a[tid];
// #pragma unroll
for (int i = 0; i < N; i++)
r = r * b;
vec_r[tid] = r;
}
}

template <typename S, int N = 10>
int vector_mult(S *vec_b, S *vec_a, S *vec_result, size_t n_elments)
{
// Set the grid and block dimensions
int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
int threads_per_block = MAX_THREADS_PER_BLOCK;

// Call the kernel to perform element-wise modular multiplication
vectorMult<S, N><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
return 0;
}

// typedef scalar_t S;
typedef point_field_t S;

const unsigned nof_mult = 100;
unsigned nof_elements = 1 << 25;
S *vec_a;
S *vec_b;
S *d_vec_b;
S *d_vec_a, *d_result;
nvmlDevice_t device;

static void BM_mult(benchmark::State& state) {
for (auto _ : state) {
vector_mult<S, nof_mult>(d_vec_a, d_vec_b, d_result, nof_elements);
cudaDeviceSynchronize();
}
unsigned int power;
nvmlDeviceGetPowerUsage(device, &power);
state.counters["PowerUsage"] = int(1.0e-3 * power);
unsigned int temperature;
nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
state.counters["Temperature"] = int(temperature);
}

BENCHMARK(BM_mult)->MinTime(60.);

int main(int argc, char** argv) {
cudaDeviceReset();
cudaDeviceProp deviceProperties;
int deviceId=0;
cudaGetDeviceProperties(&deviceProperties, deviceId);
std::string gpu_full_name = deviceProperties.name;
std::cout << gpu_full_name << std::endl;
std::string gpu_name = gpu_full_name;
int gpu_clock_mhz = deviceProperties.clockRate/1000.;

nvmlInit();
nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0

std::cout << "Setting host data" << std::endl;

vec_a = (S*)malloc(sizeof(S) * nof_elements);
vec_b = (S*)malloc(sizeof(S) * nof_elements);
for (unsigned i = 0; i < (1 << 10); i++) {
vec_a[i] = S::rand_host();
vec_b[i] = S::rand_host();
}
for (unsigned i = 1; i < (nof_elements >> 10); i++) {
memcpy((void *)(vec_a + (i << 10)), (void *)(vec_a + ((i-1) << 10)), sizeof(S) << 10);
memcpy((void *)(vec_b + (i << 10)), (void *)(vec_b + ((i-1) << 10)), sizeof(S) << 10);
}
// Allocate memory on the device for the input vectors, the output vector, and the modulus
std::cout << "Moving data to device" << std::endl;
cudaMalloc(&d_vec_a, nof_elements * sizeof(S));
cudaMalloc(&d_vec_b, nof_elements * sizeof(S));
cudaMalloc(&d_result, nof_elements * sizeof(S));

// Copy the input vectors and the modulus from the host to the device
cudaMemcpy(d_vec_a, vec_a, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
cudaMemcpy(d_vec_b, vec_b, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
std::cout << "Running benchmark" << std::endl;

// Run all benchmarks
::benchmark::Initialize(&argc, argv);
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
::benchmark::AddCustomContext("team", "Ingonyama");
::benchmark::AddCustomContext("project", "Icicle");
::benchmark::AddCustomContext("runs_on", gpu_name);
::benchmark::AddCustomContext("frequency_MHz", std::to_string(gpu_clock_mhz));
::benchmark::AddCustomContext("uses", curve);
::benchmark::AddCustomContext("comment", "on-device API");
::benchmark::AddCustomContext("operation_factor", std::to_string(nof_mult));
::benchmark::AddCustomContext("vector_size", std::to_string(nof_elements));
::benchmark::RunSpecifiedBenchmarks();

cudaFree(d_vec_a);
cudaFree(d_vec_b);
cudaFree(d_result);
free(vec_a);
free(vec_b);
}
11 changes: 11 additions & 0 deletions benchmarks/multiply/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

# Exit immediately on error
set -e

rm -rf build
mkdir -p build
cmake -S . -B build
cmake --build build


Loading

0 comments on commit eb1fc17

Please sign in to comment.