multiplication benchmark

ingonyama-zk · Dec 19, 2023 · eb1fc17 · eb1fc17
1 parent 2c41756
commit eb1fc17
Show file tree

Hide file tree

Showing 7 changed files with 405 additions and 0 deletions.
diff --git a/benchmarks/multiply/.devcontainer/Dockerfile b/benchmarks/multiply/.devcontainer/Dockerfile
@@ -0,0 +1,42 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    nsight-systems-12.2 \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    jq \
+    postgresql-client \
+    python3-pip \    
+    && rm -rf /var/lib/apt/lists/*
+
+# apt-get install cuda-nsight-systems-12-2
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Benchmarking in C++
+RUN git clone https://github.com/google/benchmark.git /opt/benchmark \
+    && cd /opt/benchmark \
+    && cmake -E make_directory "build" \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -S . -B "build" \
+    && cmake --build "build" --config Release \
+    && cmake --build "build" --config Release --target install
+
+
+# Set the working directory in the container
+WORKDIR /icicle-benchmark
+# COPY . .
+# RUN mkdir -p build && \
+#     cmake -S . -B build && \
+#     cmake --build build
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
diff --git a/benchmarks/multiply/.devcontainer/devcontainer.json b/benchmarks/multiply/.devcontainer/devcontainer.json
@@ -0,0 +1,25 @@
+{
+    "name": "Icicle Benchmarks - multiply",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-benchmark,type=bind",
+    "workspaceFolder": "/icicle-benchmark",
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python",
+				"ms-azuretools.vscode-docker",
+				"ms-vscode.cpptools-extension-pack"
+			]
+		}
+	}
+}
diff --git a/benchmarks/multiply/CMakeLists.txt b/benchmarks/multiply/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(zk-benchmarks LANGUAGES CUDA CXX)
+
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+#set(CMAKE_CUDA_FLAGS_RELEASE "")
+#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+include_directories("/icicle")
+include_directories("/opt/benchmark/include")
+
+add_executable(
+  benchmark
+  benchmark.cu
+)
+
+find_library(BENCHMARK_LIBRARY benchmark PATHS /usr/local/lib)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(benchmark ${BENCHMARK_LIBRARY} ${NVML_LIBRARY})
+set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/benchmarks/multiply/README.md b/benchmarks/multiply/README.md
@@ -0,0 +1,22 @@
+# Icicle benchmark: multiply operation
+
+The benchmark measures the runtime of the vector operation $c[i] = a[i] * b[i]^n$, where $n$ is sufficiently large and we can ignore the memory access times.
+
+## Best-Practices
+
+We recommend to run the benchmarks in [ZK-containers](../ZK-containers.md) to save your time and mental energy.
+
+## Targets
+
+We designed the benchmark to estimate how many operations per second a given GPU can sustain.
+
+## Run benchmark
+
+Inside the container,
+
+```sh
+./compile.sh
+./run.sh
+```
+
+
diff --git a/benchmarks/multiply/benchmark.cu b/benchmarks/multiply/benchmark.cu
@@ -0,0 +1,145 @@
+#define CURVE_BN254     1
+#define CURVE_BLS12_381 2
+#define CURVE_BLS12_377 3
+
+#define CURVE CURVE_BLS12_377
+
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include </opt/benchmark/include/benchmark/benchmark.h>
+#include "/icicle/icicle/primitives/field.cuh"
+
+#if CURVE == CURVE_BN254
+
+#include "/icicle/icicle/curves/bn254/curve_config.cuh"    
+using namespace BN254;
+const std::string curve = "BN254";
+
+#elif CURVE == CURVE_BLS12_381
+
+#include "/icicle/icicle/curves/bls12_381/curve_config.cuh"
+using namespace BLS12_381;
+const std::string curve = "BLS12-381";
+
+#elif CURVE == CURVE_BLS12_377
+
+#include "/icicle/icicle/curves/bls12_377/curve_config.cuh"
+using namespace BLS12_377;
+const std::string curve = "BLS12-377";
+
+#endif
+
+
+#include "/icicle/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh"
+
+template <typename S, int N>
+__global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments)
+{
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid < n_elments)
+    {
+        const S b = vec_b[tid];
+        S r = vec_a[tid];
+        // #pragma unroll
+        for (int i = 0; i < N; i++)
+            r = r * b;
+        vec_r[tid] = r;
+    }
+}
+
+template <typename S, int N = 10>
+int vector_mult(S *vec_b, S *vec_a, S *vec_result, size_t n_elments)
+{
+    // Set the grid and block dimensions
+    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+    int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+    // Call the kernel to perform element-wise modular multiplication
+    vectorMult<S, N><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
+    return 0;
+}
+
+// typedef scalar_t S;
+typedef point_field_t S;
+
+const unsigned nof_mult = 100;
+unsigned nof_elements = 1 << 25;  
+S *vec_a;
+S *vec_b;
+S *d_vec_b;
+S *d_vec_a, *d_result;
+nvmlDevice_t device;
+
+static void BM_mult(benchmark::State& state) {  
+  for (auto _ : state) {
+    vector_mult<S, nof_mult>(d_vec_a, d_vec_b, d_result, nof_elements);
+    cudaDeviceSynchronize();
+  }
+  unsigned int power;
+  nvmlDeviceGetPowerUsage(device, &power);
+  state.counters["PowerUsage"] = int(1.0e-3 * power);
+  unsigned int temperature;
+  nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
+  state.counters["Temperature"] = int(temperature);
+}
+
+BENCHMARK(BM_mult)->MinTime(60.);
+
+int main(int argc, char** argv) {
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId=0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+  int gpu_clock_mhz = deviceProperties.clockRate/1000.;
+
+  nvmlInit();
+  nvmlDeviceGetHandleByIndex(0, &device);  // for GPU 0
+
+  std::cout << "Setting host data" << std::endl;
+
+  vec_a = (S*)malloc(sizeof(S) * nof_elements);
+  vec_b = (S*)malloc(sizeof(S) * nof_elements);
+  for (unsigned i = 0; i < (1 << 10); i++) {
+    vec_a[i] = S::rand_host();
+    vec_b[i] = S::rand_host();
+  }
+  for (unsigned i = 1; i < (nof_elements >> 10); i++) {
+    memcpy((void *)(vec_a + (i << 10)), (void *)(vec_a + ((i-1) << 10)), sizeof(S) << 10);
+    memcpy((void *)(vec_b + (i << 10)), (void *)(vec_b + ((i-1) << 10)), sizeof(S) << 10);
+  }
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  std::cout << "Moving data to device" << std::endl;
+  cudaMalloc(&d_vec_a, nof_elements * sizeof(S));
+  cudaMalloc(&d_vec_b, nof_elements * sizeof(S));
+  cudaMalloc(&d_result, nof_elements * sizeof(S));
+
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpy(d_vec_a, vec_a, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_vec_b, vec_b, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
+  std::cout << "Running benchmark" << std::endl;
+
+  // Run all benchmarks 
+  ::benchmark::Initialize(&argc, argv);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
+  ::benchmark::AddCustomContext("team", "Ingonyama");
+  ::benchmark::AddCustomContext("project", "Icicle");
+  ::benchmark::AddCustomContext("runs_on", gpu_name);
+  ::benchmark::AddCustomContext("frequency_MHz", std::to_string(gpu_clock_mhz));
+  ::benchmark::AddCustomContext("uses", curve);
+  ::benchmark::AddCustomContext("comment", "on-device API");
+  ::benchmark::AddCustomContext("operation_factor", std::to_string(nof_mult));
+  ::benchmark::AddCustomContext("vector_size", std::to_string(nof_elements));
+  ::benchmark::RunSpecifiedBenchmarks();
+
+  cudaFree(d_vec_a);
+  cudaFree(d_vec_b);
+  cudaFree(d_result);
+  free(vec_a);
+  free(vec_b);
+}
diff --git a/benchmarks/multiply/compile.sh b/benchmarks/multiply/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+