From eb1fc17f7b936fe2b987c3ca5beed9127aa17201 Mon Sep 17 00:00:00 2001
From: stas <spolonsky@icloud.com>
Date: Tue, 19 Dec 2023 18:43:28 -0500
Subject: [PATCH] multiplication benchmark

---
 benchmarks/multiply/.devcontainer/Dockerfile  |  42 +++++
 .../multiply/.devcontainer/devcontainer.json  |  25 +++
 benchmarks/multiply/CMakeLists.txt            |  29 ++++
 benchmarks/multiply/README.md                 |  22 +++
 benchmarks/multiply/benchmark.cu              | 145 ++++++++++++++++++
 benchmarks/multiply/compile.sh                |  11 ++
 benchmarks/multiply/run.sh                    | 131 ++++++++++++++++
 7 files changed, 405 insertions(+)
 create mode 100644 benchmarks/multiply/.devcontainer/Dockerfile
 create mode 100644 benchmarks/multiply/.devcontainer/devcontainer.json
 create mode 100644 benchmarks/multiply/CMakeLists.txt
 create mode 100644 benchmarks/multiply/README.md
 create mode 100644 benchmarks/multiply/benchmark.cu
 create mode 100755 benchmarks/multiply/compile.sh
 create mode 100755 benchmarks/multiply/run.sh

diff --git a/benchmarks/multiply/.devcontainer/Dockerfile b/benchmarks/multiply/.devcontainer/Dockerfile
new file mode 100644
index 0000000..00002ec
--- /dev/null
+++ b/benchmarks/multiply/.devcontainer/Dockerfile
@@ -0,0 +1,42 @@
+# Make sure NVIDIA Container Toolkit is installed on your host
+
+# Use the specified base image
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+
+# Update and install dependencies
+RUN apt-get update && apt-get install -y \
+    nsight-systems-12.2 \
+    cmake \
+    protobuf-compiler \
+    curl \
+    build-essential \
+    git \
+    libboost-all-dev \
+    jq \
+    postgresql-client \
+    python3-pip \    
+    && rm -rf /var/lib/apt/lists/*
+
+# apt-get install cuda-nsight-systems-12-2
+
+# Clone Icicle from a GitHub repository
+RUN git clone https://github.com/ingonyama-zk/icicle.git  /icicle
+
+# Benchmarking in C++
+RUN git clone https://github.com/google/benchmark.git /opt/benchmark \
+    && cd /opt/benchmark \
+    && cmake -E make_directory "build" \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -S . -B "build" \
+    && cmake --build "build" --config Release \
+    && cmake --build "build" --config Release --target install
+
+
+# Set the working directory in the container
+WORKDIR /icicle-benchmark
+# COPY . .
+# RUN mkdir -p build && \
+#     cmake -S . -B build && \
+#     cmake --build build
+
+# Specify the default command for the container
+CMD ["/bin/bash"]
diff --git a/benchmarks/multiply/.devcontainer/devcontainer.json b/benchmarks/multiply/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..eef8eff
--- /dev/null
+++ b/benchmarks/multiply/.devcontainer/devcontainer.json
@@ -0,0 +1,25 @@
+{
+    "name": "Icicle Benchmarks - multiply",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-benchmark,type=bind",
+    "workspaceFolder": "/icicle-benchmark",
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    "postCreateCommand": [
+        "nvidia-smi"
+	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-vscode.cmake-tools",
+				"ms-python.python",
+				"ms-azuretools.vscode-docker",
+				"ms-vscode.cpptools-extension-pack"
+			]
+		}
+	}
+}
diff --git a/benchmarks/multiply/CMakeLists.txt b/benchmarks/multiply/CMakeLists.txt
new file mode 100644
index 0000000..4e0963f
--- /dev/null
+++ b/benchmarks/multiply/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(zk-benchmarks LANGUAGES CUDA CXX)
+
+#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+#set(CMAKE_CUDA_FLAGS_RELEASE "")
+#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+include_directories("/icicle")
+include_directories("/opt/benchmark/include")
+
+add_executable(
+  benchmark
+  benchmark.cu
+)
+
+find_library(BENCHMARK_LIBRARY benchmark PATHS /usr/local/lib)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(benchmark ${BENCHMARK_LIBRARY} ${NVML_LIBRARY})
+set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/benchmarks/multiply/README.md b/benchmarks/multiply/README.md
new file mode 100644
index 0000000..c43582b
--- /dev/null
+++ b/benchmarks/multiply/README.md
@@ -0,0 +1,22 @@
+# Icicle benchmark: multiply operation
+
+The benchmark measures the runtime of the vector operation $c[i] = a[i] * b[i]^n$, where $n$ is sufficiently large and we can ignore the memory access times.
+
+## Best-Practices
+
+We recommend to run the benchmarks in [ZK-containers](../ZK-containers.md) to save your time and mental energy.
+
+## Targets
+
+We designed the benchmark to estimate how many operations per second a given GPU can sustain.
+
+## Run benchmark
+
+Inside the container,
+
+```sh
+./compile.sh
+./run.sh
+```
+
+
diff --git a/benchmarks/multiply/benchmark.cu b/benchmarks/multiply/benchmark.cu
new file mode 100644
index 0000000..6e59612
--- /dev/null
+++ b/benchmarks/multiply/benchmark.cu
@@ -0,0 +1,145 @@
+#define CURVE_BN254     1
+#define CURVE_BLS12_381 2
+#define CURVE_BLS12_377 3
+
+#define CURVE CURVE_BLS12_377
+
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include </opt/benchmark/include/benchmark/benchmark.h>
+#include "/icicle/icicle/primitives/field.cuh"
+
+#if CURVE == CURVE_BN254
+
+#include "/icicle/icicle/curves/bn254/curve_config.cuh"    
+using namespace BN254;
+const std::string curve = "BN254";
+
+#elif CURVE == CURVE_BLS12_381
+
+#include "/icicle/icicle/curves/bls12_381/curve_config.cuh"
+using namespace BLS12_381;
+const std::string curve = "BLS12-381";
+
+#elif CURVE == CURVE_BLS12_377
+
+#include "/icicle/icicle/curves/bls12_377/curve_config.cuh"
+using namespace BLS12_377;
+const std::string curve = "BLS12-377";
+    
+#endif
+
+
+#include "/icicle/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh"
+
+template <typename S, int N>
+__global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments)
+{
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    if (tid < n_elments)
+    {
+        const S b = vec_b[tid];
+        S r = vec_a[tid];
+        // #pragma unroll
+        for (int i = 0; i < N; i++)
+            r = r * b;
+        vec_r[tid] = r;
+    }
+}
+
+template <typename S, int N = 10>
+int vector_mult(S *vec_b, S *vec_a, S *vec_result, size_t n_elments)
+{
+    // Set the grid and block dimensions
+    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+    int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+    // Call the kernel to perform element-wise modular multiplication
+    vectorMult<S, N><<<num_blocks, threads_per_block>>>(vec_a, vec_b, vec_result, n_elments);
+    return 0;
+}
+
+// typedef scalar_t S;
+typedef point_field_t S;
+
+const unsigned nof_mult = 100;
+unsigned nof_elements = 1 << 25;  
+S *vec_a;
+S *vec_b;
+S *d_vec_b;
+S *d_vec_a, *d_result;
+nvmlDevice_t device;
+
+static void BM_mult(benchmark::State& state) {  
+  for (auto _ : state) {
+    vector_mult<S, nof_mult>(d_vec_a, d_vec_b, d_result, nof_elements);
+    cudaDeviceSynchronize();
+  }
+  unsigned int power;
+  nvmlDeviceGetPowerUsage(device, &power);
+  state.counters["PowerUsage"] = int(1.0e-3 * power);
+  unsigned int temperature;
+  nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
+  state.counters["Temperature"] = int(temperature);
+}
+
+BENCHMARK(BM_mult)->MinTime(60.);
+
+int main(int argc, char** argv) {
+  cudaDeviceReset();
+  cudaDeviceProp deviceProperties;
+  int deviceId=0;
+  cudaGetDeviceProperties(&deviceProperties, deviceId);
+  std::string gpu_full_name = deviceProperties.name;
+  std::cout << gpu_full_name << std::endl;
+  std::string gpu_name = gpu_full_name;
+  int gpu_clock_mhz = deviceProperties.clockRate/1000.;
+
+  nvmlInit();
+  nvmlDeviceGetHandleByIndex(0, &device);  // for GPU 0
+
+  std::cout << "Setting host data" << std::endl;
+  
+  vec_a = (S*)malloc(sizeof(S) * nof_elements);
+  vec_b = (S*)malloc(sizeof(S) * nof_elements);
+  for (unsigned i = 0; i < (1 << 10); i++) {
+    vec_a[i] = S::rand_host();
+    vec_b[i] = S::rand_host();
+  }
+  for (unsigned i = 1; i < (nof_elements >> 10); i++) {
+    memcpy((void *)(vec_a + (i << 10)), (void *)(vec_a + ((i-1) << 10)), sizeof(S) << 10);
+    memcpy((void *)(vec_b + (i << 10)), (void *)(vec_b + ((i-1) << 10)), sizeof(S) << 10);
+  }
+  // Allocate memory on the device for the input vectors, the output vector, and the modulus
+  std::cout << "Moving data to device" << std::endl;
+  cudaMalloc(&d_vec_a, nof_elements * sizeof(S));
+  cudaMalloc(&d_vec_b, nof_elements * sizeof(S));
+  cudaMalloc(&d_result, nof_elements * sizeof(S));
+
+  // Copy the input vectors and the modulus from the host to the device
+  cudaMemcpy(d_vec_a, vec_a, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_vec_b, vec_b, nof_elements * sizeof(S), cudaMemcpyHostToDevice);
+  std::cout << "Running benchmark" << std::endl;
+
+  // Run all benchmarks 
+  ::benchmark::Initialize(&argc, argv);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
+  ::benchmark::AddCustomContext("team", "Ingonyama");
+  ::benchmark::AddCustomContext("project", "Icicle");
+  ::benchmark::AddCustomContext("runs_on", gpu_name);
+  ::benchmark::AddCustomContext("frequency_MHz", std::to_string(gpu_clock_mhz));
+  ::benchmark::AddCustomContext("uses", curve);
+  ::benchmark::AddCustomContext("comment", "on-device API");
+  ::benchmark::AddCustomContext("operation_factor", std::to_string(nof_mult));
+  ::benchmark::AddCustomContext("vector_size", std::to_string(nof_elements));
+  ::benchmark::RunSpecifiedBenchmarks();
+
+  cudaFree(d_vec_a);
+  cudaFree(d_vec_b);
+  cudaFree(d_result);
+  free(vec_a);
+  free(vec_b);
+}
diff --git a/benchmarks/multiply/compile.sh b/benchmarks/multiply/compile.sh
new file mode 100755
index 0000000..a7ba162
--- /dev/null
+++ b/benchmarks/multiply/compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
+
+
diff --git a/benchmarks/multiply/run.sh b/benchmarks/multiply/run.sh
new file mode 100755
index 0000000..252350e
--- /dev/null
+++ b/benchmarks/multiply/run.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+DB_HOST=${INGO_BENCHMARKS_DB_HOST}
+DB_PORT=${INGO_BENCHMARKS_DB_PORT}
+DB_NAME=${INGO_BENCHMARKS_DB_NAME}
+DB_USER=${INGO_BENCHMARKS_DB_USER}
+DB_PASS=${INGO_BENCHMARKS_DB_PASS}
+
+
+# testing this Icicle version
+git_id=$(cd /icicle && git rev-parse --short HEAD)
+echo "Icicle GitID: $git_id"
+
+echo "Running the benchmarks and capturing the output in the file benchmark.json"
+/icicle-benchmark/build/benchmark --benchmark_time_unit=s  --benchmark_out_format=json --benchmark_out=benchmark.json
+
+json_data=$(<benchmark.json)
+#echo $json_data
+
+echo "Extracting context..."
+
+team=$(jq -r '.context.team' benchmark.json)
+echo "Team: $team"
+
+project=$(jq -r '.context.project' benchmark.json)
+echo "Project: $project"
+
+runs_on=$(jq -r '.context.runs_on' benchmark.json)
+echo "Runs on: $runs_on"
+
+frequency_MHz=$(jq -r '.context.frequency_MHz' benchmark.json)
+echo "Frequency_MHz: $frequency_MHz"
+
+uses=$(jq -r '.context.uses' benchmark.json)
+echo "Uses: $uses"
+
+operation_factor=$(jq -r '.context.operation_factor' benchmark.json)
+echo "Operation/Memory factor: $operation_factor"
+
+vector_size=$(jq -r '.context.vector_size' benchmark.json)
+echo "Vector Size: $vector_size"
+
+comment=$(jq -r '.context.comment' benchmark.json)
+echo "Comment: $comment"
+
+test_timestamp=$(jq -r '.context.date' benchmark.json)
+#test_timestamp=$(date +"%Y-%m-%d %H:%M:%S")
+echo "Timestamp: $test_timestamp"
+
+echo "Extracting benchmarks..."
+
+nof_benchmarks=$(echo "$json_data" | jq '.benchmarks | length')
+echo "Number of benchmarks: $nof_benchmarks"
+
+for ((nof_benchmark = 0; nof_benchmark < nof_benchmarks; nof_benchmark++)); do
+  echo "Benchmark $nof_benchmark"
+  
+  benchmark=$(jq ".benchmarks[$nof_benchmark]" benchmark.json)
+  # echo $benchmark
+
+  benchmark_name=$(echo "$benchmark" | jq '.name')
+  echo "Complete benchmark name: $benchmark_name"
+
+  # Arguments: the complete benchmark name ends with "/"-separated arguments
+
+    
+  runtime_sec=$(echo "$benchmark" | jq '.real_time')
+  echo "Run time (sec): $runtime_sec"
+    
+  batch_size=$(echo "$benchmark" | jq '.iterations')
+  echo "Batch size: $batch_size"
+
+  power_Watt=$(echo "$benchmark" | jq '.PowerUsage')
+  echo "Power (Watt): $power_Watt"
+
+  chip_temp_C=$(echo "$benchmark" | jq '.Temperature')
+  echo "Chip temperature (C): $chip_temp_C"
+
+  QUERY="INSERT INTO add_benchmark (
+  team,
+  project,
+  test_timestamp,
+  git_id,
+  frequency_MHz,
+  vector_size,
+  operation_factor,
+  batch_size,
+  runtime_sec,
+  power_Watt,
+  chip_temp_C,
+  comment,
+  runs_on,
+  uses
+)
+VALUES (
+  '$team',
+  '$project',
+  '$test_timestamp',
+  '$git_id',
+  $frequency_MHz,
+  $vector_size,
+  $operation_factor,
+  $batch_size,
+  $runtime_sec,
+  $power_Watt,
+  $chip_temp_C,
+  '$comment',
+  (SELECT id FROM hw_platform WHERE device = '$runs_on'),
+  (SELECT id FROM finite_field WHERE name = '$uses')
+);
+"
+  echo "$QUERY" > benchmark.sql
+
+  # Execute the psql command to insert the row into the add_benchmark table
+  # PGPASSWORD=$DB_PASS psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c $QUERY
+
+done
+
+
+#QUERY="SELECT * FROM add_benchmark ORDER BY id DESC LIMIT 10;"
+#PGPASSWORD=$DB_PASS psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c "$QUERY"
+
+
+exit
+
+
+
+