From eb1fc17f7b936fe2b987c3ca5beed9127aa17201 Mon Sep 17 00:00:00 2001 From: stas Date: Tue, 19 Dec 2023 18:43:28 -0500 Subject: [PATCH] multiplication benchmark --- benchmarks/multiply/.devcontainer/Dockerfile | 42 +++++ .../multiply/.devcontainer/devcontainer.json | 25 +++ benchmarks/multiply/CMakeLists.txt | 29 ++++ benchmarks/multiply/README.md | 22 +++ benchmarks/multiply/benchmark.cu | 145 ++++++++++++++++++ benchmarks/multiply/compile.sh | 11 ++ benchmarks/multiply/run.sh | 131 ++++++++++++++++ 7 files changed, 405 insertions(+) create mode 100644 benchmarks/multiply/.devcontainer/Dockerfile create mode 100644 benchmarks/multiply/.devcontainer/devcontainer.json create mode 100644 benchmarks/multiply/CMakeLists.txt create mode 100644 benchmarks/multiply/README.md create mode 100644 benchmarks/multiply/benchmark.cu create mode 100755 benchmarks/multiply/compile.sh create mode 100755 benchmarks/multiply/run.sh diff --git a/benchmarks/multiply/.devcontainer/Dockerfile b/benchmarks/multiply/.devcontainer/Dockerfile new file mode 100644 index 0000000..00002ec --- /dev/null +++ b/benchmarks/multiply/.devcontainer/Dockerfile @@ -0,0 +1,42 @@ +# Make sure NVIDIA Container Toolkit is installed on your host + +# Use the specified base image +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 + +# Update and install dependencies +RUN apt-get update && apt-get install -y \ + nsight-systems-12.2 \ + cmake \ + protobuf-compiler \ + curl \ + build-essential \ + git \ + libboost-all-dev \ + jq \ + postgresql-client \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# apt-get install cuda-nsight-systems-12-2 + +# Clone Icicle from a GitHub repository +RUN git clone https://github.com/ingonyama-zk/icicle.git /icicle + +# Benchmarking in C++ +RUN git clone https://github.com/google/benchmark.git /opt/benchmark \ + && cd /opt/benchmark \ + && cmake -E make_directory "build" \ + && cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -S . -B "build" \ + && cmake --build "build" --config Release \ + && cmake --build "build" --config Release --target install + + +# Set the working directory in the container +WORKDIR /icicle-benchmark +# COPY . . +# RUN mkdir -p build && \ +# cmake -S . -B build && \ +# cmake --build build + +# Specify the default command for the container +CMD ["/bin/bash"] diff --git a/benchmarks/multiply/.devcontainer/devcontainer.json b/benchmarks/multiply/.devcontainer/devcontainer.json new file mode 100644 index 0000000..eef8eff --- /dev/null +++ b/benchmarks/multiply/.devcontainer/devcontainer.json @@ -0,0 +1,25 @@ +{ + "name": "Icicle Benchmarks - multiply", + "build": { + "dockerfile": "Dockerfile" + }, + "workspaceMount": "source=${localWorkspaceFolder}/.,target=/icicle-benchmark,type=bind", + "workspaceFolder": "/icicle-benchmark", + "runArgs": [ + "--gpus", + "all" + ], + "postCreateCommand": [ + "nvidia-smi" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-vscode.cmake-tools", + "ms-python.python", + "ms-azuretools.vscode-docker", + "ms-vscode.cpptools-extension-pack" + ] + } + } +} diff --git a/benchmarks/multiply/CMakeLists.txt b/benchmarks/multiply/CMakeLists.txt new file mode 100644 index 0000000..4e0963f --- /dev/null +++ b/benchmarks/multiply/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.18) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +if (${CMAKE_VERSION} VERSION_LESS "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) +else() + set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed +endif () +project(zk-benchmarks LANGUAGES CUDA CXX) + +#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +#set(CMAKE_CUDA_FLAGS_RELEASE "") +#set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") + +include_directories("/icicle") +include_directories("/opt/benchmark/include") + +add_executable( + benchmark + benchmark.cu +) + +find_library(BENCHMARK_LIBRARY benchmark PATHS /usr/local/lib) +find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) +target_link_libraries(benchmark ${BENCHMARK_LIBRARY} ${NVML_LIBRARY}) +set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + diff --git a/benchmarks/multiply/README.md b/benchmarks/multiply/README.md new file mode 100644 index 0000000..c43582b --- /dev/null +++ b/benchmarks/multiply/README.md @@ -0,0 +1,22 @@ +# Icicle benchmark: multiply operation + +The benchmark measures the runtime of the vector operation $c[i] = a[i] * b[i]^n$, where $n$ is sufficiently large and we can ignore the memory access times. + +## Best-Practices + +We recommend to run the benchmarks in [ZK-containers](../ZK-containers.md) to save your time and mental energy. + +## Targets + +We designed the benchmark to estimate how many operations per second a given GPU can sustain. + +## Run benchmark + +Inside the container, + +```sh +./compile.sh +./run.sh +``` + + diff --git a/benchmarks/multiply/benchmark.cu b/benchmarks/multiply/benchmark.cu new file mode 100644 index 0000000..6e59612 --- /dev/null +++ b/benchmarks/multiply/benchmark.cu @@ -0,0 +1,145 @@ +#define CURVE_BN254 1 +#define CURVE_BLS12_381 2 +#define CURVE_BLS12_377 3 + +#define CURVE CURVE_BLS12_377 + +#include +#include +#include +#include +#include +#include +#include "/icicle/icicle/primitives/field.cuh" + +#if CURVE == CURVE_BN254 + +#include "/icicle/icicle/curves/bn254/curve_config.cuh" +using namespace BN254; +const std::string curve = "BN254"; + +#elif CURVE == CURVE_BLS12_381 + +#include "/icicle/icicle/curves/bls12_381/curve_config.cuh" +using namespace BLS12_381; +const std::string curve = "BLS12-381"; + +#elif CURVE == CURVE_BLS12_377 + +#include "/icicle/icicle/curves/bls12_377/curve_config.cuh" +using namespace BLS12_377; +const std::string curve = "BLS12-377"; + +#endif + + +#include "/icicle/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh" + +template +__global__ void vectorMult(S *vec_a, S *vec_b, S *vec_r, size_t n_elments) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < n_elments) + { + const S b = vec_b[tid]; + S r = vec_a[tid]; + // #pragma unroll + for (int i = 0; i < N; i++) + r = r * b; + vec_r[tid] = r; + } +} + +template +int vector_mult(S *vec_b, S *vec_a, S *vec_result, size_t n_elments) +{ + // Set the grid and block dimensions + int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK); + int threads_per_block = MAX_THREADS_PER_BLOCK; + + // Call the kernel to perform element-wise modular multiplication + vectorMult<<>>(vec_a, vec_b, vec_result, n_elments); + return 0; +} + +// typedef scalar_t S; +typedef point_field_t S; + +const unsigned nof_mult = 100; +unsigned nof_elements = 1 << 25; +S *vec_a; +S *vec_b; +S *d_vec_b; +S *d_vec_a, *d_result; +nvmlDevice_t device; + +static void BM_mult(benchmark::State& state) { + for (auto _ : state) { + vector_mult(d_vec_a, d_vec_b, d_result, nof_elements); + cudaDeviceSynchronize(); + } + unsigned int power; + nvmlDeviceGetPowerUsage(device, &power); + state.counters["PowerUsage"] = int(1.0e-3 * power); + unsigned int temperature; + nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature); + state.counters["Temperature"] = int(temperature); +} + +BENCHMARK(BM_mult)->MinTime(60.); + +int main(int argc, char** argv) { + cudaDeviceReset(); + cudaDeviceProp deviceProperties; + int deviceId=0; + cudaGetDeviceProperties(&deviceProperties, deviceId); + std::string gpu_full_name = deviceProperties.name; + std::cout << gpu_full_name << std::endl; + std::string gpu_name = gpu_full_name; + int gpu_clock_mhz = deviceProperties.clockRate/1000.; + + nvmlInit(); + nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0 + + std::cout << "Setting host data" << std::endl; + + vec_a = (S*)malloc(sizeof(S) * nof_elements); + vec_b = (S*)malloc(sizeof(S) * nof_elements); + for (unsigned i = 0; i < (1 << 10); i++) { + vec_a[i] = S::rand_host(); + vec_b[i] = S::rand_host(); + } + for (unsigned i = 1; i < (nof_elements >> 10); i++) { + memcpy((void *)(vec_a + (i << 10)), (void *)(vec_a + ((i-1) << 10)), sizeof(S) << 10); + memcpy((void *)(vec_b + (i << 10)), (void *)(vec_b + ((i-1) << 10)), sizeof(S) << 10); + } + // Allocate memory on the device for the input vectors, the output vector, and the modulus + std::cout << "Moving data to device" << std::endl; + cudaMalloc(&d_vec_a, nof_elements * sizeof(S)); + cudaMalloc(&d_vec_b, nof_elements * sizeof(S)); + cudaMalloc(&d_result, nof_elements * sizeof(S)); + + // Copy the input vectors and the modulus from the host to the device + cudaMemcpy(d_vec_a, vec_a, nof_elements * sizeof(S), cudaMemcpyHostToDevice); + cudaMemcpy(d_vec_b, vec_b, nof_elements * sizeof(S), cudaMemcpyHostToDevice); + std::cout << "Running benchmark" << std::endl; + + // Run all benchmarks + ::benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + ::benchmark::AddCustomContext("team", "Ingonyama"); + ::benchmark::AddCustomContext("project", "Icicle"); + ::benchmark::AddCustomContext("runs_on", gpu_name); + ::benchmark::AddCustomContext("frequency_MHz", std::to_string(gpu_clock_mhz)); + ::benchmark::AddCustomContext("uses", curve); + ::benchmark::AddCustomContext("comment", "on-device API"); + ::benchmark::AddCustomContext("operation_factor", std::to_string(nof_mult)); + ::benchmark::AddCustomContext("vector_size", std::to_string(nof_elements)); + ::benchmark::RunSpecifiedBenchmarks(); + + cudaFree(d_vec_a); + cudaFree(d_vec_b); + cudaFree(d_result); + free(vec_a); + free(vec_b); +} diff --git a/benchmarks/multiply/compile.sh b/benchmarks/multiply/compile.sh new file mode 100755 index 0000000..a7ba162 --- /dev/null +++ b/benchmarks/multiply/compile.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Exit immediately on error +set -e + +rm -rf build +mkdir -p build +cmake -S . -B build +cmake --build build + + diff --git a/benchmarks/multiply/run.sh b/benchmarks/multiply/run.sh new file mode 100755 index 0000000..252350e --- /dev/null +++ b/benchmarks/multiply/run.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# Exit immediately on error +set -e + +DB_HOST=${INGO_BENCHMARKS_DB_HOST} +DB_PORT=${INGO_BENCHMARKS_DB_PORT} +DB_NAME=${INGO_BENCHMARKS_DB_NAME} +DB_USER=${INGO_BENCHMARKS_DB_USER} +DB_PASS=${INGO_BENCHMARKS_DB_PASS} + + +# testing this Icicle version +git_id=$(cd /icicle && git rev-parse --short HEAD) +echo "Icicle GitID: $git_id" + +echo "Running the benchmarks and capturing the output in the file benchmark.json" +/icicle-benchmark/build/benchmark --benchmark_time_unit=s --benchmark_out_format=json --benchmark_out=benchmark.json + +json_data=$( benchmark.sql + + # Execute the psql command to insert the row into the add_benchmark table + # PGPASSWORD=$DB_PASS psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c $QUERY + +done + + +#QUERY="SELECT * FROM add_benchmark ORDER BY id DESC LIMIT 10;" +#PGPASSWORD=$DB_PASS psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c "$QUERY" + + +exit + + + +