add cuda backend for msm

ingonyama-zk · Dec 12, 2024 · ee0aee5 · ee0aee5
1 parent da16082
commit ee0aee5
Show file tree

Hide file tree

Showing 21 changed files with 4,380 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,3 @@
 **/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
 **/build/*
 **tar.gz
-icicle/backend/cuda
diff --git a/icicle/backend/cuda/.clang-format b/icicle/backend/cuda/.clang-format
@@ -0,0 +1,39 @@
+Language: Cpp
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveMacros: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: true
+  AfterFunction: true
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: true
+ColumnLimit: 120
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DisableFormat: false
+IndentFunctionDeclarationAfterType: false
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: false
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: All
+PointerAlignment: Left
+SortIncludes: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+Standard: c++17
+UseTab: Never
diff --git a/icicle/backend/cuda/.gitignore b/icicle/backend/cuda/.gitignore
@@ -0,0 +1,35 @@
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# VScode configs
+**/.vscode
diff --git a/icicle/backend/cuda/CMakeLists.txt b/icicle/backend/cuda/CMakeLists.txt
@@ -0,0 +1,78 @@
+cmake_minimum_required(VERSION 3.18)
+
+include(cmake/Common.cmake)
+find_cuda_compiler()
+
+project(icicle_cuda_backend LANGUAGES CUDA CXX)
+
+set_env()
+set_gpu_env()
+
+find_package(CUDAToolkit REQUIRED)
+
+# device API library
+add_library(icicle_backend_cuda_device SHARED src/cuda_device_api.cu)
+target_include_directories(icicle_backend_cuda_device PRIVATE include)
+target_include_directories(icicle_backend_cuda_device PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+target_link_libraries(icicle_backend_cuda_device PUBLIC pthread)
+
+install(TARGETS icicle_backend_cuda_device
+    RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
+    LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda)
+
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    add_custom_command(TARGET icicle_backend_cuda_device POST_BUILD
+        COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_backend_cuda_device>
+    )
+endif()
+
+# field API library
+if (FIELD)
+    add_library(icicle_cuda_field SHARED
+        src/field/cuda_mont.cu
+        )
+    target_include_directories(icicle_cuda_field PRIVATE include)
+    set_target_properties(icicle_cuda_field PROPERTIES OUTPUT_NAME "icicle_backend_cuda_field_${FIELD}")
+    target_include_directories(icicle_cuda_field PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_link_libraries(icicle_cuda_field PRIVATE icicle_field ${CUDA_LIBRARIES} pthread) # Link to CUDA
+
+    install(TARGETS icicle_cuda_field
+        RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
+        LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
+        ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda")
+
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+        add_custom_command(TARGET icicle_cuda_field POST_BUILD
+            COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_field>
+        )
+    endif()
+endif() # FIELD
+
+# curve API library
+if (CURVE)
+    add_library(icicle_cuda_curve SHARED        
+        src/curve/cuda_mont.cu
+    )
+    if(MSM)
+        target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm.cu)
+    endif()
+    if(G2)
+        target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm_g2.cu)
+    endif()
+    target_include_directories(icicle_cuda_curve PRIVATE include)
+    set_target_properties(icicle_cuda_curve PROPERTIES OUTPUT_NAME "icicle_backend_cuda_curve_${CURVE}")
+    target_include_directories(icicle_cuda_curve PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_link_libraries(icicle_cuda_curve PRIVATE icicle_cuda_field icicle_curve ${CUDA_LIBRARIES} pthread) # Link to CUDA
+
+    install(TARGETS icicle_cuda_curve
+        RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
+        LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
+        ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda")
+
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+        add_custom_command(TARGET icicle_cuda_curve POST_BUILD
+            COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_curve>
+        )
+    endif()
+endif()
diff --git a/icicle/backend/cuda/cmake/Common.cmake b/icicle/backend/cuda/cmake/Common.cmake
@@ -0,0 +1,75 @@
+function(set_env)
+    set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+    set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)
+
+    if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
+        message(WARNING "Note that PIC (position-independent code) is disabled.")
+    else()
+        set(CMAKE_POSITION_INDEPENDENT_CODE ON PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(set_gpu_env)
+    # add the target cuda architectures
+    # each additional architecture increases the compilation time and output file size
+    if(DEFINED CUDA_ARCH) # user defined arch takes priority
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
+    elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
+        set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
+    else()
+        find_program(_nvidia_smi "nvidia-smi")
+
+        if(_nvidia_smi)
+            execute_process(
+                COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
+                OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            # Process the output to form the CUDA architectures string
+            string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")
+
+            set(CUDA_ARCHITECTURES "")
+            foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
+                # Remove the dot in compute capability to match CMake format
+                string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
+                if(CUDA_ARCHITECTURES)
+                    set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
+                else()
+                    set(CUDA_ARCHITECTURES "${CAPABILITY}")
+                endif()
+            endforeach()
+
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")        
+            set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)                        
+        else()
+            # no GPUs found, like on Github CI runners
+            message("Setting CMAKE_CUDA_ARCHITECTURES to: 50") 
+            set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
+        endif()
+    endif()
+
+    # Check CUDA version and, if possible, enable multi-threaded compilation 
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
+        message(STATUS "Using multi-threaded CUDA compilation.")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
+    else()
+        message(STATUS "Can't use multi-threaded CUDA compilation.")
+    endif()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
+endfunction()
+
+function(find_cuda_compiler)
+    # Find the CUDA compiler
+    execute_process(
+        COMMAND which nvcc
+        OUTPUT_VARIABLE CUDA_COMPILER_PATH
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    # Set the CUDA compiler
+    set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH} PARENT_SCOPE)    
+endfunction()
diff --git a/icicle/backend/cuda/include/cuda_mont.cuh b/icicle/backend/cuda/include/cuda_mont.cuh
@@ -0,0 +1,55 @@
+#pragma once
+#include <cuda.h>
+#include <stdexcept>
+
+#include "icicle/errors.h"
+#include "icicle/vec_ops.h"
+#include "gpu-utils/error_handler.h"
+
+namespace montgomery {
+#define MAX_THREADS_PER_BLOCK 256
+
+  template <typename E, bool is_into>
+  __global__ void MontgomeryKernel(const E* input, int n, E* output)
+  {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }
+  }
+
+  template <typename E, bool is_into>
+  cudaError_t ConvertMontgomery(const E* input, size_t n, const VecOpsConfig& config, E* output)
+  {
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(config.stream);
+
+    E *d_alloc_out = nullptr, *d_alloc_in = nullptr, *d_out;
+    const E* d_in;
+    if (!config.is_a_on_device) {
+      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_in, n * sizeof(E), cuda_stream));
+      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_in, input, n * sizeof(E), cudaMemcpyHostToDevice, cuda_stream));
+      d_in = d_alloc_in;
+    } else {
+      d_in = input;
+    }
+
+    if (!config.is_result_on_device) {
+      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_out, n * sizeof(E), cuda_stream));
+      d_out = d_alloc_out;
+    } else {
+      d_out = output;
+    }
+
+    int num_threads = MAX_THREADS_PER_BLOCK;
+    int num_blocks = (n + num_threads - 1) / num_threads;
+    MontgomeryKernel<E, is_into><<<num_blocks, num_threads, 0, cuda_stream>>>(d_in, n, d_out);
+
+    if (d_alloc_in) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_in, cuda_stream)); }
+    if (d_alloc_out) {
+      CHK_IF_RETURN(cudaMemcpyAsync(output, d_out, n * sizeof(E), cudaMemcpyDeviceToHost, cuda_stream));
+      CHK_IF_RETURN(cudaFreeAsync(d_out, cuda_stream));
+    }
+    if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(cuda_stream));
+
+    return CHK_LAST();
+  }
+
+} // namespace montgomery
diff --git a/icicle/backend/cuda/include/error_translation.h b/icicle/backend/cuda/include/error_translation.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "icicle/errors.h"
+#include "cuda_runtime.h"
+
+static eIcicleError translateCudaError(cudaError_t cudaErr)
+{
+  switch (cudaErr) {
+  case cudaSuccess:
+    return eIcicleError::SUCCESS;
+  case cudaErrorInvalidDevice:
+    return eIcicleError::INVALID_DEVICE;
+  case cudaErrorMemoryAllocation:
+    return eIcicleError::OUT_OF_MEMORY;
+  case cudaErrorInvalidDevicePointer:
+  case cudaErrorInvalidHostPointer:
+    return eIcicleError::INVALID_POINTER;
+  case cudaErrorInitializationError:
+  case cudaErrorInvalidResourceHandle:
+    return eIcicleError::ALLOCATION_FAILED;
+  case cudaErrorInvalidMemcpyDirection:
+    return eIcicleError::COPY_FAILED;
+  case cudaErrorSyncDepthExceeded:
+  case cudaErrorLaunchTimeout:
+  case cudaErrorLaunchIncompatibleTexturing:
+  case cudaErrorLaunchFailure:
+    return eIcicleError::SYNCHRONIZATION_FAILED;
+  case cudaErrorInvalidValue:
+    return eIcicleError::INVALID_ARGUMENT;
+  default:
+    return eIcicleError::UNKNOWN_ERROR;
+  }
+}
diff --git a/icicle/backend/cuda/include/gpu-utils/device_context.h b/icicle/backend/cuda/include/gpu-utils/device_context.h
@@ -0,0 +1,35 @@
+#pragma once
+#ifndef DEVICE_CONTEXT_H
+#define DEVICE_CONTEXT_H
+
+#include <cuda_runtime.h>
+
+namespace device_context {
+
+  constexpr std::size_t MAX_DEVICES = 32;
+
+  /**
+   * Properties of the device used in icicle functions.
+   */
+  struct DeviceContext {
+    cudaStream_t& stream;  /**< Stream to use. Default value: 0. */
+    std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
+    cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
+  };
+
+  /**
+   * Return default device context that corresponds to using the default stream of the first GPU
+   */
+  inline DeviceContext get_default_device_context() // TODO: naming convention ?
+  {
+    static cudaStream_t default_stream = (cudaStream_t)0;
+    return DeviceContext{
+      (cudaStream_t&)default_stream, // stream
+      0,                             // device_id
+      0,                             // mempool
+    };
+  }
+
+} // namespace device_context
+
+#endif