Skip to content

Commit

Permalink
add cuda backend for msm
Browse files Browse the repository at this point in the history
  • Loading branch information
ChickenLover committed Dec 12, 2024
1 parent da16082 commit ee0aee5
Show file tree
Hide file tree
Showing 21 changed files with 4,380 additions and 1 deletion.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs
**/build/*
**tar.gz
icicle/backend/cuda
39 changes: 39 additions & 0 deletions icicle/backend/cuda/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
Language: Cpp
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveMacros: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: false
BreakBeforeBraces: Custom
BraceWrapping:
AfterClass: true
AfterFunction: true
BreakBeforeBinaryOperators: false
BreakBeforeTernaryOperators: true
ColumnLimit: 120
ContinuationIndentWidth: 2
Cpp11BracedListStyle: true
DisableFormat: false
IndentFunctionDeclarationAfterType: false
IndentWidth: 2
KeepEmptyLinesAtTheStartOfBlocks: false
MaxEmptyLinesToKeep: 1
NamespaceIndentation: All
PointerAlignment: Left
SortIncludes: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
Standard: c++17
UseTab: Never
35 changes: 35 additions & 0 deletions icicle/backend/cuda/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# VScode configs
**/.vscode
78 changes: 78 additions & 0 deletions icicle/backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
cmake_minimum_required(VERSION 3.18)

include(cmake/Common.cmake)
find_cuda_compiler()

project(icicle_cuda_backend LANGUAGES CUDA CXX)

set_env()
set_gpu_env()

find_package(CUDAToolkit REQUIRED)

# device API library
add_library(icicle_backend_cuda_device SHARED src/cuda_device_api.cu)
target_include_directories(icicle_backend_cuda_device PRIVATE include)
target_include_directories(icicle_backend_cuda_device PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(icicle_backend_cuda_device PUBLIC pthread)

install(TARGETS icicle_backend_cuda_device
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda)

if(CMAKE_BUILD_TYPE STREQUAL "Release")
add_custom_command(TARGET icicle_backend_cuda_device POST_BUILD
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_backend_cuda_device>
)
endif()

# field API library
if (FIELD)
add_library(icicle_cuda_field SHARED
src/field/cuda_mont.cu
)
target_include_directories(icicle_cuda_field PRIVATE include)
set_target_properties(icicle_cuda_field PROPERTIES OUTPUT_NAME "icicle_backend_cuda_field_${FIELD}")
target_include_directories(icicle_cuda_field PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(icicle_cuda_field PRIVATE icicle_field ${CUDA_LIBRARIES} pthread) # Link to CUDA

install(TARGETS icicle_cuda_field
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda"
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda")

if(CMAKE_BUILD_TYPE STREQUAL "Release")
add_custom_command(TARGET icicle_cuda_field POST_BUILD
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_field>
)
endif()
endif() # FIELD

# curve API library
if (CURVE)
add_library(icicle_cuda_curve SHARED
src/curve/cuda_mont.cu
)
if(MSM)
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm.cu)
endif()
if(G2)
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm_g2.cu)
endif()
target_include_directories(icicle_cuda_curve PRIVATE include)
set_target_properties(icicle_cuda_curve PROPERTIES OUTPUT_NAME "icicle_backend_cuda_curve_${CURVE}")
target_include_directories(icicle_cuda_curve PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(icicle_cuda_curve PRIVATE icicle_cuda_field icicle_curve ${CUDA_LIBRARIES} pthread) # Link to CUDA

install(TARGETS icicle_cuda_curve
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda"
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda")

if(CMAKE_BUILD_TYPE STREQUAL "Release")
add_custom_command(TARGET icicle_cuda_curve POST_BUILD
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_curve>
)
endif()
endif()
75 changes: 75 additions & 0 deletions icicle/backend/cuda/cmake/Common.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
function(set_env)
set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE)
set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE)

if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF")
message(WARNING "Note that PIC (position-independent code) is disabled.")
else()
set(CMAKE_POSITION_INDEPENDENT_CODE ON PARENT_SCOPE)
endif()
endfunction()

function(set_gpu_env)
# add the target cuda architectures
# each additional architecture increases the compilation time and output file size
if(DEFINED CUDA_ARCH) # user defined arch takes priority
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE)
elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch
set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE)
else()
find_program(_nvidia_smi "nvidia-smi")

if(_nvidia_smi)
execute_process(
COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader
OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Process the output to form the CUDA architectures string
string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}")

set(CUDA_ARCHITECTURES "")
foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST})
# Remove the dot in compute capability to match CMake format
string(REPLACE "." "" CAPABILITY "${CAPABILITY}")
if(CUDA_ARCHITECTURES)
set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}")
else()
set(CUDA_ARCHITECTURES "${CAPABILITY}")
endif()
endforeach()

message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}")
set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE)
else()
# no GPUs found, like on Github CI runners
message("Setting CMAKE_CUDA_ARCHITECTURES to: 50")
set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value
endif()
endif()

# Check CUDA version and, if possible, enable multi-threaded compilation
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2")
message(STATUS "Using multi-threaded CUDA compilation.")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE)
else()
message(STATUS "Can't use multi-threaded CUDA compilation.")
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE)
set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE)
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE)
endfunction()

function(find_cuda_compiler)
# Find the CUDA compiler
execute_process(
COMMAND which nvcc
OUTPUT_VARIABLE CUDA_COMPILER_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)

# Set the CUDA compiler
set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH} PARENT_SCOPE)
endfunction()
55 changes: 55 additions & 0 deletions icicle/backend/cuda/include/cuda_mont.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#pragma once
#include <cuda.h>
#include <stdexcept>

#include "icicle/errors.h"
#include "icicle/vec_ops.h"
#include "gpu-utils/error_handler.h"

namespace montgomery {
#define MAX_THREADS_PER_BLOCK 256

template <typename E, bool is_into>
__global__ void MontgomeryKernel(const E* input, int n, E* output)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); }
}

template <typename E, bool is_into>
cudaError_t ConvertMontgomery(const E* input, size_t n, const VecOpsConfig& config, E* output)
{
cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(config.stream);

E *d_alloc_out = nullptr, *d_alloc_in = nullptr, *d_out;
const E* d_in;
if (!config.is_a_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_in, n * sizeof(E), cuda_stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_in, input, n * sizeof(E), cudaMemcpyHostToDevice, cuda_stream));
d_in = d_alloc_in;
} else {
d_in = input;
}

if (!config.is_result_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_out, n * sizeof(E), cuda_stream));
d_out = d_alloc_out;
} else {
d_out = output;
}

int num_threads = MAX_THREADS_PER_BLOCK;
int num_blocks = (n + num_threads - 1) / num_threads;
MontgomeryKernel<E, is_into><<<num_blocks, num_threads, 0, cuda_stream>>>(d_in, n, d_out);

if (d_alloc_in) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_in, cuda_stream)); }
if (d_alloc_out) {
CHK_IF_RETURN(cudaMemcpyAsync(output, d_out, n * sizeof(E), cudaMemcpyDeviceToHost, cuda_stream));
CHK_IF_RETURN(cudaFreeAsync(d_out, cuda_stream));
}
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(cuda_stream));

return CHK_LAST();
}

} // namespace montgomery
33 changes: 33 additions & 0 deletions icicle/backend/cuda/include/error_translation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include "icicle/errors.h"
#include "cuda_runtime.h"

static eIcicleError translateCudaError(cudaError_t cudaErr)
{
switch (cudaErr) {
case cudaSuccess:
return eIcicleError::SUCCESS;
case cudaErrorInvalidDevice:
return eIcicleError::INVALID_DEVICE;
case cudaErrorMemoryAllocation:
return eIcicleError::OUT_OF_MEMORY;
case cudaErrorInvalidDevicePointer:
case cudaErrorInvalidHostPointer:
return eIcicleError::INVALID_POINTER;
case cudaErrorInitializationError:
case cudaErrorInvalidResourceHandle:
return eIcicleError::ALLOCATION_FAILED;
case cudaErrorInvalidMemcpyDirection:
return eIcicleError::COPY_FAILED;
case cudaErrorSyncDepthExceeded:
case cudaErrorLaunchTimeout:
case cudaErrorLaunchIncompatibleTexturing:
case cudaErrorLaunchFailure:
return eIcicleError::SYNCHRONIZATION_FAILED;
case cudaErrorInvalidValue:
return eIcicleError::INVALID_ARGUMENT;
default:
return eIcicleError::UNKNOWN_ERROR;
}
}
35 changes: 35 additions & 0 deletions icicle/backend/cuda/include/gpu-utils/device_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once
#ifndef DEVICE_CONTEXT_H
#define DEVICE_CONTEXT_H

#include <cuda_runtime.h>

namespace device_context {

constexpr std::size_t MAX_DEVICES = 32;

/**
* Properties of the device used in icicle functions.
*/
struct DeviceContext {
cudaStream_t& stream; /**< Stream to use. Default value: 0. */
std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */
cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */
};

/**
* Return default device context that corresponds to using the default stream of the first GPU
*/
inline DeviceContext get_default_device_context() // TODO: naming convention ?
{
static cudaStream_t default_stream = (cudaStream_t)0;
return DeviceContext{
(cudaStream_t&)default_stream, // stream
0, // device_id
0, // mempool
};
}

} // namespace device_context

#endif
Loading

0 comments on commit ee0aee5

Please sign in to comment.