-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
da16082
commit ee0aee5
Showing
21 changed files
with
4,380 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,4 +19,3 @@ | |
**/wrappers/rust/icicle-cuda-runtime/src/bindings.rs | ||
**/build/* | ||
**tar.gz | ||
icicle/backend/cuda |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
Language: Cpp | ||
AlignAfterOpenBracket: AlwaysBreak | ||
AlignConsecutiveMacros: true | ||
AlignTrailingComments: true | ||
AllowAllParametersOfDeclarationOnNextLine: true | ||
AllowShortBlocksOnASingleLine: true | ||
AllowShortCaseLabelsOnASingleLine: false | ||
AllowShortFunctionsOnASingleLine: All | ||
AllowShortIfStatementsOnASingleLine: true | ||
AlwaysBreakTemplateDeclarations: true | ||
BinPackArguments: true | ||
BinPackParameters: false | ||
BreakBeforeBraces: Custom | ||
BraceWrapping: | ||
AfterClass: true | ||
AfterFunction: true | ||
BreakBeforeBinaryOperators: false | ||
BreakBeforeTernaryOperators: true | ||
ColumnLimit: 120 | ||
ContinuationIndentWidth: 2 | ||
Cpp11BracedListStyle: true | ||
DisableFormat: false | ||
IndentFunctionDeclarationAfterType: false | ||
IndentWidth: 2 | ||
KeepEmptyLinesAtTheStartOfBlocks: false | ||
MaxEmptyLinesToKeep: 1 | ||
NamespaceIndentation: All | ||
PointerAlignment: Left | ||
SortIncludes: false | ||
SpaceBeforeAssignmentOperators: true | ||
SpaceBeforeParens: ControlStatements | ||
SpaceInEmptyParentheses: false | ||
SpacesBeforeTrailingComments: 1 | ||
SpacesInAngles: false | ||
SpacesInContainerLiterals: false | ||
SpacesInCStyleCastParentheses: false | ||
SpacesInParentheses: false | ||
Standard: c++17 | ||
UseTab: Never |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Prerequisites | ||
*.d | ||
|
||
# Compiled Object files | ||
*.slo | ||
*.lo | ||
*.o | ||
*.obj | ||
|
||
# Precompiled Headers | ||
*.gch | ||
*.pch | ||
|
||
# Compiled Dynamic libraries | ||
*.so | ||
*.dylib | ||
*.dll | ||
|
||
# Fortran module files | ||
*.mod | ||
*.smod | ||
|
||
# Compiled Static libraries | ||
*.lai | ||
*.la | ||
*.a | ||
*.lib | ||
|
||
# Executables | ||
*.exe | ||
*.out | ||
*.app | ||
|
||
# VScode configs | ||
**/.vscode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
cmake_minimum_required(VERSION 3.18) | ||
|
||
include(cmake/Common.cmake) | ||
find_cuda_compiler() | ||
|
||
project(icicle_cuda_backend LANGUAGES CUDA CXX) | ||
|
||
set_env() | ||
set_gpu_env() | ||
|
||
find_package(CUDAToolkit REQUIRED) | ||
|
||
# device API library | ||
add_library(icicle_backend_cuda_device SHARED src/cuda_device_api.cu) | ||
target_include_directories(icicle_backend_cuda_device PRIVATE include) | ||
target_include_directories(icicle_backend_cuda_device PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | ||
target_link_libraries(icicle_backend_cuda_device PUBLIC pthread) | ||
|
||
install(TARGETS icicle_backend_cuda_device | ||
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda | ||
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda | ||
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/backend/cuda) | ||
|
||
if(CMAKE_BUILD_TYPE STREQUAL "Release") | ||
add_custom_command(TARGET icicle_backend_cuda_device POST_BUILD | ||
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_backend_cuda_device> | ||
) | ||
endif() | ||
|
||
# field API library | ||
if (FIELD) | ||
add_library(icicle_cuda_field SHARED | ||
src/field/cuda_mont.cu | ||
) | ||
target_include_directories(icicle_cuda_field PRIVATE include) | ||
set_target_properties(icicle_cuda_field PROPERTIES OUTPUT_NAME "icicle_backend_cuda_field_${FIELD}") | ||
target_include_directories(icicle_cuda_field PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | ||
target_link_libraries(icicle_cuda_field PRIVATE icicle_field ${CUDA_LIBRARIES} pthread) # Link to CUDA | ||
|
||
install(TARGETS icicle_cuda_field | ||
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda" | ||
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda" | ||
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${FIELD}/cuda") | ||
|
||
if(CMAKE_BUILD_TYPE STREQUAL "Release") | ||
add_custom_command(TARGET icicle_cuda_field POST_BUILD | ||
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_field> | ||
) | ||
endif() | ||
endif() # FIELD | ||
|
||
# curve API library | ||
if (CURVE) | ||
add_library(icicle_cuda_curve SHARED | ||
src/curve/cuda_mont.cu | ||
) | ||
if(MSM) | ||
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm.cu) | ||
endif() | ||
if(G2) | ||
target_sources(icicle_cuda_curve PRIVATE src/msm/cuda_msm_g2.cu) | ||
endif() | ||
target_include_directories(icicle_cuda_curve PRIVATE include) | ||
set_target_properties(icicle_cuda_curve PROPERTIES OUTPUT_NAME "icicle_backend_cuda_curve_${CURVE}") | ||
target_include_directories(icicle_cuda_curve PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | ||
target_link_libraries(icicle_cuda_curve PRIVATE icicle_cuda_field icicle_curve ${CUDA_LIBRARIES} pthread) # Link to CUDA | ||
|
||
install(TARGETS icicle_cuda_curve | ||
RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda" | ||
LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda" | ||
ARCHIVE DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/backend/${CURVE}/cuda") | ||
|
||
if(CMAKE_BUILD_TYPE STREQUAL "Release") | ||
add_custom_command(TARGET icicle_cuda_curve POST_BUILD | ||
COMMAND ${CMAKE_STRIP} --strip-unneeded $<TARGET_FILE:icicle_cuda_curve> | ||
) | ||
endif() | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
function(set_env) | ||
set(CMAKE_CXX_STANDARD 17 PARENT_SCOPE) | ||
set(CMAKE_CUDA_STANDARD 17 PARENT_SCOPE) | ||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE PARENT_SCOPE) | ||
set(CMAKE_CXX_STANDARD_REQUIRED TRUE PARENT_SCOPE) | ||
|
||
if("$ENV{ICICLE_PIC}" STREQUAL "OFF" OR ICICLE_PIC STREQUAL "OFF") | ||
message(WARNING "Note that PIC (position-independent code) is disabled.") | ||
else() | ||
set(CMAKE_POSITION_INDEPENDENT_CODE ON PARENT_SCOPE) | ||
endif() | ||
endfunction() | ||
|
||
function(set_gpu_env) | ||
# add the target cuda architectures | ||
# each additional architecture increases the compilation time and output file size | ||
if(DEFINED CUDA_ARCH) # user defined arch takes priority | ||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH} PARENT_SCOPE) | ||
elseif(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") # otherwise, use native to detect GPU arch | ||
set(CMAKE_CUDA_ARCHITECTURES native PARENT_SCOPE) | ||
else() | ||
find_program(_nvidia_smi "nvidia-smi") | ||
|
||
if(_nvidia_smi) | ||
execute_process( | ||
COMMAND ${_nvidia_smi} --query-gpu=compute_cap --format=csv,noheader | ||
OUTPUT_VARIABLE GPU_COMPUTE_CAPABILITIES | ||
OUTPUT_STRIP_TRAILING_WHITESPACE | ||
) | ||
# Process the output to form the CUDA architectures string | ||
string(REPLACE "\n" ";" GPU_COMPUTE_CAPABILITIES_LIST "${GPU_COMPUTE_CAPABILITIES}") | ||
|
||
set(CUDA_ARCHITECTURES "") | ||
foreach(CAPABILITY ${GPU_COMPUTE_CAPABILITIES_LIST}) | ||
# Remove the dot in compute capability to match CMake format | ||
string(REPLACE "." "" CAPABILITY "${CAPABILITY}") | ||
if(CUDA_ARCHITECTURES) | ||
set(CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES};${CAPABILITY}") | ||
else() | ||
set(CUDA_ARCHITECTURES "${CAPABILITY}") | ||
endif() | ||
endforeach() | ||
|
||
message("Setting CMAKE_CUDA_ARCHITECTURES to: ${CUDA_ARCHITECTURES}") | ||
set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}" PARENT_SCOPE) | ||
else() | ||
# no GPUs found, like on Github CI runners | ||
message("Setting CMAKE_CUDA_ARCHITECTURES to: 50") | ||
set(CMAKE_CUDA_ARCHITECTURES 50 PARENT_SCOPE) # some safe value | ||
endif() | ||
endif() | ||
|
||
# Check CUDA version and, if possible, enable multi-threaded compilation | ||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.2") | ||
message(STATUS "Using multi-threaded CUDA compilation.") | ||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --split-compile 0" PARENT_SCOPE) | ||
else() | ||
message(STATUS "Can't use multi-threaded CUDA compilation.") | ||
endif() | ||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr" PARENT_SCOPE) | ||
set(CMAKE_CUDA_FLAGS_RELEASE "" PARENT_SCOPE) | ||
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -lineinfo" PARENT_SCOPE) | ||
endfunction() | ||
|
||
function(find_cuda_compiler) | ||
# Find the CUDA compiler | ||
execute_process( | ||
COMMAND which nvcc | ||
OUTPUT_VARIABLE CUDA_COMPILER_PATH | ||
OUTPUT_STRIP_TRAILING_WHITESPACE | ||
) | ||
|
||
# Set the CUDA compiler | ||
set(CMAKE_CUDA_COMPILER ${CUDA_COMPILER_PATH} PARENT_SCOPE) | ||
endfunction() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#pragma once | ||
#include <cuda.h> | ||
#include <stdexcept> | ||
|
||
#include "icicle/errors.h" | ||
#include "icicle/vec_ops.h" | ||
#include "gpu-utils/error_handler.h" | ||
|
||
namespace montgomery { | ||
#define MAX_THREADS_PER_BLOCK 256 | ||
|
||
template <typename E, bool is_into> | ||
__global__ void MontgomeryKernel(const E* input, int n, E* output) | ||
{ | ||
int tid = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (tid < n) { output[tid] = is_into ? E::to_montgomery(input[tid]) : E::from_montgomery(input[tid]); } | ||
} | ||
|
||
template <typename E, bool is_into> | ||
cudaError_t ConvertMontgomery(const E* input, size_t n, const VecOpsConfig& config, E* output) | ||
{ | ||
cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(config.stream); | ||
|
||
E *d_alloc_out = nullptr, *d_alloc_in = nullptr, *d_out; | ||
const E* d_in; | ||
if (!config.is_a_on_device) { | ||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_in, n * sizeof(E), cuda_stream)); | ||
CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_in, input, n * sizeof(E), cudaMemcpyHostToDevice, cuda_stream)); | ||
d_in = d_alloc_in; | ||
} else { | ||
d_in = input; | ||
} | ||
|
||
if (!config.is_result_on_device) { | ||
CHK_IF_RETURN(cudaMallocAsync(&d_alloc_out, n * sizeof(E), cuda_stream)); | ||
d_out = d_alloc_out; | ||
} else { | ||
d_out = output; | ||
} | ||
|
||
int num_threads = MAX_THREADS_PER_BLOCK; | ||
int num_blocks = (n + num_threads - 1) / num_threads; | ||
MontgomeryKernel<E, is_into><<<num_blocks, num_threads, 0, cuda_stream>>>(d_in, n, d_out); | ||
|
||
if (d_alloc_in) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_in, cuda_stream)); } | ||
if (d_alloc_out) { | ||
CHK_IF_RETURN(cudaMemcpyAsync(output, d_out, n * sizeof(E), cudaMemcpyDeviceToHost, cuda_stream)); | ||
CHK_IF_RETURN(cudaFreeAsync(d_out, cuda_stream)); | ||
} | ||
if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(cuda_stream)); | ||
|
||
return CHK_LAST(); | ||
} | ||
|
||
} // namespace montgomery |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#pragma once | ||
|
||
#include "icicle/errors.h" | ||
#include "cuda_runtime.h" | ||
|
||
static eIcicleError translateCudaError(cudaError_t cudaErr) | ||
{ | ||
switch (cudaErr) { | ||
case cudaSuccess: | ||
return eIcicleError::SUCCESS; | ||
case cudaErrorInvalidDevice: | ||
return eIcicleError::INVALID_DEVICE; | ||
case cudaErrorMemoryAllocation: | ||
return eIcicleError::OUT_OF_MEMORY; | ||
case cudaErrorInvalidDevicePointer: | ||
case cudaErrorInvalidHostPointer: | ||
return eIcicleError::INVALID_POINTER; | ||
case cudaErrorInitializationError: | ||
case cudaErrorInvalidResourceHandle: | ||
return eIcicleError::ALLOCATION_FAILED; | ||
case cudaErrorInvalidMemcpyDirection: | ||
return eIcicleError::COPY_FAILED; | ||
case cudaErrorSyncDepthExceeded: | ||
case cudaErrorLaunchTimeout: | ||
case cudaErrorLaunchIncompatibleTexturing: | ||
case cudaErrorLaunchFailure: | ||
return eIcicleError::SYNCHRONIZATION_FAILED; | ||
case cudaErrorInvalidValue: | ||
return eIcicleError::INVALID_ARGUMENT; | ||
default: | ||
return eIcicleError::UNKNOWN_ERROR; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#pragma once | ||
#ifndef DEVICE_CONTEXT_H | ||
#define DEVICE_CONTEXT_H | ||
|
||
#include <cuda_runtime.h> | ||
|
||
namespace device_context { | ||
|
||
constexpr std::size_t MAX_DEVICES = 32; | ||
|
||
/** | ||
* Properties of the device used in icicle functions. | ||
*/ | ||
struct DeviceContext { | ||
cudaStream_t& stream; /**< Stream to use. Default value: 0. */ | ||
std::size_t device_id; /**< Index of the currently used GPU. Default value: 0. */ | ||
cudaMemPool_t mempool; /**< Mempool to use. Default value: 0. */ | ||
}; | ||
|
||
/** | ||
* Return default device context that corresponds to using the default stream of the first GPU | ||
*/ | ||
inline DeviceContext get_default_device_context() // TODO: naming convention ? | ||
{ | ||
static cudaStream_t default_stream = (cudaStream_t)0; | ||
return DeviceContext{ | ||
(cudaStream_t&)default_stream, // stream | ||
0, // device_id | ||
0, // mempool | ||
}; | ||
} | ||
|
||
} // namespace device_context | ||
|
||
#endif |
Oops, something went wrong.