From c86afcc2d54de00375d38abf23acd05f3595fb8a Mon Sep 17 00:00:00 2001 From: jack198704 Date: Wed, 5 Jun 2024 14:33:14 +0800 Subject: [PATCH] project: init --- .clang-format | 168 ++++++++ .gitignore | 3 + CMakeLists.txt | 73 ++++ LICENSE | 21 + README.md | 54 +++ build.sh | 6 + include/spacemesh_cuda/spacemesh.h | 35 ++ rust/Cargo.lock | 34 ++ rust/Cargo.toml | 12 + rust/LICENSE.md | 0 rust/build.rs | 25 ++ rust/src/lib.rs | 37 ++ src/device/common.cuh | 70 +++ src/device/kernel.cuh | 146 +++++++ src/device/pbkdf2.cuh | 668 +++++++++++++++++++++++++++++ src/device/romix.cuh | 505 ++++++++++++++++++++++ src/spacemesh.cu | 135 ++++++ src/utils.hpp | 545 +++++++++++++++++++++++ test/perf.cu | 182 ++++++++ test/test_kernel.cu | 463 ++++++++++++++++++++ 20 files changed, 3182 insertions(+) create mode 100644 .clang-format create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100755 build.sh create mode 100644 include/spacemesh_cuda/spacemesh.h create mode 100644 rust/Cargo.lock create mode 100644 rust/Cargo.toml create mode 100644 rust/LICENSE.md create mode 100644 rust/build.rs create mode 100644 rust/src/lib.rs create mode 100644 src/device/common.cuh create mode 100644 src/device/kernel.cuh create mode 100644 src/device/pbkdf2.cuh create mode 100644 src/device/romix.cuh create mode 100644 src/spacemesh.cu create mode 100644 src/utils.hpp create mode 100644 test/perf.cu create mode 100644 test/test_kernel.cu diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..f98a359 --- /dev/null +++ b/.clang-format @@ -0,0 +1,168 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: true +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 2 + SortPriority: 0 + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentCaseLabels: true +IndentGotoLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: Auto +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +... + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8bc8609 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +build +.vscode +rust/target diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..4e0f7bf --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 3.24 FATAL_ERROR) + +if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES "all") +endif() + +if (DEFINED CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "") + string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) +else() + set(CMAKE_BUILD_TYPE RELEASE) +endif() + +project(spacemesh-cuda LANGUAGES CXX CUDA) +add_compile_options(-fPIC) + +include(FetchContent) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +file(GLOB_RECURSE src_files LIST_DIRECTORIES false src/*.cu src/*.cpp src/*.c) + +include_directories(include) +include_directories(src) + +add_library(${PROJECT_NAME} ${src_files}) + +install(TARGETS ${PROJECT_NAME}) + +option(WITH_TEST "with test" OFF) + +if(${WITH_TEST}) + enable_testing() + set(TEST_EXE ${PROJECT_NAME}-test) + + if (DEFINED CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "") + string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) + else() + set(CMAKE_BUILD_TYPE RELEASE) + endif() + + if(${CMAKE_BUILD_TYPE} STREQUAL "RELEASE") + set(NVCC_FLAGS -O3 -Wno-deprecated-gpu-targets --Werror all-warnings) + message(STATUS "${TEST_EXE} compile cuda code in release mode") + else() + set(NVCC_FLAGS -G -Wno-deprecated-gpu-targets --Werror all-warnings) + message(STATUS "${TEST_EXE} compile cuda code in debug mode") + endif() + + #Google Unit Test + option(BUILD_GMOCK "" OFF) + option(INSTALL_GTEST "" OFF) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG release-1.12.1 + ) + FetchContent_MakeAvailable(googletest) + + file(GLOB_RECURSE test_files LIST_DIRECTORIES false test/*.cpp test/*.c test/*.cu) + add_executable(${TEST_EXE} ${test_files}) + set_target_properties(${TEST_EXE} PROPERTIES CUDA_ARCHITECTURES "native") + target_link_libraries(${TEST_EXE} + PRIVATE ${PROJECT_NAME} + PRIVATE gtest + PRIVATE gtest_main) + include(GoogleTest) + set(NVCC_FLAGS ${NVCC_FLAGS})# "SHELL:-t 0" "SHELL:-split-compile 0") + target_compile_options(${TEST_EXE} PRIVATE $<$:--forward-unknown-opts -Wall ${NVCC_FLAGS}> + PRIVATE $<$:-Wall>) + add_test(NAME test_corr COMMAND ${TEST_EXE} --gtest_filter=*CheckResult*) + add_test(NAME test_perf COMMAND ${TEST_EXE} --gtest_filter=*Perf*) +endif() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c970acf --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 ZakuroLab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc79088 --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +

Spacemesh-cuda is a library for plot acceleration using CUDA-enabled GPUs.

+ +[![license](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/ZakuroLab/spacemesh-cuda/blob/master/LICENSE) +[![release](https://img.shields.io/github/v/release/ZakuroLab/spacemesh-cuda?include_prereleases)](https://github.com/ZakuroLab/spacemesh-cuda/releases) +![platform](https://img.shields.io/badge/platform-%20linux--64-lightgrey.svg) +[![open help wanted issues](https://img.shields.io/github/issues-raw/ZakuroLab/spacemesh-cuda/help%20wanted?logo=github)](https://github.com/ZakuroLab/spacemesh-cuda/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) +[![made by](https://img.shields.io/badge/madeby-ZakuroLab-blue.svg)](https://github.com/ZakuroLab) + +## Table of Contents + +* [1. Overview](#1-overview) +* [2. Performance](#2-performance) +* [3. Build & Integration Guide](#3-build-&-integration-guide) + * [3.1 From source](#31-From-source) + * [3.2 From binary](#32-From-binary) +* [4. License](#4-license) + +## 1. Overview + +__spacemesh-cuda__ is a cuda library for plot acceleration for [spacemesh](https://github.com/spacemeshos/go-spacemesh). +This library optimizes memory access, calculation parallelism, etc. Compared with the official program, the library improved by **86.6%**. + +## 2. Performance +| GPU\Library | Official | spacemesh-cuda | +| ---- | --- | ---- | +| RTX3080 | 3.2MB/s | 5.97MB/s | + +## 3. Build & Integration Guide + +### 3.1 From source +```shell +# build libpost.so +git clone https://github.com/ZakuroLab/post-rs.git && cd post-rs/ffi +cargo build --release +cd ../../ + +# get postcli +wget https://github.com/spacemeshos/post/releases/download/v0.12.5/postcli-Linux.zip +unzip -d postcli ./postcli-Linux.zip +cd postcli && mv ../post-rs/target/release/libpost.so ./ +``` + +### 3.2 From binary +```shell +mkdir postcli && cd postcli +wget https://github.com/ZakuroLab/spacemesh-cuda/releases/download/v0.0.1/libpost.so +wget https://github.com/ZakuroLab/spacemesh-cuda/releases/download/v0.0.1/postcli +``` + +## 4. License + +We welcome all contributions to `spacemesh-cuda`. Please refer to the [license](#4-license) for the terms of contributions. + +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE.md) diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..1c0a9a5 --- /dev/null +++ b/build.sh @@ -0,0 +1,6 @@ +#!/bin/bash +HOME_DIR=`pwd` +rm -rf ${HOME_DIR}/build +mkdir ${HOME_DIR}/build +cmake -DCMAKE_BUILD_TYPE=Release -S ${HOME_DIR} -B ${HOME_DIR}/build -DWITH_TEST=ON +cmake --build ${HOME_DIR}/build -j diff --git a/include/spacemesh_cuda/spacemesh.h b/include/spacemesh_cuda/spacemesh.h new file mode 100644 index 0000000..0861051 --- /dev/null +++ b/include/spacemesh_cuda/spacemesh.h @@ -0,0 +1,35 @@ +#ifndef SPACEMESH_CUDA_SPACEMESH_H +#define SPACEMESH_CUDA_SPACEMESH_H + +#include + +extern "C" { +/** + * Obtain the number of devices (CUDA GPU) + * + * @returns A uint32 + */ +uint32_t spacemesh_get_device_num(); + +/** + * Obtain the maximum number of tasks of device @p device_id + * + * @returns A uint32 + */ +uint32_t spacemesh_get_max_task_num(uint32_t device_id); + +/** + * Execution + * + * @param device_id The index of device + * @param starting_index + * @param input Input CPU memory + * @param task_num The number of tasks + * @param output Output CPU memory + */ +void spacemesh_scrypt(uint32_t device_id, const uint64_t starting_index, + const uint32_t* input, const uint32_t task_num, + uint32_t* output); +} + +#endif // SPACEMESH_CUDA_SPACEMESH_H diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..4fee3fb --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,34 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "libc" +version = "0.2.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" + +[[package]] +name = "spacemesh-cuda" +version = "0.0.1" +dependencies = [ + "cmake", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..23ed63d --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,12 @@ +[package] +authors = ["The 6block Team"] +edition = "2021" +include = ["Cargo.toml", "src", "LICENSE.md"] +license = "GPL-3.0" +name = "spacemesh-cuda" +version = "0.0.1" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[build-dependencies] +cmake = {version = "0.1.0"} diff --git a/rust/LICENSE.md b/rust/LICENSE.md new file mode 100644 index 0000000..e69de29 diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000..11a0b60 --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,25 @@ +use cmake::Config; + +fn main() { + let profile = std::env::var("PROFILE").unwrap(); + let profile = match profile.as_str() { + "debug" => "Debug", + "release" => "Release", + _ => "Release", + }; + let mut dst = Config::new("..") + .define("CMAKE_BUILD_TYPE", profile) + .define("WITH_TEST", "OFF") + .cxxflag("-O3") + .build(); + dst.push("lib"); + println!("cargo:rustc-link-search=native={}", dst.display()); + + let default_cuda_lib_path = "/usr/local/cuda/targets/x86_64-linux/lib/"; + let default_boost_path = "/usr/local/lib/"; + println!("cargo:rustc-link-search=native={}", default_cuda_lib_path); + println!("cargo:rustc-link-search=native={}", default_boost_path); + println!("cargo:rustc-link-lib=static=cudart_static"); + println!("cargo:rustc-link-lib=stdc++"); + println!("cargo:rustc-link-lib=static=spacemesh-cuda"); +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..c48a25f --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,37 @@ +extern "C" { + fn spacemesh_get_device_num() -> u32; + fn spacemesh_get_max_task_num(device_idx: u32) -> u32; + fn spacemesh_scrypt( + device_id: u32, + starting_index: u64, + input: *const u32, + task_num: u32, + output: *mut u32, + ); +} + +pub fn get_device_num() -> u32 { + unsafe { spacemesh_get_device_num() } +} + +pub fn get_max_task_num(device_id: u32) -> u32 { + unsafe { spacemesh_get_max_task_num(device_id) } +} + +pub fn scrypt( + device_id: u32, + starting_index: u64, + input: &Vec, + task_num: u32, + output: &mut Vec, +) { + unsafe { + spacemesh_scrypt( + device_id, + starting_index, + input.as_ptr(), + task_num, + output.as_mut_ptr() as *mut u32, + ); + } +} diff --git a/src/device/common.cuh b/src/device/common.cuh new file mode 100644 index 0000000..265176f --- /dev/null +++ b/src/device/common.cuh @@ -0,0 +1,70 @@ +#ifndef DEVICE_COMMON_CUH +#define DEVICE_COMMON_CUH + +#include + +#include + +#define UINT32_NUM_BITS sizeof(uint32_t) * 8 +#define UINT64_NUM_BITS sizeof(uint64_t) * 8 + +#define UNUSED(X) (void)(X) + +template +__device__ uint32_t rotl32(uint32_t x) { + return (x << d) | (x >> (UINT32_NUM_BITS - d)); +} + +template +__device__ uint2 rotl64(uint32_t x, uint32_t y) { + uint64_t c = uint64_t(x) | (uint64_t(y) << UINT32_NUM_BITS); + c = (c << d) | (c >> (UINT64_NUM_BITS - d)); + return make_uint2(uint32_t(c), c >> UINT32_NUM_BITS); +} +template +__device__ uint2 rotl64(uint2 v) { + return rotl64(v.x, v.y); +} + +inline __device__ uint2 operator^(const uint2 &t0, const uint2 &t1) { + return {t0.x ^ t1.x, t0.y ^ t1.y}; +} +inline __device__ uint2 &operator^=(uint2 &t0, const uint2 &t1) { + t0.x ^= t1.x; + t0.y ^= t1.y; + return t0; +} + +inline __device__ uint4 operator^(const uint4 &t0, const uint4 &t1) { + return {t0.x ^ t1.x, t0.y ^ t1.y, t0.z ^ t1.z, t0.w ^ t1.w}; +} + +inline __device__ uint4 &operator^=(uint4 &t0, const uint4 &t1) { + t0.x ^= t1.x; + t0.y ^= t1.y; + t0.z ^= t1.z; + t0.w ^= t1.w; + return t0; +} + +inline __device__ uint4 &operator+=(uint4 &t0, const uint4 &t1) { + t0.x += t1.x; + t0.y += t1.y; + t0.z += t1.z; + t0.w += t1.w; + return t0; +} + +template +inline __device__ T make_zero(); + +template <> +inline __device__ uint4 make_zero() { + return {0, 0, 0, 0}; +} +template <> +inline __device__ uint2 make_zero() { + return {0, 0}; +} + +#endif diff --git a/src/device/kernel.cuh b/src/device/kernel.cuh new file mode 100644 index 0000000..1b5df13 --- /dev/null +++ b/src/device/kernel.cuh @@ -0,0 +1,146 @@ +#ifndef DEVICE_KERNEL_CUH +#define DEVICE_KERNEL_CUH + +#include "device/common.cuh" +#include "device/pbkdf2.cuh" +#include "device/romix.cuh" + +/** + * Mainly modified thread mapping relationships, LOOKUP_GAP, Block, etc., + * Throughtput: 4.16MB/s + */ +template +__global__ void scrypt_org(const uint64_t starting_index, + const uint32_t num_tasks, const uint4 input_1, + const uint4 input_2, + uint4 *const __restrict__ padcache, + uint4 *const __restrict__ output) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input_1; + password[1] = input_2; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_ROMix_org(X, padcache, tnum, tid); + scrypt_pbkdf2_32B(password, X, &output[t * 2]); + } +} + +/** + * Optimize memory access and collaborate with Warp to complete data copying + * Throughtput: 5.34MB/s + */ +template +__global__ void scrypt_coalesce_access_v1(const uint64_t starting_index, + const uint32_t num_tasks, + const uint4 input_1, + const uint4 input_2, + uint32_t *const __restrict__ padcache, + uint4 *const __restrict__ output) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input_1; + password[1] = input_2; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_ROMix_coalesce_access_v1(X, padcache, tnum, tid); + scrypt_pbkdf2_32B(password, X, &output[t * 2]); + } +} + +/** + * Optimize memory access, with 16 threads forming a sub warp, each thread + * performing read and write operations in units of 8 bytes (uint64_t) + * Throughtput: 5.94MB/s + */ +template +__global__ void scrypt_coalesce_access_v2(const uint64_t starting_index, + const uint32_t num_tasks, + const uint4 input_1, + const uint4 input_2, + uint64_t *const __restrict__ padcache, + uint4 *const __restrict__ output) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input_1; + password[1] = input_2; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_ROMix_coalesce_access_v2(X, padcache, tnum, tid); + scrypt_pbkdf2_32B(password, X, &output[t * 2]); + } +} + +/** + * Optimize memory access, with 8 threads forming a sub warp, each thread + * performing read and write operations in units of 8 bytes (uint4) + global + * memory -> shared memory asynchronous reads + * Throughtput: 6.03MB/s + */ +template +__global__ void scrypt_coalesce_access_v3(const uint64_t starting_index, + const uint32_t num_tasks, + const uint4 input_1, + const uint4 input_2, + uint4 *const __restrict__ padcache, + uint4 *const __restrict__ output) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input_1; + password[1] = input_2; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_ROMix_coalesce_access_v3(X, padcache, tnum, tid); + scrypt_pbkdf2_32B(password, X, &output[t * 2]); + } +} + +#endif diff --git a/src/device/pbkdf2.cuh b/src/device/pbkdf2.cuh new file mode 100644 index 0000000..4f7f85e --- /dev/null +++ b/src/device/pbkdf2.cuh @@ -0,0 +1,668 @@ +#ifndef DEVICE_PBKDF2_CUH +#define DEVICE_PBKDF2_CUH +#include + +#include "device/common.cuh" + +#define SCRYPT_HASH_DIGEST_SIZE 64 +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_HASH_BLOCK_SIZE 72 +#define SCRYPT_BLOCK_BYTES 128 + +typedef struct scrypt_hash_state_t { + uint4 state4[(SCRYPT_KECCAK_F + 127) / 128]; // 8 bytes of extra + uint4 buffer4[(SCRYPT_HASH_BLOCK_SIZE + 15) / 16]; // 8 bytes of extra + // uint leftover; +} scrypt_hash_state; + +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner; + scrypt_hash_state outer; +} scrypt_hmac_state; + +__constant__ uint64_t keccak_round_constants[24]{ + 0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL, + 0x8000000080008000UL, 0x000000000000808bUL, 0x0000000080000001UL, + 0x8000000080008081UL, 0x8000000000008009UL, 0x000000000000008aUL, + 0x0000000000000088UL, 0x0000000080008009UL, 0x000000008000000aUL, + 0x000000008000808bUL, 0x800000000000008bUL, 0x8000000000008089UL, + 0x8000000000008003UL, 0x8000000000008002UL, 0x8000000000000080UL, + 0x000000000000800aUL, 0x800000008000000aUL, 0x8000000080008081UL, + 0x8000000000008080UL, 0x0000000080000001UL, 0x8000000080008008UL}; + +inline __device__ void keccak_block_core(scrypt_hash_state &S) { + uint2 t[5]; + uint2 u[5]; + uint2 v; + uint2 w; + uint4 *s4 = S.state4; + + for (uint i = 0; i < 24; i++) { +/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ +#define CASE(D, S00, S01, S10, S11, S20, S21, S30, S31, S40, S41) \ + { \ + D.x = S00 ^ S10 ^ S20 ^ S30 ^ S40; \ + D.y = S01 ^ S11 ^ S21 ^ S31 ^ S41; \ + } + // t[0] = s4[0].xy ^ s4[2].zw ^ s4[5].xy ^ s4[7].zw ^ s4[10].xy; + CASE(t[0], s4[0].x, s4[0].y, s4[2].z, s4[2].w, s4[5].x, s4[5].y, s4[7].z, + s4[7].w, s4[10].x, s4[10].y); + // t[1] = s4[0].zw ^ s4[3].xy ^ s4[5].zw ^ s4[8].xy ^ s4[10].zw; + CASE(t[1], s4[0].z, s4[0].w, s4[3].x, s4[3].y, s4[5].z, s4[5].w, s4[8].x, + s4[8].y, s4[10].z, s4[10].w); + // t[2] = s4[1].xy ^ s4[3].zw ^ s4[6].xy ^ s4[8].zw ^ s4[11].xy; + CASE(t[2], s4[1].x, s4[1].y, s4[3].z, s4[3].w, s4[6].x, s4[6].y, s4[8].z, + s4[8].w, s4[11].x, s4[11].y); + // t[3] = s4[1].zw ^ s4[4].xy ^ s4[6].zw ^ s4[9].xy ^ s4[11].zw; + CASE(t[3], s4[1].z, s4[1].w, s4[4].x, s4[4].y, s4[6].z, s4[6].w, s4[9].x, + s4[9].y, s4[11].z, s4[11].w); + // t[4] = s4[2].xy ^ s4[4].zw ^ s4[7].xy ^ s4[9].zw ^ s4[12].xy; + CASE(t[4], s4[2].x, s4[2].y, s4[4].z, s4[4].w, s4[7].x, s4[7].y, s4[9].z, + s4[9].w, s4[12].x, s4[12].y); +#undef CASE + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ rotl64<1>(t[1]); + u[1] = t[0] ^ rotl64<1>(t[2]); + u[2] = t[1] ^ rotl64<1>(t[3]); + u[3] = t[2] ^ rotl64<1>(t[4]); + u[4] = t[3] ^ rotl64<1>(t[0]); + +/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ +#define CASE(D0, D1, S) \ + { \ + D0 ^= S.x; \ + D1 ^= S.y; \ + } + + // s4[0].xy ^= u[0]; + CASE(s4[0].x, s4[0].y, u[0]); + // s4[2].zw ^= u[0]; + CASE(s4[2].z, s4[2].w, u[0]); + // s4[5].xy ^= u[0]; + CASE(s4[5].x, s4[5].y, u[0]); + // s4[7].zw ^= u[0]; + CASE(s4[7].z, s4[7].w, u[0]); + // s4[10].xy ^= u[0]; + CASE(s4[10].x, s4[10].y, u[0]); + // s4[0].zw ^= u[1]; + CASE(s4[0].z, s4[0].w, u[1]); + // s4[3].xy ^= u[1]; + CASE(s4[3].x, s4[3].y, u[1]); + // s4[5].zw ^= u[1]; + CASE(s4[5].z, s4[5].w, u[1]); + // s4[8].xy ^= u[1]; + CASE(s4[8].x, s4[8].y, u[1]); + // s4[10].zw ^= u[1]; + CASE(s4[10].z, s4[10].w, u[1]); + // s4[1].xy ^= u[2]; + CASE(s4[1].x, s4[1].y, u[2]); + // s4[3].zw ^= u[2]; + CASE(s4[3].z, s4[3].w, u[2]); + // s4[6].xy ^= u[2]; + CASE(s4[6].x, s4[6].y, u[2]); + // s4[8].zw ^= u[2]; + CASE(s4[8].z, s4[8].w, u[2]); + // s4[11].xy ^= u[2]; + CASE(s4[11].x, s4[11].y, u[2]); + // s4[1].zw ^= u[3]; + CASE(s4[1].z, s4[1].w, u[3]); + // s4[4].xy ^= u[3]; + CASE(s4[4].x, s4[4].y, u[3]); + // s4[6].zw ^= u[3]; + CASE(s4[6].z, s4[6].w, u[3]); + // s4[9].xy ^= u[3]; + CASE(s4[9].x, s4[9].y, u[3]); + // s4[11].zw ^= u[3]; + CASE(s4[11].z, s4[11].w, u[3]); + // s4[2].xy ^= u[4]; + CASE(s4[2].x, s4[2].y, u[4]); + // s4[4].zw ^= u[4]; + CASE(s4[4].z, s4[4].w, u[4]); + // s4[7].xy ^= u[4]; + CASE(s4[7].x, s4[7].y, u[4]); + // s4[9].zw ^= u[4]; + CASE(s4[9].z, s4[9].w, u[4]); + // s4[12].xy ^= u[4]; + CASE(s4[12].x, s4[12].y, u[4]); +#undef CASE + + /* rho pi: b[..] = rotl(a[..], ..) */ + // v = s4[0].zw; + v = make_uint2(s4[0].z, s4[0].w); +#define CASE(D0, D1, S0, S1, M) \ + { \ + const uint2 &tmp = rotl64(S0, S1); \ + D0 = tmp.x; \ + D1 = tmp.y; \ + } + + // s4[0].zw = ROTL64(s4[3].xy, 44UL); + CASE(s4[0].z, s4[0].w, s4[3].x, s4[3].y, 44); + // s4[3].xy = ROTL64(s4[4].zw, 20UL); + CASE(s4[3].x, s4[3].y, s4[4].z, s4[4].w, 20); + // s4[4].zw = ROTL64(s4[11].xy, 61UL); + CASE(s4[4].z, s4[4].w, s4[11].x, s4[11].y, 61); + // s4[11].xy = ROTL64(s4[7].xy, 39UL); + CASE(s4[11].x, s4[11].y, s4[7].x, s4[7].y, 39); + // s4[7].xy = ROTL64(s4[10].xy, 18UL); + CASE(s4[7].x, s4[7].y, s4[10].x, s4[10].y, 18); + // s4[10].xy = ROTL64(s4[1].xy, 62UL); + CASE(s4[10].x, s4[10].y, s4[1].x, s4[1].y, 62); + // s4[1].xy = ROTL64(s4[6].xy, 43UL); + CASE(s4[1].x, s4[1].y, s4[6].x, s4[6].y, 43); + // s4[6].xy = ROTL64(s4[6].zw, 25UL); + CASE(s4[6].x, s4[6].y, s4[6].z, s4[6].w, 25); + // s4[6].zw = ROTL64(s4[9].zw, 8UL); + CASE(s4[6].z, s4[6].w, s4[9].z, s4[9].w, 8); + // s4[9].zw = ROTL64(s4[11].zw, 56UL); + CASE(s4[9].z, s4[9].w, s4[11].z, s4[11].w, 56); + // s4[11].zw = ROTL64(s4[7].zw, 41UL); + CASE(s4[11].z, s4[11].w, s4[7].z, s4[7].w, 41); + // s4[7].zw = ROTL64(s4[2].xy, 27UL); + CASE(s4[7].z, s4[7].w, s4[2].x, s4[2].y, 27); + // s4[2].xy = ROTL64(s4[12].xy, 14UL); + CASE(s4[2].x, s4[2].y, s4[12].x, s4[12].y, 14); + // s4[12].xy = ROTL64(s4[10].zw, 2UL); + CASE(s4[12].x, s4[12].y, s4[10].z, s4[10].w, 2); + // s4[10].zw = ROTL64(s4[4].xy, 55UL); + CASE(s4[10].z, s4[10].w, s4[4].x, s4[4].y, 55); + // s4[4].xy = ROTL64(s4[8].xy, 45UL); + CASE(s4[4].x, s4[4].y, s4[8].x, s4[8].y, 45); + // s4[8].xy = ROTL64(s4[2].zw, 36UL); + CASE(s4[8].x, s4[8].y, s4[2].z, s4[2].w, 36); + // s4[2].zw = ROTL64(s4[1].zw, 28UL); + CASE(s4[2].z, s4[2].w, s4[1].z, s4[1].w, 28); + // s4[1].zw = ROTL64(s4[9].xy, 21UL); + CASE(s4[1].z, s4[1].w, s4[9].x, s4[9].y, 21); + // s4[9].xy = ROTL64(s4[8].zw, 15UL); + CASE(s4[9].x, s4[9].y, s4[8].z, s4[8].w, 15); + // s4[8].zw = ROTL64(s4[5].zw, 10UL); + CASE(s4[8].z, s4[8].w, s4[5].z, s4[5].w, 10); + // s4[5].zw = ROTL64(s4[3].zw, 6UL); + CASE(s4[5].z, s4[5].w, s4[3].z, s4[3].w, 6); + // s4[3].zw = ROTL64(s4[5].xy, 3UL); + CASE(s4[3].z, s4[3].w, s4[5].x, s4[5].y, 3); + // s4[5].xy = ROTL64(v, 1UL); + CASE(s4[5].x, s4[5].y, v.x, v.y, 1); +#undef CASE + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + // v = s4[0].xy; + v = make_uint2(s4[0].x, s4[0].y); + // w = s4[0].zw; + w = make_uint2(s4[0].z, s4[0].w); + +#define CASE(D0, D1, S00, S01, S10, S11) \ + { \ + D0 ^= (~S00) & S10; \ + D1 ^= (~S01) & S11; \ + } + + // s4[0].xy ^= (~w) & s4[1].xy; + CASE(s4[0].x, s4[0].y, w.x, w.y, s4[1].x, s4[1].y); + // s4[0].zw ^= (~s4[1].xy) & s4[1].zw; + CASE(s4[0].z, s4[0].w, s4[1].x, s4[1].y, s4[1].z, s4[1].w); + // s4[1].xy ^= (~s4[1].zw) & s4[2].xy; + CASE(s4[1].x, s4[1].y, s4[1].z, s4[1].w, s4[2].x, s4[2].y); + // s4[1].zw ^= (~s4[2].xy) & v; + CASE(s4[1].z, s4[1].w, s4[2].x, s4[2].y, v.x, v.y); + // s4[2].xy ^= (~v) & w; + CASE(s4[2].x, s4[2].y, v.x, v.y, w.x, w.y); + // v = s4[2].zw; + v = make_uint2(s4[2].z, s4[2].w); + // w = s4[3].xy; + w = make_uint2(s4[3].x, s4[3].y); + // s4[2].zw ^= (~w) & s4[3].zw; + CASE(s4[2].z, s4[2].w, w.x, w.y, s4[3].z, s4[3].w); + // s4[3].xy ^= (~s4[3].zw) & s4[4].xy; + CASE(s4[3].x, s4[3].y, s4[3].z, s4[3].w, s4[4].x, s4[4].y); + // s4[3].zw ^= (~s4[4].xy) & s4[4].zw; + CASE(s4[3].z, s4[3].w, s4[4].x, s4[4].y, s4[4].z, s4[4].w); + // s4[4].xy ^= (~s4[4].zw) & v; + CASE(s4[4].x, s4[4].y, s4[4].z, s4[4].w, v.x, v.y); + // s4[4].zw ^= (~v) & w; + CASE(s4[4].z, s4[4].w, v.x, v.y, w.x, w.y); + // v = s4[5].xy; + v = make_uint2(s4[5].x, s4[5].y); + // w = s4[5].zw; + w = make_uint2(s4[5].z, s4[5].w); + // s4[5].xy ^= (~w) & s4[6].xy; + CASE(s4[5].x, s4[5].y, w.x, w.y, s4[6].x, s4[6].y); + // s4[5].zw ^= (~s4[6].xy) & s4[6].zw; + CASE(s4[5].z, s4[5].w, s4[6].x, s4[6].y, s4[6].z, s4[6].w); + // s4[6].xy ^= (~s4[6].zw) & s4[7].xy; + CASE(s4[6].x, s4[6].y, s4[6].z, s4[6].w, s4[7].x, s4[7].y); + // s4[6].zw ^= (~s4[7].xy) & v; + CASE(s4[6].z, s4[6].w, s4[7].x, s4[7].y, v.x, v.y); + // s4[7].xy ^= (~v) & w; + CASE(s4[7].x, s4[7].y, v.x, v.y, w.x, w.y); + // v = s4[7].zw; + v = make_uint2(s4[7].z, s4[7].w); + // w = s4[8].xy; + w = make_uint2(s4[8].x, s4[8].y); + // s4[7].zw ^= (~w) & s4[8].zw; + CASE(s4[7].z, s4[7].w, w.x, w.y, s4[8].z, s4[8].w); + // s4[8].xy ^= (~s4[8].zw) & s4[9].xy; + CASE(s4[8].x, s4[8].y, s4[8].z, s4[8].w, s4[9].x, s4[9].y); + // s4[8].zw ^= (~s4[9].xy) & s4[9].zw; + CASE(s4[8].z, s4[8].w, s4[9].x, s4[9].y, s4[9].z, s4[9].w); + // s4[9].xy ^= (~s4[9].zw) & v; + CASE(s4[9].x, s4[9].y, s4[9].z, s4[9].w, v.x, v.y); + // s4[9].zw ^= (~v) & w; + CASE(s4[9].z, s4[9].w, v.x, v.y, w.x, w.y); + // v = s4[10].xy; + v = make_uint2(s4[10].x, s4[10].y); + // w = s4[10].zw; + w = make_uint2(s4[10].z, s4[10].w); + // s4[10].xy ^= (~w) & s4[11].xy; + CASE(s4[10].x, s4[10].y, w.x, w.y, s4[11].x, s4[11].y); + // s4[10].zw ^= (~s4[11].xy) & s4[11].zw; + CASE(s4[10].z, s4[10].w, s4[11].x, s4[11].y, s4[11].z, s4[11].w); + // s4[11].xy ^= (~s4[11].zw) & s4[12].xy; + CASE(s4[11].x, s4[11].y, s4[11].z, s4[11].w, s4[12].x, s4[12].y); + // s4[11].zw ^= (~s4[12].xy) & v; + CASE(s4[11].z, s4[11].w, s4[12].x, s4[12].y, v.x, v.y); + // s4[12].xy ^= (~v) & w; + CASE(s4[12].x, s4[12].y, v.x, v.y, w.x, w.y); +#undef CASE + + /* iota: a[0,0] ^= round constant */ + // s4[0].xy ^= as_uint2(keccak_round_constants[i]); + s4[0].x ^= uint32_t(keccak_round_constants[i]); + s4[0].y ^= uint32_t(keccak_round_constants[i] >> UINT32_NUM_BITS); + } +} + +inline __device__ void keccak_block(scrypt_hash_state &S, const uint4 *in4) { + uint4 *s4 = S.state4; + uint i; + + /* absorb input */ + for (i = 0; i < 4; i++) { + s4[i].x ^= in4[i].x; + s4[i].y ^= in4[i].y; + s4[i].z ^= in4[i].z; + s4[i].w ^= in4[i].w; + } + + s4[4].x ^= in4[4].x; + s4[4].y ^= in4[4].y; + + keccak_block_core(S); +} + +inline __device__ void keccak_block_zero(scrypt_hash_state &S, + const uint4 *in4) { + uint4 *s4 = S.state4; + uint i; + + /* absorb input */ + for (i = 0; i < 4; i++) { + s4[i] = in4[i]; + } + // s4[4].xyzw = (uint4)(in4[4].xy, 0, 0); + s4[4] = make_uint4(in4[4].x, in4[4].y, 0, 0); + + for (i = 5; i < 12; i++) { + // s4[i] = ZERO; + s4[i] = make_zero(); + } + // s4[12].xy = ZERO_UINT2; + s4[12].x = 0; + s4[12].y = 0; + + keccak_block_core(S); +} + +inline __device__ void scrypt_hash_update_72(scrypt_hash_state &S, + const uint4 *in4) { + /* handle the current data */ + keccak_block_zero(S, in4); +} + +inline __device__ void scrypt_hash_update_80(scrypt_hash_state &S, + const uint4 *in4) { + const uchar1 *in = (const uchar1 *)in4; + // uint i; + + /* handle the current data */ + keccak_block(S, in4); + in += SCRYPT_HASH_BLOCK_SIZE; + + /* handle leftover data */ + // S->leftover = 2; + + { + const uint2 *in2 = (const uint2 *)in; + + // S->buffer4[0].xy = int2[0].xy; + S.buffer4[0].x = in2[0].x; + S.buffer4[0].y = in2[0].y; + } +} + +inline __device__ void scrypt_hash_update_128(scrypt_hash_state &S, + const uint4 *in4) { + const uchar1 *in = (const uchar1 *)in4; + // uint i; + + /* handle the current data */ + keccak_block(S, in4); + in += SCRYPT_HASH_BLOCK_SIZE; + + /* handle leftover data */ + // S->leftover = 14; + + { + const uint2 *in2 = (const uint2 *)in; + + for (uint i = 0; i < 3; i++) { + S.buffer4[i] = make_uint4(in2[2 * i].x, in2[2 * i].y, in2[2 * i + 1].x, + in2[2 * i + 1].y); + } + // S->buffer4[3].xy = int2[6].xy; + S.buffer4[3].x = in2[6].x; + S.buffer4[3].y = in2[6].y; + } +} + +inline __device__ void scrypt_hash_update_4_after_72(scrypt_hash_state &S, + uint in) { + S.buffer4[0] = make_uint4(in, 0x01, 0, 0); +} + +inline __device__ void scrypt_hash_update_4_after_80(scrypt_hash_state &S, + uint in) { + // assume that leftover = 2 + /* handle the previous data */ + // S->buffer4[0].zw = (uint2)(in, 0x01); + S.buffer4[0].z = in; + S.buffer4[0].w = 0x01; + // S->leftover += 1; +} + +inline __device__ void scrypt_hash_update_4_after_128(scrypt_hash_state &S, + uint in) { + // leftover = 14 + /* handle the previous data */ + // S->buffer4[3].zw = (uint2)(in, 0x01); + S.buffer4[3].z = in; + S.buffer4[3].w = 0x01; + // S->leftover += 1; +} + +inline __device__ void scrypt_hash_update_64(scrypt_hash_state &S, + const uint4 *in4) { + /* handle leftover data */ + // S->leftover = 16; + for (uint32_t i = 0; i < 4; i++) { + S.buffer4[i] = in4[i]; + } +} + +inline __device__ void scrypt_hash_finish_80_after_64(scrypt_hash_state &S, + uint4 *hash4) { + // assume that leftover = 16 + // S->buffer4[4].xy = (uint2)(0x01, 0x80000000); + S.buffer4[4].x = 0x01; + S.buffer4[4].y = 0x80000000; + + keccak_block(S, S.buffer4); + + for (uint i = 0; i < 4; i++) { + hash4[i] = S.state4[i]; + } +} + +inline __device__ void scrypt_hash_finish_80_after_80_4(scrypt_hash_state &S, + uint4 *hash4) { + // assume that leftover = 3 + // S->buffer4[0].w = 0x01; // done already in scrypt_hash_update_4_after_80 + for (uint i = 1; i < 4; i++) { + S.buffer4[i] = make_zero(); + } + // S->buffer4[4].xy = (uint2)(0, 0x80000000); + S.buffer4[4].x = 0; + S.buffer4[4].y = 0x80000000; + + keccak_block(S, S.buffer4); + + for (uint i = 0; i < 4; i++) { + hash4[i] = S.state4[i]; + } +} + +inline __device__ void scrypt_hash_finish_80_after_128_4(scrypt_hash_state &S, + uint4 *hash4) { + // leftover = 15 + // S->buffer4[3].w = 0x01; // done already in scrypt_hash_update_4_after_128 + // S->buffer4[4].xy = (uint2)(0, 0x80000000); + S.buffer4[4].x = 0; + S.buffer4[4].y = 0x80000000; + + keccak_block(S, S.buffer4); + + for (uint i = 0; i < 4; i++) { + hash4[i] = S.state4[i]; + } +} + +inline __device__ void scrypt_hash_72(uint4 *hash4, const uint4 *m) { + for (uint i = 0; i < 4; i++) { + hash4[i] = m[i]; + } + // hash4[4].xy = m[4].xy; + hash4[4].x = m[4].x; + hash4[4].y = m[4].y; +} + +inline __device__ void scrypt_hash_80(uint4 *hash4, const uint4 *m) { + const uchar1 *in = (const uchar1 *)m; + scrypt_hash_state st; + + /* handle the current data */ + keccak_block_zero(st, m); + in += SCRYPT_HASH_BLOCK_SIZE; + + { + const uint2 *in2 = (const uint2 *)in; + // st.buffer4[0].xyzw = (uint4)(in2[0].xy, 0x01, 0); + st.buffer4[0] = make_uint4(in2[0].x, in2[0].y, 0x01, 0); + } + + for (uint i = 1; i < 4; i++) { + st.buffer4[i] = make_zero(); + } + // st.buffer4[4].xyzw = (uint4)(0, 0x80000000, 0, 0); + st.buffer4[4] = make_uint4(0, 0x80000000, 0, 0); + + keccak_block(st, st.buffer4); + + for (uint i = 0; i < 4; i++) { + hash4[i] = st.state4[i]; + } +} + +/* hmac */ +constexpr uint KEY_0X36 = 0x36363636; +constexpr uint KEY_0X36_XOR_0X5C = 0x6A6A6A6A; + +inline __device__ void scrypt_hmac_init(scrypt_hmac_state &st, + const uint4 *key) { + uint4 pad4[SCRYPT_HASH_BLOCK_SIZE / 16 + 1]; + + scrypt_hash_72(pad4, key); + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (uint i = 0; i < 4; i++) { + pad4[i].x ^= KEY_0X36; + pad4[i].y ^= KEY_0X36; + pad4[i].z ^= KEY_0X36; + pad4[i].w ^= KEY_0X36; + } + // pad4[4].xy ^= KEY_0X36_2; + pad4[4].x ^= KEY_0X36; + pad4[4].y ^= KEY_0X36; + + scrypt_hash_update_72(st.inner, pad4); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (uint i = 0; i < 4; i++) { + pad4[i].x ^= KEY_0X36_XOR_0X5C; + pad4[i].y ^= KEY_0X36_XOR_0X5C; + pad4[i].z ^= KEY_0X36_XOR_0X5C; + pad4[i].w ^= KEY_0X36_XOR_0X5C; + } + // pad4[4].xy ^= KEY_0X36_XOR_0X5C_2; + pad4[4].x ^= KEY_0X36_XOR_0X5C; + pad4[4].y ^= KEY_0X36_XOR_0X5C; + + scrypt_hash_update_72(st.outer, pad4); +} + +inline __device__ void scrypt_hmac_update_80(scrypt_hmac_state &st, + const uint4 *m) { + /* h(inner || m...) */ + scrypt_hash_update_80(st.inner, m); +} + +inline __device__ void scrypt_hmac_update_72(scrypt_hmac_state &st, + const uint4 *m) { + /* h(inner || m...) */ + scrypt_hash_update_72(st.inner, m); +} + +inline __device__ void scrypt_hmac_update_128(scrypt_hmac_state &st, + const uint4 *m) { + /* h(inner || m...) */ + scrypt_hash_update_128(st.inner, m); +} + +inline __device__ void scrypt_hmac_update_4_after_72(scrypt_hmac_state &st, + uint m) { + /* h(inner || m...) */ + scrypt_hash_update_4_after_72(st.inner, m); +} + +inline __device__ void scrypt_hmac_update_4_after_80(scrypt_hmac_state &st, + uint m) { + /* h(inner || m...) */ + scrypt_hash_update_4_after_80(st.inner, m); +} + +inline __device__ void scrypt_hmac_update_4_after_128(scrypt_hmac_state &st, + uint m) { + /* h(inner || m...) */ + scrypt_hash_update_4_after_128(st.inner, m); +} + +inline __device__ void scrypt_hmac_finish_128B(scrypt_hmac_state &st, + uint4 *mac) { + /* h(inner || m) */ + uint4 innerhash[4]; + scrypt_hash_finish_80_after_80_4(st.inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update_64(st.outer, innerhash); + scrypt_hash_finish_80_after_64(st.outer, mac); +} + +inline __device__ void scrypt_hmac_finish_32B(scrypt_hmac_state &st, + uint4 *mac) { + /* h(inner || m) */ + uint4 innerhash[4]; + scrypt_hash_finish_80_after_128_4(st.inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update_64(st.outer, innerhash); + scrypt_hash_finish_80_after_64(st.outer, mac); +} + +inline __device__ void scrypt_copy_hmac_state_128B( + scrypt_hmac_state &dest, const scrypt_hmac_state &src) { + for (uint i = 0; i < 12; i++) { + dest.inner.state4[i] = src.inner.state4[i]; + } + // dest->inner.state4[12].xy = src->inner.state4[12].xy; + dest.inner.state4[12].x = src.inner.state4[12].x; + dest.inner.state4[12].y = src.inner.state4[12].y; + + // dest->inner.buffer4[0].xy = src->inner.buffer4[0].xy; + dest.inner.buffer4[0].x = src.inner.buffer4[0].x; + dest.inner.buffer4[0].y = src.inner.buffer4[0].y; + + for (uint i = 0; i < 12; i++) { + dest.outer.state4[i] = src.outer.state4[i]; + } + // dest->outer.state4[12].xy = src->outer.state4[12].xy; + dest.outer.state4[12].x = src.outer.state4[12].x; + dest.outer.state4[12].y = src.outer.state4[12].y; +} + +constexpr uint be1 = 0x01000000; +constexpr uint be2 = 0x02000000; + +inline __device__ void scrypt_pbkdf2_128B(const uint4 *password, uint4 *out4) { + scrypt_hmac_state hmac_pw, work; + uint4 ti4[4]; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they + * will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(hmac_pw, password); + + /* hmac(password, salt...) */ + // Skip salt + // scrypt_hmac_update_80(&hmac_pw, salt); + + /* U1 = hmac(password, salt || be(i)) */ + /* U32TO8_BE(be, i); */ + // work = hmac_pw; + scrypt_copy_hmac_state_128B(work, hmac_pw); + scrypt_hmac_update_4_after_72(work, be1); + scrypt_hmac_finish_128B(work, ti4); + + for (uint i = 0; i < 4; i++) { + out4[i] = ti4[i]; + } + + /* U1 = hmac(password, salt || be(i)) */ + /* U32TO8_BE(be, i); */ + // work = hmac_pw; + scrypt_hmac_update_4_after_72(hmac_pw, be2); + scrypt_hmac_finish_128B(hmac_pw, ti4); + + for (uint i = 0; i < 4; i++) { + out4[i + 4] = ti4[i]; + } +} + +inline __device__ void scrypt_pbkdf2_32B(const uint4 *password, + const uint4 *salt, + uint4 *__restrict__ out4) { + scrypt_hmac_state hmac_pw; + uint4 ti4[4]; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they + * will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(hmac_pw, password); + + /* hmac(password, salt...) */ + scrypt_hmac_update_128(hmac_pw, salt); + + /* U1 = hmac(password, salt || be(i)) */ + /* U32TO8_BE(be, i); */ + scrypt_hmac_update_4_after_128(hmac_pw, be1); + scrypt_hmac_finish_32B(hmac_pw, ti4); + + for (uint i = 0; i < 2; i++) { + out4[i] = ti4[i]; + } +} +#endif diff --git a/src/device/romix.cuh b/src/device/romix.cuh new file mode 100644 index 0000000..b897b84 --- /dev/null +++ b/src/device/romix.cuh @@ -0,0 +1,505 @@ +#ifndef DEVICE_ROMIX_CUH +#define DEVICE_ROMIX_CUH + +#include +#include + +#include "device/common.cuh" + +constexpr uint4 MASK_2{1, 2, 3, 0}, MASK_3{2, 3, 0, 1}, MASK_4{3, 0, 1, 2}, + ROTATE_16{16, 16, 16, 16}, ROTATE_12{12, 12, 12, 12}, ROTATE_8{8, 8, 8, 8}, + ROTATE_7{7, 7, 7, 7}; + +inline __device__ void chacha_core(uint4 *__restrict__ state) { + uint4 x[4]; + uint4 t; + + x[0] = state[0]; + x[1] = state[1]; + x[2] = state[2]; + x[3] = state[3]; + +#define CASE(D_x, D_y, D_z, D_w, S_x, S_y, S_z, S_w, R) \ + D_x = rotl32(S_x); \ + D_y = rotl32(S_y); \ + D_z = rotl32(S_z); \ + D_w = rotl32(S_w); + + for (uint32_t rounds = 0; rounds < 4; rounds++) { + x[0] += x[1]; + t = x[3] ^ x[0]; + // x[3] = ROTL32(t, ROTATE_16); + CASE(x[3].x, x[3].y, x[3].z, x[3].w, t.x, t.y, t.z, t.w, ROTATE_16); + x[2] += x[3]; + t = x[1] ^ x[2]; + // x[1] = ROTL32(t, ROTATE_12); + CASE(x[1].x, x[1].y, x[1].z, x[1].w, t.x, t.y, t.z, t.w, ROTATE_12); + x[0] += x[1]; + t = x[3] ^ x[0]; + // x[3] = ROTL32(t, ROTATE_8); + CASE(x[3].x, x[3].y, x[3].z, x[3].w, t.x, t.y, t.z, t.w, ROTATE_8); + x[2] += x[3]; + t = x[1] ^ x[2]; + // x[1] = ROTL32(t, ROTATE_7); + CASE(x[1].x, x[1].y, x[1].z, x[1].w, t.x, t.y, t.z, t.w, ROTATE_7); + + // x[1] = shuffle(x[1], MASK_2); + // x[2] = shuffle(x[2], MASK_3); + // x[3] = shuffle(x[3], MASK_4); + + // x[0] += x[1].yzwx; + x[0] += make_uint4(x[1].y, x[1].z, x[1].w, x[1].x); + // t = x[3].wxyz ^ x[0]; + t = make_uint4(x[3].w, x[3].x, x[3].y, x[3].z) ^ x[0]; + // x[3].wxyz = ROTL32(t, ROTATE_16); + CASE(x[3].w, x[3].x, x[3].y, x[3].z, t.x, t.y, t.z, t.w, ROTATE_16); + // x[2].zwxy += x[3].wxyz; + x[2] += make_uint4(x[3].y, x[3].z, x[3].w, x[3].x); + // t = x[1].yzwx ^ x[2].zwxy; + t = make_uint4(x[1].y, x[1].z, x[1].w, x[1].x) ^ + make_uint4(x[2].z, x[2].w, x[2].x, x[2].y); + // x[1].yzwx = ROTL32(t, ROTATE_12); + CASE(x[1].y, x[1].z, x[1].w, x[1].x, t.x, t.y, t.z, t.w, ROTATE_12); + // x[0] += x[1].yzwx; + x[0] += make_uint4(x[1].y, x[1].z, x[1].w, x[1].x); + // t = x[3].wxyz ^ x[0]; + t = make_uint4(x[3].w, x[3].x, x[3].y, x[3].z) ^ x[0]; + // x[3].wxyz = ROTL32(t, ROTATE_8); + CASE(x[3].w, x[3].x, x[3].y, x[3].z, t.x, t.y, t.z, t.w, ROTATE_8); + // x[2].zwxy += x[3].wxyz; + x[2] += make_uint4(x[3].y, x[3].z, x[3].w, x[3].x); + // t = x[1].yzwx ^ x[2].zwxy; + t = make_uint4(x[1].y, x[1].z, x[1].w, x[1].x) ^ + make_uint4(x[2].z, x[2].w, x[2].x, x[2].y); + // x[1].yzwx = ROTL32(t, ROTATE_7); + CASE(x[1].y, x[1].z, x[1].w, x[1].x, t.x, t.y, t.z, t.w, ROTATE_7); + + // x[1] = shuffle(x[1], MASK_4); + // x[2] = shuffle(x[2], MASK_3); + // x[3] = shuffle(x[3], MASK_2); + } +#undef CASE + + state[0] += x[0]; + state[1] += x[1]; + state[2] += x[2]; + state[3] += x[3]; +} + +inline __device__ void scrypt_ChunkMix_inplace_Bxor_local( + uint4 *__restrict__ B /*[chunkWords]*/, + uint4 *__restrict__ Bxor /*[chunkWords]*/) { + /* 1: X = B_{2r - 1} */ + + /* 2: for i = 0 to 2r - 1 do */ + /* 3: X = H(X ^ B_i) */ + B[0] ^= B[4] ^ Bxor[4] ^ Bxor[0]; + B[1] ^= B[5] ^ Bxor[5] ^ Bxor[1]; + B[2] ^= B[6] ^ Bxor[6] ^ Bxor[2]; + B[3] ^= B[7] ^ Bxor[7] ^ Bxor[3]; + + /* SCRYPT_MIX_FN */ + chacha_core(B); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + + /* 3: X = H(X ^ B_i) */ + B[4] ^= B[0] ^ Bxor[4]; + B[5] ^= B[1] ^ Bxor[5]; + B[6] ^= B[2] ^ Bxor[6]; + B[7] ^= B[3] ^ Bxor[7]; + + /* SCRYPT_MIX_FN */ + chacha_core(B + 4); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ +} + +inline __device__ void scrypt_ChunkMix_inplace_local( + uint4 *__restrict__ B /*[chunkWords]*/) { + /* 1: X = B_{2r - 1} */ + + /* 2: for i = 0 to 2r - 1 do */ + /* 3: X = H(X ^ B_i) */ + B[0] ^= B[4]; + B[1] ^= B[5]; + B[2] ^= B[6]; + B[3] ^= B[7]; + + /* SCRYPT_MIX_FN */ + chacha_core(B); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + + /* 3: X = H(X ^ B_i) */ + B[4] ^= B[0]; + B[5] ^= B[1]; + B[6] ^= B[2]; + B[7] ^= B[3]; + + /* SCRYPT_MIX_FN */ + chacha_core(B + 4); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ +} + +#define Coord(x, y, z) x + y *(x##SIZE) + z *(y##SIZE) * (x##SIZE) +#define CO Coord(z, x, y) + +template +inline __device__ void scrypt_ROMix_org(uint4 *__restrict__ X, + uint4 *__restrict__ lookup, + uint32_t tnum, uint32_t tid) { + constexpr uint32_t N = 8 * 1024; + const uint32_t zSIZE = 8; + const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0)); + UNUSED(ySIZE); + const uint32_t xSIZE = tnum; + const uint32_t x = tid % xSIZE; + uint32_t i = 0, j = 0, y = 0, z = 0; + uint4 W[8]; + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + for (y = 0; y < N / LOOKUP_GAP; y++) { + /* 3: V_i = X */ + for (z = 0; z < zSIZE; z++) { + lookup[CO] = X[z]; + } + + for (j = 0; j < LOOKUP_GAP; j++) { + /* 4: X = H(X) */ + scrypt_ChunkMix_inplace_local(X); + } + } + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i++) { + /* 7: j = Integerify(X) % N */ + j = X[4].x & (N - 1); + y = j / LOOKUP_GAP; + + for (z = 0; z < zSIZE; z++) { + W[z] = lookup[CO]; + } + + if constexpr (LOOKUP_GAP == 2) { + if (j & 1) { + scrypt_ChunkMix_inplace_local(W); + } + } + + if constexpr (LOOKUP_GAP > 2) { + uint c = j % LOOKUP_GAP; + for (uint k = 0; k < c; k++) { + scrypt_ChunkMix_inplace_local(W); + } + } + + /* 8: X = H(X ^ V_j) */ + scrypt_ChunkMix_inplace_Bxor_local(X, W); + } + + /* 10: B' = X */ + /* implicit */ +} + +template +inline __device__ void scrypt_ROMix_coalesce_access_v1( + uint4 *__restrict__ X, uint32_t *__restrict__ lookup, uint32_t tnum, + uint32_t tid) { + extern __shared__ uint32_t smem[]; + const uint32_t row_length = warpSize + 1; + uint32_t warp_work_space_size = row_length * warpSize; + uint32_t warp_id = threadIdx.x / warpSize; + uint32_t lane_id = threadIdx.x % warpSize; + const uint32_t smem_base_offset = warp_work_space_size * warp_id; + + constexpr uint32_t N = 8 * 1024; + const uint32_t zSIZE = 32; + const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0)); + UNUSED(ySIZE); + const uint32_t xSIZE = tnum; + const uint32_t x = tid % xSIZE; + uint4 W[8]; + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) { + uint32_t smem_offset = smem_base_offset + row_length * lane_id; + for (uint32_t z = 0; z < zSIZE / 4; z++) { + smem[smem_offset++] = X[z].x; + smem[smem_offset++] = X[z].y; + smem[smem_offset++] = X[z].z; + smem[smem_offset++] = X[z].w; + } + __syncwarp(); + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + for (uint32_t k = 0; k < 128 / sizeof(uint32_t); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k); + lookup[cur_offset + lane_id] = + smem[smem_base_offset + k * row_length + lane_id]; + } + __syncwarp(); + + for (uint32_t j = 0; j < LOOKUP_GAP; j++) { + /* 4: X = H(X) */ + scrypt_ChunkMix_inplace_local(X); + } + } + + /* 6: for i = 0 to N - 1 do */ + for (uint32_t i = 0; i < N; i++) { + /* 7: j = Integerify(X) % N */ + uint32_t j = X[4].x & (N - 1); + uint32_t y = j / LOOKUP_GAP; + + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + for (uint32_t k = 0; k < 128 / sizeof(uint32_t); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k); + smem[smem_base_offset + k * row_length + lane_id] = + lookup[cur_offset + lane_id]; + } + uint32_t smem_offset = smem_base_offset + row_length * lane_id; + for (uint32_t z = 0; z < zSIZE / 4; z++) { + W[z].x = smem[smem_offset++]; + W[z].y = smem[smem_offset++]; + W[z].z = smem[smem_offset++]; + W[z].w = smem[smem_offset++]; + } + __syncwarp(); + + if constexpr (LOOKUP_GAP == 2) { + if (j & 1) { + scrypt_ChunkMix_inplace_local(W); + } + } + + if constexpr (LOOKUP_GAP > 2) { + uint c = j % LOOKUP_GAP; + for (uint k = 0; k < c; k++) { + scrypt_ChunkMix_inplace_local(W); + } + } + + /* 8: X = H(X ^ V_j) */ + scrypt_ChunkMix_inplace_Bxor_local(X, W); + } + + /* 10: B' = X */ + /* implicit */ +} + +template +__device__ void scrypt_ROMix_coalesce_access_v2(uint4 *__restrict__ X, + uint64_t *__restrict__ lookup, + uint32_t tnum, uint32_t tid) { + extern __shared__ uint64_t smem_v2[]; + const uint32_t sub_warp_size = 16; + const uint32_t row_length = warpSize + 1; + uint32_t warp_work_space_size = row_length * sub_warp_size; + uint32_t warp_id = threadIdx.x / warpSize; + uint32_t sub_warp_id = threadIdx.x / sub_warp_size % 2; + uint32_t lane_id = threadIdx.x % warpSize; + uint32_t sub_lane_id = threadIdx.x % sub_warp_size; + const uint32_t smem_base_offset = warp_work_space_size * warp_id; + + constexpr uint32_t N = 8 * 1024; + const uint32_t zSIZE = 16; + const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0)); + UNUSED(ySIZE); + const uint32_t xSIZE = tnum; + const uint32_t x = tid % xSIZE; + uint4 W[8]; + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) { + uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id + + sub_warp_id * sub_warp_size; + for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) { + smem_v2[smem_offset++] = (uint64_t)X[z].x | ((uint64_t)X[z].y << 32); + smem_v2[smem_offset++] = (uint64_t)X[z].z | ((uint64_t)X[z].w << 32); + } + __syncwarp(); + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 16); + lookup[cur_offset + sub_lane_id] = + smem_v2[smem_base_offset + k * row_length + lane_id]; + } + __syncwarp(); + + for (uint32_t j = 0; j < LOOKUP_GAP; j++) { + /* 4: X = H(X) */ + scrypt_ChunkMix_inplace_local(X); + } + } + + /* 6: for i = 0 to N - 1 do */ + for (uint32_t i = 0; i < N; i++) { + /* 7: j = Integerify(X) % N */ + uint32_t j = X[4].x & (N - 1); + uint32_t y = j / LOOKUP_GAP; + + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + uint64_t tmp[16]; + for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 16); + tmp[k] = lookup[cur_offset + sub_lane_id]; + } + __syncwarp(); + for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) { + smem_v2[smem_base_offset + k * row_length + lane_id] = tmp[k]; + } + __syncwarp(); + + uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id + + sub_warp_id * sub_warp_size; + for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) { + auto t = smem_v2[smem_offset++]; + W[z].x = t; + W[z].y = t >> 32; + t = smem_v2[smem_offset++]; + W[z].z = t; + W[z].w = t >> 32; + } + __syncwarp(); + + if constexpr (LOOKUP_GAP == 2) { + if (j & 1) { + scrypt_ChunkMix_inplace_local(W); + } + } + + if constexpr (LOOKUP_GAP > 2) { + uint c = j % LOOKUP_GAP; + for (uint k = 0; k < c; k++) { + scrypt_ChunkMix_inplace_local(W); + } + } + + /* 8: X = H(X ^ V_j) */ + scrypt_ChunkMix_inplace_Bxor_local(X, W); + } + + /* 10: B' = X */ + /* implicit */ +} + +template +__device__ void scrypt_ROMix_coalesce_access_v3(uint4 *__restrict__ X, + uint4 *__restrict__ lookup, + uint32_t tnum, uint32_t tid) { + namespace cg = cooperative_groups; + extern __shared__ uint4 smem_v4[]; + const uint32_t sub_warp_size = 8; + const uint32_t row_length = warpSize + 1; + auto tb = cg::this_thread_block(); + auto tile = cg::tiled_partition(tb); // wrapSize=32 + + uint32_t warp_work_space_size = row_length * sub_warp_size; + uint32_t warp_id = threadIdx.x / warpSize; + uint32_t lane_id = threadIdx.x % warpSize; + uint32_t sub_warp_id = lane_id / sub_warp_size; + uint32_t sub_lane_id = threadIdx.x % sub_warp_size; + const uint32_t smem_base_offset = warp_work_space_size * warp_id; + + constexpr uint32_t N = 8 * 1024; + const uint32_t zSIZE = 8; + const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0)); + UNUSED(ySIZE); + const uint32_t xSIZE = tnum; + const uint32_t x = tid % xSIZE; + uint4 W[8]; + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) { + uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id + + sub_warp_id * sub_warp_size; + for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) { + smem_v4[smem_offset++] = X[z]; + } + __syncwarp(); + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 8); + lookup[cur_offset + sub_lane_id] = + smem_v4[smem_base_offset + k * row_length + lane_id]; + } + __syncwarp(); + + for (uint32_t j = 0; j < LOOKUP_GAP; j++) { + /* 4: X = H(X) */ + scrypt_ChunkMix_inplace_local(X); + } + } + + /* 6: for i = 0 to N - 1 do */ + for (uint32_t i = 0; i < N; i++) { + /* 7: j = Integerify(X) % N */ + uint32_t j = X[4].x & (N - 1); + uint32_t y = j / LOOKUP_GAP; + + uint32_t offset = x * zSIZE + y * xSIZE * zSIZE; + // uint4 tmp[8]; + for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) { + uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 8); + // tmp[k] = lookup[cur_offset + sub_lane_id]; + cg::memcpy_async(tile, + &smem_v4[smem_base_offset + k * row_length + + sub_warp_id * sub_warp_size], + sub_warp_size, &lookup[cur_offset], sub_warp_size); + } + // __syncwarp(); + // for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) { + // smem_v4[smem_base_offset + k * row_length + lane_id] = tmp[k]; + // } + // __syncwarp(); + cg::sync(tile); + cg::wait(tile); + uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id + + sub_warp_id * sub_warp_size; + for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) { + W[z] = smem_v4[smem_offset++]; + } + __syncwarp(); + + if constexpr (LOOKUP_GAP == 2) { + if (j & 1) { + scrypt_ChunkMix_inplace_local(W); + } + } + + if constexpr (LOOKUP_GAP > 2) { + uint c = j % LOOKUP_GAP; + for (uint k = 0; k < c; k++) { + scrypt_ChunkMix_inplace_local(W); + } + } + /* 8: X = H(X ^ V_j) */ + scrypt_ChunkMix_inplace_Bxor_local(X, W); + } + + /* 10: B' = X */ + /* implicit */ +} + +#endif diff --git a/src/spacemesh.cu b/src/spacemesh.cu new file mode 100644 index 0000000..27b2a2b --- /dev/null +++ b/src/spacemesh.cu @@ -0,0 +1,135 @@ +#include +#include + +#include "device/kernel.cuh" +#include "spacemesh_cuda/spacemesh.h" +#include "utils.hpp" + +constexpr uint32_t LOOKUP_GAP = 2; +constexpr uint32_t N = 8192; +constexpr uint32_t LOOKUP_MEM_FOR_ONE_TASK = 128 * N; +constexpr uint32_t OUTPUT_MEM_FOR_ONE_TASK = 32; + +struct DeviceContext { + size_t block_dim; + size_t block_num; + size_t max_task_num; +}; + +class Config { +public: + DeviceContext& GetDeviceContext(size_t device_id) { + if (device_id >= device_num_) { + throw std::invalid_argument("device_id is invalid!"); + } + + return device_contexts_.at(device_id); + } + + size_t GetDeviceNum() const { return device_num_; } + + static Config& GetDefault() { + static Config conf; + return conf; + }; + +private: + Config(); + Config(const Config&) = delete; + Config(Config&&) = delete; + Config& operator=(const Config&) = delete; + Config& operator=(Config&&) = delete; + + size_t device_num_; + std::vector device_contexts_; +}; + +inline Config::Config() { + { + int count; + CHECK(cudaGetDeviceCount(&count)); + if (count <= 0) { + throw "cuda device not found!"; + } + device_num_ = count; + } + + for (size_t di = 0; di < device_num_; ++di) { + GPUContextSwitcher switcher(di); + UNUSED(switcher); + + size_t free_mem; + size_t total; + CHECK(cudaMemGetInfo(&free_mem, &total)); + UNUSED(total); + auto device_prop = GetDeviceProp(di); + + size_t block_num = device_prop.multiProcessorCount; + size_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + size_t block_dim = device_prop.warpSize * smsp_num; + + size_t use_mem = + block_num * block_dim * + (OUTPUT_MEM_FOR_ONE_TASK + LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP); + + while (use_mem * 2 < free_mem) { + block_num *= 2; + use_mem = + block_num * block_dim * + (OUTPUT_MEM_FOR_ONE_TASK + LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP); + } + size_t max_thread_num = block_num * block_dim; + size_t max_task_num = block_dim * block_num; + + while (true) { + use_mem = max_task_num * 2 * OUTPUT_MEM_FOR_ONE_TASK + + max_thread_num * LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP; + if (use_mem >= free_mem || max_task_num > 1024 * 1024 * 8) { + break; + } else { + max_task_num *= 2; + } + } + + device_contexts_.push_back({}); + device_contexts_.back().block_dim = block_dim; + device_contexts_.back().block_num = block_num; + device_contexts_.back().max_task_num = max_task_num; + } +} + +uint32_t spacemesh_get_device_num() { + return Config::GetDefault().GetDeviceNum(); +} + +uint32_t spacemesh_get_max_task_num(uint32_t device_idx) { + return Config::GetDefault().GetDeviceContext(device_idx).max_task_num; +} + +void spacemesh_scrypt(uint32_t device_idx, const uint64_t starting_index, + const uint32_t* input, const uint32_t task_num, + uint32_t* output) { + auto& ctx = Config::GetDefault().GetDeviceContext(device_idx); + GPUContextSwitcher switcher(device_idx); + UNUSED(switcher); + + if (task_num > ctx.max_task_num) { + throw std::invalid_argument("task_num must less " + + std::to_string(ctx.max_task_num)); + } + + CudaDeviceMem d_output(task_num * 2); + CudaDeviceMem d_lookup(ctx.block_dim * ctx.block_num * 8 * N / + LOOKUP_GAP); + + uint4 input_1 = make_uint4(input[0], input[1], input[2], input[3]); + uint4 input_2 = make_uint4(input[4], input[5], input[6], input[7]); + + scrypt_coalesce_access_v3 + <<>>( + starting_index, task_num, input_1, input_2, d_lookup.Ptr(), + d_output.Ptr()); + CHECK(cudaMemcpy(output, d_output.Ptr(), task_num * OUTPUT_MEM_FOR_ONE_TASK, + cudaMemcpyDeviceToHost)); +} diff --git a/src/utils.hpp b/src/utils.hpp new file mode 100644 index 0000000..2c716f4 --- /dev/null +++ b/src/utils.hpp @@ -0,0 +1,545 @@ +#ifndef UTILS_HPP +#define UTILS_HPP + +#include + +#include +#include +#include + +#define CHECK(call) \ + do { \ + const cudaError_t error_code = call; \ + if (error_code != cudaSuccess) { \ + printf("CUDA Error:\n"); \ + printf(" File: %s\n", __FILE__); \ + printf(" Line: %d\n", __LINE__); \ + printf(" Error code: %d\n", error_code); \ + printf(" Error text: %s\n", cudaGetErrorString(error_code)); \ + exit(1); \ + } \ + } while (0); + +class GPUContextSwitcher { +public: + /** + * Use CUDA GPU with index "gpu_index". + * @param gpu_index The index of cuda capable GPU. + */ + GPUContextSwitcher(uint32_t gpu_index) { + CHECK(cudaGetDevice(&old_gpu_index_)); + CHECK(cudaSetDevice(gpu_index)); + } + + /** + * Restore the GPU context that was used before creating the + * GpuContextSwitcher if Restore() has not been called. + */ + ~GPUContextSwitcher() { Restore(); } + + /** + * Restore the GPU context that was used before creating the + * GpuContextSwitcher if Restore() has not been called. + */ + void Restore() noexcept { + if (old_gpu_index_ != -1) { + cudaSetDevice(old_gpu_index_); + old_gpu_index_ = -1; + } + } + +private: + int old_gpu_index_; +}; + +static inline cudaDeviceProp GetDeviceProp(int device_id) { + cudaDeviceProp prop; + CHECK(cudaGetDeviceProperties(&prop, device_id)); + return prop; +} + +static inline uint32_t GetSMSPNum(int major, int minor) { + const auto& invalid_msg = [](int major, int minor) -> std::string { + char buf[64]; + sprintf(buf, "Invalid argument (major=%d, minor=%d) of compute capability", + major, minor); + return buf; + }; + + switch (major) { + case 1: { + switch (minor) { + case 0: + case 1: + case 2: + case 3: + return 1; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 2: { + switch (minor) { + case 0: + case 1: + return 1; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 3: { + switch (minor) { + case 0: + case 2: + case 5: + case 7: + return 1; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 5: { + switch (minor) { + case 0: + case 2: + case 3: + return 4; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 6: { + switch (minor) { + case 0: { + return 2; + } + case 1: + case 2: { + return 4; + } + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 7: { + switch (minor) { + case 0: + case 2: + case 5: + return 4; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 8: { + switch (minor) { + case 0: + case 6: + case 7: + case 9: + return 4; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + case 9: { + switch (minor) { + case 0: + return 4; + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } + } + default: + throw std::invalid_argument(invalid_msg(major, minor)); + } +} + +/* + * An RAII class of cuda device memory. + * Represent a one-dimensional array stored on the device. + * @tparam E The data type of the elements in the array. + */ +template +class CudaDeviceMem { +public: + /** + * Default constructor. + * Doesn't allocate array on the device. + */ + CudaDeviceMem() : CudaDeviceMem(0) {} + + /** + * Allocates a one-dimensional array capable of accommodating element_num + * elements on the device. + * @param element_num + */ + CudaDeviceMem(size_t element_num) : d_ptr_(nullptr), element_num_(0) { + Resize(element_num); + } + + /** + * Allocate a device-side array of the same size as "other" and copy all the + * data from "other" to this array. + * @param other + */ + CudaDeviceMem(const CudaDeviceMem& other) : CudaDeviceMem() { *this = other; } + + /** + * Move constructor. + * Move the device-side array stored in "other" into this object. + * @param other + */ + CudaDeviceMem(CudaDeviceMem&& other) noexcept : CudaDeviceMem() { + *this = std::move(other); + } + + /** + * Destructor. + */ + ~CudaDeviceMem() noexcept { + try { + Clear(); + } catch (...) { + // Do nothing + } + } + + /** + * Get the pointer to the device-side array. + * @return The pointer to the device-side array. + */ + E* Ptr() noexcept { return d_ptr_; } + + /** + * Get the pointer to the device-side array. + * @return The pointer to the device-side array. + */ + const E* Ptr() const noexcept { return d_ptr_; } + + /** + * Type conversion operator. + * @return The pointer to the device-side array. + */ + operator E*() noexcept { return d_ptr_; } + + /** + * Type conversion operator. + * @return The pointer to the device-side array. + */ + operator const E*() const noexcept { return d_ptr_; } + + /** + * Get the size of the array on the device side in bytes. + * @return The size of the array on the device side, in bytes. + */ + size_t SizeInBytes() const noexcept { return element_num_ * sizeof(E); } + + /** + * Get the size of the array on the device side in terms of the number of + * elements. + * @return The size of the array on the device side in terms of the number of + * elements. + */ + size_t Num() const noexcept { return element_num_; } + + /** + * Resize the array on the device side. + * If the new array size is the same as the old array size, then simply return + * directly. + * @param element_num New size of the array on the device side in terms of the + * number of elements. + */ + void Resize(size_t element_num) { + if (element_num_ != element_num) { + Clear(); + if (element_num > 0) { + CHECK(cudaMalloc(&d_ptr_, element_num * sizeof(E))); + } + element_num_ = element_num; + } + } + + /** + * Free the array on the device side. + */ + void Clear() { + if (d_ptr_ != nullptr) { + CHECK(cudaFree(d_ptr_)); + d_ptr_ = nullptr; + element_num_ = 0; + } + } + + /** + * Copy assignment operator. + * Resize this array and copy all data from other to this array. + * @param other + * @return Reference to this object. + */ + CudaDeviceMem& operator=(const CudaDeviceMem& other) { + if (this == &other) { + return *this; + } + + if (other.d_ptr_ != nullptr) { + if (element_num_ != other.element_num_) { + Resize(other.element_num_); + } + CHECK(cudaMemcpy(d_ptr_, other.d_ptr_, other.SizeInBytes(), + cudaMemcpyDeviceToDevice)); + } else { + Clear(); + } + return *this; + } + + /** + * Move assignment operator. + * @param other + * @return Reference to this object. + */ + CudaDeviceMem& operator=(CudaDeviceMem&& other) { + if (this == &other) { + return *this; + } + Clear(); + d_ptr_ = other.d_ptr_; + other.d_ptr_ = nullptr; + + element_num_ = other.element_num_; + other.element_num_ = 0; + return *this; + } + +private: + E* d_ptr_; + size_t element_num_; +}; + +/** + * An RAII class of cuda host memory. + * Represent a one-dimensional array stored on the host side. + * @tparam E The data type of the elements in the array. + */ +template +class CudaHostMem { +public: + /** + * Default constructor. + * Doesn't allocate array on the host side. + */ + CudaHostMem() : CudaHostMem(0) {} + + /** + * Allocates a one-dimensional array capable of accommodating element_num + * elements with flag cudaHostAllocDefault on the host side. + * @param element_num + */ + CudaHostMem(size_t element_num) + : d_ptr_(nullptr), + h_ptr_(nullptr), + element_num_(0), + flags_(cudaHostAllocDefault) { + Resize(element_num); + } + + /** + * Allocates a one-dimensional array capable of accommodating element_num + * elements with the specified flags on the host side. + * @param element_num + * @param flags The flags used when allocate host-side memory. The value of + * the flags parameter needs to be one of the flags that the cudaHostAlloc() + * function can accept. + */ + CudaHostMem(size_t element_num, unsigned int flags) + : d_ptr_(nullptr), + h_ptr_(nullptr), + element_num_(0), + flags_(cudaHostAllocDefault) { + Resize(element_num, flags); + } + + /** + * Copy constructor. + * Allocate a host-side array of the same size and the same flags as "other" + * and copy all the data from "other" to this array. + * @param other + */ + CudaHostMem(const CudaHostMem& other) : CudaHostMem() { *this = other; } + + /** + * Move constructor. + * Move the host-side array stored in "other" into this object. + * @param other + */ + CudaHostMem(CudaHostMem&& other) noexcept : CudaHostMem() { + *this = std::move(other); + } + + /** + * Destructor. + */ + ~CudaHostMem() noexcept { + try { + Clear(); + } catch (...) { + // Do nothing. + } + } + + /** + * Get the host-side pointer to the array. + * @return The host-side pointer to the array. + */ + E* HPtr() noexcept { return h_ptr_; } + + /** + * Get the host-side pointer to the array. + * @return The host-side pointer to the array. + */ + const E* HPtr() const noexcept { return h_ptr_; } + + /** + * Get the mapped device-side pointer to the array. + * @return The mapped device-side pointer to the array. If the array is not + * allocated with flag cudaHostAllocMapped, nullptr is returned. + */ + E* DPtr() noexcept { return d_ptr_; } + + /** + * Get the mapped device-side pointer to the array. + * @return The mapped device-side pointer to the array. If the array is not + * allocated with flag cudaHostAllocMapped, nullptr is returned. + */ + const E* DPtr() const noexcept { return d_ptr_; } + + /** + * Get the size of the array on the host side in bytes. + * @return The size of the array on the host side, in bytes. + */ + size_t SizeInBytes() const noexcept { return element_num_ * sizeof(E); } + + /** + * Get the size of the array on the host side in terms of the number of + * elements. + * @return The size of the array on the host side in terms of the number of + * elements. + */ + size_t Num() const noexcept { return element_num_; } + + /** + * Resize the array on the host side. + * If the new array size is the same as the old array size, then simply return + * directly. + * @param element_num New size of the array on the host side in terms of the + * number of elements. + */ + void Resize(size_t element_num) { Resize(element_num, flags_); } + + /** + * Resize the array on the host side with new flags. + * If the new array size and the new flags is the same as the old array then + * simply return directly. + * @param element_num New size of the array on the host side in terms of the + * number of elements. + * @param flags The flags used when allocate host-side memory. The value of + * the flags parameter needs to be one of the flags that the cudaHostAlloc() + * function can accept. + */ + void Resize(size_t element_num, unsigned int flags) { + if (element_num == element_num_ && flags == flags_) { + return; + } + Clear(); + if (element_num > 0) { + CHECK(cudaHostAlloc(&h_ptr_, element_num * sizeof(E), flags)); + if ((cudaHostAllocMapped & flags) != 0) { + try { + CHECK(cudaHostGetDevicePointer(&d_ptr_, h_ptr_, 0)); + } catch (std::exception& e) { + Clear(); + throw e; + } + } + } + element_num_ = element_num; + flags_ = flags; + } + + /** + * Free the host-side array. + */ + void Clear() { + if (h_ptr_ != nullptr) { + CHECK(cudaFreeHost(h_ptr_)); + } + h_ptr_ = d_ptr_ = nullptr; + element_num_ = 0; + } + + /** + * Copy assignment operator. + * Resize the array managed in this object with the flags used when create the + * array managed in the other object. Then copy all data from other to the new + * array. + * @param other + * @return Reference to this object. + */ + CudaHostMem& operator=(const CudaHostMem& other) { + if (this == &other) { + return *this; + } + + if (other.h_ptr_ != nullptr) { + if (element_num_ != other.element_num_ || flags_ != other.flags_) { + Resize(other.element_num_, other.flags_); + } + memcpy(h_ptr_, other.h_ptr_, other.SizeInBytes()); + } else { + Clear(); + flags_ = other.flags_; + } + return *this; + } + + /** + * Move assignment operator. + * @param other + * @return Reference to this object. + */ + CudaHostMem& operator=(CudaHostMem&& other) { + if (this == &other) { + return *this; + } + Clear(); + flags_ = other.flags_; + other.flags_ = cudaHostAllocDefault; + + h_ptr_ = other.h_ptr_; + other.h_ptr_ = nullptr; + + d_ptr_ = other.d_ptr_; + other.d_ptr_ = nullptr; + + element_num_ = other.element_num_; + other.element_num_ = 0; + return *this; + } + + /** + * Obtaining the flags used when creating a one-dimensional array on the host + * side. + * @return The flags used when creating a one-dimensional array on the host + * side. + */ + unsigned int GetFlags() const noexcept { return flags_; } + +private: + E* d_ptr_; + E* h_ptr_; + size_t element_num_; + unsigned int flags_; +}; + +#endif diff --git a/test/perf.cu b/test/perf.cu new file mode 100644 index 0000000..85515bb --- /dev/null +++ b/test/perf.cu @@ -0,0 +1,182 @@ +#include + +#include + +#include "device/kernel.cuh" +#include "gtest/gtest.h" +#include "utils.hpp" + +TEST(Spacemesh, PerfCoalesceOrg) { + constexpr uint32_t LOOKUP_GAP = 2; + constexpr uint32_t task_num_per_thread = 128; + + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t N = 8 * 1024; + uint64_t starting_index = 0; + + auto device_prop = GetDeviceProp(0); + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t core_num = + device_prop.warpSize * smsp_num * device_prop.multiProcessorCount; + + uint32_t thread_num = core_num * LOOKUP_GAP; + uint32_t task_num = thread_num * task_num_per_thread; + + CudaDeviceMem d_out(task_num * 2); + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + CudaHostMem h_out(task_num * 2); + + constexpr size_t block_dim = 256; + size_t block_num = thread_num / block_dim; + size_t iter = 1; + + auto st = std::chrono::steady_clock::now(); + for (size_t i = 0; i < iter; ++i) { + scrypt_org<<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d_out.Ptr()); + } + CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + auto ed = std::chrono::steady_clock::now(); + + double d = + std::chrono::duration_cast(ed - st).count(); + std::cout << "[org] block_num: " << block_num << ", block_dim: " << block_dim + << ", time: " << d / iter << "ms, throughput: " + << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n"; +} + +TEST(Spacemesh, PerfCoalesceV1) { + constexpr uint32_t LOOKUP_GAP = 2; + constexpr uint32_t task_num_per_thread = 128; + + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t N = 8 * 1024; + uint64_t starting_index = 0; + + auto device_prop = GetDeviceProp(0); + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t core_num = + device_prop.warpSize * smsp_num * device_prop.multiProcessorCount; + + uint32_t thread_num = core_num * LOOKUP_GAP; + uint32_t task_num = thread_num * task_num_per_thread; + + CudaDeviceMem d_out(task_num * 2); + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + CudaHostMem h_out(task_num * 2); + + constexpr size_t block_dim = 256; + size_t block_num = thread_num / block_dim; + size_t iter = 1; + + auto st = std::chrono::steady_clock::now(); + for (size_t i = 0; i < iter; ++i) { + scrypt_coalesce_access_v1 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d_out.Ptr()); + } + CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + auto ed = std::chrono::steady_clock::now(); + + double d = + std::chrono::duration_cast(ed - st).count(); + std::cout << "[v1] block_num: " << block_num << ", block_dim: " << block_dim + << ", time: " << d / iter << "ms, throughput: " + << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n"; +} + +TEST(Spacemesh, PerfCoalesceV2) { + constexpr uint32_t LOOKUP_GAP = 2; + constexpr uint32_t task_num_per_thread = 512; + + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t N = 8 * 1024; + uint64_t starting_index = 0; + + auto device_prop = GetDeviceProp(0); + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t core_num = + device_prop.warpSize * smsp_num * device_prop.multiProcessorCount; + + uint32_t thread_num = core_num * LOOKUP_GAP; + uint32_t task_num = thread_num * task_num_per_thread; + + CudaDeviceMem d_out(task_num * 2); + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + CudaHostMem h_out(task_num * 2); + + constexpr size_t block_dim = 256; + size_t block_num = thread_num / block_dim; + size_t iter = 1; + + auto st = std::chrono::steady_clock::now(); + for (size_t i = 0; i < iter; ++i) { + scrypt_coalesce_access_v2 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d_out.Ptr()); + } + CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + auto ed = std::chrono::steady_clock::now(); + + double d = + std::chrono::duration_cast(ed - st).count(); + std::cout << "[v2] block_num: " << block_num << ", block_dim: " << block_dim + << ", time: " << d / iter << "ms, throughput: " + << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n"; +} + +TEST(Spacemesh, PerfCoalesceV3) { + constexpr uint32_t LOOKUP_GAP = 2; + constexpr uint32_t task_num_per_thread = 512; + + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t N = 8 * 1024; + uint64_t starting_index = 0; + + auto device_prop = GetDeviceProp(0); + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t core_num = + device_prop.warpSize * smsp_num * device_prop.multiProcessorCount; + + uint32_t thread_num = core_num * LOOKUP_GAP; + uint32_t task_num = thread_num * task_num_per_thread; + + CudaDeviceMem d_out(task_num * 2); + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + CudaHostMem h_out(task_num * 2); + + constexpr size_t block_dim = 256; + size_t block_num = thread_num / block_dim; + size_t iter = 1; + + auto st = std::chrono::steady_clock::now(); + for (size_t i = 0; i < iter; ++i) { + scrypt_coalesce_access_v3 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d_out.Ptr()); + } + CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + auto ed = std::chrono::steady_clock::now(); + + double d = + std::chrono::duration_cast(ed - st).count(); + std::cout << "[v3] block_num: " << block_num << ", block_dim: " << block_dim + << ", time: " << d / iter << "ms, throughput: " + << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n"; +} diff --git a/test/test_kernel.cu b/test/test_kernel.cu new file mode 100644 index 0000000..079d33d --- /dev/null +++ b/test/test_kernel.cu @@ -0,0 +1,463 @@ +#include + +#include "device/kernel.cuh" +#include "device/pbkdf2.cuh" +#include "device/romix.cuh" +#include "gtest/gtest.h" +#include "spacemesh_cuda/spacemesh.h" +#include "utils.hpp" + +static bool operator==(const uint4 &u0, const uint4 &u1) { + return u0.x == u1.x && u0.y == u1.y && u0.z == u1.z && u0.w == u1.w; +} + +static __global__ void pbkdf2_128b(const uint N, const ulong starting_index, + const uint4 *const __restrict__ input, + uint4 *const __restrict__ output, + const uint num_tasks) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input[0]; + password[1] = input[1]; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + /* 1: X = PBKDF2(password, salt) */ + scrypt_pbkdf2_128B(password, X); + for (uint32_t i = 0; i < 8; ++i) { + output[t * 8 + i] = X[i]; + } + } +} + +TEST(PBKDF2_128B, CheckResult) { + const uint32_t N = 8192; + const uint32_t num_tasks = 8704 * 2; + const uint64_t starting_index = 0UL; + CudaDeviceMem input(2); + CudaDeviceMem output(8 * num_tasks); + uint4 h_input[2]{2839345266U, 42009750U, 875455879U, 2217211394U, + 3438177526U, 2734532412U, 2819254414U, 1408356118U}; + // clang-format off + // {x[0]xyzw, x[7].xyzw} + std::vector> h_output_ref{ + {{2243168157, 324902921, 784369288, 4178555589}, {594727500, 3520078779, 3153430745, 1486369834}}, + {{2223702091, 3135234577, 351746947, 628596597}, {1974692412, 2751762247, 1815359819, 1220784090}}, + {{1757856292, 3796090370, 105343294, 740218899}, {862746119, 1267304388, 4212448263, 3102108417}}, + {{17321531, 3055860495, 2259029015, 3918725981}, {3736254989, 3761189418, 149153817, 3819126153}}, + {{1834041421, 3427652492, 3278849906, 3382042170}, {3393761384, 947759528, 1750308469, 1815762229}}, + {{3313951662, 1644567330, 866636170, 1422164638}, {1774972657, 1616289065, 2116049991, 2906510373}}, + {{3199039070, 694981869, 1336937698, 1163541043}, {806088862, 1536940888, 2821292057, 915496211}}, + {{4078128437, 2379231243, 1604075742, 2325245807}, {360554255, 875207183, 1516458558, 421131869}}, + {{1087253935, 2208644287, 2756603925, 3971895705}, {1895637869, 3354544041, 2252449461, 218427034}}, + {{1967339238, 1146502695, 3362372873, 2541765279}, {3706574425, 429585357, 9437500, 679403288}}, + {{3457721097, 2061161947, 4289243029, 2170079478}, {2636111503, 3688484586, 4694583, 120348073}}, + {{2852852322, 159818758, 310172246, 2704348751}, {233655115, 1516867167, 2442547836, 936759168}}, + {{3722947891, 3394662328, 1171702661, 3885525270}, {3053353333, 1393137549, 2858266450, 954570086}}, + {{4270365148, 3284942429, 639193268, 1452194571}, {1161264254, 3727720264, 2913344122, 3397135746}}, + {{596580228, 1392092807, 843646682, 3541695628}, {2590893734, 2198545995, 237641943, 602469347}}, + {{2455879077, 2319967660, 995689268, 1471734608}, {992029320, 1234415286, 3489799059, 2101928371}} + }; + // clang-format on + CudaHostMem h_output(8 * num_tasks); + cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice); + const uint32_t BLOCK_DIM = 32; + const uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2; + pbkdf2_128b<<>>(N, starting_index, input.Ptr(), + output.Ptr(), num_tasks); + cudaMemcpy(h_output.HPtr(), output.Ptr(), h_output.SizeInBytes(), + cudaMemcpyDeviceToHost); + auto *p = h_output.HPtr(); + for (size_t i = 0; i < h_output_ref.size(); ++i) { + uint4 st = p[i * 8]; + uint4 ed = p[i * 8 + 7]; + uint4 st_ref = h_output_ref[i].first; + uint4 ed_ref = h_output_ref[i].second; + EXPECT_TRUE(st == st_ref && ed == ed_ref); + } +} + +static __global__ void pbkdf2_32b(const ulong starting_index, + const uint4 *const __restrict__ input, + uint4 *const __restrict__ output, + const uint num_tasks) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input[0]; + password[1] = input[1]; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_pbkdf2_32B(password, X, &output[t * 2]); + } +} + +TEST(PBKDF2_32B, CheckResult) { + // clang-format off + std::vector output_ref{ + {1541574387, 488907923, 795739296, 3660924057}, {1595429963, 1442715467, 3454537610, 303168103}, + {2832999825, 1792965534, 3851604374, 1543672901}, {2252171238, 2489374518, 4196827066, 1925643828}, + {3776545153, 2493253923, 1983841114, 4189265163}, {3106590296, 3126490259, 4292680934, 3487255118}, + {1138967569, 3223815245, 3999001665, 2833458212}, {525278724, 204474235, 1590998285, 1126296421}, + {1712137169, 3649550485, 2728357207, 2572560430}, {629927608, 1494199002, 426129028, 2296958300}, + {612931869, 2928911829, 999195935, 3894654826}, {711808285, 773108236, 1884424028, 405027227}, + {3496895468, 4098261784, 1205595361, 987182193}, {1092867524, 3407967604, 1630381730, 850280901}, + {2344968929, 1230032780, 533878510, 782288479}, {3426145836, 2789001978, 2013751058, 3920530103}, + {1004256693, 2820654651, 3302594902, 1895517683}, {1715443604, 1822444432, 3642638638, 4172159742}, + {361838609, 2572724440, 3232663262, 1640532158}, {3156298490, 285734263, 2107779922, 124293624}, + {2875742700, 1740468830, 292959372, 1165028846}, {1034497710, 841845972, 3869241330, 279539866}, + {1684353161, 2791287652, 639938640, 696103378}, {1098941359, 3059816824, 80312190, 1352994082}, + {491432485, 1977606223, 568579903, 2848969311}, {856678766, 694736051, 1544114686, 3391276793}, + {528722383, 4221448681, 1427708755, 3435410113}, {108301243, 3032419668, 401350649, 1497139475}, + {1633560854, 1259258950, 2231332965, 3069488567}, {3422183957, 1262157060, 2082726213, 1043984063}, + {1659437647, 2520492023, 1526642929, 2037984020}, {3943654391, 2575523191, 3898673117, 1543536477}, + }; + // clang-format on + + uint32_t num_tasks = 8704 * 2; + uint64_t starting_index = 0UL; + CudaDeviceMem input(2); + uint4 h_input[2]{2839345266U, 42009750U, 875455879U, 2217211394U, + 3438177526U, 2734532412U, 2819254414U, 1408356118U}; + cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice); + CudaDeviceMem output(num_tasks * 2); + CudaHostMem h_out(num_tasks * 2); + + uint32_t BLOCK_DIM = 32; + uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2; + pbkdf2_32b<<>>(starting_index, input.Ptr(), output.Ptr(), + num_tasks); + cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(), + cudaMemcpyDeviceToHost); + uint4 *p = h_out.HPtr(); + for (size_t i = 0; i < output_ref.size(); ++i) { + EXPECT_TRUE(p[i] == output_ref[i]); + } +} + +template +static __global__ void romix(const uint32_t N, const ulong starting_index, + const uint4 *const __restrict__ input, + uint4 *const __restrict__ padcache, + uint4 *const __restrict__ output, + const uint num_tasks) { + uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; + uint32_t tnum = gridDim.x * blockDim.x; + + uint4 password[5]; + uint4 X[8]; + for (uint32_t t = tid; t < num_tasks; t += tnum) { + const uint64_t index = starting_index + t; + + password[0] = input[0]; + password[1] = input[1]; + password[2].x = uint32_t(index & 0xFFFFFFFF); + password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF); + password[2].z = 0; + password[2].w = 0; + password[3] = make_zero(); + password[4] = make_zero(); + + scrypt_pbkdf2_128B(password, X); + scrypt_ROMix_org(X, padcache, tnum, tid); + output[t * 2] = X[0]; + output[t * 2 + 1] = X[7]; + } +} + +TEST(Romix, CheckResult) { + // clang-format off + std::vector> output_ref{ + {{177095052, 2769478868, 1267860214, 3840882696}, {4209226529, 2478377306, 3668441954, 661024389}}, + {{675217897, 1210703173, 2821523351, 395671908}, {1164792708, 2377829658, 1357012668, 3086536117}}, + {{2122569980, 1521965530, 428608715, 1170231827}, {711050857, 3859067996, 201759563, 1563548156}}, + {{1157318318, 2940549691, 1127296292, 3378566105}, {4262566715, 148588802, 1200061494, 186445103}}, + {{1910618740, 3668591534, 543383426, 3111805377}, {1064989078, 4060066605, 1451448625, 391663587}}, + {{4214846473, 2831090917, 2414213101, 1567041697}, {4072116936, 1223198630, 1381754999, 3832720526}}, + {{1999710917, 3830843826, 2091719392, 3506640524}, {1612283548, 4084973170, 2005582528, 4160760954}}, + {{3701562884, 1369415191, 231608195, 3440200853}, {1936641932, 3560076262, 2893706191, 348983257}}, + {{1714814708, 806504637, 1185624757, 1309360160}, {1093527150, 3638059477, 3970998707, 2784271355}}, + {{1640097667, 3808618373, 1719302163, 3224052072}, {2464420809, 2492362386, 2238290668, 1350255901}}, + {{3593000297, 2800731264, 510712390, 3370323384}, {607950715, 1260312177, 248473348, 3713416381}}, + {{4278573785, 4219952570, 252971591, 1295396640}, {1479543003, 1443275236, 1565620974, 4172569527}}, + {{895020402, 4109313948, 184256163, 1338271264}, {2928529152, 1810204067, 3068405352, 4239813782}}, + {{4014920583, 1109162161, 4257725846, 444189643}, {1724450670, 2872480592, 4016779893, 878820321}}, + {{715651626, 3257863402, 2715797466, 2730055762}, {2101471733, 1562788948, 1551284046, 2392879513}}, + {{3992784225, 60299832, 649066318, 1383234528}, {3861971681, 1804071335, 1551305386, 1951189750}}, + }; + // clang-format on + + const uint32_t N = 8192; + uint32_t num_tasks = 8704; + uint64_t starting_index = 0UL; + const uint32_t loopup_gap = 1; + + CudaDeviceMem input(2); + uint4 h_input[2]{2839345266U, 42009750U, 875455879U, 2217211394U, + 3438177526U, 2734532412U, 2819254414U, 1408356118U}; + cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice); + CudaDeviceMem output(num_tasks * 2); + CudaHostMem h_out(num_tasks * 2); + + uint32_t BLOCK_DIM = 32; + uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM; + const size_t global_size = GRID_DIM * BLOCK_DIM; + CudaDeviceMem loopup(N / loopup_gap * 8 * global_size); + + romix<<>>( + N, starting_index, input.Ptr(), loopup.Ptr(), output.Ptr(), num_tasks); + + cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(), + cudaMemcpyDeviceToHost); + uint4 *p = h_out.HPtr(); + for (size_t i = 0; i < output_ref.size(); ++i) { + EXPECT_TRUE(p[i * 2] == output_ref[i].first && + p[i * 2 + 1] == output_ref[i].second); + } +} + +TEST(SpacemeshOrg, CheckResult) { + // clang-format off + std::vector> output_ref{ + {{4048330317, 4093720124, 3952305695, 861738752}, {1207399841, 1252004780, 3769355734, 605176832}}, + {{2755521322, 1773634345, 3570559863, 3528040384}, {1426306995, 1828125550, 2362351051, 592285497}}, + {{794272509, 1869538505, 1508520130, 3189115413}, {3371395123, 664595614, 942134631, 2595808448}}, + {{2966305566, 139203859, 3236026452, 971490366}, {406197471, 2107380474, 1647245437, 1398227647}}, + {{4248325324, 2135553756, 3121138058, 2606063366}, {3712984664, 405355870, 1786205915, 2338023431}}, + {{2223254193, 320690588, 1750003793, 1980189572}, {2430931520, 2358257771, 857129483, 4174911228}}, + {{3824389675, 3900375118, 885509409, 2541713504}, {3965109822, 4266067977, 233389698, 429691579}}, + {{617654257, 287857505, 44437086, 3091373715}, {1579913930, 42500443, 741315169, 2504110009}}, + {{4185094906, 11838281, 788328873, 2513580847}, {3475313515, 1738120748, 1410721087, 3731153976}}, + {{3786242664, 575105985, 1815246426, 2602042915}, {1355784688, 2576224732, 3988570599, 858796312}}, + {{2700358536, 2813184983, 1596688001, 1714337525}, {1078584113, 3032059087, 2777935765, 1318855605}}, + {{1541547040, 1772543483, 3001615835, 2646093592}, {764031266, 303589574, 363686698, 2135502111}}, + {{1152844501, 106430079, 2715838609, 2711284980}, {3437796150, 1500472695, 2708109779, 889703778}}, + {{1770568223, 4026398494, 3075729081, 1584205826}, {2457749057, 155442015, 202804510, 2647478225}}, + {{3703726525, 394584141, 117082576, 699663196}, {1831918362, 2038557743, 2876631409, 3154481933}}, + {{2148001307, 979677799, 520145623, 64552000}, {4006002196, 3736832885, 543708505, 2041849618}}, + }; + // clang-format on + + const uint32_t N = 8192; + uint32_t num_tasks = 8704 * 2; + uint64_t starting_index = 0UL; + const uint32_t loopup_gap = 2; + + uint4 h_input[2]{2839345266U, 42009750U, 875455879U, 2217211394U, + 3438177526U, 2734532412U, 2819254414U, 1408356118U}; + CudaDeviceMem output(num_tasks * 2); + CudaHostMem h_out(num_tasks * 2); + + uint32_t BLOCK_DIM = 32; + uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2; + const size_t global_size = GRID_DIM * BLOCK_DIM; + CudaDeviceMem loopup(N / loopup_gap * 8 * global_size); + + scrypt_org<<>>(starting_index, num_tasks, + h_input[0], h_input[1], + loopup.Ptr(), output.Ptr()); + + cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(), + cudaMemcpyDeviceToHost); + uint4 *p = h_out.HPtr(); + for (size_t i = 0; i < output_ref.size(); ++i) { + EXPECT_TRUE(p[i * 2] == output_ref[i].first && + p[i * 2 + 1] == output_ref[i].second); + } +} + +TEST(SpacemeshAPI, CheckResult) { + // clang-format off + std::vector> output_ref{ + {{4048330317, 4093720124, 3952305695, 861738752}, {1207399841, 1252004780, 3769355734, 605176832}}, + {{2755521322, 1773634345, 3570559863, 3528040384}, {1426306995, 1828125550, 2362351051, 592285497}}, + {{794272509, 1869538505, 1508520130, 3189115413}, {3371395123, 664595614, 942134631, 2595808448}}, + {{2966305566, 139203859, 3236026452, 971490366}, {406197471, 2107380474, 1647245437, 1398227647}}, + {{4248325324, 2135553756, 3121138058, 2606063366}, {3712984664, 405355870, 1786205915, 2338023431}}, + {{2223254193, 320690588, 1750003793, 1980189572}, {2430931520, 2358257771, 857129483, 4174911228}}, + {{3824389675, 3900375118, 885509409, 2541713504}, {3965109822, 4266067977, 233389698, 429691579}}, + {{617654257, 287857505, 44437086, 3091373715}, {1579913930, 42500443, 741315169, 2504110009}}, + {{4185094906, 11838281, 788328873, 2513580847}, {3475313515, 1738120748, 1410721087, 3731153976}}, + {{3786242664, 575105985, 1815246426, 2602042915}, {1355784688, 2576224732, 3988570599, 858796312}}, + {{2700358536, 2813184983, 1596688001, 1714337525}, {1078584113, 3032059087, 2777935765, 1318855605}}, + {{1541547040, 1772543483, 3001615835, 2646093592}, {764031266, 303589574, 363686698, 2135502111}}, + {{1152844501, 106430079, 2715838609, 2711284980}, {3437796150, 1500472695, 2708109779, 889703778}}, + {{1770568223, 4026398494, 3075729081, 1584205826}, {2457749057, 155442015, 202804510, 2647478225}}, + {{3703726525, 394584141, 117082576, 699663196}, {1831918362, 2038557743, 2876631409, 3154481933}}, + {{2148001307, 979677799, 520145623, 64552000}, {4006002196, 3736832885, 543708505, 2041849618}}, + }; + // clang-format on + + uint32_t num_tasks = 8704 * 2; + uint64_t starting_index = 0UL; + + std::vector h_input{2839345266U, 42009750U, 875455879U, + 2217211394U, 3438177526U, 2734532412U, + 2819254414U, 1408356118U}; + std::vector h_out(num_tasks * 8, 0); + + spacemesh_scrypt(0, starting_index, h_input.data(), num_tasks, h_out.data()); + uint4 *p = reinterpret_cast(h_out.data()); + + for (size_t i = 0; i < output_ref.size(); ++i) { + EXPECT_TRUE(p[i * 2] == output_ref[i].first && + p[i * 2 + 1] == output_ref[i].second); + } +} +TEST(SpacemeshCoalesceV1, CheckResult) { + auto device_prop = GetDeviceProp(0); + uint32_t block_num = device_prop.multiProcessorCount; + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t block_dim = device_prop.warpSize * smsp_num; + + const uint64_t starting_index = 0; + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t LOOKUP_GAP = 2; + const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP; + block_dim = 32; + block_num = thread_num / block_dim; + + constexpr uint32_t TASK_PER_THREAD = 1; + const uint32_t task_num = thread_num * TASK_PER_THREAD; + + CudaDeviceMem d0_out(task_num * 2); + CudaDeviceMem d1_out(task_num * 2); + + const uint32_t N = 8 * 1024; + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + + std::unique_ptr h0_out(new uint4[task_num * 2]); + std::unique_ptr h1_out(new uint4[task_num * 2]); + + scrypt_org<<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d0_out.Ptr()); + + scrypt_coalesce_access_v1 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d1_out.Ptr()); + + CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < task_num * 2; ++i) { + EXPECT_EQ(h0_out[i], h1_out[i]); + } +} + +TEST(SpacemeshCoalesceV2, CheckResult) { + auto device_prop = GetDeviceProp(0); + uint32_t block_num = device_prop.multiProcessorCount; + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t block_dim = device_prop.warpSize * smsp_num; + + const uint64_t starting_index = 0; + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t LOOKUP_GAP = 2; + const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP; + block_dim = 32; + block_num = thread_num / block_dim; + + constexpr uint32_t TASK_PER_THREAD = 1; + const uint32_t task_num = thread_num * TASK_PER_THREAD; + + CudaDeviceMem d0_out(task_num * 2); + CudaDeviceMem d1_out(task_num * 2); + + const uint32_t N = 8 * 1024; + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + + std::unique_ptr h0_out(new uint4[task_num * 2]); + std::unique_ptr h1_out(new uint4[task_num * 2]); + + scrypt_org<<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d0_out.Ptr()); + + scrypt_coalesce_access_v2 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d1_out.Ptr()); + + CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < task_num * 2; ++i) { + EXPECT_EQ(h0_out[i], h1_out[i]); + } +} + +TEST(SpacemeshCoalesceV3, CheckResult) { + auto device_prop = GetDeviceProp(0); + uint32_t block_num = device_prop.multiProcessorCount; + uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor); + uint32_t block_dim = device_prop.warpSize * smsp_num; + + const uint64_t starting_index = 0; + uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U}; + uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U}; + + const uint32_t LOOKUP_GAP = 2; + const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP; + block_dim = 32; + block_num = thread_num / block_dim; + + constexpr uint32_t TASK_PER_THREAD = 1; + const uint32_t task_num = thread_num * TASK_PER_THREAD; + + CudaDeviceMem d0_out(task_num * 2); + CudaDeviceMem d1_out(task_num * 2); + + const uint32_t N = 8 * 1024; + CudaDeviceMem d_lookup(N / LOOKUP_GAP * 32 * thread_num); + + std::unique_ptr h0_out(new uint4[task_num * 2]); + std::unique_ptr h1_out(new uint4[task_num * 2]); + + scrypt_org<<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d0_out.Ptr()); + + scrypt_coalesce_access_v3 + <<>>( + starting_index, task_num, in0, in1, + reinterpret_cast(d_lookup.Ptr()), d1_out.Ptr()); + + CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(), + cudaMemcpyDeviceToHost)); + + for (size_t i = 0; i < task_num * 2; ++i) { + EXPECT_EQ(h0_out[i], h1_out[i]); + } +}