From c86afcc2d54de00375d38abf23acd05f3595fb8a Mon Sep 17 00:00:00 2001
From: jack198704 <qq1993414600@gmail.com>
Date: Wed, 5 Jun 2024 14:33:14 +0800
Subject: [PATCH] project: init

---
 .clang-format                      | 168 ++++++++
 .gitignore                         |   3 +
 CMakeLists.txt                     |  73 ++++
 LICENSE                            |  21 +
 README.md                          |  54 +++
 build.sh                           |   6 +
 include/spacemesh_cuda/spacemesh.h |  35 ++
 rust/Cargo.lock                    |  34 ++
 rust/Cargo.toml                    |  12 +
 rust/LICENSE.md                    |   0
 rust/build.rs                      |  25 ++
 rust/src/lib.rs                    |  37 ++
 src/device/common.cuh              |  70 +++
 src/device/kernel.cuh              | 146 +++++++
 src/device/pbkdf2.cuh              | 668 +++++++++++++++++++++++++++++
 src/device/romix.cuh               | 505 ++++++++++++++++++++++
 src/spacemesh.cu                   | 135 ++++++
 src/utils.hpp                      | 545 +++++++++++++++++++++++
 test/perf.cu                       | 182 ++++++++
 test/test_kernel.cu                | 463 ++++++++++++++++++++
 20 files changed, 3182 insertions(+)
 create mode 100644 .clang-format
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100755 build.sh
 create mode 100644 include/spacemesh_cuda/spacemesh.h
 create mode 100644 rust/Cargo.lock
 create mode 100644 rust/Cargo.toml
 create mode 100644 rust/LICENSE.md
 create mode 100644 rust/build.rs
 create mode 100644 rust/src/lib.rs
 create mode 100644 src/device/common.cuh
 create mode 100644 src/device/kernel.cuh
 create mode 100644 src/device/pbkdf2.cuh
 create mode 100644 src/device/romix.cuh
 create mode 100644 src/spacemesh.cu
 create mode 100644 src/utils.hpp
 create mode 100644 test/perf.cu
 create mode 100644 test/test_kernel.cu

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..f98a359
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,168 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: true
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        Auto
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+...
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8bc8609
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+build
+.vscode
+rust/target
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4e0f7bf
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
+
+if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES "all")
+endif()
+
+if (DEFINED CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "")
+    string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+else()
+    set(CMAKE_BUILD_TYPE RELEASE)
+endif()
+
+project(spacemesh-cuda LANGUAGES CXX CUDA)
+add_compile_options(-fPIC)
+
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+file(GLOB_RECURSE src_files LIST_DIRECTORIES false src/*.cu src/*.cpp src/*.c)
+
+include_directories(include)
+include_directories(src)
+
+add_library(${PROJECT_NAME} ${src_files})
+
+install(TARGETS ${PROJECT_NAME})
+
+option(WITH_TEST "with test" OFF)
+
+if(${WITH_TEST})
+    enable_testing()
+    set(TEST_EXE ${PROJECT_NAME}-test)
+
+    if (DEFINED CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "")
+        string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+    else()
+        set(CMAKE_BUILD_TYPE RELEASE)
+    endif()
+
+    if(${CMAKE_BUILD_TYPE} STREQUAL "RELEASE")
+        set(NVCC_FLAGS -O3 -Wno-deprecated-gpu-targets --Werror all-warnings)
+        message(STATUS "${TEST_EXE} compile cuda code in release mode")
+    else()
+        set(NVCC_FLAGS -G -Wno-deprecated-gpu-targets --Werror all-warnings)
+        message(STATUS "${TEST_EXE} compile cuda code in debug mode")
+    endif()
+
+    #Google Unit Test
+    option(BUILD_GMOCK "" OFF)
+    option(INSTALL_GTEST "" OFF)
+    FetchContent_Declare(
+            googletest
+            GIT_REPOSITORY https://github.com/google/googletest.git
+            GIT_TAG release-1.12.1
+    )
+    FetchContent_MakeAvailable(googletest)
+
+    file(GLOB_RECURSE test_files LIST_DIRECTORIES false test/*.cpp test/*.c test/*.cu)
+    add_executable(${TEST_EXE} ${test_files})
+    set_target_properties(${TEST_EXE} PROPERTIES CUDA_ARCHITECTURES "native")
+    target_link_libraries(${TEST_EXE}
+            PRIVATE ${PROJECT_NAME}
+            PRIVATE gtest
+            PRIVATE gtest_main)
+    include(GoogleTest)
+    set(NVCC_FLAGS ${NVCC_FLAGS})# "SHELL:-t 0" "SHELL:-split-compile 0")
+    target_compile_options(${TEST_EXE} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--forward-unknown-opts -Wall ${NVCC_FLAGS}>
+            PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Wall>)
+    add_test(NAME test_corr COMMAND ${TEST_EXE} --gtest_filter=*CheckResult*)
+    add_test(NAME test_perf COMMAND ${TEST_EXE} --gtest_filter=*Perf*)
+endif()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c970acf
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024  ZakuroLab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bc79088
--- /dev/null
+++ b/README.md
@@ -0,0 +1,54 @@
+<h1 align="center">Spacemesh-cuda is a library for plot acceleration using CUDA-enabled GPUs.</h1>
+
+[![license](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/ZakuroLab/spacemesh-cuda/blob/master/LICENSE)
+[![release](https://img.shields.io/github/v/release/ZakuroLab/spacemesh-cuda?include_prereleases)](https://github.com/ZakuroLab/spacemesh-cuda/releases)
+![platform](https://img.shields.io/badge/platform-%20linux--64-lightgrey.svg)
+[![open help wanted issues](https://img.shields.io/github/issues-raw/ZakuroLab/spacemesh-cuda/help%20wanted?logo=github)](https://github.com/ZakuroLab/spacemesh-cuda/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
+[![made by](https://img.shields.io/badge/madeby-ZakuroLab-blue.svg)](https://github.com/ZakuroLab)
+
+## <a name='TableofContents'></a>Table of Contents
+
+* [1. Overview](#1-overview)
+* [2. Performance](#2-performance)
+* [3. Build & Integration Guide](#3-build-&-integration-guide)
+  * [3.1 From source](#31-From-source)
+  * [3.2 From binary](#32-From-binary)
+* [4. License](#4-license)
+
+## 1. Overview
+
+__spacemesh-cuda__ is a cuda library for plot acceleration for [spacemesh](https://github.com/spacemeshos/go-spacemesh). 
+This library optimizes memory access, calculation parallelism, etc. Compared with the official program, the library improved by **86.6%**.
+
+## 2. Performance
+| GPU\Library | Official | spacemesh-cuda |
+| ---- |  --- | ---- |
+| RTX3080 | 3.2MB/s | 5.97MB/s |
+
+## 3. Build & Integration Guide
+
+### 3.1 From source
+```shell
+# build libpost.so
+git clone https://github.com/ZakuroLab/post-rs.git && cd post-rs/ffi
+cargo build --release
+cd ../../
+
+# get postcli
+wget https://github.com/spacemeshos/post/releases/download/v0.12.5/postcli-Linux.zip
+unzip -d postcli ./postcli-Linux.zip
+cd postcli && mv ../post-rs/target/release/libpost.so ./
+```
+
+### 3.2 From binary
+```shell
+mkdir postcli && cd postcli
+wget https://github.com/ZakuroLab/spacemesh-cuda/releases/download/v0.0.1/libpost.so
+wget https://github.com/ZakuroLab/spacemesh-cuda/releases/download/v0.0.1/postcli
+```
+
+## 4. License
+
+We welcome all contributions to `spacemesh-cuda`. Please refer to the [license](#4-license) for the terms of contributions.
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE.md)
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..1c0a9a5
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+HOME_DIR=`pwd`
+rm -rf ${HOME_DIR}/build
+mkdir ${HOME_DIR}/build
+cmake -DCMAKE_BUILD_TYPE=Release -S ${HOME_DIR} -B ${HOME_DIR}/build -DWITH_TEST=ON
+cmake --build ${HOME_DIR}/build -j
diff --git a/include/spacemesh_cuda/spacemesh.h b/include/spacemesh_cuda/spacemesh.h
new file mode 100644
index 0000000..0861051
--- /dev/null
+++ b/include/spacemesh_cuda/spacemesh.h
@@ -0,0 +1,35 @@
+#ifndef SPACEMESH_CUDA_SPACEMESH_H
+#define SPACEMESH_CUDA_SPACEMESH_H
+
+#include <stdint.h>
+
+extern "C" {
+/**
+ * Obtain the number of devices (CUDA GPU)
+ *
+ * @returns A uint32
+ */
+uint32_t spacemesh_get_device_num();
+
+/**
+ * Obtain the maximum number of tasks of device @p device_id
+ *
+ * @returns A uint32
+ */
+uint32_t spacemesh_get_max_task_num(uint32_t device_id);
+
+/**
+ * Execution
+ *
+ * @param device_id The index of device
+ * @param starting_index
+ * @param input Input CPU memory
+ * @param task_num The number of tasks
+ * @param output Output CPU memory
+ */
+void spacemesh_scrypt(uint32_t device_id, const uint64_t starting_index,
+                      const uint32_t* input, const uint32_t task_num,
+                      uint32_t* output);
+}
+
+#endif  // SPACEMESH_CUDA_SPACEMESH_H
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
new file mode 100644
index 0000000..4fee3fb
--- /dev/null
+++ b/rust/Cargo.lock
@@ -0,0 +1,34 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cc"
+version = "1.0.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "cmake"
+version = "0.1.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+
+[[package]]
+name = "spacemesh-cuda"
+version = "0.0.1"
+dependencies = [
+ "cmake",
+]
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
new file mode 100644
index 0000000..23ed63d
--- /dev/null
+++ b/rust/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+authors = ["The 6block Team"]
+edition = "2021"
+include = ["Cargo.toml", "src", "LICENSE.md"]
+license = "GPL-3.0"
+name = "spacemesh-cuda"
+version = "0.0.1"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[build-dependencies]
+cmake = {version = "0.1.0"}
diff --git a/rust/LICENSE.md b/rust/LICENSE.md
new file mode 100644
index 0000000..e69de29
diff --git a/rust/build.rs b/rust/build.rs
new file mode 100644
index 0000000..11a0b60
--- /dev/null
+++ b/rust/build.rs
@@ -0,0 +1,25 @@
+use cmake::Config;
+
+fn main() {
+    let profile = std::env::var("PROFILE").unwrap();
+    let profile = match profile.as_str() {
+        "debug" => "Debug",
+        "release" => "Release",
+        _ => "Release",
+    };
+    let mut dst = Config::new("..")
+        .define("CMAKE_BUILD_TYPE", profile)
+        .define("WITH_TEST", "OFF")
+        .cxxflag("-O3")
+        .build();
+    dst.push("lib");
+    println!("cargo:rustc-link-search=native={}", dst.display());
+
+    let default_cuda_lib_path = "/usr/local/cuda/targets/x86_64-linux/lib/";
+    let default_boost_path = "/usr/local/lib/";
+    println!("cargo:rustc-link-search=native={}", default_cuda_lib_path);
+    println!("cargo:rustc-link-search=native={}", default_boost_path);
+    println!("cargo:rustc-link-lib=static=cudart_static");
+    println!("cargo:rustc-link-lib=stdc++");
+    println!("cargo:rustc-link-lib=static=spacemesh-cuda");
+}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644
index 0000000..c48a25f
--- /dev/null
+++ b/rust/src/lib.rs
@@ -0,0 +1,37 @@
+extern "C" {
+    fn spacemesh_get_device_num() -> u32;
+    fn spacemesh_get_max_task_num(device_idx: u32) -> u32;
+    fn spacemesh_scrypt(
+        device_id: u32,
+        starting_index: u64,
+        input: *const u32,
+        task_num: u32,
+        output: *mut u32,
+    );
+}
+
+pub fn get_device_num() -> u32 {
+    unsafe { spacemesh_get_device_num() }
+}
+
+pub fn get_max_task_num(device_id: u32) -> u32 {
+    unsafe { spacemesh_get_max_task_num(device_id) }
+}
+
+pub fn scrypt(
+    device_id: u32,
+    starting_index: u64,
+    input: &Vec<u32>,
+    task_num: u32,
+    output: &mut Vec<u8>,
+) {
+    unsafe {
+        spacemesh_scrypt(
+            device_id,
+            starting_index,
+            input.as_ptr(),
+            task_num,
+            output.as_mut_ptr() as *mut u32,
+        );
+    }
+}
diff --git a/src/device/common.cuh b/src/device/common.cuh
new file mode 100644
index 0000000..265176f
--- /dev/null
+++ b/src/device/common.cuh
@@ -0,0 +1,70 @@
+#ifndef DEVICE_COMMON_CUH
+#define DEVICE_COMMON_CUH
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+#define UINT32_NUM_BITS sizeof(uint32_t) * 8
+#define UINT64_NUM_BITS sizeof(uint64_t) * 8
+
+#define UNUSED(X) (void)(X)
+
+template <uint32_t d>
+__device__ uint32_t rotl32(uint32_t x) {
+  return (x << d) | (x >> (UINT32_NUM_BITS - d));
+}
+
+template <uint32_t d>
+__device__ uint2 rotl64(uint32_t x, uint32_t y) {
+  uint64_t c = uint64_t(x) | (uint64_t(y) << UINT32_NUM_BITS);
+  c = (c << d) | (c >> (UINT64_NUM_BITS - d));
+  return make_uint2(uint32_t(c), c >> UINT32_NUM_BITS);
+}
+template <uint32_t d>
+__device__ uint2 rotl64(uint2 v) {
+  return rotl64<d>(v.x, v.y);
+}
+
+inline __device__ uint2 operator^(const uint2 &t0, const uint2 &t1) {
+  return {t0.x ^ t1.x, t0.y ^ t1.y};
+}
+inline __device__ uint2 &operator^=(uint2 &t0, const uint2 &t1) {
+  t0.x ^= t1.x;
+  t0.y ^= t1.y;
+  return t0;
+}
+
+inline __device__ uint4 operator^(const uint4 &t0, const uint4 &t1) {
+  return {t0.x ^ t1.x, t0.y ^ t1.y, t0.z ^ t1.z, t0.w ^ t1.w};
+}
+
+inline __device__ uint4 &operator^=(uint4 &t0, const uint4 &t1) {
+  t0.x ^= t1.x;
+  t0.y ^= t1.y;
+  t0.z ^= t1.z;
+  t0.w ^= t1.w;
+  return t0;
+}
+
+inline __device__ uint4 &operator+=(uint4 &t0, const uint4 &t1) {
+  t0.x += t1.x;
+  t0.y += t1.y;
+  t0.z += t1.z;
+  t0.w += t1.w;
+  return t0;
+}
+
+template <typename T>
+inline __device__ T make_zero();
+
+template <>
+inline __device__ uint4 make_zero<uint4>() {
+  return {0, 0, 0, 0};
+}
+template <>
+inline __device__ uint2 make_zero<uint2>() {
+  return {0, 0};
+}
+
+#endif
diff --git a/src/device/kernel.cuh b/src/device/kernel.cuh
new file mode 100644
index 0000000..1b5df13
--- /dev/null
+++ b/src/device/kernel.cuh
@@ -0,0 +1,146 @@
+#ifndef DEVICE_KERNEL_CUH
+#define DEVICE_KERNEL_CUH
+
+#include "device/common.cuh"
+#include "device/pbkdf2.cuh"
+#include "device/romix.cuh"
+
+/**
+ * Mainly modified thread mapping relationships, LOOKUP_GAP, Block, etc.,
+ * Throughtput: 4.16MB/s
+ */
+template <uint32_t LOOKUP_GAP = 1>
+__global__ void scrypt_org(const uint64_t starting_index,
+                           const uint32_t num_tasks, const uint4 input_1,
+                           const uint4 input_2,
+                           uint4 *const __restrict__ padcache,
+                           uint4 *const __restrict__ output) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input_1;
+    password[1] = input_2;
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_ROMix_org<LOOKUP_GAP>(X, padcache, tnum, tid);
+    scrypt_pbkdf2_32B(password, X, &output[t * 2]);
+  }
+}
+
+/**
+ * Optimize memory access and collaborate with Warp to complete data copying
+ * Throughtput: 5.34MB/s
+ */
+template <uint32_t LOOKUP_GAP = 1>
+__global__ void scrypt_coalesce_access_v1(const uint64_t starting_index,
+                                          const uint32_t num_tasks,
+                                          const uint4 input_1,
+                                          const uint4 input_2,
+                                          uint32_t *const __restrict__ padcache,
+                                          uint4 *const __restrict__ output) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input_1;
+    password[1] = input_2;
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_ROMix_coalesce_access_v1<LOOKUP_GAP>(X, padcache, tnum, tid);
+    scrypt_pbkdf2_32B(password, X, &output[t * 2]);
+  }
+}
+
+/**
+ * Optimize memory access, with 16 threads forming a sub warp, each thread
+ * performing read and write operations in units of 8 bytes (uint64_t)
+ * Throughtput: 5.94MB/s
+ */
+template <uint32_t LOOKUP_GAP = 1>
+__global__ void scrypt_coalesce_access_v2(const uint64_t starting_index,
+                                          const uint32_t num_tasks,
+                                          const uint4 input_1,
+                                          const uint4 input_2,
+                                          uint64_t *const __restrict__ padcache,
+                                          uint4 *const __restrict__ output) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input_1;
+    password[1] = input_2;
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_ROMix_coalesce_access_v2<LOOKUP_GAP>(X, padcache, tnum, tid);
+    scrypt_pbkdf2_32B(password, X, &output[t * 2]);
+  }
+}
+
+/**
+ * Optimize memory access, with 8 threads forming a sub warp, each thread
+ * performing read and write operations in units of 8 bytes (uint4) + global
+ * memory -> shared memory asynchronous reads
+ * Throughtput: 6.03MB/s
+ */
+template <uint32_t LOOKUP_GAP = 1>
+__global__ void scrypt_coalesce_access_v3(const uint64_t starting_index,
+                                          const uint32_t num_tasks,
+                                          const uint4 input_1,
+                                          const uint4 input_2,
+                                          uint4 *const __restrict__ padcache,
+                                          uint4 *const __restrict__ output) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input_1;
+    password[1] = input_2;
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_ROMix_coalesce_access_v3<LOOKUP_GAP>(X, padcache, tnum, tid);
+    scrypt_pbkdf2_32B(password, X, &output[t * 2]);
+  }
+}
+
+#endif
diff --git a/src/device/pbkdf2.cuh b/src/device/pbkdf2.cuh
new file mode 100644
index 0000000..4f7f85e
--- /dev/null
+++ b/src/device/pbkdf2.cuh
@@ -0,0 +1,668 @@
+#ifndef DEVICE_PBKDF2_CUH
+#define DEVICE_PBKDF2_CUH
+#include <cuda_runtime.h>
+
+#include "device/common.cuh"
+
+#define SCRYPT_HASH_DIGEST_SIZE 64
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_HASH_BLOCK_SIZE 72
+#define SCRYPT_BLOCK_BYTES 128
+
+typedef struct scrypt_hash_state_t {
+  uint4 state4[(SCRYPT_KECCAK_F + 127) / 128];        // 8 bytes of extra
+  uint4 buffer4[(SCRYPT_HASH_BLOCK_SIZE + 15) / 16];  // 8 bytes of extra
+                                                      // uint leftover;
+} scrypt_hash_state;
+
+typedef struct scrypt_hmac_state_t {
+  scrypt_hash_state inner;
+  scrypt_hash_state outer;
+} scrypt_hmac_state;
+
+__constant__ uint64_t keccak_round_constants[24]{
+  0x0000000000000001UL, 0x0000000000008082UL, 0x800000000000808aUL,
+  0x8000000080008000UL, 0x000000000000808bUL, 0x0000000080000001UL,
+  0x8000000080008081UL, 0x8000000000008009UL, 0x000000000000008aUL,
+  0x0000000000000088UL, 0x0000000080008009UL, 0x000000008000000aUL,
+  0x000000008000808bUL, 0x800000000000008bUL, 0x8000000000008089UL,
+  0x8000000000008003UL, 0x8000000000008002UL, 0x8000000000000080UL,
+  0x000000000000800aUL, 0x800000008000000aUL, 0x8000000080008081UL,
+  0x8000000000008080UL, 0x0000000080000001UL, 0x8000000080008008UL};
+
+inline __device__ void keccak_block_core(scrypt_hash_state &S) {
+  uint2 t[5];
+  uint2 u[5];
+  uint2 v;
+  uint2 w;
+  uint4 *s4 = S.state4;
+
+  for (uint i = 0; i < 24; i++) {
+/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+#define CASE(D, S00, S01, S10, S11, S20, S21, S30, S31, S40, S41) \
+  {                                                               \
+    D.x = S00 ^ S10 ^ S20 ^ S30 ^ S40;                            \
+    D.y = S01 ^ S11 ^ S21 ^ S31 ^ S41;                            \
+  }
+    // t[0] = s4[0].xy ^ s4[2].zw ^ s4[5].xy ^ s4[7].zw ^ s4[10].xy;
+    CASE(t[0], s4[0].x, s4[0].y, s4[2].z, s4[2].w, s4[5].x, s4[5].y, s4[7].z,
+         s4[7].w, s4[10].x, s4[10].y);
+    // t[1] = s4[0].zw ^ s4[3].xy ^ s4[5].zw ^ s4[8].xy ^ s4[10].zw;
+    CASE(t[1], s4[0].z, s4[0].w, s4[3].x, s4[3].y, s4[5].z, s4[5].w, s4[8].x,
+         s4[8].y, s4[10].z, s4[10].w);
+    // t[2] = s4[1].xy ^ s4[3].zw ^ s4[6].xy ^ s4[8].zw ^ s4[11].xy;
+    CASE(t[2], s4[1].x, s4[1].y, s4[3].z, s4[3].w, s4[6].x, s4[6].y, s4[8].z,
+         s4[8].w, s4[11].x, s4[11].y);
+    // t[3] = s4[1].zw ^ s4[4].xy ^ s4[6].zw ^ s4[9].xy ^ s4[11].zw;
+    CASE(t[3], s4[1].z, s4[1].w, s4[4].x, s4[4].y, s4[6].z, s4[6].w, s4[9].x,
+         s4[9].y, s4[11].z, s4[11].w);
+    // t[4] = s4[2].xy ^ s4[4].zw ^ s4[7].xy ^ s4[9].zw ^ s4[12].xy;
+    CASE(t[4], s4[2].x, s4[2].y, s4[4].z, s4[4].w, s4[7].x, s4[7].y, s4[9].z,
+         s4[9].w, s4[12].x, s4[12].y);
+#undef CASE
+
+    /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+    u[0] = t[4] ^ rotl64<1>(t[1]);
+    u[1] = t[0] ^ rotl64<1>(t[2]);
+    u[2] = t[1] ^ rotl64<1>(t[3]);
+    u[3] = t[2] ^ rotl64<1>(t[4]);
+    u[4] = t[3] ^ rotl64<1>(t[0]);
+
+/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+#define CASE(D0, D1, S) \
+  {                     \
+    D0 ^= S.x;          \
+    D1 ^= S.y;          \
+  }
+
+    // s4[0].xy ^= u[0];
+    CASE(s4[0].x, s4[0].y, u[0]);
+    // s4[2].zw ^= u[0];
+    CASE(s4[2].z, s4[2].w, u[0]);
+    // s4[5].xy ^= u[0];
+    CASE(s4[5].x, s4[5].y, u[0]);
+    // s4[7].zw ^= u[0];
+    CASE(s4[7].z, s4[7].w, u[0]);
+    // s4[10].xy ^= u[0];
+    CASE(s4[10].x, s4[10].y, u[0]);
+    // s4[0].zw ^= u[1];
+    CASE(s4[0].z, s4[0].w, u[1]);
+    // s4[3].xy ^= u[1];
+    CASE(s4[3].x, s4[3].y, u[1]);
+    // s4[5].zw ^= u[1];
+    CASE(s4[5].z, s4[5].w, u[1]);
+    // s4[8].xy ^= u[1];
+    CASE(s4[8].x, s4[8].y, u[1]);
+    // s4[10].zw ^= u[1];
+    CASE(s4[10].z, s4[10].w, u[1]);
+    // s4[1].xy ^= u[2];
+    CASE(s4[1].x, s4[1].y, u[2]);
+    // s4[3].zw ^= u[2];
+    CASE(s4[3].z, s4[3].w, u[2]);
+    // s4[6].xy ^= u[2];
+    CASE(s4[6].x, s4[6].y, u[2]);
+    // s4[8].zw ^= u[2];
+    CASE(s4[8].z, s4[8].w, u[2]);
+    // s4[11].xy ^= u[2];
+    CASE(s4[11].x, s4[11].y, u[2]);
+    // s4[1].zw ^= u[3];
+    CASE(s4[1].z, s4[1].w, u[3]);
+    // s4[4].xy ^= u[3];
+    CASE(s4[4].x, s4[4].y, u[3]);
+    // s4[6].zw ^= u[3];
+    CASE(s4[6].z, s4[6].w, u[3]);
+    // s4[9].xy ^= u[3];
+    CASE(s4[9].x, s4[9].y, u[3]);
+    // s4[11].zw ^= u[3];
+    CASE(s4[11].z, s4[11].w, u[3]);
+    // s4[2].xy ^= u[4];
+    CASE(s4[2].x, s4[2].y, u[4]);
+    // s4[4].zw ^= u[4];
+    CASE(s4[4].z, s4[4].w, u[4]);
+    // s4[7].xy ^= u[4];
+    CASE(s4[7].x, s4[7].y, u[4]);
+    // s4[9].zw ^= u[4];
+    CASE(s4[9].z, s4[9].w, u[4]);
+    // s4[12].xy ^= u[4];
+    CASE(s4[12].x, s4[12].y, u[4]);
+#undef CASE
+
+    /* rho pi: b[..] = rotl(a[..], ..) */
+    // v = s4[0].zw;
+    v = make_uint2(s4[0].z, s4[0].w);
+#define CASE(D0, D1, S0, S1, M)           \
+  {                                       \
+    const uint2 &tmp = rotl64<M>(S0, S1); \
+    D0 = tmp.x;                           \
+    D1 = tmp.y;                           \
+  }
+
+    // s4[0].zw = ROTL64(s4[3].xy, 44UL);
+    CASE(s4[0].z, s4[0].w, s4[3].x, s4[3].y, 44);
+    // s4[3].xy = ROTL64(s4[4].zw, 20UL);
+    CASE(s4[3].x, s4[3].y, s4[4].z, s4[4].w, 20);
+    // s4[4].zw = ROTL64(s4[11].xy, 61UL);
+    CASE(s4[4].z, s4[4].w, s4[11].x, s4[11].y, 61);
+    // s4[11].xy = ROTL64(s4[7].xy, 39UL);
+    CASE(s4[11].x, s4[11].y, s4[7].x, s4[7].y, 39);
+    // s4[7].xy = ROTL64(s4[10].xy, 18UL);
+    CASE(s4[7].x, s4[7].y, s4[10].x, s4[10].y, 18);
+    // s4[10].xy = ROTL64(s4[1].xy, 62UL);
+    CASE(s4[10].x, s4[10].y, s4[1].x, s4[1].y, 62);
+    // s4[1].xy = ROTL64(s4[6].xy, 43UL);
+    CASE(s4[1].x, s4[1].y, s4[6].x, s4[6].y, 43);
+    // s4[6].xy = ROTL64(s4[6].zw, 25UL);
+    CASE(s4[6].x, s4[6].y, s4[6].z, s4[6].w, 25);
+    // s4[6].zw = ROTL64(s4[9].zw, 8UL);
+    CASE(s4[6].z, s4[6].w, s4[9].z, s4[9].w, 8);
+    // s4[9].zw = ROTL64(s4[11].zw, 56UL);
+    CASE(s4[9].z, s4[9].w, s4[11].z, s4[11].w, 56);
+    // s4[11].zw = ROTL64(s4[7].zw, 41UL);
+    CASE(s4[11].z, s4[11].w, s4[7].z, s4[7].w, 41);
+    // s4[7].zw = ROTL64(s4[2].xy, 27UL);
+    CASE(s4[7].z, s4[7].w, s4[2].x, s4[2].y, 27);
+    // s4[2].xy = ROTL64(s4[12].xy, 14UL);
+    CASE(s4[2].x, s4[2].y, s4[12].x, s4[12].y, 14);
+    // s4[12].xy = ROTL64(s4[10].zw, 2UL);
+    CASE(s4[12].x, s4[12].y, s4[10].z, s4[10].w, 2);
+    // s4[10].zw = ROTL64(s4[4].xy, 55UL);
+    CASE(s4[10].z, s4[10].w, s4[4].x, s4[4].y, 55);
+    // s4[4].xy = ROTL64(s4[8].xy, 45UL);
+    CASE(s4[4].x, s4[4].y, s4[8].x, s4[8].y, 45);
+    // s4[8].xy = ROTL64(s4[2].zw, 36UL);
+    CASE(s4[8].x, s4[8].y, s4[2].z, s4[2].w, 36);
+    // s4[2].zw = ROTL64(s4[1].zw, 28UL);
+    CASE(s4[2].z, s4[2].w, s4[1].z, s4[1].w, 28);
+    // s4[1].zw = ROTL64(s4[9].xy, 21UL);
+    CASE(s4[1].z, s4[1].w, s4[9].x, s4[9].y, 21);
+    // s4[9].xy = ROTL64(s4[8].zw, 15UL);
+    CASE(s4[9].x, s4[9].y, s4[8].z, s4[8].w, 15);
+    // s4[8].zw = ROTL64(s4[5].zw, 10UL);
+    CASE(s4[8].z, s4[8].w, s4[5].z, s4[5].w, 10);
+    // s4[5].zw = ROTL64(s4[3].zw, 6UL);
+    CASE(s4[5].z, s4[5].w, s4[3].z, s4[3].w, 6);
+    // s4[3].zw = ROTL64(s4[5].xy, 3UL);
+    CASE(s4[3].z, s4[3].w, s4[5].x, s4[5].y, 3);
+    // s4[5].xy = ROTL64(v, 1UL);
+    CASE(s4[5].x, s4[5].y, v.x, v.y, 1);
+#undef CASE
+
+    /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+    // v = s4[0].xy;
+    v = make_uint2(s4[0].x, s4[0].y);
+    // w = s4[0].zw;
+    w = make_uint2(s4[0].z, s4[0].w);
+
+#define CASE(D0, D1, S00, S01, S10, S11) \
+  {                                      \
+    D0 ^= (~S00) & S10;                  \
+    D1 ^= (~S01) & S11;                  \
+  }
+
+    // s4[0].xy ^= (~w) & s4[1].xy;
+    CASE(s4[0].x, s4[0].y, w.x, w.y, s4[1].x, s4[1].y);
+    // s4[0].zw ^= (~s4[1].xy) & s4[1].zw;
+    CASE(s4[0].z, s4[0].w, s4[1].x, s4[1].y, s4[1].z, s4[1].w);
+    // s4[1].xy ^= (~s4[1].zw) & s4[2].xy;
+    CASE(s4[1].x, s4[1].y, s4[1].z, s4[1].w, s4[2].x, s4[2].y);
+    // s4[1].zw ^= (~s4[2].xy) & v;
+    CASE(s4[1].z, s4[1].w, s4[2].x, s4[2].y, v.x, v.y);
+    // s4[2].xy ^= (~v) & w;
+    CASE(s4[2].x, s4[2].y, v.x, v.y, w.x, w.y);
+    // v = s4[2].zw;
+    v = make_uint2(s4[2].z, s4[2].w);
+    // w = s4[3].xy;
+    w = make_uint2(s4[3].x, s4[3].y);
+    // s4[2].zw ^= (~w) & s4[3].zw;
+    CASE(s4[2].z, s4[2].w, w.x, w.y, s4[3].z, s4[3].w);
+    // s4[3].xy ^= (~s4[3].zw) & s4[4].xy;
+    CASE(s4[3].x, s4[3].y, s4[3].z, s4[3].w, s4[4].x, s4[4].y);
+    // s4[3].zw ^= (~s4[4].xy) & s4[4].zw;
+    CASE(s4[3].z, s4[3].w, s4[4].x, s4[4].y, s4[4].z, s4[4].w);
+    // s4[4].xy ^= (~s4[4].zw) & v;
+    CASE(s4[4].x, s4[4].y, s4[4].z, s4[4].w, v.x, v.y);
+    // s4[4].zw ^= (~v) & w;
+    CASE(s4[4].z, s4[4].w, v.x, v.y, w.x, w.y);
+    // v = s4[5].xy;
+    v = make_uint2(s4[5].x, s4[5].y);
+    // w = s4[5].zw;
+    w = make_uint2(s4[5].z, s4[5].w);
+    // s4[5].xy ^= (~w) & s4[6].xy;
+    CASE(s4[5].x, s4[5].y, w.x, w.y, s4[6].x, s4[6].y);
+    // s4[5].zw ^= (~s4[6].xy) & s4[6].zw;
+    CASE(s4[5].z, s4[5].w, s4[6].x, s4[6].y, s4[6].z, s4[6].w);
+    // s4[6].xy ^= (~s4[6].zw) & s4[7].xy;
+    CASE(s4[6].x, s4[6].y, s4[6].z, s4[6].w, s4[7].x, s4[7].y);
+    // s4[6].zw ^= (~s4[7].xy) & v;
+    CASE(s4[6].z, s4[6].w, s4[7].x, s4[7].y, v.x, v.y);
+    // s4[7].xy ^= (~v) & w;
+    CASE(s4[7].x, s4[7].y, v.x, v.y, w.x, w.y);
+    // v = s4[7].zw;
+    v = make_uint2(s4[7].z, s4[7].w);
+    // w = s4[8].xy;
+    w = make_uint2(s4[8].x, s4[8].y);
+    // s4[7].zw ^= (~w) & s4[8].zw;
+    CASE(s4[7].z, s4[7].w, w.x, w.y, s4[8].z, s4[8].w);
+    // s4[8].xy ^= (~s4[8].zw) & s4[9].xy;
+    CASE(s4[8].x, s4[8].y, s4[8].z, s4[8].w, s4[9].x, s4[9].y);
+    // s4[8].zw ^= (~s4[9].xy) & s4[9].zw;
+    CASE(s4[8].z, s4[8].w, s4[9].x, s4[9].y, s4[9].z, s4[9].w);
+    // s4[9].xy ^= (~s4[9].zw) & v;
+    CASE(s4[9].x, s4[9].y, s4[9].z, s4[9].w, v.x, v.y);
+    // s4[9].zw ^= (~v) & w;
+    CASE(s4[9].z, s4[9].w, v.x, v.y, w.x, w.y);
+    // v = s4[10].xy;
+    v = make_uint2(s4[10].x, s4[10].y);
+    // w = s4[10].zw;
+    w = make_uint2(s4[10].z, s4[10].w);
+    // s4[10].xy ^= (~w) & s4[11].xy;
+    CASE(s4[10].x, s4[10].y, w.x, w.y, s4[11].x, s4[11].y);
+    // s4[10].zw ^= (~s4[11].xy) & s4[11].zw;
+    CASE(s4[10].z, s4[10].w, s4[11].x, s4[11].y, s4[11].z, s4[11].w);
+    // s4[11].xy ^= (~s4[11].zw) & s4[12].xy;
+    CASE(s4[11].x, s4[11].y, s4[11].z, s4[11].w, s4[12].x, s4[12].y);
+    // s4[11].zw ^= (~s4[12].xy) & v;
+    CASE(s4[11].z, s4[11].w, s4[12].x, s4[12].y, v.x, v.y);
+    // s4[12].xy ^= (~v) & w;
+    CASE(s4[12].x, s4[12].y, v.x, v.y, w.x, w.y);
+#undef CASE
+
+    /* iota: a[0,0] ^= round constant */
+    // s4[0].xy ^= as_uint2(keccak_round_constants[i]);
+    s4[0].x ^= uint32_t(keccak_round_constants[i]);
+    s4[0].y ^= uint32_t(keccak_round_constants[i] >> UINT32_NUM_BITS);
+  }
+}
+
+inline __device__ void keccak_block(scrypt_hash_state &S, const uint4 *in4) {
+  uint4 *s4 = S.state4;
+  uint i;
+
+  /* absorb input */
+  for (i = 0; i < 4; i++) {
+    s4[i].x ^= in4[i].x;
+    s4[i].y ^= in4[i].y;
+    s4[i].z ^= in4[i].z;
+    s4[i].w ^= in4[i].w;
+  }
+
+  s4[4].x ^= in4[4].x;
+  s4[4].y ^= in4[4].y;
+
+  keccak_block_core(S);
+}
+
+inline __device__ void keccak_block_zero(scrypt_hash_state &S,
+                                         const uint4 *in4) {
+  uint4 *s4 = S.state4;
+  uint i;
+
+  /* absorb input */
+  for (i = 0; i < 4; i++) {
+    s4[i] = in4[i];
+  }
+  // s4[4].xyzw = (uint4)(in4[4].xy, 0, 0);
+  s4[4] = make_uint4(in4[4].x, in4[4].y, 0, 0);
+
+  for (i = 5; i < 12; i++) {
+    // s4[i] = ZERO;
+    s4[i] = make_zero<uint4>();
+  }
+  // s4[12].xy = ZERO_UINT2;
+  s4[12].x = 0;
+  s4[12].y = 0;
+
+  keccak_block_core(S);
+}
+
+inline __device__ void scrypt_hash_update_72(scrypt_hash_state &S,
+                                             const uint4 *in4) {
+  /* handle the current data */
+  keccak_block_zero(S, in4);
+}
+
+inline __device__ void scrypt_hash_update_80(scrypt_hash_state &S,
+                                             const uint4 *in4) {
+  const uchar1 *in = (const uchar1 *)in4;
+  // uint i;
+
+  /* handle the current data */
+  keccak_block(S, in4);
+  in += SCRYPT_HASH_BLOCK_SIZE;
+
+  /* handle leftover data */
+  // S->leftover = 2;
+
+  {
+    const uint2 *in2 = (const uint2 *)in;
+
+    // S->buffer4[0].xy = int2[0].xy;
+    S.buffer4[0].x = in2[0].x;
+    S.buffer4[0].y = in2[0].y;
+  }
+}
+
+inline __device__ void scrypt_hash_update_128(scrypt_hash_state &S,
+                                              const uint4 *in4) {
+  const uchar1 *in = (const uchar1 *)in4;
+  // uint i;
+
+  /* handle the current data */
+  keccak_block(S, in4);
+  in += SCRYPT_HASH_BLOCK_SIZE;
+
+  /* handle leftover data */
+  // S->leftover = 14;
+
+  {
+    const uint2 *in2 = (const uint2 *)in;
+
+    for (uint i = 0; i < 3; i++) {
+      S.buffer4[i] = make_uint4(in2[2 * i].x, in2[2 * i].y, in2[2 * i + 1].x,
+                                in2[2 * i + 1].y);
+    }
+    // S->buffer4[3].xy = int2[6].xy;
+    S.buffer4[3].x = in2[6].x;
+    S.buffer4[3].y = in2[6].y;
+  }
+}
+
+inline __device__ void scrypt_hash_update_4_after_72(scrypt_hash_state &S,
+                                                     uint in) {
+  S.buffer4[0] = make_uint4(in, 0x01, 0, 0);
+}
+
+inline __device__ void scrypt_hash_update_4_after_80(scrypt_hash_state &S,
+                                                     uint in) {
+  // assume that leftover = 2
+  /* handle the previous data */
+  // S->buffer4[0].zw = (uint2)(in, 0x01);
+  S.buffer4[0].z = in;
+  S.buffer4[0].w = 0x01;
+  // S->leftover += 1;
+}
+
+inline __device__ void scrypt_hash_update_4_after_128(scrypt_hash_state &S,
+                                                      uint in) {
+  // leftover = 14
+  /* handle the previous data */
+  // S->buffer4[3].zw = (uint2)(in, 0x01);
+  S.buffer4[3].z = in;
+  S.buffer4[3].w = 0x01;
+  // S->leftover += 1;
+}
+
+inline __device__ void scrypt_hash_update_64(scrypt_hash_state &S,
+                                             const uint4 *in4) {
+  /* handle leftover data */
+  // S->leftover = 16;
+  for (uint32_t i = 0; i < 4; i++) {
+    S.buffer4[i] = in4[i];
+  }
+}
+
+inline __device__ void scrypt_hash_finish_80_after_64(scrypt_hash_state &S,
+                                                      uint4 *hash4) {
+  // assume that leftover = 16
+  // S->buffer4[4].xy = (uint2)(0x01, 0x80000000);
+  S.buffer4[4].x = 0x01;
+  S.buffer4[4].y = 0x80000000;
+
+  keccak_block(S, S.buffer4);
+
+  for (uint i = 0; i < 4; i++) {
+    hash4[i] = S.state4[i];
+  }
+}
+
+inline __device__ void scrypt_hash_finish_80_after_80_4(scrypt_hash_state &S,
+                                                        uint4 *hash4) {
+  // assume that leftover = 3
+  // S->buffer4[0].w = 0x01; // done already in scrypt_hash_update_4_after_80
+  for (uint i = 1; i < 4; i++) {
+    S.buffer4[i] = make_zero<uint4>();
+  }
+  // S->buffer4[4].xy = (uint2)(0, 0x80000000);
+  S.buffer4[4].x = 0;
+  S.buffer4[4].y = 0x80000000;
+
+  keccak_block(S, S.buffer4);
+
+  for (uint i = 0; i < 4; i++) {
+    hash4[i] = S.state4[i];
+  }
+}
+
+inline __device__ void scrypt_hash_finish_80_after_128_4(scrypt_hash_state &S,
+                                                         uint4 *hash4) {
+  // leftover = 15
+  // S->buffer4[3].w = 0x01; // done already in scrypt_hash_update_4_after_128
+  // S->buffer4[4].xy = (uint2)(0, 0x80000000);
+  S.buffer4[4].x = 0;
+  S.buffer4[4].y = 0x80000000;
+
+  keccak_block(S, S.buffer4);
+
+  for (uint i = 0; i < 4; i++) {
+    hash4[i] = S.state4[i];
+  }
+}
+
+inline __device__ void scrypt_hash_72(uint4 *hash4, const uint4 *m) {
+  for (uint i = 0; i < 4; i++) {
+    hash4[i] = m[i];
+  }
+  // hash4[4].xy = m[4].xy;
+  hash4[4].x = m[4].x;
+  hash4[4].y = m[4].y;
+}
+
+inline __device__ void scrypt_hash_80(uint4 *hash4, const uint4 *m) {
+  const uchar1 *in = (const uchar1 *)m;
+  scrypt_hash_state st;
+
+  /* handle the current data */
+  keccak_block_zero(st, m);
+  in += SCRYPT_HASH_BLOCK_SIZE;
+
+  {
+    const uint2 *in2 = (const uint2 *)in;
+    // st.buffer4[0].xyzw = (uint4)(in2[0].xy, 0x01, 0);
+    st.buffer4[0] = make_uint4(in2[0].x, in2[0].y, 0x01, 0);
+  }
+
+  for (uint i = 1; i < 4; i++) {
+    st.buffer4[i] = make_zero<uint4>();
+  }
+  // st.buffer4[4].xyzw = (uint4)(0, 0x80000000, 0, 0);
+  st.buffer4[4] = make_uint4(0, 0x80000000, 0, 0);
+
+  keccak_block(st, st.buffer4);
+
+  for (uint i = 0; i < 4; i++) {
+    hash4[i] = st.state4[i];
+  }
+}
+
+/* hmac */
+constexpr uint KEY_0X36 = 0x36363636;
+constexpr uint KEY_0X36_XOR_0X5C = 0x6A6A6A6A;
+
+inline __device__ void scrypt_hmac_init(scrypt_hmac_state &st,
+                                        const uint4 *key) {
+  uint4 pad4[SCRYPT_HASH_BLOCK_SIZE / 16 + 1];
+
+  scrypt_hash_72(pad4, key);
+
+  /* inner = (key ^ 0x36) */
+  /* h(inner || ...) */
+  for (uint i = 0; i < 4; i++) {
+    pad4[i].x ^= KEY_0X36;
+    pad4[i].y ^= KEY_0X36;
+    pad4[i].z ^= KEY_0X36;
+    pad4[i].w ^= KEY_0X36;
+  }
+  // pad4[4].xy ^= KEY_0X36_2;
+  pad4[4].x ^= KEY_0X36;
+  pad4[4].y ^= KEY_0X36;
+
+  scrypt_hash_update_72(st.inner, pad4);
+
+  /* outer = (key ^ 0x5c) */
+  /* h(outer || ...) */
+  for (uint i = 0; i < 4; i++) {
+    pad4[i].x ^= KEY_0X36_XOR_0X5C;
+    pad4[i].y ^= KEY_0X36_XOR_0X5C;
+    pad4[i].z ^= KEY_0X36_XOR_0X5C;
+    pad4[i].w ^= KEY_0X36_XOR_0X5C;
+  }
+  // pad4[4].xy ^= KEY_0X36_XOR_0X5C_2;
+  pad4[4].x ^= KEY_0X36_XOR_0X5C;
+  pad4[4].y ^= KEY_0X36_XOR_0X5C;
+
+  scrypt_hash_update_72(st.outer, pad4);
+}
+
+inline __device__ void scrypt_hmac_update_80(scrypt_hmac_state &st,
+                                             const uint4 *m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_80(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_update_72(scrypt_hmac_state &st,
+                                             const uint4 *m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_72(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_update_128(scrypt_hmac_state &st,
+                                              const uint4 *m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_128(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_update_4_after_72(scrypt_hmac_state &st,
+                                                     uint m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_4_after_72(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_update_4_after_80(scrypt_hmac_state &st,
+                                                     uint m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_4_after_80(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_update_4_after_128(scrypt_hmac_state &st,
+                                                      uint m) {
+  /* h(inner || m...) */
+  scrypt_hash_update_4_after_128(st.inner, m);
+}
+
+inline __device__ void scrypt_hmac_finish_128B(scrypt_hmac_state &st,
+                                               uint4 *mac) {
+  /* h(inner || m) */
+  uint4 innerhash[4];
+  scrypt_hash_finish_80_after_80_4(st.inner, innerhash);
+
+  /* h(outer || h(inner || m)) */
+  scrypt_hash_update_64(st.outer, innerhash);
+  scrypt_hash_finish_80_after_64(st.outer, mac);
+}
+
+inline __device__ void scrypt_hmac_finish_32B(scrypt_hmac_state &st,
+                                              uint4 *mac) {
+  /* h(inner || m) */
+  uint4 innerhash[4];
+  scrypt_hash_finish_80_after_128_4(st.inner, innerhash);
+
+  /* h(outer || h(inner || m)) */
+  scrypt_hash_update_64(st.outer, innerhash);
+  scrypt_hash_finish_80_after_64(st.outer, mac);
+}
+
+inline __device__ void scrypt_copy_hmac_state_128B(
+  scrypt_hmac_state &dest, const scrypt_hmac_state &src) {
+  for (uint i = 0; i < 12; i++) {
+    dest.inner.state4[i] = src.inner.state4[i];
+  }
+  // dest->inner.state4[12].xy = src->inner.state4[12].xy;
+  dest.inner.state4[12].x = src.inner.state4[12].x;
+  dest.inner.state4[12].y = src.inner.state4[12].y;
+
+  // dest->inner.buffer4[0].xy = src->inner.buffer4[0].xy;
+  dest.inner.buffer4[0].x = src.inner.buffer4[0].x;
+  dest.inner.buffer4[0].y = src.inner.buffer4[0].y;
+
+  for (uint i = 0; i < 12; i++) {
+    dest.outer.state4[i] = src.outer.state4[i];
+  }
+  // dest->outer.state4[12].xy = src->outer.state4[12].xy;
+  dest.outer.state4[12].x = src.outer.state4[12].x;
+  dest.outer.state4[12].y = src.outer.state4[12].y;
+}
+
+constexpr uint be1 = 0x01000000;
+constexpr uint be2 = 0x02000000;
+
+inline __device__ void scrypt_pbkdf2_128B(const uint4 *password, uint4 *out4) {
+  scrypt_hmac_state hmac_pw, work;
+  uint4 ti4[4];
+
+  /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they
+   * will always be under scrypt */
+
+  /* hmac(password, ...) */
+  scrypt_hmac_init(hmac_pw, password);
+
+  /* hmac(password, salt...) */
+  // Skip salt
+  // scrypt_hmac_update_80(&hmac_pw, salt);
+
+  /* U1 = hmac(password, salt || be(i)) */
+  /* U32TO8_BE(be, i); */
+  // work = hmac_pw;
+  scrypt_copy_hmac_state_128B(work, hmac_pw);
+  scrypt_hmac_update_4_after_72(work, be1);
+  scrypt_hmac_finish_128B(work, ti4);
+
+  for (uint i = 0; i < 4; i++) {
+    out4[i] = ti4[i];
+  }
+
+  /* U1 = hmac(password, salt || be(i)) */
+  /* U32TO8_BE(be, i); */
+  // work = hmac_pw;
+  scrypt_hmac_update_4_after_72(hmac_pw, be2);
+  scrypt_hmac_finish_128B(hmac_pw, ti4);
+
+  for (uint i = 0; i < 4; i++) {
+    out4[i + 4] = ti4[i];
+  }
+}
+
+inline __device__ void scrypt_pbkdf2_32B(const uint4 *password,
+                                         const uint4 *salt,
+                                         uint4 *__restrict__ out4) {
+  scrypt_hmac_state hmac_pw;
+  uint4 ti4[4];
+
+  /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they
+   * will always be under scrypt */
+
+  /* hmac(password, ...) */
+  scrypt_hmac_init(hmac_pw, password);
+
+  /* hmac(password, salt...) */
+  scrypt_hmac_update_128(hmac_pw, salt);
+
+  /* U1 = hmac(password, salt || be(i)) */
+  /* U32TO8_BE(be, i); */
+  scrypt_hmac_update_4_after_128(hmac_pw, be1);
+  scrypt_hmac_finish_32B(hmac_pw, ti4);
+
+  for (uint i = 0; i < 2; i++) {
+    out4[i] = ti4[i];
+  }
+}
+#endif
diff --git a/src/device/romix.cuh b/src/device/romix.cuh
new file mode 100644
index 0000000..b897b84
--- /dev/null
+++ b/src/device/romix.cuh
@@ -0,0 +1,505 @@
+#ifndef DEVICE_ROMIX_CUH
+#define DEVICE_ROMIX_CUH
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+
+#include "device/common.cuh"
+
+constexpr uint4 MASK_2{1, 2, 3, 0}, MASK_3{2, 3, 0, 1}, MASK_4{3, 0, 1, 2},
+    ROTATE_16{16, 16, 16, 16}, ROTATE_12{12, 12, 12, 12}, ROTATE_8{8, 8, 8, 8},
+    ROTATE_7{7, 7, 7, 7};
+
+inline __device__ void chacha_core(uint4 *__restrict__ state) {
+  uint4 x[4];
+  uint4 t;
+
+  x[0] = state[0];
+  x[1] = state[1];
+  x[2] = state[2];
+  x[3] = state[3];
+
+#define CASE(D_x, D_y, D_z, D_w, S_x, S_y, S_z, S_w, R) \
+  D_x = rotl32<R.x>(S_x);                               \
+  D_y = rotl32<R.y>(S_y);                               \
+  D_z = rotl32<R.z>(S_z);                               \
+  D_w = rotl32<R.w>(S_w);
+
+  for (uint32_t rounds = 0; rounds < 4; rounds++) {
+    x[0] += x[1];
+    t = x[3] ^ x[0];
+    // x[3] = ROTL32(t, ROTATE_16);
+    CASE(x[3].x, x[3].y, x[3].z, x[3].w, t.x, t.y, t.z, t.w, ROTATE_16);
+    x[2] += x[3];
+    t = x[1] ^ x[2];
+    // x[1] = ROTL32(t, ROTATE_12);
+    CASE(x[1].x, x[1].y, x[1].z, x[1].w, t.x, t.y, t.z, t.w, ROTATE_12);
+    x[0] += x[1];
+    t = x[3] ^ x[0];
+    // x[3] = ROTL32(t, ROTATE_8);
+    CASE(x[3].x, x[3].y, x[3].z, x[3].w, t.x, t.y, t.z, t.w, ROTATE_8);
+    x[2] += x[3];
+    t = x[1] ^ x[2];
+    // x[1] = ROTL32(t, ROTATE_7);
+    CASE(x[1].x, x[1].y, x[1].z, x[1].w, t.x, t.y, t.z, t.w, ROTATE_7);
+
+    // x[1] = shuffle(x[1], MASK_2);
+    // x[2] = shuffle(x[2], MASK_3);
+    // x[3] = shuffle(x[3], MASK_4);
+
+    // x[0] += x[1].yzwx;
+    x[0] += make_uint4(x[1].y, x[1].z, x[1].w, x[1].x);
+    // t = x[3].wxyz ^ x[0];
+    t = make_uint4(x[3].w, x[3].x, x[3].y, x[3].z) ^ x[0];
+    // x[3].wxyz = ROTL32(t, ROTATE_16);
+    CASE(x[3].w, x[3].x, x[3].y, x[3].z, t.x, t.y, t.z, t.w, ROTATE_16);
+    // x[2].zwxy += x[3].wxyz;
+    x[2] += make_uint4(x[3].y, x[3].z, x[3].w, x[3].x);
+    // t = x[1].yzwx ^ x[2].zwxy;
+    t = make_uint4(x[1].y, x[1].z, x[1].w, x[1].x) ^
+        make_uint4(x[2].z, x[2].w, x[2].x, x[2].y);
+    // x[1].yzwx = ROTL32(t, ROTATE_12);
+    CASE(x[1].y, x[1].z, x[1].w, x[1].x, t.x, t.y, t.z, t.w, ROTATE_12);
+    // x[0] += x[1].yzwx;
+    x[0] += make_uint4(x[1].y, x[1].z, x[1].w, x[1].x);
+    // t = x[3].wxyz ^ x[0];
+    t = make_uint4(x[3].w, x[3].x, x[3].y, x[3].z) ^ x[0];
+    // x[3].wxyz = ROTL32(t, ROTATE_8);
+    CASE(x[3].w, x[3].x, x[3].y, x[3].z, t.x, t.y, t.z, t.w, ROTATE_8);
+    // x[2].zwxy += x[3].wxyz;
+    x[2] += make_uint4(x[3].y, x[3].z, x[3].w, x[3].x);
+    // t = x[1].yzwx ^ x[2].zwxy;
+    t = make_uint4(x[1].y, x[1].z, x[1].w, x[1].x) ^
+        make_uint4(x[2].z, x[2].w, x[2].x, x[2].y);
+    // x[1].yzwx = ROTL32(t, ROTATE_7);
+    CASE(x[1].y, x[1].z, x[1].w, x[1].x, t.x, t.y, t.z, t.w, ROTATE_7);
+
+    // x[1] = shuffle(x[1], MASK_4);
+    // x[2] = shuffle(x[2], MASK_3);
+    // x[3] = shuffle(x[3], MASK_2);
+  }
+#undef CASE
+
+  state[0] += x[0];
+  state[1] += x[1];
+  state[2] += x[2];
+  state[3] += x[3];
+}
+
+inline __device__ void scrypt_ChunkMix_inplace_Bxor_local(
+    uint4 *__restrict__ B /*[chunkWords]*/,
+    uint4 *__restrict__ Bxor /*[chunkWords]*/) {
+  /* 1: X = B_{2r - 1} */
+
+  /* 2: for i = 0 to 2r - 1 do */
+  /* 3: X = H(X ^ B_i) */
+  B[0] ^= B[4] ^ Bxor[4] ^ Bxor[0];
+  B[1] ^= B[5] ^ Bxor[5] ^ Bxor[1];
+  B[2] ^= B[6] ^ Bxor[6] ^ Bxor[2];
+  B[3] ^= B[7] ^ Bxor[7] ^ Bxor[3];
+
+  /* SCRYPT_MIX_FN */
+  chacha_core(B);
+
+  /* 4: Y_i = X */
+  /* 6: B'[0..r-1] = Y_even */
+  /* 6: B'[r..2r-1] = Y_odd */
+
+  /* 3: X = H(X ^ B_i) */
+  B[4] ^= B[0] ^ Bxor[4];
+  B[5] ^= B[1] ^ Bxor[5];
+  B[6] ^= B[2] ^ Bxor[6];
+  B[7] ^= B[3] ^ Bxor[7];
+
+  /* SCRYPT_MIX_FN */
+  chacha_core(B + 4);
+
+  /* 4: Y_i = X */
+  /* 6: B'[0..r-1] = Y_even */
+  /* 6: B'[r..2r-1] = Y_odd */
+}
+
+inline __device__ void scrypt_ChunkMix_inplace_local(
+    uint4 *__restrict__ B /*[chunkWords]*/) {
+  /* 1: X = B_{2r - 1} */
+
+  /* 2: for i = 0 to 2r - 1 do */
+  /* 3: X = H(X ^ B_i) */
+  B[0] ^= B[4];
+  B[1] ^= B[5];
+  B[2] ^= B[6];
+  B[3] ^= B[7];
+
+  /* SCRYPT_MIX_FN */
+  chacha_core(B);
+
+  /* 4: Y_i = X */
+  /* 6: B'[0..r-1] = Y_even */
+  /* 6: B'[r..2r-1] = Y_odd */
+
+  /* 3: X = H(X ^ B_i) */
+  B[4] ^= B[0];
+  B[5] ^= B[1];
+  B[6] ^= B[2];
+  B[7] ^= B[3];
+
+  /* SCRYPT_MIX_FN */
+  chacha_core(B + 4);
+
+  /* 4: Y_i = X */
+  /* 6: B'[0..r-1] = Y_even */
+  /* 6: B'[r..2r-1] = Y_odd */
+}
+
+#define Coord(x, y, z) x + y *(x##SIZE) + z *(y##SIZE) * (x##SIZE)
+#define CO Coord(z, x, y)
+
+template <uint32_t LOOKUP_GAP>
+inline __device__ void scrypt_ROMix_org(uint4 *__restrict__ X,
+                                        uint4 *__restrict__ lookup,
+                                        uint32_t tnum, uint32_t tid) {
+  constexpr uint32_t N = 8 * 1024;
+  const uint32_t zSIZE = 8;
+  const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0));
+  UNUSED(ySIZE);
+  const uint32_t xSIZE = tnum;
+  const uint32_t x = tid % xSIZE;
+  uint32_t i = 0, j = 0, y = 0, z = 0;
+  uint4 W[8];
+
+  /* 1: X = B */
+  /* implicit */
+
+  /* 2: for i = 0 to N - 1 do */
+  for (y = 0; y < N / LOOKUP_GAP; y++) {
+    /* 3: V_i = X */
+    for (z = 0; z < zSIZE; z++) {
+      lookup[CO] = X[z];
+    }
+
+    for (j = 0; j < LOOKUP_GAP; j++) {
+      /* 4: X = H(X) */
+      scrypt_ChunkMix_inplace_local(X);
+    }
+  }
+
+  /* 6: for i = 0 to N - 1 do */
+  for (i = 0; i < N; i++) {
+    /* 7: j = Integerify(X) % N */
+    j = X[4].x & (N - 1);
+    y = j / LOOKUP_GAP;
+
+    for (z = 0; z < zSIZE; z++) {
+      W[z] = lookup[CO];
+    }
+
+    if constexpr (LOOKUP_GAP == 2) {
+      if (j & 1) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    if constexpr (LOOKUP_GAP > 2) {
+      uint c = j % LOOKUP_GAP;
+      for (uint k = 0; k < c; k++) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    /* 8: X = H(X ^ V_j) */
+    scrypt_ChunkMix_inplace_Bxor_local(X, W);
+  }
+
+  /* 10: B' = X */
+  /* implicit */
+}
+
+template <uint32_t LOOKUP_GAP>
+inline __device__ void scrypt_ROMix_coalesce_access_v1(
+    uint4 *__restrict__ X, uint32_t *__restrict__ lookup, uint32_t tnum,
+    uint32_t tid) {
+  extern __shared__ uint32_t smem[];
+  const uint32_t row_length = warpSize + 1;
+  uint32_t warp_work_space_size = row_length * warpSize;
+  uint32_t warp_id = threadIdx.x / warpSize;
+  uint32_t lane_id = threadIdx.x % warpSize;
+  const uint32_t smem_base_offset = warp_work_space_size * warp_id;
+
+  constexpr uint32_t N = 8 * 1024;
+  const uint32_t zSIZE = 32;
+  const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0));
+  UNUSED(ySIZE);
+  const uint32_t xSIZE = tnum;
+  const uint32_t x = tid % xSIZE;
+  uint4 W[8];
+
+  /* 1: X = B */
+  /* implicit */
+
+  /* 2: for i = 0 to N - 1 do */
+  for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) {
+    uint32_t smem_offset = smem_base_offset + row_length * lane_id;
+    for (uint32_t z = 0; z < zSIZE / 4; z++) {
+      smem[smem_offset++] = X[z].x;
+      smem[smem_offset++] = X[z].y;
+      smem[smem_offset++] = X[z].z;
+      smem[smem_offset++] = X[z].w;
+    }
+    __syncwarp();
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    for (uint32_t k = 0; k < 128 / sizeof(uint32_t); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k);
+      lookup[cur_offset + lane_id] =
+          smem[smem_base_offset + k * row_length + lane_id];
+    }
+    __syncwarp();
+
+    for (uint32_t j = 0; j < LOOKUP_GAP; j++) {
+      /* 4: X = H(X) */
+      scrypt_ChunkMix_inplace_local(X);
+    }
+  }
+
+  /* 6: for i = 0 to N - 1 do */
+  for (uint32_t i = 0; i < N; i++) {
+    /* 7: j = Integerify(X) % N */
+    uint32_t j = X[4].x & (N - 1);
+    uint32_t y = j / LOOKUP_GAP;
+
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    for (uint32_t k = 0; k < 128 / sizeof(uint32_t); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k);
+      smem[smem_base_offset + k * row_length + lane_id] =
+          lookup[cur_offset + lane_id];
+    }
+    uint32_t smem_offset = smem_base_offset + row_length * lane_id;
+    for (uint32_t z = 0; z < zSIZE / 4; z++) {
+      W[z].x = smem[smem_offset++];
+      W[z].y = smem[smem_offset++];
+      W[z].z = smem[smem_offset++];
+      W[z].w = smem[smem_offset++];
+    }
+    __syncwarp();
+
+    if constexpr (LOOKUP_GAP == 2) {
+      if (j & 1) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    if constexpr (LOOKUP_GAP > 2) {
+      uint c = j % LOOKUP_GAP;
+      for (uint k = 0; k < c; k++) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    /* 8: X = H(X ^ V_j) */
+    scrypt_ChunkMix_inplace_Bxor_local(X, W);
+  }
+
+  /* 10: B' = X */
+  /* implicit */
+}
+
+template <uint32_t LOOKUP_GAP>
+__device__ void scrypt_ROMix_coalesce_access_v2(uint4 *__restrict__ X,
+                                                uint64_t *__restrict__ lookup,
+                                                uint32_t tnum, uint32_t tid) {
+  extern __shared__ uint64_t smem_v2[];
+  const uint32_t sub_warp_size = 16;
+  const uint32_t row_length = warpSize + 1;
+  uint32_t warp_work_space_size = row_length * sub_warp_size;
+  uint32_t warp_id = threadIdx.x / warpSize;
+  uint32_t sub_warp_id = threadIdx.x / sub_warp_size % 2;
+  uint32_t lane_id = threadIdx.x % warpSize;
+  uint32_t sub_lane_id = threadIdx.x % sub_warp_size;
+  const uint32_t smem_base_offset = warp_work_space_size * warp_id;
+
+  constexpr uint32_t N = 8 * 1024;
+  const uint32_t zSIZE = 16;
+  const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0));
+  UNUSED(ySIZE);
+  const uint32_t xSIZE = tnum;
+  const uint32_t x = tid % xSIZE;
+  uint4 W[8];
+
+  /* 1: X = B */
+  /* implicit */
+
+  /* 2: for i = 0 to N - 1 do */
+  for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) {
+    uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id +
+                           sub_warp_id * sub_warp_size;
+    for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) {
+      smem_v2[smem_offset++] = (uint64_t)X[z].x | ((uint64_t)X[z].y << 32);
+      smem_v2[smem_offset++] = (uint64_t)X[z].z | ((uint64_t)X[z].w << 32);
+    }
+    __syncwarp();
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 16);
+      lookup[cur_offset + sub_lane_id] =
+          smem_v2[smem_base_offset + k * row_length + lane_id];
+    }
+    __syncwarp();
+
+    for (uint32_t j = 0; j < LOOKUP_GAP; j++) {
+      /* 4: X = H(X) */
+      scrypt_ChunkMix_inplace_local(X);
+    }
+  }
+
+  /* 6: for i = 0 to N - 1 do */
+  for (uint32_t i = 0; i < N; i++) {
+    /* 7: j = Integerify(X) % N */
+    uint32_t j = X[4].x & (N - 1);
+    uint32_t y = j / LOOKUP_GAP;
+
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    uint64_t tmp[16];
+    for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 16);
+      tmp[k] = lookup[cur_offset + sub_lane_id];
+    }
+    __syncwarp();
+    for (uint32_t k = 0; k < 128 / sizeof(uint64_t); k++) {
+      smem_v2[smem_base_offset + k * row_length + lane_id] = tmp[k];
+    }
+    __syncwarp();
+
+    uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id +
+                           sub_warp_id * sub_warp_size;
+    for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) {
+      auto t = smem_v2[smem_offset++];
+      W[z].x = t;
+      W[z].y = t >> 32;
+      t = smem_v2[smem_offset++];
+      W[z].z = t;
+      W[z].w = t >> 32;
+    }
+    __syncwarp();
+
+    if constexpr (LOOKUP_GAP == 2) {
+      if (j & 1) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    if constexpr (LOOKUP_GAP > 2) {
+      uint c = j % LOOKUP_GAP;
+      for (uint k = 0; k < c; k++) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    /* 8: X = H(X ^ V_j) */
+    scrypt_ChunkMix_inplace_Bxor_local(X, W);
+  }
+
+  /* 10: B' = X */
+  /* implicit */
+}
+
+template <uint32_t LOOKUP_GAP>
+__device__ void scrypt_ROMix_coalesce_access_v3(uint4 *__restrict__ X,
+                                                uint4 *__restrict__ lookup,
+                                                uint32_t tnum, uint32_t tid) {
+  namespace cg = cooperative_groups;
+  extern __shared__ uint4 smem_v4[];
+  const uint32_t sub_warp_size = 8;
+  const uint32_t row_length = warpSize + 1;
+  auto tb = cg::this_thread_block();
+  auto tile = cg::tiled_partition<sub_warp_size>(tb);  // wrapSize=32
+
+  uint32_t warp_work_space_size = row_length * sub_warp_size;
+  uint32_t warp_id = threadIdx.x / warpSize;
+  uint32_t lane_id = threadIdx.x % warpSize;
+  uint32_t sub_warp_id = lane_id / sub_warp_size;
+  uint32_t sub_lane_id = threadIdx.x % sub_warp_size;
+  const uint32_t smem_base_offset = warp_work_space_size * warp_id;
+
+  constexpr uint32_t N = 8 * 1024;
+  const uint32_t zSIZE = 8;
+  const uint32_t ySIZE = (N / LOOKUP_GAP + (N % LOOKUP_GAP > 0));
+  UNUSED(ySIZE);
+  const uint32_t xSIZE = tnum;
+  const uint32_t x = tid % xSIZE;
+  uint4 W[8];
+
+  /* 1: X = B */
+  /* implicit */
+
+  /* 2: for i = 0 to N - 1 do */
+  for (uint32_t y = 0; y < N / LOOKUP_GAP; y++) {
+    uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id +
+                           sub_warp_id * sub_warp_size;
+    for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) {
+      smem_v4[smem_offset++] = X[z];
+    }
+    __syncwarp();
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 8);
+      lookup[cur_offset + sub_lane_id] =
+          smem_v4[smem_base_offset + k * row_length + lane_id];
+    }
+    __syncwarp();
+
+    for (uint32_t j = 0; j < LOOKUP_GAP; j++) {
+      /* 4: X = H(X) */
+      scrypt_ChunkMix_inplace_local(X);
+    }
+  }
+
+  /* 6: for i = 0 to N - 1 do */
+  for (uint32_t i = 0; i < N; i++) {
+    /* 7: j = Integerify(X) % N */
+    uint32_t j = X[4].x & (N - 1);
+    uint32_t y = j / LOOKUP_GAP;
+
+    uint32_t offset = x * zSIZE + y * xSIZE * zSIZE;
+    // uint4 tmp[8];
+    for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) {
+      uint32_t cur_offset = __shfl_sync(0xffffffff, offset, k, 8);
+      // tmp[k] = lookup[cur_offset + sub_lane_id];
+      cg::memcpy_async(tile,
+                       &smem_v4[smem_base_offset + k * row_length +
+                                sub_warp_id * sub_warp_size],
+                       sub_warp_size, &lookup[cur_offset], sub_warp_size);
+    }
+    // __syncwarp();
+    // for (uint32_t k = 0; k < 128 / sizeof(uint4); k++) {
+    //   smem_v4[smem_base_offset + k * row_length + lane_id] = tmp[k];
+    // }
+    // __syncwarp();
+    cg::sync(tile);
+    cg::wait(tile);
+    uint32_t smem_offset = smem_base_offset + row_length * sub_lane_id +
+                           sub_warp_id * sub_warp_size;
+    for (uint32_t z = 0; z < 128 / sizeof(uint4); z++) {
+      W[z] = smem_v4[smem_offset++];
+    }
+    __syncwarp();
+
+    if constexpr (LOOKUP_GAP == 2) {
+      if (j & 1) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+
+    if constexpr (LOOKUP_GAP > 2) {
+      uint c = j % LOOKUP_GAP;
+      for (uint k = 0; k < c; k++) {
+        scrypt_ChunkMix_inplace_local(W);
+      }
+    }
+    /* 8: X = H(X ^ V_j) */
+    scrypt_ChunkMix_inplace_Bxor_local(X, W);
+  }
+
+  /* 10: B' = X */
+  /* implicit */
+}
+
+#endif
diff --git a/src/spacemesh.cu b/src/spacemesh.cu
new file mode 100644
index 0000000..27b2a2b
--- /dev/null
+++ b/src/spacemesh.cu
@@ -0,0 +1,135 @@
+#include <stdexcept>
+#include <vector>
+
+#include "device/kernel.cuh"
+#include "spacemesh_cuda/spacemesh.h"
+#include "utils.hpp"
+
+constexpr uint32_t LOOKUP_GAP = 2;
+constexpr uint32_t N = 8192;
+constexpr uint32_t LOOKUP_MEM_FOR_ONE_TASK = 128 * N;
+constexpr uint32_t OUTPUT_MEM_FOR_ONE_TASK = 32;
+
+struct DeviceContext {
+  size_t block_dim;
+  size_t block_num;
+  size_t max_task_num;
+};
+
+class Config {
+public:
+  DeviceContext& GetDeviceContext(size_t device_id) {
+    if (device_id >= device_num_) {
+      throw std::invalid_argument("device_id is invalid!");
+    }
+
+    return device_contexts_.at(device_id);
+  }
+
+  size_t GetDeviceNum() const { return device_num_; }
+
+  static Config& GetDefault() {
+    static Config conf;
+    return conf;
+  };
+
+private:
+  Config();
+  Config(const Config&) = delete;
+  Config(Config&&) = delete;
+  Config& operator=(const Config&) = delete;
+  Config& operator=(Config&&) = delete;
+
+  size_t device_num_;
+  std::vector<DeviceContext> device_contexts_;
+};
+
+inline Config::Config() {
+  {
+    int count;
+    CHECK(cudaGetDeviceCount(&count));
+    if (count <= 0) {
+      throw "cuda device not found!";
+    }
+    device_num_ = count;
+  }
+
+  for (size_t di = 0; di < device_num_; ++di) {
+    GPUContextSwitcher switcher(di);
+    UNUSED(switcher);
+
+    size_t free_mem;
+    size_t total;
+    CHECK(cudaMemGetInfo(&free_mem, &total));
+    UNUSED(total);
+    auto device_prop = GetDeviceProp(di);
+
+    size_t block_num = device_prop.multiProcessorCount;
+    size_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+    size_t block_dim = device_prop.warpSize * smsp_num;
+
+    size_t use_mem =
+        block_num * block_dim *
+        (OUTPUT_MEM_FOR_ONE_TASK + LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP);
+
+    while (use_mem * 2 < free_mem) {
+      block_num *= 2;
+      use_mem =
+          block_num * block_dim *
+          (OUTPUT_MEM_FOR_ONE_TASK + LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP);
+    }
+    size_t max_thread_num = block_num * block_dim;
+    size_t max_task_num = block_dim * block_num;
+
+    while (true) {
+      use_mem = max_task_num * 2 * OUTPUT_MEM_FOR_ONE_TASK +
+                max_thread_num * LOOKUP_MEM_FOR_ONE_TASK / LOOKUP_GAP;
+      if (use_mem >= free_mem || max_task_num > 1024 * 1024 * 8) {
+        break;
+      } else {
+        max_task_num *= 2;
+      }
+    }
+
+    device_contexts_.push_back({});
+    device_contexts_.back().block_dim = block_dim;
+    device_contexts_.back().block_num = block_num;
+    device_contexts_.back().max_task_num = max_task_num;
+  }
+}
+
+uint32_t spacemesh_get_device_num() {
+  return Config::GetDefault().GetDeviceNum();
+}
+
+uint32_t spacemesh_get_max_task_num(uint32_t device_idx) {
+  return Config::GetDefault().GetDeviceContext(device_idx).max_task_num;
+}
+
+void spacemesh_scrypt(uint32_t device_idx, const uint64_t starting_index,
+                      const uint32_t* input, const uint32_t task_num,
+                      uint32_t* output) {
+  auto& ctx = Config::GetDefault().GetDeviceContext(device_idx);
+  GPUContextSwitcher switcher(device_idx);
+  UNUSED(switcher);
+
+  if (task_num > ctx.max_task_num) {
+    throw std::invalid_argument("task_num must less " +
+                                std::to_string(ctx.max_task_num));
+  }
+
+  CudaDeviceMem<uint4> d_output(task_num * 2);
+  CudaDeviceMem<uint4> d_lookup(ctx.block_dim * ctx.block_num * 8 * N /
+                                LOOKUP_GAP);
+
+  uint4 input_1 = make_uint4(input[0], input[1], input[2], input[3]);
+  uint4 input_2 = make_uint4(input[4], input[5], input[6], input[7]);
+
+  scrypt_coalesce_access_v3<LOOKUP_GAP>
+      <<<ctx.block_num, ctx.block_dim,
+         33ULL * sizeof(uint4) / 4 * ctx.block_dim>>>(
+          starting_index, task_num, input_1, input_2, d_lookup.Ptr(),
+          d_output.Ptr());
+  CHECK(cudaMemcpy(output, d_output.Ptr(), task_num * OUTPUT_MEM_FOR_ONE_TASK,
+                   cudaMemcpyDeviceToHost));
+}
diff --git a/src/utils.hpp b/src/utils.hpp
new file mode 100644
index 0000000..2c716f4
--- /dev/null
+++ b/src/utils.hpp
@@ -0,0 +1,545 @@
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+#define CHECK(call)                                                   \
+  do {                                                                \
+    const cudaError_t error_code = call;                              \
+    if (error_code != cudaSuccess) {                                  \
+      printf("CUDA Error:\n");                                        \
+      printf("    File:   %s\n", __FILE__);                           \
+      printf("    Line:   %d\n", __LINE__);                           \
+      printf("    Error code: %d\n", error_code);                     \
+      printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
+      exit(1);                                                        \
+    }                                                                 \
+  } while (0);
+
+class GPUContextSwitcher {
+public:
+  /**
+   * Use CUDA GPU with index "gpu_index".
+   * @param gpu_index The index of cuda capable GPU.
+   */
+  GPUContextSwitcher(uint32_t gpu_index) {
+    CHECK(cudaGetDevice(&old_gpu_index_));
+    CHECK(cudaSetDevice(gpu_index));
+  }
+
+  /**
+   * Restore the GPU context that was used before creating the
+   * GpuContextSwitcher if Restore() has not been called.
+   */
+  ~GPUContextSwitcher() { Restore(); }
+
+  /**
+   * Restore the GPU context that was used before creating the
+   * GpuContextSwitcher if Restore() has not been called.
+   */
+  void Restore() noexcept {
+    if (old_gpu_index_ != -1) {
+      cudaSetDevice(old_gpu_index_);
+      old_gpu_index_ = -1;
+    }
+  }
+
+private:
+  int old_gpu_index_;
+};
+
+static inline cudaDeviceProp GetDeviceProp(int device_id) {
+  cudaDeviceProp prop;
+  CHECK(cudaGetDeviceProperties(&prop, device_id));
+  return prop;
+}
+
+static inline uint32_t GetSMSPNum(int major, int minor) {
+  const auto& invalid_msg = [](int major, int minor) -> std::string {
+    char buf[64];
+    sprintf(buf, "Invalid argument (major=%d, minor=%d) of compute capability",
+            major, minor);
+    return buf;
+  };
+
+  switch (major) {
+    case 1: {
+      switch (minor) {
+        case 0:
+        case 1:
+        case 2:
+        case 3:
+          return 1;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 2: {
+      switch (minor) {
+        case 0:
+        case 1:
+          return 1;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 3: {
+      switch (minor) {
+        case 0:
+        case 2:
+        case 5:
+        case 7:
+          return 1;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 5: {
+      switch (minor) {
+        case 0:
+        case 2:
+        case 3:
+          return 4;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 6: {
+      switch (minor) {
+        case 0: {
+          return 2;
+        }
+        case 1:
+        case 2: {
+          return 4;
+        }
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 7: {
+      switch (minor) {
+        case 0:
+        case 2:
+        case 5:
+          return 4;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 8: {
+      switch (minor) {
+        case 0:
+        case 6:
+        case 7:
+        case 9:
+          return 4;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    case 9: {
+      switch (minor) {
+        case 0:
+          return 4;
+        default:
+          throw std::invalid_argument(invalid_msg(major, minor));
+      }
+    }
+    default:
+      throw std::invalid_argument(invalid_msg(major, minor));
+  }
+}
+
+/*
+ * An RAII class of cuda device memory.
+ * Represent a one-dimensional array stored on the device.
+ * @tparam E The data type of the elements in the array.
+ */
+template <typename E>
+class CudaDeviceMem {
+public:
+  /**
+   * Default constructor.
+   *  Doesn't allocate array on the device.
+   */
+  CudaDeviceMem() : CudaDeviceMem(0) {}
+
+  /**
+   * Allocates a one-dimensional array capable of accommodating element_num
+   * elements on the device.
+   * @param element_num
+   */
+  CudaDeviceMem(size_t element_num) : d_ptr_(nullptr), element_num_(0) {
+    Resize(element_num);
+  }
+
+  /**
+   * Allocate a device-side array of the same size as "other" and copy all the
+   * data from "other" to this array.
+   * @param other
+   */
+  CudaDeviceMem(const CudaDeviceMem& other) : CudaDeviceMem() { *this = other; }
+
+  /**
+   * Move constructor.
+   * Move the device-side array stored in "other" into this object.
+   * @param other
+   */
+  CudaDeviceMem(CudaDeviceMem&& other) noexcept : CudaDeviceMem() {
+    *this = std::move(other);
+  }
+
+  /**
+   * Destructor.
+   */
+  ~CudaDeviceMem() noexcept {
+    try {
+      Clear();
+    } catch (...) {
+      // Do nothing
+    }
+  }
+
+  /**
+   * Get the pointer to the device-side array.
+   * @return The pointer to the device-side array.
+   */
+  E* Ptr() noexcept { return d_ptr_; }
+
+  /**
+   * Get the pointer to the device-side array.
+   * @return The pointer to the device-side array.
+   */
+  const E* Ptr() const noexcept { return d_ptr_; }
+
+  /**
+   * Type conversion operator.
+   * @return The pointer to the device-side array.
+   */
+  operator E*() noexcept { return d_ptr_; }
+
+  /**
+   * Type conversion operator.
+   * @return The pointer to the device-side array.
+   */
+  operator const E*() const noexcept { return d_ptr_; }
+
+  /**
+   * Get the size of the array on the device side in bytes.
+   * @return The size of the array on the device side, in bytes.
+   */
+  size_t SizeInBytes() const noexcept { return element_num_ * sizeof(E); }
+
+  /**
+   * Get the size of the array on the device side in terms of the number of
+   * elements.
+   * @return The size of the array on the device side in terms of the number of
+   * elements.
+   */
+  size_t Num() const noexcept { return element_num_; }
+
+  /**
+   * Resize the array on the device side.
+   * If the new array size is the same as the old array size, then simply return
+   * directly.
+   * @param element_num New size of the array on the device side in terms of the
+   * number of elements.
+   */
+  void Resize(size_t element_num) {
+    if (element_num_ != element_num) {
+      Clear();
+      if (element_num > 0) {
+        CHECK(cudaMalloc(&d_ptr_, element_num * sizeof(E)));
+      }
+      element_num_ = element_num;
+    }
+  }
+
+  /**
+   * Free the array on the device side.
+   */
+  void Clear() {
+    if (d_ptr_ != nullptr) {
+      CHECK(cudaFree(d_ptr_));
+      d_ptr_ = nullptr;
+      element_num_ = 0;
+    }
+  }
+
+  /**
+   * Copy assignment operator.
+   * Resize this array and copy all data from other to this array.
+   * @param other
+   * @return Reference to this object.
+   */
+  CudaDeviceMem& operator=(const CudaDeviceMem& other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    if (other.d_ptr_ != nullptr) {
+      if (element_num_ != other.element_num_) {
+        Resize(other.element_num_);
+      }
+      CHECK(cudaMemcpy(d_ptr_, other.d_ptr_, other.SizeInBytes(),
+                       cudaMemcpyDeviceToDevice));
+    } else {
+      Clear();
+    }
+    return *this;
+  }
+
+  /**
+   * Move assignment operator.
+   * @param other
+   * @return Reference to this object.
+   */
+  CudaDeviceMem& operator=(CudaDeviceMem&& other) {
+    if (this == &other) {
+      return *this;
+    }
+    Clear();
+    d_ptr_ = other.d_ptr_;
+    other.d_ptr_ = nullptr;
+
+    element_num_ = other.element_num_;
+    other.element_num_ = 0;
+    return *this;
+  }
+
+private:
+  E* d_ptr_;
+  size_t element_num_;
+};
+
+/**
+ * An RAII class of cuda host memory.
+ * Represent a one-dimensional array stored on the host side.
+ * @tparam E The data type of the elements in the array.
+ */
+template <typename E>
+class CudaHostMem {
+public:
+  /**
+   * Default constructor.
+   * Doesn't allocate array on the host side.
+   */
+  CudaHostMem() : CudaHostMem(0) {}
+
+  /**
+   * Allocates a one-dimensional array capable of accommodating element_num
+   * elements with flag cudaHostAllocDefault on the host side.
+   * @param element_num
+   */
+  CudaHostMem(size_t element_num)
+      : d_ptr_(nullptr),
+        h_ptr_(nullptr),
+        element_num_(0),
+        flags_(cudaHostAllocDefault) {
+    Resize(element_num);
+  }
+
+  /**
+   * Allocates a one-dimensional array capable of accommodating element_num
+   * elements with the specified flags on the host side.
+   * @param element_num
+   * @param flags The flags used when allocate host-side memory. The value of
+   * the flags parameter needs to be one of the flags that the cudaHostAlloc()
+   * function can accept.
+   */
+  CudaHostMem(size_t element_num, unsigned int flags)
+      : d_ptr_(nullptr),
+        h_ptr_(nullptr),
+        element_num_(0),
+        flags_(cudaHostAllocDefault) {
+    Resize(element_num, flags);
+  }
+
+  /**
+   * Copy constructor.
+   * Allocate a host-side array of the same size and the same flags as "other"
+   * and copy all the data from "other" to this array.
+   * @param other
+   */
+  CudaHostMem(const CudaHostMem& other) : CudaHostMem() { *this = other; }
+
+  /**
+   * Move constructor.
+   * Move the host-side array stored in "other" into this object.
+   * @param other
+   */
+  CudaHostMem(CudaHostMem&& other) noexcept : CudaHostMem() {
+    *this = std::move(other);
+  }
+
+  /**
+   * Destructor.
+   */
+  ~CudaHostMem() noexcept {
+    try {
+      Clear();
+    } catch (...) {
+      // Do nothing.
+    }
+  }
+
+  /**
+   * Get the host-side pointer to the array.
+   * @return The host-side pointer to the array.
+   */
+  E* HPtr() noexcept { return h_ptr_; }
+
+  /**
+   * Get the host-side pointer to the array.
+   * @return The host-side pointer to the array.
+   */
+  const E* HPtr() const noexcept { return h_ptr_; }
+
+  /**
+   * Get the mapped device-side pointer to the array.
+   * @return The mapped device-side pointer to the array. If the array is not
+   * allocated with flag cudaHostAllocMapped, nullptr is returned.
+   */
+  E* DPtr() noexcept { return d_ptr_; }
+
+  /**
+   * Get the mapped device-side pointer to the array.
+   * @return The mapped device-side pointer to the array. If the array is not
+   * allocated with flag cudaHostAllocMapped, nullptr is returned.
+   */
+  const E* DPtr() const noexcept { return d_ptr_; }
+
+  /**
+   * Get the size of the array on the host side in bytes.
+   * @return The size of the array on the host side, in bytes.
+   */
+  size_t SizeInBytes() const noexcept { return element_num_ * sizeof(E); }
+
+  /**
+   * Get the size of the array on the host side in terms of the number of
+   * elements.
+   * @return The size of the array on the host side in terms of the number of
+   * elements.
+   */
+  size_t Num() const noexcept { return element_num_; }
+
+  /**
+   * Resize the array on the host side.
+   * If the new array size is the same as the old array size, then simply return
+   * directly.
+   * @param element_num New size of the array on the host side in terms of the
+   * number of elements.
+   */
+  void Resize(size_t element_num) { Resize(element_num, flags_); }
+
+  /**
+   * Resize the array on the host side with new flags.
+   * If the new array size and the new flags is the same as the old array then
+   * simply return directly.
+   * @param element_num New size of the array on the host side in terms of the
+   * number of elements.
+   * @param flags The flags used when allocate host-side memory. The value of
+   * the flags parameter needs to be one of the flags that the cudaHostAlloc()
+   * function can accept.
+   */
+  void Resize(size_t element_num, unsigned int flags) {
+    if (element_num == element_num_ && flags == flags_) {
+      return;
+    }
+    Clear();
+    if (element_num > 0) {
+      CHECK(cudaHostAlloc(&h_ptr_, element_num * sizeof(E), flags));
+      if ((cudaHostAllocMapped & flags) != 0) {
+        try {
+          CHECK(cudaHostGetDevicePointer(&d_ptr_, h_ptr_, 0));
+        } catch (std::exception& e) {
+          Clear();
+          throw e;
+        }
+      }
+    }
+    element_num_ = element_num;
+    flags_ = flags;
+  }
+
+  /**
+   * Free the host-side array.
+   */
+  void Clear() {
+    if (h_ptr_ != nullptr) {
+      CHECK(cudaFreeHost(h_ptr_));
+    }
+    h_ptr_ = d_ptr_ = nullptr;
+    element_num_ = 0;
+  }
+
+  /**
+   * Copy assignment operator.
+   * Resize the array managed in this object with the flags used when create the
+   * array managed in the other object. Then copy all data from other to the new
+   * array.
+   * @param other
+   * @return Reference to this object.
+   */
+  CudaHostMem& operator=(const CudaHostMem& other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    if (other.h_ptr_ != nullptr) {
+      if (element_num_ != other.element_num_ || flags_ != other.flags_) {
+        Resize(other.element_num_, other.flags_);
+      }
+      memcpy(h_ptr_, other.h_ptr_, other.SizeInBytes());
+    } else {
+      Clear();
+      flags_ = other.flags_;
+    }
+    return *this;
+  }
+
+  /**
+   * Move assignment operator.
+   * @param other
+   * @return Reference to this object.
+   */
+  CudaHostMem& operator=(CudaHostMem&& other) {
+    if (this == &other) {
+      return *this;
+    }
+    Clear();
+    flags_ = other.flags_;
+    other.flags_ = cudaHostAllocDefault;
+
+    h_ptr_ = other.h_ptr_;
+    other.h_ptr_ = nullptr;
+
+    d_ptr_ = other.d_ptr_;
+    other.d_ptr_ = nullptr;
+
+    element_num_ = other.element_num_;
+    other.element_num_ = 0;
+    return *this;
+  }
+
+  /**
+   * Obtaining the flags used when creating a one-dimensional array on the host
+   * side.
+   * @return The flags used when creating a one-dimensional array on the host
+   * side.
+   */
+  unsigned int GetFlags() const noexcept { return flags_; }
+
+private:
+  E* d_ptr_;
+  E* h_ptr_;
+  size_t element_num_;
+  unsigned int flags_;
+};
+
+#endif
diff --git a/test/perf.cu b/test/perf.cu
new file mode 100644
index 0000000..85515bb
--- /dev/null
+++ b/test/perf.cu
@@ -0,0 +1,182 @@
+#include <cuda_runtime.h>
+
+#include <chrono>
+
+#include "device/kernel.cuh"
+#include "gtest/gtest.h"
+#include "utils.hpp"
+
+TEST(Spacemesh, PerfCoalesceOrg) {
+  constexpr uint32_t LOOKUP_GAP = 2;
+  constexpr uint32_t task_num_per_thread = 128;
+
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t N = 8 * 1024;
+  uint64_t starting_index = 0;
+
+  auto device_prop = GetDeviceProp(0);
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t core_num =
+      device_prop.warpSize * smsp_num * device_prop.multiProcessorCount;
+
+  uint32_t thread_num = core_num * LOOKUP_GAP;
+  uint32_t task_num = thread_num * task_num_per_thread;
+
+  CudaDeviceMem<uint4> d_out(task_num * 2);
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+  CudaHostMem<uint4> h_out(task_num * 2);
+
+  constexpr size_t block_dim = 256;
+  size_t block_num = thread_num / block_dim;
+  size_t iter = 1;
+
+  auto st = std::chrono::steady_clock::now();
+  for (size_t i = 0; i < iter; ++i) {
+    scrypt_org<LOOKUP_GAP><<<block_num, block_dim>>>(
+        starting_index, task_num, in0, in1,
+        reinterpret_cast<uint4 *>(d_lookup.Ptr()), d_out.Ptr());
+  }
+  CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  auto ed = std::chrono::steady_clock::now();
+
+  double d =
+      std::chrono::duration_cast<std::chrono::milliseconds>(ed - st).count();
+  std::cout << "[org] block_num: " << block_num << ", block_dim: " << block_dim
+            << ", time: " << d / iter << "ms, throughput: "
+            << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n";
+}
+
+TEST(Spacemesh, PerfCoalesceV1) {
+  constexpr uint32_t LOOKUP_GAP = 2;
+  constexpr uint32_t task_num_per_thread = 128;
+
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t N = 8 * 1024;
+  uint64_t starting_index = 0;
+
+  auto device_prop = GetDeviceProp(0);
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t core_num =
+      device_prop.warpSize * smsp_num * device_prop.multiProcessorCount;
+
+  uint32_t thread_num = core_num * LOOKUP_GAP;
+  uint32_t task_num = thread_num * task_num_per_thread;
+
+  CudaDeviceMem<uint4> d_out(task_num * 2);
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+  CudaHostMem<uint4> h_out(task_num * 2);
+
+  constexpr size_t block_dim = 256;
+  size_t block_num = thread_num / block_dim;
+  size_t iter = 1;
+
+  auto st = std::chrono::steady_clock::now();
+  for (size_t i = 0; i < iter; ++i) {
+    scrypt_coalesce_access_v1<LOOKUP_GAP>
+        <<<block_num, block_dim, 33 * sizeof(uint32_t) * block_dim>>>(
+            starting_index, task_num, in0, in1,
+            reinterpret_cast<uint32_t *>(d_lookup.Ptr()), d_out.Ptr());
+  }
+  CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  auto ed = std::chrono::steady_clock::now();
+
+  double d =
+      std::chrono::duration_cast<std::chrono::milliseconds>(ed - st).count();
+  std::cout << "[v1] block_num: " << block_num << ", block_dim: " << block_dim
+            << ", time: " << d / iter << "ms, throughput: "
+            << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n";
+}
+
+TEST(Spacemesh, PerfCoalesceV2) {
+  constexpr uint32_t LOOKUP_GAP = 2;
+  constexpr uint32_t task_num_per_thread = 512;
+
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t N = 8 * 1024;
+  uint64_t starting_index = 0;
+
+  auto device_prop = GetDeviceProp(0);
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t core_num =
+      device_prop.warpSize * smsp_num * device_prop.multiProcessorCount;
+
+  uint32_t thread_num = core_num * LOOKUP_GAP;
+  uint32_t task_num = thread_num * task_num_per_thread;
+
+  CudaDeviceMem<uint4> d_out(task_num * 2);
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+  CudaHostMem<uint4> h_out(task_num * 2);
+
+  constexpr size_t block_dim = 256;
+  size_t block_num = thread_num / block_dim;
+  size_t iter = 1;
+
+  auto st = std::chrono::steady_clock::now();
+  for (size_t i = 0; i < iter; ++i) {
+    scrypt_coalesce_access_v2<LOOKUP_GAP>
+        <<<block_num, block_dim, 33 * sizeof(uint64_t) * block_dim / 2>>>(
+            starting_index, task_num, in0, in1,
+            reinterpret_cast<uint64_t *>(d_lookup.Ptr()), d_out.Ptr());
+  }
+  CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  auto ed = std::chrono::steady_clock::now();
+
+  double d =
+      std::chrono::duration_cast<std::chrono::milliseconds>(ed - st).count();
+  std::cout << "[v2] block_num: " << block_num << ", block_dim: " << block_dim
+            << ", time: " << d / iter << "ms, throughput: "
+            << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n";
+}
+
+TEST(Spacemesh, PerfCoalesceV3) {
+  constexpr uint32_t LOOKUP_GAP = 2;
+  constexpr uint32_t task_num_per_thread = 512;
+
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t N = 8 * 1024;
+  uint64_t starting_index = 0;
+
+  auto device_prop = GetDeviceProp(0);
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t core_num =
+      device_prop.warpSize * smsp_num * device_prop.multiProcessorCount;
+
+  uint32_t thread_num = core_num * LOOKUP_GAP;
+  uint32_t task_num = thread_num * task_num_per_thread;
+
+  CudaDeviceMem<uint4> d_out(task_num * 2);
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+  CudaHostMem<uint4> h_out(task_num * 2);
+
+  constexpr size_t block_dim = 256;
+  size_t block_num = thread_num / block_dim;
+  size_t iter = 1;
+
+  auto st = std::chrono::steady_clock::now();
+  for (size_t i = 0; i < iter; ++i) {
+    scrypt_coalesce_access_v3<LOOKUP_GAP>
+        <<<block_num, block_dim, 33 * sizeof(uint4) * block_dim / 4>>>(
+            starting_index, task_num, in0, in1,
+            reinterpret_cast<uint4 *>(d_lookup.Ptr()), d_out.Ptr());
+  }
+  CHECK(cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  auto ed = std::chrono::steady_clock::now();
+
+  double d =
+      std::chrono::duration_cast<std::chrono::milliseconds>(ed - st).count();
+  std::cout << "[v3] block_num: " << block_num << ", block_dim: " << block_dim
+            << ", time: " << d / iter << "ms, throughput: "
+            << task_num * iter / d * 1000 * 16.0 / 1024 / 1024 << "MB/s\n";
+}
diff --git a/test/test_kernel.cu b/test/test_kernel.cu
new file mode 100644
index 0000000..079d33d
--- /dev/null
+++ b/test/test_kernel.cu
@@ -0,0 +1,463 @@
+#include <cuda_runtime.h>
+
+#include "device/kernel.cuh"
+#include "device/pbkdf2.cuh"
+#include "device/romix.cuh"
+#include "gtest/gtest.h"
+#include "spacemesh_cuda/spacemesh.h"
+#include "utils.hpp"
+
+static bool operator==(const uint4 &u0, const uint4 &u1) {
+  return u0.x == u1.x && u0.y == u1.y && u0.z == u1.z && u0.w == u1.w;
+}
+
+static __global__ void pbkdf2_128b(const uint N, const ulong starting_index,
+                                   const uint4 *const __restrict__ input,
+                                   uint4 *const __restrict__ output,
+                                   const uint num_tasks) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input[0];
+    password[1] = input[1];
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+    /* 1: X = PBKDF2(password, salt) */
+    scrypt_pbkdf2_128B(password, X);
+    for (uint32_t i = 0; i < 8; ++i) {
+      output[t * 8 + i] = X[i];
+    }
+  }
+}
+
+TEST(PBKDF2_128B, CheckResult) {
+  const uint32_t N = 8192;
+  const uint32_t num_tasks = 8704 * 2;
+  const uint64_t starting_index = 0UL;
+  CudaDeviceMem<uint4> input(2);
+  CudaDeviceMem<uint4> output(8 * num_tasks);
+  uint4 h_input[2]{2839345266U, 42009750U,   875455879U,  2217211394U,
+                   3438177526U, 2734532412U, 2819254414U, 1408356118U};
+  // clang-format off
+  // {x[0]xyzw, x[7].xyzw}
+  std::vector<std::pair<uint4,uint4>> h_output_ref{
+    {{2243168157, 324902921, 784369288, 4178555589}, {594727500, 3520078779, 3153430745, 1486369834}},
+    {{2223702091, 3135234577, 351746947, 628596597}, {1974692412, 2751762247, 1815359819, 1220784090}},
+    {{1757856292, 3796090370, 105343294, 740218899}, {862746119, 1267304388, 4212448263, 3102108417}},
+    {{17321531, 3055860495, 2259029015, 3918725981}, {3736254989, 3761189418, 149153817, 3819126153}},
+    {{1834041421, 3427652492, 3278849906, 3382042170}, {3393761384, 947759528, 1750308469, 1815762229}},
+    {{3313951662, 1644567330, 866636170, 1422164638}, {1774972657, 1616289065, 2116049991, 2906510373}},
+    {{3199039070, 694981869, 1336937698, 1163541043}, {806088862, 1536940888, 2821292057, 915496211}},
+    {{4078128437, 2379231243, 1604075742, 2325245807}, {360554255, 875207183, 1516458558, 421131869}},
+    {{1087253935, 2208644287, 2756603925, 3971895705}, {1895637869, 3354544041, 2252449461, 218427034}},
+    {{1967339238, 1146502695, 3362372873, 2541765279}, {3706574425, 429585357, 9437500, 679403288}},
+    {{3457721097, 2061161947, 4289243029, 2170079478}, {2636111503, 3688484586, 4694583, 120348073}},
+    {{2852852322, 159818758, 310172246, 2704348751}, {233655115, 1516867167, 2442547836, 936759168}},
+    {{3722947891, 3394662328, 1171702661, 3885525270}, {3053353333, 1393137549, 2858266450, 954570086}},
+    {{4270365148, 3284942429, 639193268, 1452194571}, {1161264254, 3727720264, 2913344122, 3397135746}},
+    {{596580228, 1392092807, 843646682, 3541695628}, {2590893734, 2198545995, 237641943, 602469347}},
+    {{2455879077, 2319967660, 995689268, 1471734608}, {992029320, 1234415286, 3489799059, 2101928371}}
+  };
+  // clang-format on
+  CudaHostMem<uint4> h_output(8 * num_tasks);
+  cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice);
+  const uint32_t BLOCK_DIM = 32;
+  const uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2;
+  pbkdf2_128b<<<GRID_DIM, BLOCK_DIM>>>(N, starting_index, input.Ptr(),
+                                       output.Ptr(), num_tasks);
+  cudaMemcpy(h_output.HPtr(), output.Ptr(), h_output.SizeInBytes(),
+             cudaMemcpyDeviceToHost);
+  auto *p = h_output.HPtr();
+  for (size_t i = 0; i < h_output_ref.size(); ++i) {
+    uint4 st = p[i * 8];
+    uint4 ed = p[i * 8 + 7];
+    uint4 st_ref = h_output_ref[i].first;
+    uint4 ed_ref = h_output_ref[i].second;
+    EXPECT_TRUE(st == st_ref && ed == ed_ref);
+  }
+}
+
+static __global__ void pbkdf2_32b(const ulong starting_index,
+                                  const uint4 *const __restrict__ input,
+                                  uint4 *const __restrict__ output,
+                                  const uint num_tasks) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input[0];
+    password[1] = input[1];
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_pbkdf2_32B(password, X, &output[t * 2]);
+  }
+}
+
+TEST(PBKDF2_32B, CheckResult) {
+  // clang-format off
+  std::vector<uint4> output_ref{
+    {1541574387, 488907923, 795739296, 3660924057}, {1595429963, 1442715467, 3454537610, 303168103},
+    {2832999825, 1792965534, 3851604374, 1543672901}, {2252171238, 2489374518, 4196827066, 1925643828},
+    {3776545153, 2493253923, 1983841114, 4189265163}, {3106590296, 3126490259, 4292680934, 3487255118},
+    {1138967569, 3223815245, 3999001665, 2833458212}, {525278724, 204474235, 1590998285, 1126296421},
+    {1712137169, 3649550485, 2728357207, 2572560430}, {629927608, 1494199002, 426129028, 2296958300},
+    {612931869, 2928911829, 999195935, 3894654826}, {711808285, 773108236, 1884424028, 405027227},
+    {3496895468, 4098261784, 1205595361, 987182193}, {1092867524, 3407967604, 1630381730, 850280901},
+    {2344968929, 1230032780, 533878510, 782288479}, {3426145836, 2789001978, 2013751058, 3920530103},
+    {1004256693, 2820654651, 3302594902, 1895517683}, {1715443604, 1822444432, 3642638638, 4172159742},
+    {361838609, 2572724440, 3232663262, 1640532158}, {3156298490, 285734263, 2107779922, 124293624},
+    {2875742700, 1740468830, 292959372, 1165028846}, {1034497710, 841845972, 3869241330, 279539866},
+    {1684353161, 2791287652, 639938640, 696103378}, {1098941359, 3059816824, 80312190, 1352994082},
+    {491432485, 1977606223, 568579903, 2848969311}, {856678766, 694736051, 1544114686, 3391276793},
+    {528722383, 4221448681, 1427708755, 3435410113}, {108301243, 3032419668, 401350649, 1497139475},
+    {1633560854, 1259258950, 2231332965, 3069488567}, {3422183957, 1262157060, 2082726213, 1043984063},
+    {1659437647, 2520492023, 1526642929, 2037984020}, {3943654391, 2575523191, 3898673117, 1543536477},
+  };
+  // clang-format on
+
+  uint32_t num_tasks = 8704 * 2;
+  uint64_t starting_index = 0UL;
+  CudaDeviceMem<uint4> input(2);
+  uint4 h_input[2]{2839345266U, 42009750U,   875455879U,  2217211394U,
+                   3438177526U, 2734532412U, 2819254414U, 1408356118U};
+  cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice);
+  CudaDeviceMem<uint4> output(num_tasks * 2);
+  CudaHostMem<uint4> h_out(num_tasks * 2);
+
+  uint32_t BLOCK_DIM = 32;
+  uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2;
+  pbkdf2_32b<<<GRID_DIM, BLOCK_DIM>>>(starting_index, input.Ptr(), output.Ptr(),
+                                      num_tasks);
+  cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(),
+             cudaMemcpyDeviceToHost);
+  uint4 *p = h_out.HPtr();
+  for (size_t i = 0; i < output_ref.size(); ++i) {
+    EXPECT_TRUE(p[i] == output_ref[i]);
+  }
+}
+
+template <uint32_t LOOPUP_GAP = 2>
+static __global__ void romix(const uint32_t N, const ulong starting_index,
+                             const uint4 *const __restrict__ input,
+                             uint4 *const __restrict__ padcache,
+                             uint4 *const __restrict__ output,
+                             const uint num_tasks) {
+  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  uint32_t tnum = gridDim.x * blockDim.x;
+
+  uint4 password[5];
+  uint4 X[8];
+  for (uint32_t t = tid; t < num_tasks; t += tnum) {
+    const uint64_t index = starting_index + t;
+
+    password[0] = input[0];
+    password[1] = input[1];
+    password[2].x = uint32_t(index & 0xFFFFFFFF);
+    password[2].y = uint32_t((index >> 32) & 0xFFFFFFFF);
+    password[2].z = 0;
+    password[2].w = 0;
+    password[3] = make_zero<uint4>();
+    password[4] = make_zero<uint4>();
+
+    scrypt_pbkdf2_128B(password, X);
+    scrypt_ROMix_org<LOOPUP_GAP>(X, padcache, tnum, tid);
+    output[t * 2] = X[0];
+    output[t * 2 + 1] = X[7];
+  }
+}
+
+TEST(Romix, CheckResult) {
+  // clang-format off
+  std::vector<std::pair<uint4, uint4>> output_ref{
+    {{177095052, 2769478868, 1267860214, 3840882696}, {4209226529, 2478377306, 3668441954, 661024389}},
+    {{675217897, 1210703173, 2821523351, 395671908}, {1164792708, 2377829658, 1357012668, 3086536117}},
+    {{2122569980, 1521965530, 428608715, 1170231827}, {711050857, 3859067996, 201759563, 1563548156}},
+    {{1157318318, 2940549691, 1127296292, 3378566105}, {4262566715, 148588802, 1200061494, 186445103}},
+    {{1910618740, 3668591534, 543383426, 3111805377}, {1064989078, 4060066605, 1451448625, 391663587}},
+    {{4214846473, 2831090917, 2414213101, 1567041697}, {4072116936, 1223198630, 1381754999, 3832720526}},
+    {{1999710917, 3830843826, 2091719392, 3506640524}, {1612283548, 4084973170, 2005582528, 4160760954}},
+    {{3701562884, 1369415191, 231608195, 3440200853}, {1936641932, 3560076262, 2893706191, 348983257}},
+    {{1714814708, 806504637, 1185624757, 1309360160}, {1093527150, 3638059477, 3970998707, 2784271355}},
+    {{1640097667, 3808618373, 1719302163, 3224052072}, {2464420809, 2492362386, 2238290668, 1350255901}},
+    {{3593000297, 2800731264, 510712390, 3370323384}, {607950715, 1260312177, 248473348, 3713416381}},
+    {{4278573785, 4219952570, 252971591, 1295396640}, {1479543003, 1443275236, 1565620974, 4172569527}},
+    {{895020402, 4109313948, 184256163, 1338271264}, {2928529152, 1810204067, 3068405352, 4239813782}},
+    {{4014920583, 1109162161, 4257725846, 444189643}, {1724450670, 2872480592, 4016779893, 878820321}},
+    {{715651626, 3257863402, 2715797466, 2730055762}, {2101471733, 1562788948, 1551284046, 2392879513}},
+    {{3992784225, 60299832, 649066318, 1383234528}, {3861971681, 1804071335, 1551305386, 1951189750}},
+  };
+  // clang-format on
+
+  const uint32_t N = 8192;
+  uint32_t num_tasks = 8704;
+  uint64_t starting_index = 0UL;
+  const uint32_t loopup_gap = 1;
+
+  CudaDeviceMem<uint4> input(2);
+  uint4 h_input[2]{2839345266U, 42009750U,   875455879U,  2217211394U,
+                   3438177526U, 2734532412U, 2819254414U, 1408356118U};
+  cudaMemcpy(input.Ptr(), h_input, input.SizeInBytes(), cudaMemcpyHostToDevice);
+  CudaDeviceMem<uint4> output(num_tasks * 2);
+  CudaHostMem<uint4> h_out(num_tasks * 2);
+
+  uint32_t BLOCK_DIM = 32;
+  uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM;
+  const size_t global_size = GRID_DIM * BLOCK_DIM;
+  CudaDeviceMem<uint4> loopup(N / loopup_gap * 8 * global_size);
+
+  romix<loopup_gap><<<GRID_DIM, BLOCK_DIM>>>(
+      N, starting_index, input.Ptr(), loopup.Ptr(), output.Ptr(), num_tasks);
+
+  cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(),
+             cudaMemcpyDeviceToHost);
+  uint4 *p = h_out.HPtr();
+  for (size_t i = 0; i < output_ref.size(); ++i) {
+    EXPECT_TRUE(p[i * 2] == output_ref[i].first &&
+                p[i * 2 + 1] == output_ref[i].second);
+  }
+}
+
+TEST(SpacemeshOrg, CheckResult) {
+  // clang-format off
+  std::vector<std::pair<uint4, uint4>> output_ref{
+    {{4048330317, 4093720124, 3952305695, 861738752}, {1207399841, 1252004780, 3769355734, 605176832}},
+    {{2755521322, 1773634345, 3570559863, 3528040384}, {1426306995, 1828125550, 2362351051, 592285497}},
+    {{794272509, 1869538505, 1508520130, 3189115413}, {3371395123, 664595614, 942134631, 2595808448}},
+    {{2966305566, 139203859, 3236026452, 971490366}, {406197471, 2107380474, 1647245437, 1398227647}},
+    {{4248325324, 2135553756, 3121138058, 2606063366}, {3712984664, 405355870, 1786205915, 2338023431}},
+    {{2223254193, 320690588, 1750003793, 1980189572}, {2430931520, 2358257771, 857129483, 4174911228}},
+    {{3824389675, 3900375118, 885509409, 2541713504}, {3965109822, 4266067977, 233389698, 429691579}},
+    {{617654257, 287857505, 44437086, 3091373715}, {1579913930, 42500443, 741315169, 2504110009}},
+    {{4185094906, 11838281, 788328873, 2513580847}, {3475313515, 1738120748, 1410721087, 3731153976}},
+    {{3786242664, 575105985, 1815246426, 2602042915}, {1355784688, 2576224732, 3988570599, 858796312}},
+    {{2700358536, 2813184983, 1596688001, 1714337525}, {1078584113, 3032059087, 2777935765, 1318855605}},
+    {{1541547040, 1772543483, 3001615835, 2646093592}, {764031266, 303589574, 363686698, 2135502111}},
+    {{1152844501, 106430079, 2715838609, 2711284980}, {3437796150, 1500472695, 2708109779, 889703778}},
+    {{1770568223, 4026398494, 3075729081, 1584205826}, {2457749057, 155442015, 202804510, 2647478225}},
+    {{3703726525, 394584141, 117082576, 699663196}, {1831918362, 2038557743, 2876631409, 3154481933}},
+    {{2148001307, 979677799, 520145623, 64552000}, {4006002196, 3736832885, 543708505, 2041849618}},
+  };
+  // clang-format on
+
+  const uint32_t N = 8192;
+  uint32_t num_tasks = 8704 * 2;
+  uint64_t starting_index = 0UL;
+  const uint32_t loopup_gap = 2;
+
+  uint4 h_input[2]{2839345266U, 42009750U,   875455879U,  2217211394U,
+                   3438177526U, 2734532412U, 2819254414U, 1408356118U};
+  CudaDeviceMem<uint4> output(num_tasks * 2);
+  CudaHostMem<uint4> h_out(num_tasks * 2);
+
+  uint32_t BLOCK_DIM = 32;
+  uint32_t GRID_DIM = (num_tasks - BLOCK_DIM - 1) / BLOCK_DIM / 2;
+  const size_t global_size = GRID_DIM * BLOCK_DIM;
+  CudaDeviceMem<uint4> loopup(N / loopup_gap * 8 * global_size);
+
+  scrypt_org<loopup_gap><<<GRID_DIM, BLOCK_DIM>>>(starting_index, num_tasks,
+                                                  h_input[0], h_input[1],
+                                                  loopup.Ptr(), output.Ptr());
+
+  cudaMemcpy(h_out.HPtr(), output.Ptr(), output.SizeInBytes(),
+             cudaMemcpyDeviceToHost);
+  uint4 *p = h_out.HPtr();
+  for (size_t i = 0; i < output_ref.size(); ++i) {
+    EXPECT_TRUE(p[i * 2] == output_ref[i].first &&
+                p[i * 2 + 1] == output_ref[i].second);
+  }
+}
+
+TEST(SpacemeshAPI, CheckResult) {
+  // clang-format off
+  std::vector<std::pair<uint4, uint4>> output_ref{
+    {{4048330317, 4093720124, 3952305695, 861738752}, {1207399841, 1252004780, 3769355734, 605176832}},
+    {{2755521322, 1773634345, 3570559863, 3528040384}, {1426306995, 1828125550, 2362351051, 592285497}},
+    {{794272509, 1869538505, 1508520130, 3189115413}, {3371395123, 664595614, 942134631, 2595808448}},
+    {{2966305566, 139203859, 3236026452, 971490366}, {406197471, 2107380474, 1647245437, 1398227647}},
+    {{4248325324, 2135553756, 3121138058, 2606063366}, {3712984664, 405355870, 1786205915, 2338023431}},
+    {{2223254193, 320690588, 1750003793, 1980189572}, {2430931520, 2358257771, 857129483, 4174911228}},
+    {{3824389675, 3900375118, 885509409, 2541713504}, {3965109822, 4266067977, 233389698, 429691579}},
+    {{617654257, 287857505, 44437086, 3091373715}, {1579913930, 42500443, 741315169, 2504110009}},
+    {{4185094906, 11838281, 788328873, 2513580847}, {3475313515, 1738120748, 1410721087, 3731153976}},
+    {{3786242664, 575105985, 1815246426, 2602042915}, {1355784688, 2576224732, 3988570599, 858796312}},
+    {{2700358536, 2813184983, 1596688001, 1714337525}, {1078584113, 3032059087, 2777935765, 1318855605}},
+    {{1541547040, 1772543483, 3001615835, 2646093592}, {764031266, 303589574, 363686698, 2135502111}},
+    {{1152844501, 106430079, 2715838609, 2711284980}, {3437796150, 1500472695, 2708109779, 889703778}},
+    {{1770568223, 4026398494, 3075729081, 1584205826}, {2457749057, 155442015, 202804510, 2647478225}},
+    {{3703726525, 394584141, 117082576, 699663196}, {1831918362, 2038557743, 2876631409, 3154481933}},
+    {{2148001307, 979677799, 520145623, 64552000}, {4006002196, 3736832885, 543708505, 2041849618}},
+  };
+  // clang-format on
+
+  uint32_t num_tasks = 8704 * 2;
+  uint64_t starting_index = 0UL;
+
+  std::vector<uint32_t> h_input{2839345266U, 42009750U,   875455879U,
+                                2217211394U, 3438177526U, 2734532412U,
+                                2819254414U, 1408356118U};
+  std::vector<uint32_t> h_out(num_tasks * 8, 0);
+
+  spacemesh_scrypt(0, starting_index, h_input.data(), num_tasks, h_out.data());
+  uint4 *p = reinterpret_cast<uint4 *>(h_out.data());
+
+  for (size_t i = 0; i < output_ref.size(); ++i) {
+    EXPECT_TRUE(p[i * 2] == output_ref[i].first &&
+                p[i * 2 + 1] == output_ref[i].second);
+  }
+}
+TEST(SpacemeshCoalesceV1, CheckResult) {
+  auto device_prop = GetDeviceProp(0);
+  uint32_t block_num = device_prop.multiProcessorCount;
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t block_dim = device_prop.warpSize * smsp_num;
+
+  const uint64_t starting_index = 0;
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t LOOKUP_GAP = 2;
+  const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP;
+  block_dim = 32;
+  block_num = thread_num / block_dim;
+
+  constexpr uint32_t TASK_PER_THREAD = 1;
+  const uint32_t task_num = thread_num * TASK_PER_THREAD;
+
+  CudaDeviceMem<uint4> d0_out(task_num * 2);
+  CudaDeviceMem<uint4> d1_out(task_num * 2);
+
+  const uint32_t N = 8 * 1024;
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+
+  std::unique_ptr<uint4[]> h0_out(new uint4[task_num * 2]);
+  std::unique_ptr<uint4[]> h1_out(new uint4[task_num * 2]);
+
+  scrypt_org<LOOKUP_GAP><<<block_num, block_dim>>>(
+      starting_index, task_num, in0, in1,
+      reinterpret_cast<uint4 *>(d_lookup.Ptr()), d0_out.Ptr());
+
+  scrypt_coalesce_access_v1<LOOKUP_GAP>
+      <<<block_num, block_dim, 33 * sizeof(uint32_t) * block_dim>>>(
+          starting_index, task_num, in0, in1,
+          reinterpret_cast<uint32_t *>(d_lookup.Ptr()), d1_out.Ptr());
+
+  CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < task_num * 2; ++i) {
+    EXPECT_EQ(h0_out[i], h1_out[i]);
+  }
+}
+
+TEST(SpacemeshCoalesceV2, CheckResult) {
+  auto device_prop = GetDeviceProp(0);
+  uint32_t block_num = device_prop.multiProcessorCount;
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t block_dim = device_prop.warpSize * smsp_num;
+
+  const uint64_t starting_index = 0;
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t LOOKUP_GAP = 2;
+  const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP;
+  block_dim = 32;
+  block_num = thread_num / block_dim;
+
+  constexpr uint32_t TASK_PER_THREAD = 1;
+  const uint32_t task_num = thread_num * TASK_PER_THREAD;
+
+  CudaDeviceMem<uint4> d0_out(task_num * 2);
+  CudaDeviceMem<uint4> d1_out(task_num * 2);
+
+  const uint32_t N = 8 * 1024;
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+
+  std::unique_ptr<uint4[]> h0_out(new uint4[task_num * 2]);
+  std::unique_ptr<uint4[]> h1_out(new uint4[task_num * 2]);
+
+  scrypt_org<LOOKUP_GAP><<<block_num, block_dim>>>(
+      starting_index, task_num, in0, in1,
+      reinterpret_cast<uint4 *>(d_lookup.Ptr()), d0_out.Ptr());
+
+  scrypt_coalesce_access_v2<LOOKUP_GAP>
+      <<<block_num, block_dim, 33 * sizeof(uint64_t) * block_dim / 2>>>(
+          starting_index, task_num, in0, in1,
+          reinterpret_cast<uint64_t *>(d_lookup.Ptr()), d1_out.Ptr());
+
+  CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < task_num * 2; ++i) {
+    EXPECT_EQ(h0_out[i], h1_out[i]);
+  }
+}
+
+TEST(SpacemeshCoalesceV3, CheckResult) {
+  auto device_prop = GetDeviceProp(0);
+  uint32_t block_num = device_prop.multiProcessorCount;
+  uint32_t smsp_num = GetSMSPNum(device_prop.major, device_prop.minor);
+  uint32_t block_dim = device_prop.warpSize * smsp_num;
+
+  const uint64_t starting_index = 0;
+  uint4 in0{2839345266U, 42009750U, 875455879U, 2217211394U};
+  uint4 in1{3438177526U, 2734532412U, 2819254414U, 1408356118U};
+
+  const uint32_t LOOKUP_GAP = 2;
+  const uint32_t thread_num = block_num * block_dim * LOOKUP_GAP;
+  block_dim = 32;
+  block_num = thread_num / block_dim;
+
+  constexpr uint32_t TASK_PER_THREAD = 1;
+  const uint32_t task_num = thread_num * TASK_PER_THREAD;
+
+  CudaDeviceMem<uint4> d0_out(task_num * 2);
+  CudaDeviceMem<uint4> d1_out(task_num * 2);
+
+  const uint32_t N = 8 * 1024;
+  CudaDeviceMem<uint32_t> d_lookup(N / LOOKUP_GAP * 32 * thread_num);
+
+  std::unique_ptr<uint4[]> h0_out(new uint4[task_num * 2]);
+  std::unique_ptr<uint4[]> h1_out(new uint4[task_num * 2]);
+
+  scrypt_org<LOOKUP_GAP><<<block_num, block_dim>>>(
+      starting_index, task_num, in0, in1,
+      reinterpret_cast<uint4 *>(d_lookup.Ptr()), d0_out.Ptr());
+
+  scrypt_coalesce_access_v3<LOOKUP_GAP>
+      <<<block_num, block_dim, 33 * sizeof(uint4) * block_dim / 4>>>(
+          starting_index, task_num, in0, in1,
+          reinterpret_cast<uint4 *>(d_lookup.Ptr()), d1_out.Ptr());
+
+  CHECK(cudaMemcpy(h0_out.get(), d0_out.Ptr(), d0_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+  CHECK(cudaMemcpy(h1_out.get(), d1_out.Ptr(), d1_out.SizeInBytes(),
+                   cudaMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < task_num * 2; ++i) {
+    EXPECT_EQ(h0_out[i], h1_out[i]);
+  }
+}