Merge pull request #1309 from msimberg/cuda-docs

Expand and add more CUDA/HIP documentation
pika-org · Nov 26, 2024 · 89bef2d · 89bef2d
2 parents 1e5ce1d + c0ca2c9
commit 89bef2d
Show file tree

Hide file tree

Showing 16 changed files with 669 additions and 73 deletions.
diff --git a/docs/Doxyfile b/docs/Doxyfile
@@ -7,7 +7,9 @@ GENERATE_RTF           = NO
 GENERATE_HTML          = NO
 QUIET                  = NO
 OUTPUT_DIRECTORY       = "$(PIKA_DOCS_DOXYGEN_OUTPUT_DIRECTORY)"
-INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
+INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda_base" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/execution"
 FILE_PATTERNS          = *.cpp *.hpp *.cu
@@ -18,4 +20,4 @@ EXTRACT_ALL            = YES
 ENABLE_PREPPROCESSING  = YES
 MACRO_EXPANSION        = YES
 EXPAND_ONLY_PREDEF     = YES
-PREDEFINED             = PIKA_EXPORT=
+PREDEFINED             = PIKA_EXPORT= PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE= "PIKA_STATIC_CALL_OPERATOR(...)=operator()(__VA_ARGS__) const" PIKA_FORCEINLINE=
diff --git a/docs/api.rst b/docs/api.rst
@@ -5,6 +5,8 @@
     Distributed under the Boost Software License, Version 1.0. (See accompanying
     file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+:tocdepth: 3
+
 .. _api:
 
 =============
@@ -32,7 +34,6 @@ These headers are part of the public API, but are currently undocumented.
 - ``pika/async_rw_mutex.hpp``
 - ``pika/barrier.hpp``
 - ``pika/condition_variable.hpp``
-- ``pika/cuda.hpp``
 - ``pika/latch.hpp``
 - ``pika/mpi.hpp``
 - ``pika/mutex.hpp``
@@ -47,8 +48,8 @@ is to stabilize those APIs over time.
 
 .. _header_pika_init:
 
-``pika/init.hpp``
-=================
+Runtime management (``pika/init.hpp``)
+======================================
 
 The ``pika/init.hpp`` header provides functionality to manage the pika runtime.
 
@@ -71,8 +72,8 @@ The ``pika/init.hpp`` header provides functionality to manage the pika runtime.
 
 .. _header_pika_execution:
 
-``pika/execution.hpp``
-======================
+``std::execution`` support (``pika/execution.hpp``)
+===================================================
 
 The ``pika/execution.hpp`` header provides functionality related to ``std::execution``.
 ``std::execution`` functionality, including extensions provided by pika, is defined in the
@@ -136,3 +137,91 @@ All sender adaptors are `customization point objects (CPOs)
 .. literalinclude:: ../examples/documentation/when_all_vector_documentation.cpp
    :language: c++
    :start-at: #include
+
+.. _header_pika_cuda:
+
+CUDA/HIP support (``pika/cuda.hpp``)
+====================================
+
+The ``pika/cuda.hpp`` header provides functionality related to CUDA and HIP. All functionality is
+under the ``pika::cuda::experimental`` namespace and class and function names contain ``cuda``, even
+when HIP support is enabled. CUDA and HIP functionality can be enabled with the CMake options
+``PIKA_WITH_CUDA`` and ``PIKA_WITH_HIP``, respectively, but they are mutually exclusive. In the
+following, whenever CUDA is mentioned, it refers to to CUDA and HIP interchangeably.
+
+.. note::
+   https://github.com/pika-org/pika/issues/116 tracks a potential renaming of the functionality
+   to avoid using ``cuda`` even when HIP is enabled. If you have feedback on a rename or just want
+   to follow along, please see that issue.
+
+.. note::
+   pika uses `whip <https://github.com/eth-cscs/whip>`__ internally for portability between CUDA and
+   HIP. However, users of pika are not forced to use whip as whip only creates aliases for CUDA/HIP
+   types and enumerations. whip is thus compatible with directly using the types and enumerations
+   provided by CUDA/HIP. For cuBLAS, cuSOLVER, rocBLAS, and rocSOLVER support pika does not use a
+   portability library, but simply uses the appropriate types depending on if CUDA or HIP support is
+   enabled.
+
+.. warning::
+   At the moment, ``nvcc`` can not compile stdexec headers. Of the CUDA compilers, only ``nvc++`` is
+   able to compile stdexec headers. If you have stdexec support enabled in pika, either ensure that
+   ``.cu`` files do not include stdexec headers, or use ``nvc++`` to compile your application.
+   However, ``nvc++`` does not officially support compiling device code. Use at your own risk.
+
+   For HIP there are no known restrictions.
+
+The CUDA support in pika relies on four major components:
+
+1. A pool of CUDA streams as well as cuBLAS and cuSOLVER handles
+   (:cpp:class:`pika::cuda::experimental::cuda_pool`). These streams and handles are used in a
+   round-robin fashion by various sender adaptors.
+2. A CUDA scheduler, in the ``std::execution`` sense
+   (:cpp:class:`pika::cuda::experimental::cuda_scheduler`). This uses the CUDA pool to schedule work
+   on a GPU.
+3. Sender adaptors (:cpp:var:`pika::cuda::experimental::then_with_stream` etc.). A few
+   special-purpose sender adaptors, as well as customizations of a few ``std::execution`` adaptors
+   are provided to help schedule different types of work on a GPU.
+4. Polling of CUDA events integrated into the pika scheduling loop
+   (:cpp:class:`pika::cuda::experimental::enable_user_polling`). This integration is essential to
+   avoid calling e.g. ``cudaStreamSynchronize`` on a pika task, which would block the underlying
+   worker thread and thus block progress of other work.
+
+The following example gives an overview of using the above CUDA functionalities in pika:
+
+.. literalinclude:: ../examples/documentation/cuda_overview_documentation.cu
+   :language: c++
+   :start-at: #include
+
+While :cpp:class:`pika::cuda::experimental::cuda_pool` gives direct access to streams and handles,
+the recommended way to access them is through the
+:cpp:class:`pika::cuda::experimental::cuda_scheduler` and the sender adaptors available below.
+
+.. doxygenclass:: pika::cuda::experimental::cuda_scheduler
+.. doxygenstruct:: pika::cuda::experimental::then_with_stream_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_stream
+
+.. literalinclude:: ../examples/documentation/then_with_stream_documentation.cu
+    :language: c++
+    :start-at: #include
+
+.. doxygenstruct:: pika::cuda::experimental::then_with_cublas_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_cublas
+
+.. literalinclude:: ../examples/documentation/then_with_cublas_documentation.cu
+    :language: c++
+    :start-at: #include
+
+.. doxygenstruct:: pika::cuda::experimental::then_with_cusolver_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_cusolver
+
+See :cpp:var:`pika::cuda::experimental::then_with_cublas` for an example of what can be done with
+:cpp:var:`pika::cuda::experimental::then_with_cusolver`. The interfaces are identical except for the
+type of handle passed to the callable.
+
+.. doxygenclass:: pika::cuda::experimental::enable_user_polling
+.. doxygenclass:: pika::cuda::experimental::cuda_pool
+.. doxygenclass:: pika::cuda::experimental::cuda_stream
+.. doxygenclass:: pika::cuda::experimental::cublas_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cublas_handle
+.. doxygenclass:: pika::cuda::experimental::cusolver_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cusolver_handle
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -67,6 +67,7 @@ pika optionally depends on:
   ``nvc++``.
 * `HIP <https://rocmdocs.amd.com/en/latest/index.html>`__ 5.2.0 or greater. HIP support can be
   enabled with ``PIKA_WITH_HIP=ON``.
+* `whip <https://github.com/eth-cscs/whip>`__  when CUDA or HIP support is enabled.
 * `MPI <https://www.mpi-forum.org/>`__. MPI support can be enabled with ``PIKA_WITH_MPI=ON``.
 * `Boost.Context <https://boost.org>`__ on macOS or exotic platforms which are not supported by the
   default user-level thread implementations in pika. This can be enabled with

diff --git a/examples/documentation/CMakeLists.txt b/examples/documentation/CMakeLists.txt
@@ -15,8 +15,24 @@ set(example_programs
     when_all_vector_documentation
 )
 
+if(PIKA_WITH_GPU_SUPPORT)
+  list(APPEND example_programs cuda_overview_documentation then_with_cublas_documentation
+       then_with_stream_documentation
+  )
+  set(cuda_overview_documentation_GPU ON)
+  set(then_with_stream_documentation_GPU ON)
+  set(then_with_cublas_documentation_GPU ON)
+  set(cuda_overview_documentation_PARAMETERS RUN_SERIAL)
+  set(then_with_stream_documentation_PARAMETERS RUN_SERIAL)
+  set(then_with_cublas_documentation_PARAMETERS RUN_SERIAL)
+endif()
+
 foreach(example_program ${example_programs})
-  set(sources ${example_program}.cpp)
+  if(${${example_program}_GPU})
+    set(sources ${example_program}.cu)
+  else()
+    set(sources ${example_program}.cpp)
+  endif()
 
   source_group("Source Files" FILES ${sources})
 

diff --git a/examples/documentation/cuda_overview_documentation.cu b/examples/documentation/cuda_overview_documentation.cu
@@ -0,0 +1,59 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <pika/cuda.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+
+#include <whip.hpp>
+
+#include <cstdio>
+#include <utility>
+
+__global__ void kernel()
+{
+    printf(
+        "Hello from kernel! threadIdx.x: %d\n", static_cast<int>(threadIdx.x));
+}
+
+int main(int argc, char* argv[])
+{
+    namespace cu = pika::cuda::experimental;
+    namespace ex = pika::execution::experimental;
+    namespace tt = pika::this_thread::experimental;
+
+    pika::start(argc, argv);
+    ex::thread_pool_scheduler cpu_sched{};
+
+    // Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler
+    // that uses the pool.
+    cu::cuda_pool pool{};
+    cu::cuda_scheduler cuda_sched{pool};
+
+    {
+        // Enable polling of CUDA events on the default pool. This is required
+        // to allow the adaptors below to signal completion of kernels.
+        cu::enable_user_polling p{};
+
+        // The work created by the adaptors below will all be scheduled on the
+        // same stream from the pool since the work is sequential.
+        //
+        // Note that error checking is omitted below.
+        auto s = ex::just(42) | ex::continues_on(cuda_sched) |
+            // CUDA kernel through a lambda.
+            ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) |
+            // Explicitly launch a CUDA kernel with a stream (see
+            // https://github.com/eth-cscs/whip for details about whip)
+            cu::then_with_stream(
+                [](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
+        tt::sync_wait(std::move(s));
+    }
+
+    pika::finalize();
+    pika::stop();
+
+    return 0;
+}
diff --git a/examples/documentation/then_with_cublas_documentation.cu b/examples/documentation/then_with_cublas_documentation.cu
@@ -0,0 +1,126 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <pika/cuda.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+
+#include <fmt/printf.h>
+#include <whip.hpp>
+
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+#if defined(PIKA_HAVE_CUDA)
+# include <cublas_v2.h>
+using blas_handle_t = cublasHandle_t;
+auto* blas_gemm = &cublasDgemm;
+auto blas_pointer_mode = CUBLAS_POINTER_MODE_HOST;
+auto blas_op_n = CUBLAS_OP_N;
+#elif defined(PIKA_HAVE_HIP)
+# include <rocblas/rocblas.h>
+using blas_handle_t = rocblas_handle;
+# define CUBLAS_POINTER_MODE_HOST rocblas_pointer_mode_host
+auto* blas_gemm = &rocblas_dgemm;
+auto blas_pointer_mode = rocblas_pointer_mode_host;
+auto blas_op_n = rocblas_operation_none;
+#endif
+
+// Owning wrapper for GPU-allocated memory.
+class gpu_data
+{
+    double* p{nullptr};
+    std::size_t n{0};
+
+public:
+    // Note that blocking functions such as cudaMalloc will block the underlying
+    // operating system thread instead of yielding the pika task. Consider using
+    // e.g. a pool of GPU memory to avoid blocking the thread for too long.
+    gpu_data(std::size_t n)
+      : n(n)
+    {
+        whip::malloc(&p, sizeof(double) * n);
+    }
+    gpu_data(gpu_data&& other) noexcept
+      : p(std::exchange(other.p, nullptr))
+      , n(std::exchange(other.n, 0))
+    {
+    }
+    gpu_data& operator=(gpu_data&& other) noexcept
+    {
+        p = std::exchange(other.p, nullptr);
+        n = std::exchange(other.n, 0);
+        return *this;
+    }
+    gpu_data(gpu_data const&) = delete;
+    gpu_data& operator=(gpu_data const&) = delete;
+    ~gpu_data() { whip::free(p); }
+
+    std::size_t size() const { return n; }
+    double* get() const { return p; }
+};
+
+__global__ void init(double* a, double* b, double* c, std::size_t n)
+{
+    std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+    {
+        a[i] = 1.0;
+        b[i] = 2.0;
+        c[i] = 3.0;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    namespace cu = pika::cuda::experimental;
+    namespace ex = pika::execution::experimental;
+    namespace tt = pika::this_thread::experimental;
+
+    pika::start(argc, argv);
+    ex::thread_pool_scheduler cpu_sched{};
+    cu::cuda_pool pool{};
+    cu::cuda_scheduler cuda_sched{pool};
+
+    {
+        cu::enable_user_polling p{};
+
+        constexpr std::size_t n = 2048;
+        gpu_data a{n * n};
+        gpu_data b{n * n};
+        gpu_data c{n * n};
+        double alpha = 1.0;
+        double beta = 1.0;
+
+        auto s = ex::just(std::move(a), std::move(b), std::move(c)) |
+            ex::continues_on(cuda_sched) |
+            cu::then_with_stream(
+                [](auto& a, auto& b, auto& c, whip::stream_t stream) {
+                    init<<<n * n / 256, 256, 0, stream>>>(
+                        a.get(), b.get(), c.get(), n * n);
+                    return std::make_tuple(
+                        std::move(a), std::move(b), std::move(c));
+                }) |
+            ex::unpack() |
+            // a, b, and c will be kept alive by the then_with_cublas operation
+            // state at least until the GPU kernels complete.  Values sent by
+            // the predecessor sender are passed as the last arguments after the
+            // handle.
+            cu::then_with_cublas(
+                [&](blas_handle_t handle, auto& a, auto& b, auto& c) {
+                    blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha,
+                        a.get(), n, b.get(), n, &beta, c.get(), n);
+                },
+                blas_pointer_mode);
+        tt::sync_wait(std::move(s));
+    }
+
+    pika::finalize();
+    pika::stop();
+
+    return 0;
+}