diff --git a/docs/Doxyfile b/docs/Doxyfile index e2bb88eb4..81190100b 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -7,7 +7,9 @@ GENERATE_RTF = NO GENERATE_HTML = NO QUIET = NO OUTPUT_DIRECTORY = "$(PIKA_DOCS_DOXYGEN_OUTPUT_DIRECTORY)" -INPUT = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \ +INPUT = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda" \ + "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda_base" \ + "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \ "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/runtime" \ "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/execution" FILE_PATTERNS = *.cpp *.hpp *.cu @@ -18,4 +20,4 @@ EXTRACT_ALL = YES ENABLE_PREPPROCESSING = YES MACRO_EXPANSION = YES EXPAND_ONLY_PREDEF = YES -PREDEFINED = PIKA_EXPORT= +PREDEFINED = PIKA_EXPORT= PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE= "PIKA_STATIC_CALL_OPERATOR(...)=operator()(__VA_ARGS__) const" PIKA_FORCEINLINE= diff --git a/docs/api.rst b/docs/api.rst index 0b1282bd1..bfbffcd5a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,6 +5,8 @@ Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +:tocdepth: 3 + .. _api: ============= @@ -32,7 +34,6 @@ These headers are part of the public API, but are currently undocumented. - ``pika/async_rw_mutex.hpp`` - ``pika/barrier.hpp`` - ``pika/condition_variable.hpp`` -- ``pika/cuda.hpp`` - ``pika/latch.hpp`` - ``pika/mpi.hpp`` - ``pika/mutex.hpp`` @@ -47,8 +48,8 @@ is to stabilize those APIs over time. .. _header_pika_init: -``pika/init.hpp`` -================= +Runtime management (``pika/init.hpp``) +====================================== The ``pika/init.hpp`` header provides functionality to manage the pika runtime. @@ -71,8 +72,8 @@ The ``pika/init.hpp`` header provides functionality to manage the pika runtime. .. _header_pika_execution: -``pika/execution.hpp`` -====================== +``std::execution`` support (``pika/execution.hpp``) +=================================================== The ``pika/execution.hpp`` header provides functionality related to ``std::execution``. ``std::execution`` functionality, including extensions provided by pika, is defined in the @@ -136,3 +137,91 @@ All sender adaptors are `customization point objects (CPOs) .. literalinclude:: ../examples/documentation/when_all_vector_documentation.cpp :language: c++ :start-at: #include + +.. _header_pika_cuda: + +CUDA/HIP support (``pika/cuda.hpp``) +==================================== + +The ``pika/cuda.hpp`` header provides functionality related to CUDA and HIP. All functionality is +under the ``pika::cuda::experimental`` namespace and class and function names contain ``cuda``, even +when HIP support is enabled. CUDA and HIP functionality can be enabled with the CMake options +``PIKA_WITH_CUDA`` and ``PIKA_WITH_HIP``, respectively, but they are mutually exclusive. In the +following, whenever CUDA is mentioned, it refers to to CUDA and HIP interchangeably. + +.. note:: + https://github.com/pika-org/pika/issues/116 tracks a potential renaming of the functionality + to avoid using ``cuda`` even when HIP is enabled. If you have feedback on a rename or just want + to follow along, please see that issue. + +.. note:: + pika uses `whip `__ internally for portability between CUDA and + HIP. However, users of pika are not forced to use whip as whip only creates aliases for CUDA/HIP + types and enumerations. whip is thus compatible with directly using the types and enumerations + provided by CUDA/HIP. For cuBLAS, cuSOLVER, rocBLAS, and rocSOLVER support pika does not use a + portability library, but simply uses the appropriate types depending on if CUDA or HIP support is + enabled. + +.. warning:: + At the moment, ``nvcc`` can not compile stdexec headers. Of the CUDA compilers, only ``nvc++`` is + able to compile stdexec headers. If you have stdexec support enabled in pika, either ensure that + ``.cu`` files do not include stdexec headers, or use ``nvc++`` to compile your application. + However, ``nvc++`` does not officially support compiling device code. Use at your own risk. + + For HIP there are no known restrictions. + +The CUDA support in pika relies on four major components: + +1. A pool of CUDA streams as well as cuBLAS and cuSOLVER handles + (:cpp:class:`pika::cuda::experimental::cuda_pool`). These streams and handles are used in a + round-robin fashion by various sender adaptors. +2. A CUDA scheduler, in the ``std::execution`` sense + (:cpp:class:`pika::cuda::experimental::cuda_scheduler`). This uses the CUDA pool to schedule work + on a GPU. +3. Sender adaptors (:cpp:var:`pika::cuda::experimental::then_with_stream` etc.). A few + special-purpose sender adaptors, as well as customizations of a few ``std::execution`` adaptors + are provided to help schedule different types of work on a GPU. +4. Polling of CUDA events integrated into the pika scheduling loop + (:cpp:class:`pika::cuda::experimental::enable_user_polling`). This integration is essential to + avoid calling e.g. ``cudaStreamSynchronize`` on a pika task, which would block the underlying + worker thread and thus block progress of other work. + +The following example gives an overview of using the above CUDA functionalities in pika: + +.. literalinclude:: ../examples/documentation/cuda_overview_documentation.cu + :language: c++ + :start-at: #include + +While :cpp:class:`pika::cuda::experimental::cuda_pool` gives direct access to streams and handles, +the recommended way to access them is through the +:cpp:class:`pika::cuda::experimental::cuda_scheduler` and the sender adaptors available below. + +.. doxygenclass:: pika::cuda::experimental::cuda_scheduler +.. doxygenstruct:: pika::cuda::experimental::then_with_stream_t +.. doxygenvariable:: pika::cuda::experimental::then_with_stream + +.. literalinclude:: ../examples/documentation/then_with_stream_documentation.cu + :language: c++ + :start-at: #include + +.. doxygenstruct:: pika::cuda::experimental::then_with_cublas_t +.. doxygenvariable:: pika::cuda::experimental::then_with_cublas + +.. literalinclude:: ../examples/documentation/then_with_cublas_documentation.cu + :language: c++ + :start-at: #include + +.. doxygenstruct:: pika::cuda::experimental::then_with_cusolver_t +.. doxygenvariable:: pika::cuda::experimental::then_with_cusolver + +See :cpp:var:`pika::cuda::experimental::then_with_cublas` for an example of what can be done with +:cpp:var:`pika::cuda::experimental::then_with_cusolver`. The interfaces are identical except for the +type of handle passed to the callable. + +.. doxygenclass:: pika::cuda::experimental::enable_user_polling +.. doxygenclass:: pika::cuda::experimental::cuda_pool +.. doxygenclass:: pika::cuda::experimental::cuda_stream +.. doxygenclass:: pika::cuda::experimental::cublas_handle +.. doxygenclass:: pika::cuda::experimental::locked_cublas_handle +.. doxygenclass:: pika::cuda::experimental::cusolver_handle +.. doxygenclass:: pika::cuda::experimental::locked_cusolver_handle diff --git a/docs/usage.rst b/docs/usage.rst index dffac840b..f3fd2a8c8 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -67,6 +67,7 @@ pika optionally depends on: ``nvc++``. * `HIP `__ 5.2.0 or greater. HIP support can be enabled with ``PIKA_WITH_HIP=ON``. +* `whip `__ when CUDA or HIP support is enabled. * `MPI `__. MPI support can be enabled with ``PIKA_WITH_MPI=ON``. * `Boost.Context `__ on macOS or exotic platforms which are not supported by the default user-level thread implementations in pika. This can be enabled with diff --git a/examples/documentation/CMakeLists.txt b/examples/documentation/CMakeLists.txt index 31a3e0771..1f443339c 100644 --- a/examples/documentation/CMakeLists.txt +++ b/examples/documentation/CMakeLists.txt @@ -15,8 +15,24 @@ set(example_programs when_all_vector_documentation ) +if(PIKA_WITH_GPU_SUPPORT) + list(APPEND example_programs cuda_overview_documentation then_with_cublas_documentation + then_with_stream_documentation + ) + set(cuda_overview_documentation_GPU ON) + set(then_with_stream_documentation_GPU ON) + set(then_with_cublas_documentation_GPU ON) + set(cuda_overview_documentation_PARAMETERS RUN_SERIAL) + set(then_with_stream_documentation_PARAMETERS RUN_SERIAL) + set(then_with_cublas_documentation_PARAMETERS RUN_SERIAL) +endif() + foreach(example_program ${example_programs}) - set(sources ${example_program}.cpp) + if(${${example_program}_GPU}) + set(sources ${example_program}.cu) + else() + set(sources ${example_program}.cpp) + endif() source_group("Source Files" FILES ${sources}) diff --git a/examples/documentation/cuda_overview_documentation.cu b/examples/documentation/cuda_overview_documentation.cu new file mode 100644 index 000000000..1c552df3c --- /dev/null +++ b/examples/documentation/cuda_overview_documentation.cu @@ -0,0 +1,59 @@ +// Copyright (c) 2024 ETH Zurich +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#include + +#include +#include + +__global__ void kernel() +{ + printf( + "Hello from kernel! threadIdx.x: %d\n", static_cast(threadIdx.x)); +} + +int main(int argc, char* argv[]) +{ + namespace cu = pika::cuda::experimental; + namespace ex = pika::execution::experimental; + namespace tt = pika::this_thread::experimental; + + pika::start(argc, argv); + ex::thread_pool_scheduler cpu_sched{}; + + // Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler + // that uses the pool. + cu::cuda_pool pool{}; + cu::cuda_scheduler cuda_sched{pool}; + + { + // Enable polling of CUDA events on the default pool. This is required + // to allow the adaptors below to signal completion of kernels. + cu::enable_user_polling p{}; + + // The work created by the adaptors below will all be scheduled on the + // same stream from the pool since the work is sequential. + // + // Note that error checking is omitted below. + auto s = ex::just(42) | ex::continues_on(cuda_sched) | + // CUDA kernel through a lambda. + ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) | + // Explicitly launch a CUDA kernel with a stream (see + // https://github.com/eth-cscs/whip for details about whip) + cu::then_with_stream( + [](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); }); + tt::sync_wait(std::move(s)); + } + + pika::finalize(); + pika::stop(); + + return 0; +} diff --git a/examples/documentation/then_with_cublas_documentation.cu b/examples/documentation/then_with_cublas_documentation.cu new file mode 100644 index 000000000..ececa311c --- /dev/null +++ b/examples/documentation/then_with_cublas_documentation.cu @@ -0,0 +1,126 @@ +// Copyright (c) 2024 ETH Zurich +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#if defined(PIKA_HAVE_CUDA) +# include +using blas_handle_t = cublasHandle_t; +auto* blas_gemm = &cublasDgemm; +auto blas_pointer_mode = CUBLAS_POINTER_MODE_HOST; +auto blas_op_n = CUBLAS_OP_N; +#elif defined(PIKA_HAVE_HIP) +# include +using blas_handle_t = rocblas_handle; +# define CUBLAS_POINTER_MODE_HOST rocblas_pointer_mode_host +auto* blas_gemm = &rocblas_dgemm; +auto blas_pointer_mode = rocblas_pointer_mode_host; +auto blas_op_n = rocblas_operation_none; +#endif + +// Owning wrapper for GPU-allocated memory. +class gpu_data +{ + double* p{nullptr}; + std::size_t n{0}; + +public: + // Note that blocking functions such as cudaMalloc will block the underlying + // operating system thread instead of yielding the pika task. Consider using + // e.g. a pool of GPU memory to avoid blocking the thread for too long. + gpu_data(std::size_t n) + : n(n) + { + whip::malloc(&p, sizeof(double) * n); + } + gpu_data(gpu_data&& other) noexcept + : p(std::exchange(other.p, nullptr)) + , n(std::exchange(other.n, 0)) + { + } + gpu_data& operator=(gpu_data&& other) noexcept + { + p = std::exchange(other.p, nullptr); + n = std::exchange(other.n, 0); + return *this; + } + gpu_data(gpu_data const&) = delete; + gpu_data& operator=(gpu_data const&) = delete; + ~gpu_data() { whip::free(p); } + + std::size_t size() const { return n; } + double* get() const { return p; } +}; + +__global__ void init(double* a, double* b, double* c, std::size_t n) +{ + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + { + a[i] = 1.0; + b[i] = 2.0; + c[i] = 3.0; + } +} + +int main(int argc, char* argv[]) +{ + namespace cu = pika::cuda::experimental; + namespace ex = pika::execution::experimental; + namespace tt = pika::this_thread::experimental; + + pika::start(argc, argv); + ex::thread_pool_scheduler cpu_sched{}; + cu::cuda_pool pool{}; + cu::cuda_scheduler cuda_sched{pool}; + + { + cu::enable_user_polling p{}; + + constexpr std::size_t n = 2048; + gpu_data a{n * n}; + gpu_data b{n * n}; + gpu_data c{n * n}; + double alpha = 1.0; + double beta = 1.0; + + auto s = ex::just(std::move(a), std::move(b), std::move(c)) | + ex::continues_on(cuda_sched) | + cu::then_with_stream( + [](auto& a, auto& b, auto& c, whip::stream_t stream) { + init<<>>( + a.get(), b.get(), c.get(), n * n); + return std::make_tuple( + std::move(a), std::move(b), std::move(c)); + }) | + ex::unpack() | + // a, b, and c will be kept alive by the then_with_cublas operation + // state at least until the GPU kernels complete. Values sent by + // the predecessor sender are passed as the last arguments after the + // handle. + cu::then_with_cublas( + [&](blas_handle_t handle, auto& a, auto& b, auto& c) { + blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha, + a.get(), n, b.get(), n, &beta, c.get(), n); + }, + blas_pointer_mode); + tt::sync_wait(std::move(s)); + } + + pika::finalize(); + pika::stop(); + + return 0; +} diff --git a/examples/documentation/then_with_stream_documentation.cu b/examples/documentation/then_with_stream_documentation.cu new file mode 100644 index 000000000..432b079ff --- /dev/null +++ b/examples/documentation/then_with_stream_documentation.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2024 ETH Zurich +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#include + +#include +#include +#include + +__global__ void kernel(int* p, int offset) +{ + printf( + "Hello from kernel! threadIdx.x: %d\n", static_cast(threadIdx.x)); + p[threadIdx.x] = threadIdx.x * threadIdx.x + offset; +} + +int main(int argc, char* argv[]) +{ + namespace cu = pika::cuda::experimental; + namespace ex = pika::execution::experimental; + namespace tt = pika::this_thread::experimental; + + pika::start(argc, argv); + ex::thread_pool_scheduler cpu_sched{}; + cu::cuda_pool pool{}; + cu::cuda_scheduler cuda_sched{pool}; + + { + cu::enable_user_polling p{}; + + constexpr std::size_t n = 32; + int* a = nullptr; + + // whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the + // sender adaptors the allocation, work, and deallocation can all be + // scheduled onto the same stream. + auto s = ex::just(&a, n * sizeof(int)) | ex::continues_on(cuda_sched) | + cu::then_with_stream(whip::malloc_async) | + // The then_with_stream callable accepts values sent by the + // predecessor. They will be passed by reference before the stream. + // This allows e.g. whip::malloc_async to be used above with values + // sent by the just sender. The values are passed by reference and + // will be kept alive until the work done on the stream is done. + cu::then_with_stream( + [&a]( + /* other values by reference here */ whip::stream_t + stream) { + kernel<<<1, n, 0, stream>>>(a, 17); + // Even though the function returns here, the sync_wait below + // will wait for the kernel to finish. Values returned are + // passed on to continuations. + return a; + }) | + cu::then_with_stream(whip::free_async); + + tt::sync_wait(std::move(s)); + } + + pika::finalize(); + pika::stop(); + + return 0; +} diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp index 88679f593..10def061d 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp @@ -21,10 +21,21 @@ namespace pika::cuda::experimental { PIKA_EXPORT std::string const& get_pool_name(); PIKA_EXPORT void set_pool_name(std::string const&); - // ----------------------------------------------------------------- - // This RAII helper class enables polling for a scoped block - struct [[nodiscard]] enable_user_polling + /// \brief Enable CUDA polling on the given thread pool. + /// + /// RAII helper class to enable and disable polling of CUDA events on the given pool. Enabling + /// polling is a requirement to signal completion of work submitted to the \ref cuda_scheduler. + /// + /// There is no detection of whether polling is already enabled or disabled, or if \ref + /// enable_user_polling is nested. The constructor and destructor will unconditionally register + /// and unregister polling, respectively. + class [[nodiscard]] enable_user_polling { + public: + /// \brief Start polling for CUDA events on the given thread pool. + /// + /// \param pool_name The name of the thread pool to enable polling on. The default is to use + /// the default thread pool. enable_user_polling(std::string const& pool_name = "") : pool_name_(pool_name) { @@ -41,6 +52,10 @@ namespace pika::cuda::experimental { } } + /// \brief Stop polling for CUDA events. + /// + /// The destructor will not wait for work submitted to a \ref cuda_scheduler to complete. + /// The user must ensure that work completes before disabling polling. ~enable_user_polling() { if (pool_name_.empty()) diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp index bff585d84..c794d2b85 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp @@ -22,6 +22,13 @@ #include namespace pika::cuda::experimental { + /// \brief A locked cuBLAS handle. + /// + /// A handle that provides thread-safe access to a \ref cublas_handle. The locked handle is + /// immovable. + /// + /// \note The recommended way to access a handle is through sender adaptors using \ref + /// cuda_scheduler. class locked_cublas_handle { cublas_handle& handle; @@ -35,9 +42,20 @@ namespace pika::cuda::experimental { locked_cublas_handle& operator=(locked_cublas_handle&&) = delete; locked_cublas_handle& operator=(locked_cublas_handle const&) = delete; + /// \brief Access the underlying cuBLAS handle. + /// + /// \return a reference to the \ref cublas_handle. The returned handle is not thread-safe + /// and must be used within the lifetime of the \ref locked_cublas_handle. PIKA_EXPORT cublas_handle const& get() noexcept; }; + /// \brief A locked cuSOLVER handle. + /// + /// A handle that provides thread-safe access to a \ref cusolver_handle. The locked handle is + /// immovable. + /// + /// \note The recommended way to access a handle is through sender adaptors using \ref + /// cuda_scheduler. class locked_cusolver_handle { cusolver_handle& handle; @@ -51,15 +69,26 @@ namespace pika::cuda::experimental { locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete; locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete; + /// \brief Access the underlying cuSOLVER handle. + /// + /// \return a reference to the \ref cusolver_handle. The returned handle is not thread-safe + /// and must be used within the lifetime of the \ref locked_cusolver_handle. PIKA_EXPORT cusolver_handle const& get() noexcept; }; - /// A pool of CUDA streams, used for scheduling work on a CUDA device. + /// \brief A pool of CUDA streams, used for scheduling work on a CUDA device. + /// + /// The pool initializes a set of CUDA streams on construction and provides access to the + /// streams in a round-robin fashion. The pool also gives access to cuBLAS and cuSOLVER handles. + /// + /// The pool is movable and copyable with reference semantics. Copies of a pool still refer to + /// the original pool of streams. A moved-from pool can't be used, except to check if it is + /// valid with \ref valid(). /// - /// The pool initializes a set of CUDA streams on construction and - /// provides access to the streams in a round-robin fashion. The pool is - /// movable and copyable with reference semantics. Copies of a pool still - /// refer to the original pool of streams. + /// The pool is equality comparable and formattable. + /// + /// \note The recommended way to access streams and handles from the \ref cuda_pool is through + /// sender adaptors using \ref cuda_scheduler. class cuda_pool { private: @@ -129,9 +158,34 @@ namespace pika::cuda::experimental { std::shared_ptr data; public: + /// \brief Construct a pool of CUDA streams and handles. + /// + /// \param device the CUDA device used for scheduling work + /// \param num_normal_priority_streams the number of normal priority streams + /// \param num_high_priority_streams the number of high priority streams + /// \param flags flags used to construct CUDA streams + /// \param num_cublas_handles the number of cuBLAS handles to create for the whole pool + /// \param num_cusolver_handles the number of cuSOLVER handles to create for the whole pool + /// + /// \note The default values of \p num_normal_priority_streams, \p + /// num_high_priority_streams, \p num_cublas_handles, and \p num_cusolver_handles have been + /// chosen to easily allow saturating most GPUs without creating unnecessarily many streams. + /// In individual situations more streams (e.g. launching many small kernels) or fewer + /// streams (e.g. the GPU does not support more concurrency, or [slows down when using too + /// many streams](https://github.com/ROCm/HIP/issues/3366)) may be more appropriate. Each + /// cuBLAS and cuSOLVER handle may require a significant amount of GPU memory, which is why + /// the default values are lower than the number of streams. The default values have proven + /// to work well e.g. in [DLA-Future](https://github.com/eth-cscs/DLA-Future). + /// + /// \warning Up to and including version 0.30.X the number of streams parameters denoted the + /// number of streams *per worker thread*. From 0.31.0 onwards the parameters denote the + /// total number of streams to create in the pool. The default values were adjusted + /// accordingly, but if you are not using the default values, please verify the values you + /// are passing to the \ref cuda_pool constructor are still reasonable with 0.31.0. PIKA_EXPORT explicit cuda_pool(int device = 0, std::size_t num_normal_priority_streams = 32, std::size_t num_high_priority_streams = 32, unsigned int flags = 0, std::size_t num_cublas_handles = 16, std::size_t num_cusolver_handles = 16); + PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool(cuda_pool&&) = default; PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE @@ -141,12 +195,42 @@ namespace pika::cuda::experimental { PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool& operator=(cuda_pool const&) = default; + /// \brief Check if the pool is valid. + /// + /// \return true if the pool refers to a valid pool, false otherwise (e.g. if the pool has + /// been moved out from). PIKA_EXPORT bool valid() const noexcept; + + /// \brief Check if the pool is valid. + /// + /// See \ref valid(). PIKA_EXPORT explicit operator bool() noexcept; + + /// \brief Get a reference to the next CUDA stream. + /// + /// \note The recommended way to access a stream is through a \ref cuda_scheduler. + /// + /// \param priority the priority of the stream. PIKA_EXPORT cuda_stream const& get_next_stream( pika::execution::thread_priority priority = pika::execution::thread_priority::normal); + + /// \brief Get a locked cuBLAS handle. + /// + /// \note The recommended way to access a handle is through a \ref cuda_scheduler. + /// + /// \param stream the CUDA stream to use with the cuBLAS handle. + /// + /// \return a locked cuBLAS handle, which is released for reuse on destruction. PIKA_EXPORT locked_cublas_handle get_cublas_handle( cuda_stream const& stream, cublasPointerMode_t pointer_mode); + + /// \brief Get a locked cuSOLVER handle. + /// + /// \note The recommended way to access a handle is through a \ref cuda_scheduler. + /// + /// \param stream the CUDA stream to use with the cuSOLVER handle. + /// + /// \return a locked cuSOLVER handle, which is released for reuse on destruction. PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp index 9b3ecf39d..003e9b2f1 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp @@ -23,8 +23,17 @@ namespace pika::cuda::experimental { /// A scheduler for running work on a CUDA pool. /// - /// Provides access to scheduling work on a CUDA context represented by a - /// cuda_pool. + /// Provides access to scheduling work on a CUDA context represented by a \ref cuda_pool. Models + /// the [std::execution scheduler concept](https://eel.is/c++draft/exec.sched). + /// + /// Move and copy constructible. The scheduler has reference semantics with respect to the + /// associated CUDA pool. + /// + /// Equality comparable. + /// + /// \note The recommended way to access streams and handles from the \ref cuda_pool is through + /// the sender adaptors \ref then_with_stream, \ref then_with_cublas, and \ref + /// then_with_cusolver. class cuda_scheduler { private: @@ -32,18 +41,25 @@ namespace pika::cuda::experimental { pika::execution::thread_priority priority; public: - PIKA_EXPORT - cuda_scheduler(cuda_pool pool); + /// \brief Constructs a new \ref cuda_scheduler using the given \ref cuda_pool. + PIKA_EXPORT explicit cuda_scheduler(cuda_pool pool); cuda_scheduler(cuda_scheduler&&) = default; cuda_scheduler(cuda_scheduler const&) = default; cuda_scheduler& operator=(cuda_scheduler&&) = default; cuda_scheduler& operator=(cuda_scheduler const&) = default; ~cuda_scheduler(){}; + /// \brief Return the \ref cuda_pool associated with this scheduler. PIKA_EXPORT cuda_pool const& get_pool() const noexcept; + + /// \brief Return the next available CUDA stream from the pool. PIKA_EXPORT cuda_stream const& get_next_stream(); + + /// \brief Return the next available cuBLAS handle from the pool. PIKA_EXPORT locked_cublas_handle get_cublas_handle( cuda_stream const& stream, cublasPointerMode_t pointer_mode); + + /// \brief Return the next available cuSOLVER handle from the pool. PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL @@ -56,7 +72,6 @@ namespace pika::cuda::experimental { { return !(lhs == rhs); } - /// \endcond friend cuda_scheduler tag_invoke(pika::execution::experimental::with_priority_t, cuda_scheduler const& scheduler, pika::execution::thread_priority priority) @@ -71,6 +86,7 @@ namespace pika::cuda::experimental { { return scheduler.priority; } + /// \endcond }; namespace detail { diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp index b1940dfd3..c0f491971 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp @@ -595,16 +595,14 @@ namespace pika::cuda::experimental { // - this operation can only be used when the predecessor sender has // cuda_scheduler as its completion scheduler - /// Attach a continuation to run f with an additional CUDA stream. - /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cudaStream_t - /// argument as the last argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. f does not have exclusive - /// access to the given stream and other calls may reuse the same stream - /// concurrently. - inline constexpr struct then_with_stream_t final + /// \brief The type of the \ref then_with_stream sender adaptor. + struct then_with_stream_t final { + /// \brief Create a \ref then_with_stream sender. + /// + /// \param sender The predecessor sender. + /// \param f Callable that will be passed a \p cudaStream_t as the last argument. Values + /// from \p sender are passed as references. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f) { @@ -612,24 +610,36 @@ namespace pika::cuda::experimental { then_with_stream_detail::cuda_stream_callable{std::forward(f)}); } + /// \brief Partially bound sender. Expects a sender to be supplied later. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(F&& f) { return pika::execution::experimental::detail::partial_algorithm{ std::forward(f)}; } - } then_with_stream{}; + }; - /// Attach a continuation to run f with an additional cuBLAS handle. + /// \brief Sender adaptor which calls \p f with CUDA stream. /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cublasHandle_t - /// argument as the first argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. The handle is - /// thread-local and f may not yield a pika thread until after the handle - /// has been used the last time by f. - inline constexpr struct then_with_cublas_t final + /// When the predecessor sender completes, calls \p f with a CUDA stream as the last argument + /// after other values sent by the predecessor sender. This adaptor can only be used when the + /// completion scheduler is a \ref cuda_scheduler. Other work may be scheduled concurrently on + /// the stream passed to \p f. Values sent by the predecessor sender are passed as references to + /// \p f and kept alive until the work submitted by \p f to the stream is completed. \p f may + /// return as soon as work has been submitted, and a connected receiver will be signaled only + /// once the kernels submitted to the stream have completed. + inline constexpr then_with_stream_t then_with_stream{}; + + /// \brief The type of the \ref then_with_cublas sender adaptor. + struct then_with_cublas_t final { + /// \brief Create a \ref then_with_cublas sender. + /// + /// \param sender The predecessor sender. + /// \param f Callable that will be passed a \p cublasHandle_t as the first argument. Values + /// from \p sender are passed as references. + /// \param pointer_mode The \p cublasPointerMode_t used for the internal cuBLAS handle, or + /// the equivalent for rocBLAS. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f, cublasPointerMode_t pointer_mode) @@ -639,6 +649,7 @@ namespace pika::cuda::experimental { std::forward(f), pointer_mode}); } + /// \brief Partially bound sender. Expects a sender to be supplied later. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(F&& f, cublasPointerMode_t pointer_mode) @@ -646,18 +657,26 @@ namespace pika::cuda::experimental { return pika::execution::experimental::detail::partial_algorithm{std::forward(f), pointer_mode}; } - } then_with_cublas{}; + }; - /// Attach a continuation to run f with an additional cuSOLVER handle. + /// \brief Sender adaptor which calls \p f with a cuBLAS handle. /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cusolverDnHandle_t - /// argument as the first argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. The handle is - /// thread-local and f may not yield a pika thread until after the handle - /// has been used the last time by f. - inline constexpr struct then_with_cusolver_t final + /// This sender is intended to be used to submit work using a cuBLAS handle. The stream + /// associated to the handle may also be used to submit work. The handle is accessed through a + /// \ref locked_cublas_handle and \p f should return as quickly as possible to avoid blocking + /// other work from using the handle. + /// + /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream, + /// except that the handle is passed as the first argument to match the typical function + /// signatures of cuBLAS functions. + inline constexpr then_with_cublas_t then_with_cublas{}; + + /// \brief The type of the \ref then_with_cusolver sender adaptor. + struct then_with_cusolver_t final { + /// \param sender The predecessor sender. + /// \param f Callable that will be passed a \p cusolverDnHandle_t as the first argument. + /// Values from \p sender are passed as references. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f) { @@ -665,11 +684,24 @@ namespace pika::cuda::experimental { then_with_stream_detail::cusolver_handle_callable{std::forward(f)}); } + /// \brief Partially bound sender. Expects a sender to be supplied later. template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(F&& f) { return pika::execution::experimental::detail::partial_algorithm{std::forward(f)}; } - } then_with_cusolver{}; + }; + + /// \brief Sender adaptor which calls \p f with a cuSOLVER handle. + /// + /// This sender is intended to be used to submit work using a cuSOLVER handle. The stream + /// associated to the handle may also be used to submit work. The handle is accessed through a + /// \ref locked_cusolver_handle and \p f should return as quickly as possible to avoid blocking + /// other work from using the handle. + /// + /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream, + /// except that the handle is passed as the first argument to match the typical function + /// signatures of cuBLAS functions. + inline constexpr then_with_cusolver_t then_with_cusolver{}; } // namespace pika::cuda::experimental diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp index be6ff9590..d782210aa 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp @@ -16,7 +16,18 @@ #include namespace pika::cuda::experimental { - /// RAII wrapper for a cuBLAS handle. + /// \brief RAII wrapper for a cuBLAS handle. + /// + /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it + /// on destruction. + /// + /// The wrapper is movable and copyable. A moved-from handle can not be used other than to check + /// for validity with \ref valid(). A copied stream uses the properties from the given handle + /// and creates a new handle. + /// + /// Equality comparable and formattable. + /// + /// \note The recommended way to access a handle is through a \ref cuda_scheduler. class cublas_handle { private: @@ -27,7 +38,10 @@ namespace pika::cuda::experimental { static PIKA_EXPORT cublasHandle_t create_handle(int device, whip::stream_t stream); public: + /// \brief Constructs a new cuBLAS handle with the default stream. PIKA_EXPORT cublas_handle(); + + /// \brief Constructs a new cuBLAS handle with the given stream. PIKA_EXPORT explicit cublas_handle(cuda_stream const& stream); PIKA_EXPORT ~cublas_handle(); PIKA_EXPORT cublas_handle(cublas_handle&&) noexcept; @@ -35,11 +49,30 @@ namespace pika::cuda::experimental { PIKA_EXPORT cublas_handle(cublas_handle const&); PIKA_EXPORT cublas_handle& operator=(cublas_handle const&); + /// \brief Check if the handle is valid. + /// + /// \return true if the handle refers to a valid handle, false otherwise (e.g. if the handle + /// has been moved out from, or it has been default-constructed) + PIKA_EXPORT bool valid() const noexcept; + + /// \brief Check if the handle is valid. + /// + /// See \ref valid(). + PIKA_EXPORT explicit operator bool() const noexcept; + + /// \brief Get the underlying cuBLAS handle. PIKA_EXPORT cublasHandle_t get() const noexcept; + + /// \brief Get the device associated with the stream of the cuBLAS handle. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the stream associated with the cuBLAS handle. PIKA_EXPORT whip::stream_t get_stream() const noexcept; + /// \brief Set the stream associated with the cuBLAS handle. PIKA_EXPORT void set_stream(cuda_stream const& stream); + + /// \brief Set the cuBLAS pointer mode of the handle. PIKA_EXPORT void set_pointer_mode(cublasPointerMode_t pointer_mode); /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp index 5d5220b18..507c872b1 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp @@ -15,12 +15,19 @@ #include namespace pika::cuda::experimental { - /// RAII wrapper for a CUDA stream. + /// \brief RAII wrapper for a CUDA stream. /// - /// An RAII wrapper for a CUDA stream which creates a stream on construction - /// and destroys it on destruction. Is movable and copiable. A moved-from - /// stream holds the default stream. A copied stream uses the properties - /// from the given stream and creates a new stream. + /// An RAII wrapper for a CUDA stream which creates a stream on construction and destroys it on + /// destruction. It is movable and copyable. A moved-from stream holds the default stream. A + /// copied stream uses the properties from the given stream and creates a new stream. + /// + /// Equality comparable and formattable. + /// + /// When accessing the underlying stream, [whip](https://github.com/eth-cscs/whip) is used for + /// compatibility with CUDA and HIP. + /// + /// \note The recommended way to access a stream is through sender adaptors using \ref + /// cuda_scheduler. class cuda_stream { private: @@ -40,6 +47,13 @@ namespace pika::cuda::experimental { int device, pika::execution::thread_priority priority, unsigned int flags); public: + /// \brief Construct a new stream with the given device and priority. + /// + /// \param device The device to create the stream on. + /// \param priority The priority of the stream. The mapping from \ref thread_priority to + /// CUDA stream priorities is undefined, except that the order is preserved, allowing for + /// different \ref thread_priority to map to the same CUDA priority. + /// \param flags Flags to pass to the CUDA stream creation. PIKA_EXPORT explicit cuda_stream(int device = 0, pika::execution::thread_priority priority = pika::execution::thread_priority::default_, unsigned int flags = 0); @@ -49,9 +63,18 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_stream(cuda_stream const&); PIKA_EXPORT cuda_stream& operator=(cuda_stream const&); + /// \brief Get the underlying stream. + /// + /// The stream is still owned by the \ref cuda_stream and must not be manually released. PIKA_EXPORT whip::stream_t get() const noexcept; + + /// \brief Get the device of the stream. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the priority of the stream. PIKA_EXPORT pika::execution::thread_priority get_priority() const noexcept; + + /// brief Get the flags of the stream. PIKA_EXPORT unsigned int get_flags() const noexcept; /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp index bbd6a1883..5bea99f82 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp @@ -7,17 +7,28 @@ #pragma once #include -#if defined(PIKA_HAVE_GPU_SUPPORT) -# include -# include +#include +#include -# include -# include +#include +#include -# include +#include namespace pika::cuda::experimental { - /// RAII wrapper for a cuSOLVER handle. + /// \brief RAII wrapper for a cuSOLVER handle. + /// + /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it + /// on destruction. + /// + /// The wrapper is movable and copyable. A moved-from handle can not be used other than to check + /// for validity with \ref valid(). A copied stream uses the properties from the given handle + /// and creates a new handle. + /// + /// Equality comparable and formattable. + /// + /// \note The recommended way to access a handle is through sender adaptors using \ref + /// cuda_scheduler. class cusolver_handle { private: @@ -28,7 +39,10 @@ namespace pika::cuda::experimental { static PIKA_EXPORT cusolverDnHandle_t create_handle(int device, whip::stream_t stream); public: + /// \brief Constructs a new cuSOLVER handle with the default stream. PIKA_EXPORT cusolver_handle(); + + /// \brief Constructs a new cuSOLVER handle with the given stream. PIKA_EXPORT explicit cusolver_handle(cuda_stream const& stream); PIKA_EXPORT ~cusolver_handle(); PIKA_EXPORT cusolver_handle(cusolver_handle&&) noexcept; @@ -36,10 +50,27 @@ namespace pika::cuda::experimental { PIKA_EXPORT cusolver_handle(cusolver_handle const&); PIKA_EXPORT cusolver_handle& operator=(cusolver_handle const&); + /// \brief Check if the handle is valid. + /// + /// \return true if the handle refers to a valid handle, false otherwise (e.g. if the handle + /// has been moved out from, or it has been default-constructed) + PIKA_EXPORT bool valid() const noexcept; + + /// \brief Check if the handle is valid. + /// + /// See \ref valid(). + PIKA_EXPORT explicit operator bool() const noexcept; + + /// \brief Get the underlying cuSOLVER handle. PIKA_EXPORT cusolverDnHandle_t get() const noexcept; + + /// \brief Get the device associated with the cuSOLVER handle. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the stream associated with the cuSOLVER handle. PIKA_EXPORT whip::stream_t get_stream() const noexcept; + /// \brief Set the stream associated with the cuSOLVER handle. PIKA_EXPORT void set_stream(cuda_stream const& stream); /// \cond NOINTERNAL @@ -66,4 +97,3 @@ struct fmt::formatter : fmt::formatte fmt::format("cusolver_handle({})", fmt::ptr(handle.get())), ctx); } }; -#endif diff --git a/libs/pika/async_cuda_base/src/cublas_handle.cpp b/libs/pika/async_cuda_base/src/cublas_handle.cpp index a2b51d408..a637ac9f6 100644 --- a/libs/pika/async_cuda_base/src/cublas_handle.cpp +++ b/libs/pika/async_cuda_base/src/cublas_handle.cpp @@ -65,7 +65,7 @@ namespace pika::cuda::experimental { cublas_handle::cublas_handle(cublas_handle const& other) : device(other.device) , stream(other.stream) - , handle(other.handle != 0 ? create_handle(device, stream) : 0) + , handle(other.valid() ? create_handle(device, stream) : 0) { } @@ -73,20 +73,20 @@ namespace pika::cuda::experimental { { device = other.device; stream = other.stream; - handle = other.handle != 0 ? create_handle(device, stream) : 0; + handle = other.valid() ? create_handle(device, stream) : 0; return *this; } cublas_handle::~cublas_handle() { - if (handle != 0) { check_cublas_error(cublasDestroy(handle)); } + if (valid()) { check_cublas_error(cublasDestroy(handle)); } } + bool cublas_handle::valid() const noexcept { return bool(handle); } + cublas_handle::operator bool() const noexcept { return bool(handle); } cublasHandle_t cublas_handle::get() const noexcept { return handle; } - int cublas_handle::get_device() const noexcept { return device; } - whip::stream_t cublas_handle::get_stream() const noexcept { return stream; } void cublas_handle::set_stream(cuda_stream const& stream) diff --git a/libs/pika/async_cuda_base/src/cusolver_handle.cpp b/libs/pika/async_cuda_base/src/cusolver_handle.cpp index 7b54c0df1..f4c6e8384 100644 --- a/libs/pika/async_cuda_base/src/cusolver_handle.cpp +++ b/libs/pika/async_cuda_base/src/cusolver_handle.cpp @@ -65,7 +65,7 @@ namespace pika::cuda::experimental { cusolver_handle::cusolver_handle(cusolver_handle const& other) : device(other.device) , stream(other.stream) - , handle(other.handle != 0 ? create_handle(device, stream) : 0) + , handle(other.valid() ? create_handle(device, stream) : 0) { } @@ -73,20 +73,20 @@ namespace pika::cuda::experimental { { device = other.device; stream = other.stream; - handle = other.handle != 0 ? create_handle(device, stream) : 0; + handle = other.valid() ? create_handle(device, stream) : 0; return *this; } cusolver_handle::~cusolver_handle() { - if (handle != 0) { check_cusolver_error(cusolverDnDestroy(handle)); } + if (valid()) { check_cusolver_error(cusolverDnDestroy(handle)); } } + bool cusolver_handle::valid() const noexcept { return bool(handle); } + cusolver_handle::operator bool() const noexcept { return bool(handle); } cusolverDnHandle_t cusolver_handle::get() const noexcept { return handle; } - int cusolver_handle::get_device() const noexcept { return device; } - whip::stream_t cusolver_handle::get_stream() const noexcept { return stream; } void cusolver_handle::set_stream(cuda_stream const& stream)