From 9f0b8bc77adce4dffe926f7a6de5d086d2f98b42 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 1 Nov 2024 13:02:57 +0100
Subject: [PATCH] Expand and add more CUDA/HIP documentation

Document cuda_pool, cuda_scheduler, cuda_stream, cublas_handle, cusolver_handle, as well as expose
these with CUDA sender adaptors in the documentation. Adds a high-level example of using CUDA functionality.
---
 docs/Doxyfile                                 |  6 +-
 docs/api.rst                                  | 98 ++++++++++++++++++-
 examples/documentation/CMakeLists.txt         |  2 +
 .../include/pika/async_cuda/cuda_pool.hpp     | 70 ++++++++++++-
 .../pika/async_cuda/cuda_scheduler.hpp        | 26 ++++-
 .../include/pika/async_cuda/then_on_host.hpp  | 25 ++---
 .../pika/async_cuda/then_with_stream.hpp      | 84 +++++++++-------
 .../pika/async_cuda_base/cublas_handle.hpp    | 18 +++-
 .../pika/async_cuda_base/cuda_stream.hpp      | 37 ++++++-
 .../pika/async_cuda_base/cusolver_handle.hpp  | 28 ++++--
 10 files changed, 320 insertions(+), 74 deletions(-)

diff --git a/docs/Doxyfile b/docs/Doxyfile
index e2bb88eb4..f0e49a117 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -7,7 +7,9 @@ GENERATE_RTF           = NO
 GENERATE_HTML          = NO
 QUIET                  = NO
 OUTPUT_DIRECTORY       = "$(PIKA_DOCS_DOXYGEN_OUTPUT_DIRECTORY)"
-INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
+INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda_base" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/execution"
 FILE_PATTERNS          = *.cpp *.hpp *.cu
@@ -18,4 +20,4 @@ EXTRACT_ALL            = YES
 ENABLE_PREPPROCESSING  = YES
 MACRO_EXPANSION        = YES
 EXPAND_ONLY_PREDEF     = YES
-PREDEFINED             = PIKA_EXPORT=
+PREDEFINED             = PIKA_EXPORT= PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE=
diff --git a/docs/api.rst b/docs/api.rst
index f461199c3..d0890c635 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -5,6 +5,8 @@
     Distributed under the Boost Software License, Version 1.0. (See accompanying
     file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+:tocdepth: 3
+
 .. _api:
 
 =============
@@ -32,7 +34,6 @@ These headers are part of the public API, but are currently undocumented.
 - ``pika/async_rw_mutex.hpp``
 - ``pika/barrier.hpp``
 - ``pika/condition_variable.hpp``
-- ``pika/cuda.hpp``
 - ``pika/execution.hpp``
 - ``pika/latch.hpp``
 - ``pika/mpi.hpp``
@@ -104,3 +105,98 @@ The ``pika/execution.hpp`` header provides functionality related to ``std::execu
 .. literalinclude:: ../examples/documentation/when_all_vector_documentation.cpp
    :language: c++
    :start-at: #include
+
+.. _header_pika_cuda:
+
+``pika/cuda.hpp``
+=================
+
+The ``pika/cuda.hpp`` header provides functionality related to CUDA and HIP. All functionality is
+under the ``pika::cuda::experimental`` namespace and class and function names contain ``cuda``, even
+when HIP support is enabled. CUDA and HIP functionality can be enabled with the CMake options
+``PIKA_WITH_CUDA`` and ``PIKA_WITH_HIP``, respectively. In the following, whenever CUDA is
+mentioned, it refers to to CUDA and HIP interchangeably.
+
+.. note::
+   https://github.com/pika-org/pika/issues/116 tracks a potential renaming of the functionality
+   to avoid using ``cuda`` even when HIP is enabled. If you have feedback on a rename or just want
+   to follow along, please see that issue.
+
+.. warning::
+   At the moment, ``nvcc`` can not compile stdexec headers. Of the CUDA compilers, only ``nvc++`` is
+   able to compile stdexec headers. If you have stdexec support enabled in pika, either ensure that
+   ``.cu`` files do not include stdexec headers, or use ``nvc++`` to compile your application.
+   However, ``nvc++`` does not officially support compiling device code. Use at your own risk.
+
+   For HIP there are no known restrictions.
+
+The CUDA support in pika relies on four major components:
+
+1. A pool of CUDA streams as well as cuBLAS and cuSOLVER handles. These streams and handles are used
+   in a round-robin fashion by various sender adaptors.
+2. A CUDA scheduler, in the ``std::execution`` sense. This uses the CUDA pool to schedule work on a
+   GPU.
+3. Sender adaptors. A few special-purpose sender adaptors, as well as customizations of a few
+   ``std::execution`` adaptors are provided to help schedule different types of work on a GPU.
+4. Polling of CUDA events integrated into the pika scheduling loop. This integration is essential to
+   avoid calling e.g. ``cudaStreamSynchronize`` on a pika task, which would block the underlying
+   worker thread and thus block progress of other work.
+
+The following example gives an overview of using the above CUDA functionalities in pika:
+
+.. literalinclude:: ../examples/documentation/cuda_overview_documentation.cu
+   :language: c++
+   :start-at: #include
+
+TODO: Note that while cuda_pool gives direct access to streams and handles, the intended usage is to
+access them via the scheduler and sender adaptors.
+
+.. doxygenclass:: pika::cuda::experimental::cuda_pool
+.. doxygenclass:: pika::cuda::experimental::cuda_stream
+.. doxygenclass:: pika::cuda::experimental::cublas_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cublas_handle
+.. doxygenclass:: pika::cuda::experimental::cusolver_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cusolver_handle
+
+..
+   .. literalinclude:: ../examples/documentation/cuda_pool_documentation.cpp
+      :language: c++
+      :start-at: #include
+
+.. doxygenclass:: pika::cuda::experimental::cuda_scheduler
+
+..
+   .. literalinclude:: ../examples/documentation/cuda_scheduler_documentation.cpp
+      :language: c++
+      :start-at: #include
+
+.. doxygenvariable:: pika::cuda::experimental::then_with_stream
+
+.. literalinclude:: ../examples/documentation/then_with_stream_documentation.cu
+    :language: c++
+    :start-at: #include
+
+
+.. doxygenvariable:: pika::cuda::experimental::then_with_cublas
+
+..
+   .. literalinclude:: ../examples/documentation/then_with_cublas_documentation.cpp
+      :language: c++
+      :start-at: #include
+
+.. doxygenvariable:: pika::cuda::experimental::then_with_cusolver
+
+..
+   .. literalinclude:: ../examples/documentation/then_with_cusolver_documentation.cpp
+      :language: c++
+      :start-at: #include
+
+.. note::
+   TODO: Is this useful? Just transfer to host explicitly?
+
+.. doxygenvariable:: pika::cuda::experimental::then_on_host
+
+..
+   .. literalinclude:: ../examples/documentation/then_on_host_documentation.cpp
+      :language: c++
+      :start-at: #include
diff --git a/examples/documentation/CMakeLists.txt b/examples/documentation/CMakeLists.txt
index 80116999b..4578df557 100644
--- a/examples/documentation/CMakeLists.txt
+++ b/examples/documentation/CMakeLists.txt
@@ -5,11 +5,13 @@
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
 set(example_programs
+    # cuda_overview_documentation # TODO
     drop_operation_state_documentation
     drop_value_documentation
     hello_world_documentation
     init_hpp_documentation
     split_tuple_documentation
+    # then_with_stream_documentation # TODO
     unpack_documentation
     when_all_vector_documentation
 )
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
index 8dfcb5ec5..d65f3e5f0 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
@@ -23,6 +23,10 @@
 #include <vector>
 
 namespace pika::cuda::experimental {
+    /// \brief A locked cuBLAS handle.
+    ///
+    /// A handle that provides thread-safe access to a \ref cublas_handle. The locked handle is
+    /// immovable.
     class locked_cublas_handle
     {
         cublas_handle& handle;
@@ -36,9 +40,17 @@ namespace pika::cuda::experimental {
         locked_cublas_handle& operator=(locked_cublas_handle&&) = delete;
         locked_cublas_handle& operator=(locked_cublas_handle const&) = delete;
 
+        /// \brief Access the underlying cuBLAS handle.
+        ///
+        /// \return a reference to the \ref cublas_handle. The returned handle is not thread-safe
+        /// and must be used within the lifetime of the \ref locked_cublas_handle.
         PIKA_EXPORT cublas_handle const& get() noexcept;
     };
 
+    /// \brief A locked cuSOLVER handle.
+    ///
+    /// A handle that provides thread-safe access to a \ref cusolver_handle. The locked handle is
+    /// immovable.
     class locked_cusolver_handle
     {
         cusolver_handle& handle;
@@ -52,15 +64,24 @@ namespace pika::cuda::experimental {
         locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete;
         locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete;
 
+        /// \brief Access the underlying cuSOLVER handle.
+        ///
+        /// \return a reference to the \ref cusolver_handle. The returned handle is not thread-safe
+        /// and must be used within the lifetime of the \ref locked_cusolver_handle.
         PIKA_EXPORT cusolver_handle const& get() noexcept;
     };
 
-    /// A pool of CUDA streams, used for scheduling work on a CUDA device.
+    /// \brief A pool of CUDA streams, used for scheduling work on a CUDA device.
+    ///
+    /// The pool initializes a set of CUDA (thread-local) streams on construction and provides
+    /// access to the streams in a round-robin fashion. The pool also gives access to cuBLAS and
+    /// cuSOLVER handles.
     ///
-    /// The pool initializes a set of CUDA (thread-local) streams on
-    /// construction and provides access to the streams in a round-robin
-    /// fashion. The pool is movable and copyable with reference semantics.
-    /// Copies of a pool still refer to the original pool of streams.
+    /// The pool is movable and copyable with reference semantics.  Copies of a pool still refer to
+    /// the original pool of streams. A moved-from pool can't be used, except to check if it is
+    /// valid with \ref valid().
+    ///
+    /// The pool is equality comparable and formattable.
     class cuda_pool
     {
     private:
@@ -135,25 +156,64 @@ namespace pika::cuda::experimental {
         std::shared_ptr<pool_data> data;
 
     public:
+        /// \brief Construct a pool of CUDA streams and handles.
+        ///
+        /// \param device the CUDA device used for scheduling work
+        /// \param num_normal_priority_streams_per_thread the number of normal priority streams per
+        /// thread
+        /// \param num_high_priority_streams_per_thread the number of high priority streams per
+        /// thread
+        /// \param flags flags used to construct CUDA streams
+        /// \param num_cublas_handles the number of cuBLAS handles to create for the whole pool
+        /// \param num_cusolver_handles the number of cuSOLVER handles to create for the whole pool
         PIKA_EXPORT explicit cuda_pool(int device = 0,
             std::size_t num_normal_priority_streams_per_thread = 3,
             std::size_t num_high_priority_streams_per_thread = 3, unsigned int flags = 0,
             std::size_t num_cublas_handles = 16, std::size_t num_cusolver_handles = 16);
+
+        /// \brief Move constructor.
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool(cuda_pool&&) = default;
+        /// \brief Copy constructor.
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool(cuda_pool const&) = default;
+        /// \brief Move assignment operator.
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool& operator=(cuda_pool&&) = default;
+        /// \brief Copy assignment operator.
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool& operator=(cuda_pool const&) = default;
 
+        /// \brief Check if the pool is valid.
+        ///
+        /// \return true if the pool refers to a valid pool, false otherwise (e.g. if the pool has
+        /// been moved out from)
         PIKA_EXPORT bool valid() const noexcept;
+
+        /// \brief Check if the pool is valid.
+        ///
+        /// See \ref valid().
         PIKA_EXPORT explicit operator bool() noexcept;
+
+        /// \brief Get a reference to the next CUDA stream.
+        ///
+        /// \param priority the priority of the stream
         PIKA_EXPORT cuda_stream const& get_next_stream(
             pika::execution::thread_priority priority = pika::execution::thread_priority::normal);
+
+        /// \brief Get a locked cuBLAS handle.
+        ///
+        /// \param stream the CUDA stream to use with the cuBLAS handle
+        ///
+        /// \return a locked cuBLAS handle, which is released for reuse on destruction
         PIKA_EXPORT locked_cublas_handle get_cublas_handle(
             cuda_stream const& stream, cublasPointerMode_t pointer_mode);
+
+        /// \brief Get a locked cuSOLVER handle.
+        ///
+        /// \param stream the CUDA stream to use with the cuSOLVER handle
+        ///
+        /// \return a locked cuSOLVER handle, which is released for reuse on destruction
         PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
index 6b6cb0153..0a283d392 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
@@ -21,8 +21,13 @@
 namespace pika::cuda::experimental {
     /// A scheduler for running work on a CUDA pool.
     ///
-    /// Provides access to scheduling work on a CUDA context represented by a
-    /// cuda_pool.
+    /// Provides access to scheduling work on a CUDA context represented by a \ref cuda_pool. Models
+    /// the `std::execution` scheduler concept.
+    ///
+    /// Move and copy constructible. The scheduler has reference semantics with respect to the
+    /// associated CUDA pool.
+    ///
+    /// Equality comparable.
     class cuda_scheduler
     {
     private:
@@ -30,18 +35,32 @@ namespace pika::cuda::experimental {
         pika::execution::thread_priority priority;
 
     public:
+        /// \brief TODO
         PIKA_EXPORT
         cuda_scheduler(cuda_pool pool);
         cuda_scheduler(cuda_scheduler&&) = default;
         cuda_scheduler(cuda_scheduler const&) = default;
         cuda_scheduler& operator=(cuda_scheduler&&) = default;
         cuda_scheduler& operator=(cuda_scheduler const&) = default;
-        ~cuda_scheduler(){};
+        ~cuda_scheduler() = default;    // TODO: Default?
 
+        /// \brief Return the \ref cuda_pool associated with this scheduler.
         PIKA_EXPORT cuda_pool const& get_pool() const noexcept;
+
+        /// \brief Return the next available CUDA stream from the pool.
+        ///
+        /// Equivalent to accessing the stream via \ref get_pool().
         PIKA_EXPORT cuda_stream const& get_next_stream();
+
+        /// \brief Return the next available cuBLAS handle from the pool.
+        ///
+        /// Equivalent to accessing the cuBLAS handle via \ref get_pool().
         PIKA_EXPORT locked_cublas_handle get_cublas_handle(
             cuda_stream const& stream, cublasPointerMode_t pointer_mode);
+
+        /// \brief Return the next available cuSOLVER handle from the pool.
+        ///
+        /// Equivalent to accessing the cuSOLVER handle via \ref get_pool().
         PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
@@ -56,6 +75,7 @@ namespace pika::cuda::experimental {
         }
         /// \endcond
 
+        /// TODO
         friend cuda_scheduler tag_invoke(pika::execution::experimental::with_priority_t,
             cuda_scheduler const& scheduler, pika::execution::thread_priority priority)
         {
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp
index 2fb5cedf2..0a79ef68c 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp
@@ -212,17 +212,7 @@ namespace pika::cuda::experimental {
         };
     }    // namespace then_on_host_detail
 
-    // NOTE: This is not a customization of pika::execution::experimental::then.
-    // It retains the cuda_scheduler execution context from the predecessor
-    // sender, but does not run the continuation on a CUDA device. Instead, it
-    // runs the continuation in the polling thread used by the cuda_scheduler on
-    // the CPU. The continuation is run only after synchronizing all previous
-    // events scheduled on the cuda_scheduler. Blocking in the callable given to
-    // then_on_host blocks other work scheduled on cuda_scheduler from
-    // completing. Heavier work should be transferred to a host scheduler as
-    // soon as possible.
-    inline constexpr struct then_on_host_t final
-      : pika::functional::detail::tag_fallback<then_on_host_t>
+    struct then_on_host_t final : pika::functional::detail::tag_fallback<then_on_host_t>
     {
     private:
         template <typename Sender, typename F>
@@ -245,5 +235,16 @@ namespace pika::cuda::experimental {
             return pika::execution::experimental::detail::partial_algorithm<then_on_host_t, F>{
                 PIKA_FORWARD(F, f)};
         }
-    } then_on_host{};
+    };
+
+    /// NOTE: This is not a customization of pika::execution::experimental::then.
+    /// It retains the cuda_scheduler execution context from the predecessor
+    /// sender, but does not run the continuation on a CUDA device. Instead, it
+    /// runs the continuation in the polling thread used by the cuda_scheduler on
+    /// the CPU. The continuation is run only after synchronizing all previous
+    /// events scheduled on the cuda_scheduler. Blocking in the callable given to
+    /// then_on_host blocks other work scheduled on cuda_scheduler from
+    /// completing. Heavier work should be transferred to a host scheduler as
+    /// soon as possible.
+    inline constexpr then_on_host_t then_on_host{};
 }    // namespace pika::cuda::experimental
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
index a0d7deda8..460bc338d 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
@@ -37,9 +37,9 @@
 namespace pika::cuda::experimental::then_with_stream_detail {
     template <typename F, typename... Ts>
     auto invoke_with_thread_local_cublas_handle(cuda_scheduler& sched, cuda_stream const& stream,
-        cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts)
-        -> decltype(PIKA_INVOKE(
-            PIKA_FORWARD(F, f), std::declval<cublasHandle_t>(), PIKA_FORWARD(Ts, ts)...))
+        cublasPointerMode_t pointer_mode, F&& f,
+        Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f), std::declval<cublasHandle_t>(),
+                        PIKA_FORWARD(Ts, ts)...))
     {
         auto locked_handle = sched.get_cublas_handle(stream, pointer_mode);
         return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...);
@@ -48,7 +48,7 @@ namespace pika::cuda::experimental::then_with_stream_detail {
     template <typename F, typename... Ts>
     auto invoke_with_thread_local_cusolver_handle(cuda_scheduler& sched, cuda_stream const& stream,
         F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f),
-        std::declval<cusolverDnHandle_t>(), PIKA_FORWARD(Ts, ts)...))
+                               std::declval<cusolverDnHandle_t>(), PIKA_FORWARD(Ts, ts)...))
     {
         auto locked_handle = sched.get_cusolver_handle(stream);
         return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...);
@@ -249,9 +249,10 @@ namespace pika::cuda::experimental::then_with_stream_detail {
                 }
 
                 template <typename... Ts>
-                auto set_value(Ts&&... ts) noexcept
-                    -> decltype(PIKA_INVOKE(PIKA_MOVE(f), op_state.sched, stream.value(), ts...),
-                        void())
+                auto
+                set_value(Ts&&... ts) noexcept -> decltype(PIKA_INVOKE(PIKA_MOVE(f), op_state.sched,
+                                                               stream.value(), ts...),
+                                                   void())
                 {
                     pika::detail::try_catch_exception_ptr(
                         [&]() mutable {
@@ -389,8 +390,8 @@ namespace pika::cuda::experimental::then_with_stream_detail {
             // matches the argument list").
             template <typename... Ts>
             friend auto tag_invoke(pika::execution::experimental::set_value_t,
-                then_with_cuda_stream_receiver&& r, Ts&&... ts) noexcept
-                -> decltype(r.set_value(PIKA_FORWARD(Ts, ts)...))
+                then_with_cuda_stream_receiver&& r,
+                Ts&&... ts) noexcept -> decltype(r.set_value(PIKA_FORWARD(Ts, ts)...))
             {
                 // nvcc fails to compile this with std::forward<Ts>(ts)...  or
                 // static_cast<Ts&&>(ts)... so we explicitly use
@@ -610,15 +611,7 @@ namespace pika::cuda::experimental {
     // - this operation can only be used when the predecessor sender has
     //   cuda_scheduler as its completion scheduler
 
-    /// Attach a continuation to run f with an additional CUDA stream.
-    ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cudaStream_t
-    /// argument as the last argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. f does not have exclusive
-    /// access to the given stream and other calls may reuse the same stream
-    /// concurrently.
-    inline constexpr struct then_with_stream_t final
+    struct then_with_stream_t final
     {
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f)
@@ -633,17 +626,20 @@ namespace pika::cuda::experimental {
             return pika::execution::experimental::detail::partial_algorithm<then_with_stream_t, F>{
                 PIKA_FORWARD(F, f)};
         }
-    } then_with_stream{};
+    };
 
-    /// Attach a continuation to run f with an additional cuBLAS handle.
+    /// \brief Sender adaptor which calls \p f with CUDA stream.
     ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cublasHandle_t
-    /// argument as the first argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. The handle is
-    /// thread-local and f may not yield a pika thread until after the handle
-    /// has been used the last time by f.
-    inline constexpr struct then_with_cublas_t final
+    /// When the predecessor sender completes, calls \p f with a CUDA stream as the last argument
+    /// after other values sent by the predecessor sender. This adaptor can only be used when the
+    /// completion scheduler is a \ref cuda_scheduler. Other work may be scheduled concurrently on
+    /// the stream passed to \p f. Values sent by the predecessor sender are passed as references to
+    /// \p f and kept alive until the work submitted by \p f to the stream is completed. \p f may
+    /// return as soon as work has been submitted, and a connected receiver will be signaled only
+    /// once the kernels submitted to the stream have completed.
+    inline constexpr then_with_stream_t then_with_stream{};
+
+    struct then_with_cublas_t final
     {
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto
@@ -661,17 +657,19 @@ namespace pika::cuda::experimental {
             return pika::execution::experimental::detail::partial_algorithm<then_with_cublas_t, F,
                 cublasPointerMode_t>{PIKA_FORWARD(F, f), pointer_mode};
         }
-    } then_with_cublas{};
+    };
 
-    /// Attach a continuation to run f with an additional cuSOLVER handle.
+    /// \brief Sender adaptor which calls \p f with a cuBLAS handle.
     ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cusolverDnHandle_t
-    /// argument as the first argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. The handle is
-    /// thread-local and f may not yield a pika thread until after the handle
-    /// has been used the last time by f.
-    inline constexpr struct then_with_cusolver_t final
+    /// This sender is intended to be used to submit work using a cuBLAS handle. The stream
+    /// associated to the handle may also be used to submit work. The handle is accessed through a
+    /// \ref locked_cublas_handle and \p f should return as quickly as possible to avoid blocking
+    /// other work from using the handle.
+    ///
+    /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream.
+    inline constexpr then_with_cublas_t then_with_cublas{};
+
+    struct then_with_cusolver_t final
     {
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f)
@@ -686,5 +684,17 @@ namespace pika::cuda::experimental {
             return pika::execution::experimental::detail::partial_algorithm<then_with_cusolver_t,
                 F>{PIKA_FORWARD(F, f)};
         }
-    } then_with_cusolver{};
+    };
+
+    /// \brief Sender adaptor which calls \p f with a cuSOLVER handle.
+    ///
+    /// This sender is intended to be used to submit work using a cuSOLVER handle. The stream
+    /// associated to the handle may also be used to submit work. The handle is accessed through a
+    /// \ref locked_cusolver_handle and \p f should return as quickly as possible to avoid blocking
+    /// other work from using the handle.
+    ///
+    /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream.
+    inline constexpr then_with_cusolver_t then_with_cusolver
+    {
+    };
 }    // namespace pika::cuda::experimental
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
index be6ff9590..50fdc3093 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
@@ -16,7 +16,12 @@
 #include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a cuBLAS handle.
+    /// \brief RAII wrapper for a cuBLAS handle.
+    ///
+    /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it
+    /// on destruction. The wrapper is movable and copyable. A moved-from handle can not be used
+    /// other than to check for validity with \ref valid(). A copied stream uses the properties from
+    /// the given handle and creates a new handle.
     class cublas_handle
     {
     private:
@@ -27,6 +32,7 @@ namespace pika::cuda::experimental {
         static PIKA_EXPORT cublasHandle_t create_handle(int device, whip::stream_t stream);
 
     public:
+        /// TODO: How to best document constructor and other special member functions.
         PIKA_EXPORT cublas_handle();
         PIKA_EXPORT explicit cublas_handle(cuda_stream const& stream);
         PIKA_EXPORT ~cublas_handle();
@@ -35,11 +41,21 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cublas_handle(cublas_handle const&);
         PIKA_EXPORT cublas_handle& operator=(cublas_handle const&);
 
+        /// \brief Get the underlying cuBLAS handle.
         PIKA_EXPORT cublasHandle_t get() const noexcept;
+
+        /// \brief Get the device associated with the stream of the cuBLAS handle.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the stream associated with the cuBLAS handle.
         PIKA_EXPORT whip::stream_t get_stream() const noexcept;
 
+        // TODO: Add valid/operator bool() methods
+
+        /// \brief Set the stream associated with the cuBLAS handle.
         PIKA_EXPORT void set_stream(cuda_stream const& stream);
+
+        /// \brief Set the cuBLAS pointer mode of the handle.
         PIKA_EXPORT void set_pointer_mode(cublasPointerMode_t pointer_mode);
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
index 5d5220b18..0c3d4a7d6 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
@@ -15,12 +15,16 @@
 #include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a CUDA stream.
+    /// \brief RAII wrapper for a CUDA stream.
     ///
-    /// An RAII wrapper for a CUDA stream which creates a stream on construction
-    /// and destroys it on destruction. Is movable and copiable. A moved-from
-    /// stream holds the default stream. A copied stream uses the properties
-    /// from the given stream and creates a new stream.
+    /// An RAII wrapper for a CUDA stream which creates a stream on construction and destroys it on
+    /// destruction. Is movable and copyable. A moved-from stream holds the default stream. A copied
+    /// stream uses the properties from the given stream and creates a new stream.
+    ///
+    /// Streams are equality comparable and formattable.
+    ///
+    /// When accessing the underlying stream, https://github.com/eth-cscs/whip is used for
+    /// compatibility with CUDA and HIP.
     class cuda_stream
     {
     private:
@@ -40,6 +44,13 @@ namespace pika::cuda::experimental {
             int device, pika::execution::thread_priority priority, unsigned int flags);
 
     public:
+        /// \brief Construct a new stream with the given device and priority.
+        ///
+        /// \param device The device to create the stream on.
+        /// \param priority The priority of the stream. The mapping from \ref thread_priority to
+        /// CUDA stream priorities is undefined, except that the order is preserved, allowing for
+        /// different \ref thread_priority to map to the same CUDA priority.
+        /// \param flags Flags to pass to the CUDA stream creation.
         PIKA_EXPORT explicit cuda_stream(int device = 0,
             pika::execution::thread_priority priority = pika::execution::thread_priority::default_,
             unsigned int flags = 0);
@@ -49,9 +60,25 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cuda_stream(cuda_stream const&);
         PIKA_EXPORT cuda_stream& operator=(cuda_stream const&);
 
+        /// \brief Get the underlying stream.
+        ///
+        /// \return the underlying stream. The stream is still owned by the \ref cuda_stream and
+        /// must not be manually released.
         PIKA_EXPORT whip::stream_t get() const noexcept;
+
+        /// \brief Get the device of the stream.
+        ///
+        /// \return the device of the stream.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the priority of the stream.
+        ///
+        /// \return the priority of the stream.
         PIKA_EXPORT pika::execution::thread_priority get_priority() const noexcept;
+
+        /// brief Get the flags of the stream.
+        ///
+        /// \return the flags of the stream.
         PIKA_EXPORT unsigned int get_flags() const noexcept;
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
index bbd6a1883..c0bea7e4c 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
@@ -7,17 +7,21 @@
 #pragma once
 
 #include <pika/config.hpp>
-#if defined(PIKA_HAVE_GPU_SUPPORT)
-# include <pika/async_cuda_base/cuda_stream.hpp>
-# include <pika/async_cuda_base/custom_lapack_api.hpp>
+#include <pika/async_cuda_base/cuda_stream.hpp>
+#include <pika/async_cuda_base/custom_lapack_api.hpp>
 
-# include <fmt/format.h>
-# include <whip.hpp>
+#include <fmt/format.h>
+#include <whip.hpp>
 
-# include <string>
+#include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a cuSOLVER handle.
+    /// \brief RAII wrapper for a cuSOLVER handle.
+    ///
+    /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it
+    /// on destruction. The wrapper is movable and copyable. A moved-from handle can not be used
+    /// other than to check for validity with \ref valid(). A copied stream uses the properties from
+    /// the given handle and creates a new handle.
     class cusolver_handle
     {
     private:
@@ -28,6 +32,7 @@ namespace pika::cuda::experimental {
         static PIKA_EXPORT cusolverDnHandle_t create_handle(int device, whip::stream_t stream);
 
     public:
+        /// TODO: How to best document constructor and other special member functions.
         PIKA_EXPORT cusolver_handle();
         PIKA_EXPORT explicit cusolver_handle(cuda_stream const& stream);
         PIKA_EXPORT ~cusolver_handle();
@@ -36,10 +41,18 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cusolver_handle(cusolver_handle const&);
         PIKA_EXPORT cusolver_handle& operator=(cusolver_handle const&);
 
+        /// \brief Get the underlying cuSOLVER handle.
         PIKA_EXPORT cusolverDnHandle_t get() const noexcept;
+
+        /// \brief Get the device associated with the cuSOLVER handle.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the stream associated with the cuSOLVER handle.
         PIKA_EXPORT whip::stream_t get_stream() const noexcept;
 
+        // TODO: Add valid/operator bool() methods
+
+        /// \brief Set the stream associated with the cuSOLVER handle.
         PIKA_EXPORT void set_stream(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
@@ -66,4 +79,3 @@ struct fmt::formatter<pika::cuda::experimental::cusolver_handle> : fmt::formatte
             fmt::format("cusolver_handle({})", fmt::ptr(handle.get())), ctx);
     }
 };
-#endif