diff --git a/docs/Doxyfile b/docs/Doxyfile
index e2bb88eb4..81190100b 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -7,7 +7,9 @@ GENERATE_RTF           = NO
 GENERATE_HTML          = NO
 QUIET                  = NO
 OUTPUT_DIRECTORY       = "$(PIKA_DOCS_DOXYGEN_OUTPUT_DIRECTORY)"
-INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
+INPUT                  = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda_base" \
+                         "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/runtime" \
                          "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/execution"
 FILE_PATTERNS          = *.cpp *.hpp *.cu
@@ -18,4 +20,4 @@ EXTRACT_ALL            = YES
 ENABLE_PREPPROCESSING  = YES
 MACRO_EXPANSION        = YES
 EXPAND_ONLY_PREDEF     = YES
-PREDEFINED             = PIKA_EXPORT=
+PREDEFINED             = PIKA_EXPORT= PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE= "PIKA_STATIC_CALL_OPERATOR(...)=operator()(__VA_ARGS__) const" PIKA_FORCEINLINE=
diff --git a/docs/api.rst b/docs/api.rst
index 0b1282bd1..bfbffcd5a 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -5,6 +5,8 @@
     Distributed under the Boost Software License, Version 1.0. (See accompanying
     file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
+:tocdepth: 3
+
 .. _api:
 
 =============
@@ -32,7 +34,6 @@ These headers are part of the public API, but are currently undocumented.
 - ``pika/async_rw_mutex.hpp``
 - ``pika/barrier.hpp``
 - ``pika/condition_variable.hpp``
-- ``pika/cuda.hpp``
 - ``pika/latch.hpp``
 - ``pika/mpi.hpp``
 - ``pika/mutex.hpp``
@@ -47,8 +48,8 @@ is to stabilize those APIs over time.
 
 .. _header_pika_init:
 
-``pika/init.hpp``
-=================
+Runtime management (``pika/init.hpp``)
+======================================
 
 The ``pika/init.hpp`` header provides functionality to manage the pika runtime.
 
@@ -71,8 +72,8 @@ The ``pika/init.hpp`` header provides functionality to manage the pika runtime.
 
 .. _header_pika_execution:
 
-``pika/execution.hpp``
-======================
+``std::execution`` support (``pika/execution.hpp``)
+===================================================
 
 The ``pika/execution.hpp`` header provides functionality related to ``std::execution``.
 ``std::execution`` functionality, including extensions provided by pika, is defined in the
@@ -136,3 +137,91 @@ All sender adaptors are `customization point objects (CPOs)
 .. literalinclude:: ../examples/documentation/when_all_vector_documentation.cpp
    :language: c++
    :start-at: #include
+
+.. _header_pika_cuda:
+
+CUDA/HIP support (``pika/cuda.hpp``)
+====================================
+
+The ``pika/cuda.hpp`` header provides functionality related to CUDA and HIP. All functionality is
+under the ``pika::cuda::experimental`` namespace and class and function names contain ``cuda``, even
+when HIP support is enabled. CUDA and HIP functionality can be enabled with the CMake options
+``PIKA_WITH_CUDA`` and ``PIKA_WITH_HIP``, respectively, but they are mutually exclusive. In the
+following, whenever CUDA is mentioned, it refers to to CUDA and HIP interchangeably.
+
+.. note::
+   https://github.com/pika-org/pika/issues/116 tracks a potential renaming of the functionality
+   to avoid using ``cuda`` even when HIP is enabled. If you have feedback on a rename or just want
+   to follow along, please see that issue.
+
+.. note::
+   pika uses `whip <https://github.com/eth-cscs/whip>`__ internally for portability between CUDA and
+   HIP. However, users of pika are not forced to use whip as whip only creates aliases for CUDA/HIP
+   types and enumerations. whip is thus compatible with directly using the types and enumerations
+   provided by CUDA/HIP. For cuBLAS, cuSOLVER, rocBLAS, and rocSOLVER support pika does not use a
+   portability library, but simply uses the appropriate types depending on if CUDA or HIP support is
+   enabled.
+
+.. warning::
+   At the moment, ``nvcc`` can not compile stdexec headers. Of the CUDA compilers, only ``nvc++`` is
+   able to compile stdexec headers. If you have stdexec support enabled in pika, either ensure that
+   ``.cu`` files do not include stdexec headers, or use ``nvc++`` to compile your application.
+   However, ``nvc++`` does not officially support compiling device code. Use at your own risk.
+
+   For HIP there are no known restrictions.
+
+The CUDA support in pika relies on four major components:
+
+1. A pool of CUDA streams as well as cuBLAS and cuSOLVER handles
+   (:cpp:class:`pika::cuda::experimental::cuda_pool`). These streams and handles are used in a
+   round-robin fashion by various sender adaptors.
+2. A CUDA scheduler, in the ``std::execution`` sense
+   (:cpp:class:`pika::cuda::experimental::cuda_scheduler`). This uses the CUDA pool to schedule work
+   on a GPU.
+3. Sender adaptors (:cpp:var:`pika::cuda::experimental::then_with_stream` etc.). A few
+   special-purpose sender adaptors, as well as customizations of a few ``std::execution`` adaptors
+   are provided to help schedule different types of work on a GPU.
+4. Polling of CUDA events integrated into the pika scheduling loop
+   (:cpp:class:`pika::cuda::experimental::enable_user_polling`). This integration is essential to
+   avoid calling e.g. ``cudaStreamSynchronize`` on a pika task, which would block the underlying
+   worker thread and thus block progress of other work.
+
+The following example gives an overview of using the above CUDA functionalities in pika:
+
+.. literalinclude:: ../examples/documentation/cuda_overview_documentation.cu
+   :language: c++
+   :start-at: #include
+
+While :cpp:class:`pika::cuda::experimental::cuda_pool` gives direct access to streams and handles,
+the recommended way to access them is through the
+:cpp:class:`pika::cuda::experimental::cuda_scheduler` and the sender adaptors available below.
+
+.. doxygenclass:: pika::cuda::experimental::cuda_scheduler
+.. doxygenstruct:: pika::cuda::experimental::then_with_stream_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_stream
+
+.. literalinclude:: ../examples/documentation/then_with_stream_documentation.cu
+    :language: c++
+    :start-at: #include
+
+.. doxygenstruct:: pika::cuda::experimental::then_with_cublas_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_cublas
+
+.. literalinclude:: ../examples/documentation/then_with_cublas_documentation.cu
+    :language: c++
+    :start-at: #include
+
+.. doxygenstruct:: pika::cuda::experimental::then_with_cusolver_t
+.. doxygenvariable:: pika::cuda::experimental::then_with_cusolver
+
+See :cpp:var:`pika::cuda::experimental::then_with_cublas` for an example of what can be done with
+:cpp:var:`pika::cuda::experimental::then_with_cusolver`. The interfaces are identical except for the
+type of handle passed to the callable.
+
+.. doxygenclass:: pika::cuda::experimental::enable_user_polling
+.. doxygenclass:: pika::cuda::experimental::cuda_pool
+.. doxygenclass:: pika::cuda::experimental::cuda_stream
+.. doxygenclass:: pika::cuda::experimental::cublas_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cublas_handle
+.. doxygenclass:: pika::cuda::experimental::cusolver_handle
+.. doxygenclass:: pika::cuda::experimental::locked_cusolver_handle
diff --git a/docs/usage.rst b/docs/usage.rst
index dffac840b..f3fd2a8c8 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -67,6 +67,7 @@ pika optionally depends on:
   ``nvc++``.
 * `HIP <https://rocmdocs.amd.com/en/latest/index.html>`__ 5.2.0 or greater. HIP support can be
   enabled with ``PIKA_WITH_HIP=ON``.
+* `whip <https://github.com/eth-cscs/whip>`__  when CUDA or HIP support is enabled.
 * `MPI <https://www.mpi-forum.org/>`__. MPI support can be enabled with ``PIKA_WITH_MPI=ON``.
 * `Boost.Context <https://boost.org>`__ on macOS or exotic platforms which are not supported by the
   default user-level thread implementations in pika. This can be enabled with
diff --git a/examples/documentation/CMakeLists.txt b/examples/documentation/CMakeLists.txt
index 31a3e0771..1f443339c 100644
--- a/examples/documentation/CMakeLists.txt
+++ b/examples/documentation/CMakeLists.txt
@@ -15,8 +15,24 @@ set(example_programs
     when_all_vector_documentation
 )
 
+if(PIKA_WITH_GPU_SUPPORT)
+  list(APPEND example_programs cuda_overview_documentation then_with_cublas_documentation
+       then_with_stream_documentation
+  )
+  set(cuda_overview_documentation_GPU ON)
+  set(then_with_stream_documentation_GPU ON)
+  set(then_with_cublas_documentation_GPU ON)
+  set(cuda_overview_documentation_PARAMETERS RUN_SERIAL)
+  set(then_with_stream_documentation_PARAMETERS RUN_SERIAL)
+  set(then_with_cublas_documentation_PARAMETERS RUN_SERIAL)
+endif()
+
 foreach(example_program ${example_programs})
-  set(sources ${example_program}.cpp)
+  if(${${example_program}_GPU})
+    set(sources ${example_program}.cu)
+  else()
+    set(sources ${example_program}.cpp)
+  endif()
 
   source_group("Source Files" FILES ${sources})
 
diff --git a/examples/documentation/cuda_overview_documentation.cu b/examples/documentation/cuda_overview_documentation.cu
new file mode 100644
index 000000000..1c552df3c
--- /dev/null
+++ b/examples/documentation/cuda_overview_documentation.cu
@@ -0,0 +1,59 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <pika/cuda.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+
+#include <whip.hpp>
+
+#include <cstdio>
+#include <utility>
+
+__global__ void kernel()
+{
+    printf(
+        "Hello from kernel! threadIdx.x: %d\n", static_cast<int>(threadIdx.x));
+}
+
+int main(int argc, char* argv[])
+{
+    namespace cu = pika::cuda::experimental;
+    namespace ex = pika::execution::experimental;
+    namespace tt = pika::this_thread::experimental;
+
+    pika::start(argc, argv);
+    ex::thread_pool_scheduler cpu_sched{};
+
+    // Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler
+    // that uses the pool.
+    cu::cuda_pool pool{};
+    cu::cuda_scheduler cuda_sched{pool};
+
+    {
+        // Enable polling of CUDA events on the default pool. This is required
+        // to allow the adaptors below to signal completion of kernels.
+        cu::enable_user_polling p{};
+
+        // The work created by the adaptors below will all be scheduled on the
+        // same stream from the pool since the work is sequential.
+        //
+        // Note that error checking is omitted below.
+        auto s = ex::just(42) | ex::continues_on(cuda_sched) |
+            // CUDA kernel through a lambda.
+            ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) |
+            // Explicitly launch a CUDA kernel with a stream (see
+            // https://github.com/eth-cscs/whip for details about whip)
+            cu::then_with_stream(
+                [](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); });
+        tt::sync_wait(std::move(s));
+    }
+
+    pika::finalize();
+    pika::stop();
+
+    return 0;
+}
diff --git a/examples/documentation/then_with_cublas_documentation.cu b/examples/documentation/then_with_cublas_documentation.cu
new file mode 100644
index 000000000..ececa311c
--- /dev/null
+++ b/examples/documentation/then_with_cublas_documentation.cu
@@ -0,0 +1,126 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <pika/cuda.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+
+#include <fmt/printf.h>
+#include <whip.hpp>
+
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+#if defined(PIKA_HAVE_CUDA)
+# include <cublas_v2.h>
+using blas_handle_t = cublasHandle_t;
+auto* blas_gemm = &cublasDgemm;
+auto blas_pointer_mode = CUBLAS_POINTER_MODE_HOST;
+auto blas_op_n = CUBLAS_OP_N;
+#elif defined(PIKA_HAVE_HIP)
+# include <rocblas/rocblas.h>
+using blas_handle_t = rocblas_handle;
+# define CUBLAS_POINTER_MODE_HOST rocblas_pointer_mode_host
+auto* blas_gemm = &rocblas_dgemm;
+auto blas_pointer_mode = rocblas_pointer_mode_host;
+auto blas_op_n = rocblas_operation_none;
+#endif
+
+// Owning wrapper for GPU-allocated memory.
+class gpu_data
+{
+    double* p{nullptr};
+    std::size_t n{0};
+
+public:
+    // Note that blocking functions such as cudaMalloc will block the underlying
+    // operating system thread instead of yielding the pika task. Consider using
+    // e.g. a pool of GPU memory to avoid blocking the thread for too long.
+    gpu_data(std::size_t n)
+      : n(n)
+    {
+        whip::malloc(&p, sizeof(double) * n);
+    }
+    gpu_data(gpu_data&& other) noexcept
+      : p(std::exchange(other.p, nullptr))
+      , n(std::exchange(other.n, 0))
+    {
+    }
+    gpu_data& operator=(gpu_data&& other) noexcept
+    {
+        p = std::exchange(other.p, nullptr);
+        n = std::exchange(other.n, 0);
+        return *this;
+    }
+    gpu_data(gpu_data const&) = delete;
+    gpu_data& operator=(gpu_data const&) = delete;
+    ~gpu_data() { whip::free(p); }
+
+    std::size_t size() const { return n; }
+    double* get() const { return p; }
+};
+
+__global__ void init(double* a, double* b, double* c, std::size_t n)
+{
+    std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+    {
+        a[i] = 1.0;
+        b[i] = 2.0;
+        c[i] = 3.0;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    namespace cu = pika::cuda::experimental;
+    namespace ex = pika::execution::experimental;
+    namespace tt = pika::this_thread::experimental;
+
+    pika::start(argc, argv);
+    ex::thread_pool_scheduler cpu_sched{};
+    cu::cuda_pool pool{};
+    cu::cuda_scheduler cuda_sched{pool};
+
+    {
+        cu::enable_user_polling p{};
+
+        constexpr std::size_t n = 2048;
+        gpu_data a{n * n};
+        gpu_data b{n * n};
+        gpu_data c{n * n};
+        double alpha = 1.0;
+        double beta = 1.0;
+
+        auto s = ex::just(std::move(a), std::move(b), std::move(c)) |
+            ex::continues_on(cuda_sched) |
+            cu::then_with_stream(
+                [](auto& a, auto& b, auto& c, whip::stream_t stream) {
+                    init<<<n * n / 256, 256, 0, stream>>>(
+                        a.get(), b.get(), c.get(), n * n);
+                    return std::make_tuple(
+                        std::move(a), std::move(b), std::move(c));
+                }) |
+            ex::unpack() |
+            // a, b, and c will be kept alive by the then_with_cublas operation
+            // state at least until the GPU kernels complete.  Values sent by
+            // the predecessor sender are passed as the last arguments after the
+            // handle.
+            cu::then_with_cublas(
+                [&](blas_handle_t handle, auto& a, auto& b, auto& c) {
+                    blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha,
+                        a.get(), n, b.get(), n, &beta, c.get(), n);
+                },
+                blas_pointer_mode);
+        tt::sync_wait(std::move(s));
+    }
+
+    pika::finalize();
+    pika::stop();
+
+    return 0;
+}
diff --git a/examples/documentation/then_with_stream_documentation.cu b/examples/documentation/then_with_stream_documentation.cu
new file mode 100644
index 000000000..432b079ff
--- /dev/null
+++ b/examples/documentation/then_with_stream_documentation.cu
@@ -0,0 +1,70 @@
+//  Copyright (c) 2024 ETH Zurich
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <pika/cuda.hpp>
+#include <pika/execution.hpp>
+#include <pika/init.hpp>
+
+#include <whip.hpp>
+
+#include <cstddef>
+#include <cstdio>
+#include <utility>
+
+__global__ void kernel(int* p, int offset)
+{
+    printf(
+        "Hello from kernel! threadIdx.x: %d\n", static_cast<int>(threadIdx.x));
+    p[threadIdx.x] = threadIdx.x * threadIdx.x + offset;
+}
+
+int main(int argc, char* argv[])
+{
+    namespace cu = pika::cuda::experimental;
+    namespace ex = pika::execution::experimental;
+    namespace tt = pika::this_thread::experimental;
+
+    pika::start(argc, argv);
+    ex::thread_pool_scheduler cpu_sched{};
+    cu::cuda_pool pool{};
+    cu::cuda_scheduler cuda_sched{pool};
+
+    {
+        cu::enable_user_polling p{};
+
+        constexpr std::size_t n = 32;
+        int* a = nullptr;
+
+        // whip::malloc_async wraps cudaMallocAsync/hipMallocAsync. Using the
+        // sender adaptors the allocation, work, and deallocation can all be
+        // scheduled onto the same stream.
+        auto s = ex::just(&a, n * sizeof(int)) | ex::continues_on(cuda_sched) |
+            cu::then_with_stream(whip::malloc_async) |
+            // The then_with_stream callable accepts values sent by the
+            // predecessor. They will be passed by reference before the stream.
+            // This allows e.g. whip::malloc_async to be used above with values
+            // sent by the just sender. The values are passed by reference and
+            // will be kept alive until the work done on the stream is done.
+            cu::then_with_stream(
+                [&a](
+                    /* other values by reference here */ whip::stream_t
+                        stream) {
+                    kernel<<<1, n, 0, stream>>>(a, 17);
+                    // Even though the function returns here, the sync_wait below
+                    // will wait for the kernel to finish. Values returned are
+                    // passed on to continuations.
+                    return a;
+                }) |
+            cu::then_with_stream(whip::free_async);
+
+        tt::sync_wait(std::move(s));
+    }
+
+    pika::finalize();
+    pika::stop();
+
+    return 0;
+}
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp
index 88679f593..10def061d 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_polling_helper.hpp
@@ -21,10 +21,21 @@ namespace pika::cuda::experimental {
     PIKA_EXPORT std::string const& get_pool_name();
     PIKA_EXPORT void set_pool_name(std::string const&);
 
-    // -----------------------------------------------------------------
-    // This RAII helper class enables polling for a scoped block
-    struct [[nodiscard]] enable_user_polling
+    /// \brief Enable CUDA polling on the given thread pool.
+    ///
+    /// RAII helper class to enable and disable polling of CUDA events on the given pool. Enabling
+    /// polling is a requirement to signal completion of work submitted to the \ref cuda_scheduler.
+    ///
+    /// There is no detection of whether polling is already enabled or disabled, or if \ref
+    /// enable_user_polling is nested. The constructor and destructor will unconditionally register
+    /// and unregister polling, respectively.
+    class [[nodiscard]] enable_user_polling
     {
+    public:
+        /// \brief Start polling for CUDA events on the given thread pool.
+        ///
+        /// \param pool_name The name of the thread pool to enable polling on. The default is to use
+        /// the default thread pool.
         enable_user_polling(std::string const& pool_name = "")
           : pool_name_(pool_name)
         {
@@ -41,6 +52,10 @@ namespace pika::cuda::experimental {
             }
         }
 
+        /// \brief Stop polling for CUDA events.
+        ///
+        /// The destructor will not wait for work submitted to a \ref cuda_scheduler to complete.
+        /// The user must ensure that work completes before disabling polling.
         ~enable_user_polling()
         {
             if (pool_name_.empty())
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
index bff585d84..c794d2b85 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
@@ -22,6 +22,13 @@
 #include <vector>
 
 namespace pika::cuda::experimental {
+    /// \brief A locked cuBLAS handle.
+    ///
+    /// A handle that provides thread-safe access to a \ref cublas_handle. The locked handle is
+    /// immovable.
+    ///
+    /// \note The recommended way to access a handle is through sender adaptors using \ref
+    /// cuda_scheduler.
     class locked_cublas_handle
     {
         cublas_handle& handle;
@@ -35,9 +42,20 @@ namespace pika::cuda::experimental {
         locked_cublas_handle& operator=(locked_cublas_handle&&) = delete;
         locked_cublas_handle& operator=(locked_cublas_handle const&) = delete;
 
+        /// \brief Access the underlying cuBLAS handle.
+        ///
+        /// \return a reference to the \ref cublas_handle. The returned handle is not thread-safe
+        /// and must be used within the lifetime of the \ref locked_cublas_handle.
         PIKA_EXPORT cublas_handle const& get() noexcept;
     };
 
+    /// \brief A locked cuSOLVER handle.
+    ///
+    /// A handle that provides thread-safe access to a \ref cusolver_handle. The locked handle is
+    /// immovable.
+    ///
+    /// \note The recommended way to access a handle is through sender adaptors using \ref
+    /// cuda_scheduler.
     class locked_cusolver_handle
     {
         cusolver_handle& handle;
@@ -51,15 +69,26 @@ namespace pika::cuda::experimental {
         locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete;
         locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete;
 
+        /// \brief Access the underlying cuSOLVER handle.
+        ///
+        /// \return a reference to the \ref cusolver_handle. The returned handle is not thread-safe
+        /// and must be used within the lifetime of the \ref locked_cusolver_handle.
         PIKA_EXPORT cusolver_handle const& get() noexcept;
     };
 
-    /// A pool of CUDA streams, used for scheduling work on a CUDA device.
+    /// \brief A pool of CUDA streams, used for scheduling work on a CUDA device.
+    ///
+    /// The pool initializes a set of CUDA streams on construction and provides access to the
+    /// streams in a round-robin fashion. The pool also gives access to cuBLAS and cuSOLVER handles.
+    ///
+    /// The pool is movable and copyable with reference semantics.  Copies of a pool still refer to
+    /// the original pool of streams. A moved-from pool can't be used, except to check if it is
+    /// valid with \ref valid().
     ///
-    /// The pool initializes a set of CUDA streams on construction and
-    /// provides access to the streams in a round-robin fashion. The pool is
-    /// movable and copyable with reference semantics. Copies of a pool still
-    /// refer to the original pool of streams.
+    /// The pool is equality comparable and formattable.
+    ///
+    /// \note The recommended way to access streams and handles from the \ref cuda_pool is through
+    /// sender adaptors using \ref cuda_scheduler.
     class cuda_pool
     {
     private:
@@ -129,9 +158,34 @@ namespace pika::cuda::experimental {
         std::shared_ptr<pool_data> data;
 
     public:
+        /// \brief Construct a pool of CUDA streams and handles.
+        ///
+        /// \param device the CUDA device used for scheduling work
+        /// \param num_normal_priority_streams the number of normal priority streams
+        /// \param num_high_priority_streams the number of high priority streams
+        /// \param flags flags used to construct CUDA streams
+        /// \param num_cublas_handles the number of cuBLAS handles to create for the whole pool
+        /// \param num_cusolver_handles the number of cuSOLVER handles to create for the whole pool
+        ///
+        /// \note The default values of \p num_normal_priority_streams, \p
+        /// num_high_priority_streams, \p num_cublas_handles, and \p num_cusolver_handles have been
+        /// chosen to easily allow saturating most GPUs without creating unnecessarily many streams.
+        /// In individual situations more streams (e.g. launching many small kernels) or fewer
+        /// streams (e.g. the GPU does not support more concurrency, or [slows down when using too
+        /// many streams](https://github.com/ROCm/HIP/issues/3366)) may be more appropriate. Each
+        /// cuBLAS and cuSOLVER handle may require a significant amount of GPU memory, which is why
+        /// the default values are lower than the number of streams. The default values have proven
+        /// to work well e.g. in [DLA-Future](https://github.com/eth-cscs/DLA-Future).
+        ///
+        /// \warning Up to and including version 0.30.X the number of streams parameters denoted the
+        /// number of streams *per worker thread*. From 0.31.0 onwards the parameters denote the
+        /// total number of streams to create in the pool. The default values were adjusted
+        /// accordingly, but if you are not using the default values, please verify the values you
+        /// are passing to the \ref cuda_pool constructor are still reasonable with 0.31.0.
         PIKA_EXPORT explicit cuda_pool(int device = 0, std::size_t num_normal_priority_streams = 32,
             std::size_t num_high_priority_streams = 32, unsigned int flags = 0,
             std::size_t num_cublas_handles = 16, std::size_t num_cusolver_handles = 16);
+
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool(cuda_pool&&) = default;
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
@@ -141,12 +195,42 @@ namespace pika::cuda::experimental {
         PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE
         cuda_pool& operator=(cuda_pool const&) = default;
 
+        /// \brief Check if the pool is valid.
+        ///
+        /// \return true if the pool refers to a valid pool, false otherwise (e.g. if the pool has
+        /// been moved out from).
         PIKA_EXPORT bool valid() const noexcept;
+
+        /// \brief Check if the pool is valid.
+        ///
+        /// See \ref valid().
         PIKA_EXPORT explicit operator bool() noexcept;
+
+        /// \brief Get a reference to the next CUDA stream.
+        ///
+        /// \note The recommended way to access a stream is through a \ref cuda_scheduler.
+        ///
+        /// \param priority the priority of the stream.
         PIKA_EXPORT cuda_stream const& get_next_stream(
             pika::execution::thread_priority priority = pika::execution::thread_priority::normal);
+
+        /// \brief Get a locked cuBLAS handle.
+        ///
+        /// \note The recommended way to access a handle is through a \ref cuda_scheduler.
+        ///
+        /// \param stream the CUDA stream to use with the cuBLAS handle.
+        ///
+        /// \return a locked cuBLAS handle, which is released for reuse on destruction.
         PIKA_EXPORT locked_cublas_handle get_cublas_handle(
             cuda_stream const& stream, cublasPointerMode_t pointer_mode);
+
+        /// \brief Get a locked cuSOLVER handle.
+        ///
+        /// \note The recommended way to access a handle is through a \ref cuda_scheduler.
+        ///
+        /// \param stream the CUDA stream to use with the cuSOLVER handle.
+        ///
+        /// \return a locked cuSOLVER handle, which is released for reuse on destruction.
         PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
index 9b3ecf39d..003e9b2f1 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp
@@ -23,8 +23,17 @@
 namespace pika::cuda::experimental {
     /// A scheduler for running work on a CUDA pool.
     ///
-    /// Provides access to scheduling work on a CUDA context represented by a
-    /// cuda_pool.
+    /// Provides access to scheduling work on a CUDA context represented by a \ref cuda_pool. Models
+    /// the [std::execution scheduler concept](https://eel.is/c++draft/exec.sched).
+    ///
+    /// Move and copy constructible. The scheduler has reference semantics with respect to the
+    /// associated CUDA pool.
+    ///
+    /// Equality comparable.
+    ///
+    /// \note The recommended way to access streams and handles from the \ref cuda_pool is through
+    /// the sender adaptors \ref then_with_stream, \ref then_with_cublas, and \ref
+    /// then_with_cusolver.
     class cuda_scheduler
     {
     private:
@@ -32,18 +41,25 @@ namespace pika::cuda::experimental {
         pika::execution::thread_priority priority;
 
     public:
-        PIKA_EXPORT
-        cuda_scheduler(cuda_pool pool);
+        /// \brief Constructs a new \ref cuda_scheduler using the given \ref cuda_pool.
+        PIKA_EXPORT explicit cuda_scheduler(cuda_pool pool);
         cuda_scheduler(cuda_scheduler&&) = default;
         cuda_scheduler(cuda_scheduler const&) = default;
         cuda_scheduler& operator=(cuda_scheduler&&) = default;
         cuda_scheduler& operator=(cuda_scheduler const&) = default;
         ~cuda_scheduler(){};
 
+        /// \brief Return the \ref cuda_pool associated with this scheduler.
         PIKA_EXPORT cuda_pool const& get_pool() const noexcept;
+
+        /// \brief Return the next available CUDA stream from the pool.
         PIKA_EXPORT cuda_stream const& get_next_stream();
+
+        /// \brief Return the next available cuBLAS handle from the pool.
         PIKA_EXPORT locked_cublas_handle get_cublas_handle(
             cuda_stream const& stream, cublasPointerMode_t pointer_mode);
+
+        /// \brief Return the next available cuSOLVER handle from the pool.
         PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
@@ -56,7 +72,6 @@ namespace pika::cuda::experimental {
         {
             return !(lhs == rhs);
         }
-        /// \endcond
 
         friend cuda_scheduler tag_invoke(pika::execution::experimental::with_priority_t,
             cuda_scheduler const& scheduler, pika::execution::thread_priority priority)
@@ -71,6 +86,7 @@ namespace pika::cuda::experimental {
         {
             return scheduler.priority;
         }
+        /// \endcond
     };
 
     namespace detail {
diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
index b1940dfd3..c0f491971 100644
--- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
+++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp
@@ -595,16 +595,14 @@ namespace pika::cuda::experimental {
     // - this operation can only be used when the predecessor sender has
     //   cuda_scheduler as its completion scheduler
 
-    /// Attach a continuation to run f with an additional CUDA stream.
-    ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cudaStream_t
-    /// argument as the last argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. f does not have exclusive
-    /// access to the given stream and other calls may reuse the same stream
-    /// concurrently.
-    inline constexpr struct then_with_stream_t final
+    /// \brief The type of the \ref then_with_stream sender adaptor.
+    struct then_with_stream_t final
     {
+        /// \brief Create a \ref then_with_stream sender.
+        ///
+        /// \param sender The predecessor sender.
+        /// \param f Callable that will be passed a \p cudaStream_t as the last argument. Values
+        /// from \p sender are passed as references.
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f)
         {
@@ -612,24 +610,36 @@ namespace pika::cuda::experimental {
                 then_with_stream_detail::cuda_stream_callable<F>{std::forward<F>(f)});
         }
 
+        /// \brief Partially bound sender. Expects a sender to be supplied later.
         template <typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(F&& f)
         {
             return pika::execution::experimental::detail::partial_algorithm<then_with_stream_t, F>{
                 std::forward<F>(f)};
         }
-    } then_with_stream{};
+    };
 
-    /// Attach a continuation to run f with an additional cuBLAS handle.
+    /// \brief Sender adaptor which calls \p f with CUDA stream.
     ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cublasHandle_t
-    /// argument as the first argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. The handle is
-    /// thread-local and f may not yield a pika thread until after the handle
-    /// has been used the last time by f.
-    inline constexpr struct then_with_cublas_t final
+    /// When the predecessor sender completes, calls \p f with a CUDA stream as the last argument
+    /// after other values sent by the predecessor sender. This adaptor can only be used when the
+    /// completion scheduler is a \ref cuda_scheduler. Other work may be scheduled concurrently on
+    /// the stream passed to \p f. Values sent by the predecessor sender are passed as references to
+    /// \p f and kept alive until the work submitted by \p f to the stream is completed. \p f may
+    /// return as soon as work has been submitted, and a connected receiver will be signaled only
+    /// once the kernels submitted to the stream have completed.
+    inline constexpr then_with_stream_t then_with_stream{};
+
+    /// \brief The type of the \ref then_with_cublas sender adaptor.
+    struct then_with_cublas_t final
     {
+        /// \brief Create a \ref then_with_cublas sender.
+        ///
+        /// \param sender The predecessor sender.
+        /// \param f Callable that will be passed a \p cublasHandle_t as the first argument. Values
+        /// from \p sender are passed as references.
+        /// \param pointer_mode The \p cublasPointerMode_t used for the internal cuBLAS handle, or
+        /// the equivalent for rocBLAS.
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto
         PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f, cublasPointerMode_t pointer_mode)
@@ -639,6 +649,7 @@ namespace pika::cuda::experimental {
                     std::forward<F>(f), pointer_mode});
         }
 
+        /// \brief Partially bound sender. Expects a sender to be supplied later.
         template <typename F>
         constexpr PIKA_FORCEINLINE auto
         PIKA_STATIC_CALL_OPERATOR(F&& f, cublasPointerMode_t pointer_mode)
@@ -646,18 +657,26 @@ namespace pika::cuda::experimental {
             return pika::execution::experimental::detail::partial_algorithm<then_with_cublas_t, F,
                 cublasPointerMode_t>{std::forward<F>(f), pointer_mode};
         }
-    } then_with_cublas{};
+    };
 
-    /// Attach a continuation to run f with an additional cuSOLVER handle.
+    /// \brief Sender adaptor which calls \p f with a cuBLAS handle.
     ///
-    /// Attaches a continuation to the given sender which will call f with the
-    /// arguments sent by the given sender with an additional cusolverDnHandle_t
-    /// argument as the first argument. This can only be called on a sender with
-    /// a completion scheduler that is cuda_scheduler. The handle is
-    /// thread-local and f may not yield a pika thread until after the handle
-    /// has been used the last time by f.
-    inline constexpr struct then_with_cusolver_t final
+    /// This sender is intended to be used to submit work using a cuBLAS handle. The stream
+    /// associated to the handle may also be used to submit work. The handle is accessed through a
+    /// \ref locked_cublas_handle and \p f should return as quickly as possible to avoid blocking
+    /// other work from using the handle.
+    ///
+    /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream,
+    /// except that the handle is passed as the first argument to match the typical function
+    /// signatures of cuBLAS functions.
+    inline constexpr then_with_cublas_t then_with_cublas{};
+
+    /// \brief The type of the \ref then_with_cusolver sender adaptor.
+    struct then_with_cusolver_t final
     {
+        /// \param sender The predecessor sender.
+        /// \param f Callable that will be passed a \p cusolverDnHandle_t as the first argument.
+        /// Values from \p sender are passed as references.
         template <typename Sender, typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f)
         {
@@ -665,11 +684,24 @@ namespace pika::cuda::experimental {
                 then_with_stream_detail::cusolver_handle_callable<F>{std::forward<F>(f)});
         }
 
+        /// \brief Partially bound sender. Expects a sender to be supplied later.
         template <typename F>
         constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(F&& f)
         {
             return pika::execution::experimental::detail::partial_algorithm<then_with_cusolver_t,
                 F>{std::forward<F>(f)};
         }
-    } then_with_cusolver{};
+    };
+
+    /// \brief Sender adaptor which calls \p f with a cuSOLVER handle.
+    ///
+    /// This sender is intended to be used to submit work using a cuSOLVER handle. The stream
+    /// associated to the handle may also be used to submit work. The handle is accessed through a
+    /// \ref locked_cusolver_handle and \p f should return as quickly as possible to avoid blocking
+    /// other work from using the handle.
+    ///
+    /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream,
+    /// except that the handle is passed as the first argument to match the typical function
+    /// signatures of cuBLAS functions.
+    inline constexpr then_with_cusolver_t then_with_cusolver{};
 }    // namespace pika::cuda::experimental
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
index be6ff9590..d782210aa 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp
@@ -16,7 +16,18 @@
 #include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a cuBLAS handle.
+    /// \brief RAII wrapper for a cuBLAS handle.
+    ///
+    /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it
+    /// on destruction.
+    ///
+    /// The wrapper is movable and copyable. A moved-from handle can not be used other than to check
+    /// for validity with \ref valid(). A copied stream uses the properties from the given handle
+    /// and creates a new handle.
+    ///
+    /// Equality comparable and formattable.
+    ///
+    /// \note The recommended way to access a handle is through a \ref cuda_scheduler.
     class cublas_handle
     {
     private:
@@ -27,7 +38,10 @@ namespace pika::cuda::experimental {
         static PIKA_EXPORT cublasHandle_t create_handle(int device, whip::stream_t stream);
 
     public:
+        /// \brief Constructs a new cuBLAS handle with the default stream.
         PIKA_EXPORT cublas_handle();
+
+        /// \brief Constructs a new cuBLAS handle with the given stream.
         PIKA_EXPORT explicit cublas_handle(cuda_stream const& stream);
         PIKA_EXPORT ~cublas_handle();
         PIKA_EXPORT cublas_handle(cublas_handle&&) noexcept;
@@ -35,11 +49,30 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cublas_handle(cublas_handle const&);
         PIKA_EXPORT cublas_handle& operator=(cublas_handle const&);
 
+        /// \brief Check if the handle is valid.
+        ///
+        /// \return true if the handle refers to a valid handle, false otherwise (e.g. if the handle
+        /// has been moved out from, or it has been default-constructed)
+        PIKA_EXPORT bool valid() const noexcept;
+
+        /// \brief Check if the handle is valid.
+        ///
+        /// See \ref valid().
+        PIKA_EXPORT explicit operator bool() const noexcept;
+
+        /// \brief Get the underlying cuBLAS handle.
         PIKA_EXPORT cublasHandle_t get() const noexcept;
+
+        /// \brief Get the device associated with the stream of the cuBLAS handle.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the stream associated with the cuBLAS handle.
         PIKA_EXPORT whip::stream_t get_stream() const noexcept;
 
+        /// \brief Set the stream associated with the cuBLAS handle.
         PIKA_EXPORT void set_stream(cuda_stream const& stream);
+
+        /// \brief Set the cuBLAS pointer mode of the handle.
         PIKA_EXPORT void set_pointer_mode(cublasPointerMode_t pointer_mode);
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
index 5d5220b18..507c872b1 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp
@@ -15,12 +15,19 @@
 #include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a CUDA stream.
+    /// \brief RAII wrapper for a CUDA stream.
     ///
-    /// An RAII wrapper for a CUDA stream which creates a stream on construction
-    /// and destroys it on destruction. Is movable and copiable. A moved-from
-    /// stream holds the default stream. A copied stream uses the properties
-    /// from the given stream and creates a new stream.
+    /// An RAII wrapper for a CUDA stream which creates a stream on construction and destroys it on
+    /// destruction. It is movable and copyable. A moved-from stream holds the default stream. A
+    /// copied stream uses the properties from the given stream and creates a new stream.
+    ///
+    /// Equality comparable and formattable.
+    ///
+    /// When accessing the underlying stream, [whip](https://github.com/eth-cscs/whip) is used for
+    /// compatibility with CUDA and HIP.
+    ///
+    /// \note The recommended way to access a stream is through sender adaptors using \ref
+    /// cuda_scheduler.
     class cuda_stream
     {
     private:
@@ -40,6 +47,13 @@ namespace pika::cuda::experimental {
             int device, pika::execution::thread_priority priority, unsigned int flags);
 
     public:
+        /// \brief Construct a new stream with the given device and priority.
+        ///
+        /// \param device The device to create the stream on.
+        /// \param priority The priority of the stream. The mapping from \ref thread_priority to
+        /// CUDA stream priorities is undefined, except that the order is preserved, allowing for
+        /// different \ref thread_priority to map to the same CUDA priority.
+        /// \param flags Flags to pass to the CUDA stream creation.
         PIKA_EXPORT explicit cuda_stream(int device = 0,
             pika::execution::thread_priority priority = pika::execution::thread_priority::default_,
             unsigned int flags = 0);
@@ -49,9 +63,18 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cuda_stream(cuda_stream const&);
         PIKA_EXPORT cuda_stream& operator=(cuda_stream const&);
 
+        /// \brief Get the underlying stream.
+        ///
+        /// The stream is still owned by the \ref cuda_stream and must not be manually released.
         PIKA_EXPORT whip::stream_t get() const noexcept;
+
+        /// \brief Get the device of the stream.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the priority of the stream.
         PIKA_EXPORT pika::execution::thread_priority get_priority() const noexcept;
+
+        /// brief Get the flags of the stream.
         PIKA_EXPORT unsigned int get_flags() const noexcept;
 
         /// \cond NOINTERNAL
diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
index bbd6a1883..5bea99f82 100644
--- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
+++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp
@@ -7,17 +7,28 @@
 #pragma once
 
 #include <pika/config.hpp>
-#if defined(PIKA_HAVE_GPU_SUPPORT)
-# include <pika/async_cuda_base/cuda_stream.hpp>
-# include <pika/async_cuda_base/custom_lapack_api.hpp>
+#include <pika/async_cuda_base/cuda_stream.hpp>
+#include <pika/async_cuda_base/custom_lapack_api.hpp>
 
-# include <fmt/format.h>
-# include <whip.hpp>
+#include <fmt/format.h>
+#include <whip.hpp>
 
-# include <string>
+#include <string>
 
 namespace pika::cuda::experimental {
-    /// RAII wrapper for a cuSOLVER handle.
+    /// \brief RAII wrapper for a cuSOLVER handle.
+    ///
+    /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it
+    /// on destruction.
+    ///
+    /// The wrapper is movable and copyable. A moved-from handle can not be used other than to check
+    /// for validity with \ref valid(). A copied stream uses the properties from the given handle
+    /// and creates a new handle.
+    ///
+    /// Equality comparable and formattable.
+    ///
+    /// \note The recommended way to access a handle is through sender adaptors using \ref
+    /// cuda_scheduler.
     class cusolver_handle
     {
     private:
@@ -28,7 +39,10 @@ namespace pika::cuda::experimental {
         static PIKA_EXPORT cusolverDnHandle_t create_handle(int device, whip::stream_t stream);
 
     public:
+        /// \brief Constructs a new cuSOLVER handle with the default stream.
         PIKA_EXPORT cusolver_handle();
+
+        /// \brief Constructs a new cuSOLVER handle with the given stream.
         PIKA_EXPORT explicit cusolver_handle(cuda_stream const& stream);
         PIKA_EXPORT ~cusolver_handle();
         PIKA_EXPORT cusolver_handle(cusolver_handle&&) noexcept;
@@ -36,10 +50,27 @@ namespace pika::cuda::experimental {
         PIKA_EXPORT cusolver_handle(cusolver_handle const&);
         PIKA_EXPORT cusolver_handle& operator=(cusolver_handle const&);
 
+        /// \brief Check if the handle is valid.
+        ///
+        /// \return true if the handle refers to a valid handle, false otherwise (e.g. if the handle
+        /// has been moved out from, or it has been default-constructed)
+        PIKA_EXPORT bool valid() const noexcept;
+
+        /// \brief Check if the handle is valid.
+        ///
+        /// See \ref valid().
+        PIKA_EXPORT explicit operator bool() const noexcept;
+
+        /// \brief Get the underlying cuSOLVER handle.
         PIKA_EXPORT cusolverDnHandle_t get() const noexcept;
+
+        /// \brief Get the device associated with the cuSOLVER handle.
         PIKA_EXPORT int get_device() const noexcept;
+
+        /// \brief Get the stream associated with the cuSOLVER handle.
         PIKA_EXPORT whip::stream_t get_stream() const noexcept;
 
+        /// \brief Set the stream associated with the cuSOLVER handle.
         PIKA_EXPORT void set_stream(cuda_stream const& stream);
 
         /// \cond NOINTERNAL
@@ -66,4 +97,3 @@ struct fmt::formatter<pika::cuda::experimental::cusolver_handle> : fmt::formatte
             fmt::format("cusolver_handle({})", fmt::ptr(handle.get())), ctx);
     }
 };
-#endif
diff --git a/libs/pika/async_cuda_base/src/cublas_handle.cpp b/libs/pika/async_cuda_base/src/cublas_handle.cpp
index a2b51d408..a637ac9f6 100644
--- a/libs/pika/async_cuda_base/src/cublas_handle.cpp
+++ b/libs/pika/async_cuda_base/src/cublas_handle.cpp
@@ -65,7 +65,7 @@ namespace pika::cuda::experimental {
     cublas_handle::cublas_handle(cublas_handle const& other)
       : device(other.device)
       , stream(other.stream)
-      , handle(other.handle != 0 ? create_handle(device, stream) : 0)
+      , handle(other.valid() ? create_handle(device, stream) : 0)
     {
     }
 
@@ -73,20 +73,20 @@ namespace pika::cuda::experimental {
     {
         device = other.device;
         stream = other.stream;
-        handle = other.handle != 0 ? create_handle(device, stream) : 0;
+        handle = other.valid() ? create_handle(device, stream) : 0;
 
         return *this;
     }
 
     cublas_handle::~cublas_handle()
     {
-        if (handle != 0) { check_cublas_error(cublasDestroy(handle)); }
+        if (valid()) { check_cublas_error(cublasDestroy(handle)); }
     }
 
+    bool cublas_handle::valid() const noexcept { return bool(handle); }
+    cublas_handle::operator bool() const noexcept { return bool(handle); }
     cublasHandle_t cublas_handle::get() const noexcept { return handle; }
-
     int cublas_handle::get_device() const noexcept { return device; }
-
     whip::stream_t cublas_handle::get_stream() const noexcept { return stream; }
 
     void cublas_handle::set_stream(cuda_stream const& stream)
diff --git a/libs/pika/async_cuda_base/src/cusolver_handle.cpp b/libs/pika/async_cuda_base/src/cusolver_handle.cpp
index 7b54c0df1..f4c6e8384 100644
--- a/libs/pika/async_cuda_base/src/cusolver_handle.cpp
+++ b/libs/pika/async_cuda_base/src/cusolver_handle.cpp
@@ -65,7 +65,7 @@ namespace pika::cuda::experimental {
     cusolver_handle::cusolver_handle(cusolver_handle const& other)
       : device(other.device)
       , stream(other.stream)
-      , handle(other.handle != 0 ? create_handle(device, stream) : 0)
+      , handle(other.valid() ? create_handle(device, stream) : 0)
     {
     }
 
@@ -73,20 +73,20 @@ namespace pika::cuda::experimental {
     {
         device = other.device;
         stream = other.stream;
-        handle = other.handle != 0 ? create_handle(device, stream) : 0;
+        handle = other.valid() ? create_handle(device, stream) : 0;
 
         return *this;
     }
 
     cusolver_handle::~cusolver_handle()
     {
-        if (handle != 0) { check_cusolver_error(cusolverDnDestroy(handle)); }
+        if (valid()) { check_cusolver_error(cusolverDnDestroy(handle)); }
     }
 
+    bool cusolver_handle::valid() const noexcept { return bool(handle); }
+    cusolver_handle::operator bool() const noexcept { return bool(handle); }
     cusolverDnHandle_t cusolver_handle::get() const noexcept { return handle; }
-
     int cusolver_handle::get_device() const noexcept { return device; }
-
     whip::stream_t cusolver_handle::get_stream() const noexcept { return stream; }
 
     void cusolver_handle::set_stream(cuda_stream const& stream)