From 9f0b8bc77adce4dffe926f7a6de5d086d2f98b42 Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Fri, 1 Nov 2024 13:02:57 +0100 Subject: [PATCH] Expand and add more CUDA/HIP documentation Document cuda_pool, cuda_scheduler, cuda_stream, cublas_handle, cusolver_handle, as well as expose these with CUDA sender adaptors in the documentation. Adds a high-level example of using CUDA functionality. --- docs/Doxyfile | 6 +- docs/api.rst | 98 ++++++++++++++++++- examples/documentation/CMakeLists.txt | 2 + .../include/pika/async_cuda/cuda_pool.hpp | 70 ++++++++++++- .../pika/async_cuda/cuda_scheduler.hpp | 26 ++++- .../include/pika/async_cuda/then_on_host.hpp | 25 ++--- .../pika/async_cuda/then_with_stream.hpp | 84 +++++++++------- .../pika/async_cuda_base/cublas_handle.hpp | 18 +++- .../pika/async_cuda_base/cuda_stream.hpp | 37 ++++++- .../pika/async_cuda_base/cusolver_handle.hpp | 28 ++++-- 10 files changed, 320 insertions(+), 74 deletions(-) diff --git a/docs/Doxyfile b/docs/Doxyfile index e2bb88eb4..f0e49a117 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -7,7 +7,9 @@ GENERATE_RTF = NO GENERATE_HTML = NO QUIET = NO OUTPUT_DIRECTORY = "$(PIKA_DOCS_DOXYGEN_OUTPUT_DIRECTORY)" -INPUT = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \ +INPUT = "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda" \ + "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/async_cuda_base" \ + "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/init_runtime" \ "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/runtime" \ "$(PIKA_DOCS_DOXYGEN_INPUT_ROOT)/libs/pika/execution" FILE_PATTERNS = *.cpp *.hpp *.cu @@ -18,4 +20,4 @@ EXTRACT_ALL = YES ENABLE_PREPPROCESSING = YES MACRO_EXPANSION = YES EXPAND_ONLY_PREDEF = YES -PREDEFINED = PIKA_EXPORT= +PREDEFINED = PIKA_EXPORT= PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE= diff --git a/docs/api.rst b/docs/api.rst index f461199c3..d0890c635 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,6 +5,8 @@ Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +:tocdepth: 3 + .. _api: ============= @@ -32,7 +34,6 @@ These headers are part of the public API, but are currently undocumented. - ``pika/async_rw_mutex.hpp`` - ``pika/barrier.hpp`` - ``pika/condition_variable.hpp`` -- ``pika/cuda.hpp`` - ``pika/execution.hpp`` - ``pika/latch.hpp`` - ``pika/mpi.hpp`` @@ -104,3 +105,98 @@ The ``pika/execution.hpp`` header provides functionality related to ``std::execu .. literalinclude:: ../examples/documentation/when_all_vector_documentation.cpp :language: c++ :start-at: #include + +.. _header_pika_cuda: + +``pika/cuda.hpp`` +================= + +The ``pika/cuda.hpp`` header provides functionality related to CUDA and HIP. All functionality is +under the ``pika::cuda::experimental`` namespace and class and function names contain ``cuda``, even +when HIP support is enabled. CUDA and HIP functionality can be enabled with the CMake options +``PIKA_WITH_CUDA`` and ``PIKA_WITH_HIP``, respectively. In the following, whenever CUDA is +mentioned, it refers to to CUDA and HIP interchangeably. + +.. note:: + https://github.com/pika-org/pika/issues/116 tracks a potential renaming of the functionality + to avoid using ``cuda`` even when HIP is enabled. If you have feedback on a rename or just want + to follow along, please see that issue. + +.. warning:: + At the moment, ``nvcc`` can not compile stdexec headers. Of the CUDA compilers, only ``nvc++`` is + able to compile stdexec headers. If you have stdexec support enabled in pika, either ensure that + ``.cu`` files do not include stdexec headers, or use ``nvc++`` to compile your application. + However, ``nvc++`` does not officially support compiling device code. Use at your own risk. + + For HIP there are no known restrictions. + +The CUDA support in pika relies on four major components: + +1. A pool of CUDA streams as well as cuBLAS and cuSOLVER handles. These streams and handles are used + in a round-robin fashion by various sender adaptors. +2. A CUDA scheduler, in the ``std::execution`` sense. This uses the CUDA pool to schedule work on a + GPU. +3. Sender adaptors. A few special-purpose sender adaptors, as well as customizations of a few + ``std::execution`` adaptors are provided to help schedule different types of work on a GPU. +4. Polling of CUDA events integrated into the pika scheduling loop. This integration is essential to + avoid calling e.g. ``cudaStreamSynchronize`` on a pika task, which would block the underlying + worker thread and thus block progress of other work. + +The following example gives an overview of using the above CUDA functionalities in pika: + +.. literalinclude:: ../examples/documentation/cuda_overview_documentation.cu + :language: c++ + :start-at: #include + +TODO: Note that while cuda_pool gives direct access to streams and handles, the intended usage is to +access them via the scheduler and sender adaptors. + +.. doxygenclass:: pika::cuda::experimental::cuda_pool +.. doxygenclass:: pika::cuda::experimental::cuda_stream +.. doxygenclass:: pika::cuda::experimental::cublas_handle +.. doxygenclass:: pika::cuda::experimental::locked_cublas_handle +.. doxygenclass:: pika::cuda::experimental::cusolver_handle +.. doxygenclass:: pika::cuda::experimental::locked_cusolver_handle + +.. + .. literalinclude:: ../examples/documentation/cuda_pool_documentation.cpp + :language: c++ + :start-at: #include + +.. doxygenclass:: pika::cuda::experimental::cuda_scheduler + +.. + .. literalinclude:: ../examples/documentation/cuda_scheduler_documentation.cpp + :language: c++ + :start-at: #include + +.. doxygenvariable:: pika::cuda::experimental::then_with_stream + +.. literalinclude:: ../examples/documentation/then_with_stream_documentation.cu + :language: c++ + :start-at: #include + + +.. doxygenvariable:: pika::cuda::experimental::then_with_cublas + +.. + .. literalinclude:: ../examples/documentation/then_with_cublas_documentation.cpp + :language: c++ + :start-at: #include + +.. doxygenvariable:: pika::cuda::experimental::then_with_cusolver + +.. + .. literalinclude:: ../examples/documentation/then_with_cusolver_documentation.cpp + :language: c++ + :start-at: #include + +.. note:: + TODO: Is this useful? Just transfer to host explicitly? + +.. doxygenvariable:: pika::cuda::experimental::then_on_host + +.. + .. literalinclude:: ../examples/documentation/then_on_host_documentation.cpp + :language: c++ + :start-at: #include diff --git a/examples/documentation/CMakeLists.txt b/examples/documentation/CMakeLists.txt index 80116999b..4578df557 100644 --- a/examples/documentation/CMakeLists.txt +++ b/examples/documentation/CMakeLists.txt @@ -5,11 +5,13 @@ # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) set(example_programs + # cuda_overview_documentation # TODO drop_operation_state_documentation drop_value_documentation hello_world_documentation init_hpp_documentation split_tuple_documentation + # then_with_stream_documentation # TODO unpack_documentation when_all_vector_documentation ) diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp index 8dfcb5ec5..d65f3e5f0 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp @@ -23,6 +23,10 @@ #include namespace pika::cuda::experimental { + /// \brief A locked cuBLAS handle. + /// + /// A handle that provides thread-safe access to a \ref cublas_handle. The locked handle is + /// immovable. class locked_cublas_handle { cublas_handle& handle; @@ -36,9 +40,17 @@ namespace pika::cuda::experimental { locked_cublas_handle& operator=(locked_cublas_handle&&) = delete; locked_cublas_handle& operator=(locked_cublas_handle const&) = delete; + /// \brief Access the underlying cuBLAS handle. + /// + /// \return a reference to the \ref cublas_handle. The returned handle is not thread-safe + /// and must be used within the lifetime of the \ref locked_cublas_handle. PIKA_EXPORT cublas_handle const& get() noexcept; }; + /// \brief A locked cuSOLVER handle. + /// + /// A handle that provides thread-safe access to a \ref cusolver_handle. The locked handle is + /// immovable. class locked_cusolver_handle { cusolver_handle& handle; @@ -52,15 +64,24 @@ namespace pika::cuda::experimental { locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete; locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete; + /// \brief Access the underlying cuSOLVER handle. + /// + /// \return a reference to the \ref cusolver_handle. The returned handle is not thread-safe + /// and must be used within the lifetime of the \ref locked_cusolver_handle. PIKA_EXPORT cusolver_handle const& get() noexcept; }; - /// A pool of CUDA streams, used for scheduling work on a CUDA device. + /// \brief A pool of CUDA streams, used for scheduling work on a CUDA device. + /// + /// The pool initializes a set of CUDA (thread-local) streams on construction and provides + /// access to the streams in a round-robin fashion. The pool also gives access to cuBLAS and + /// cuSOLVER handles. /// - /// The pool initializes a set of CUDA (thread-local) streams on - /// construction and provides access to the streams in a round-robin - /// fashion. The pool is movable and copyable with reference semantics. - /// Copies of a pool still refer to the original pool of streams. + /// The pool is movable and copyable with reference semantics. Copies of a pool still refer to + /// the original pool of streams. A moved-from pool can't be used, except to check if it is + /// valid with \ref valid(). + /// + /// The pool is equality comparable and formattable. class cuda_pool { private: @@ -135,25 +156,64 @@ namespace pika::cuda::experimental { std::shared_ptr data; public: + /// \brief Construct a pool of CUDA streams and handles. + /// + /// \param device the CUDA device used for scheduling work + /// \param num_normal_priority_streams_per_thread the number of normal priority streams per + /// thread + /// \param num_high_priority_streams_per_thread the number of high priority streams per + /// thread + /// \param flags flags used to construct CUDA streams + /// \param num_cublas_handles the number of cuBLAS handles to create for the whole pool + /// \param num_cusolver_handles the number of cuSOLVER handles to create for the whole pool PIKA_EXPORT explicit cuda_pool(int device = 0, std::size_t num_normal_priority_streams_per_thread = 3, std::size_t num_high_priority_streams_per_thread = 3, unsigned int flags = 0, std::size_t num_cublas_handles = 16, std::size_t num_cusolver_handles = 16); + + /// \brief Move constructor. PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool(cuda_pool&&) = default; + /// \brief Copy constructor. PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool(cuda_pool const&) = default; + /// \brief Move assignment operator. PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool& operator=(cuda_pool&&) = default; + /// \brief Copy assignment operator. PIKA_NVCC_PRAGMA_HD_WARNING_DISABLE cuda_pool& operator=(cuda_pool const&) = default; + /// \brief Check if the pool is valid. + /// + /// \return true if the pool refers to a valid pool, false otherwise (e.g. if the pool has + /// been moved out from) PIKA_EXPORT bool valid() const noexcept; + + /// \brief Check if the pool is valid. + /// + /// See \ref valid(). PIKA_EXPORT explicit operator bool() noexcept; + + /// \brief Get a reference to the next CUDA stream. + /// + /// \param priority the priority of the stream PIKA_EXPORT cuda_stream const& get_next_stream( pika::execution::thread_priority priority = pika::execution::thread_priority::normal); + + /// \brief Get a locked cuBLAS handle. + /// + /// \param stream the CUDA stream to use with the cuBLAS handle + /// + /// \return a locked cuBLAS handle, which is released for reuse on destruction PIKA_EXPORT locked_cublas_handle get_cublas_handle( cuda_stream const& stream, cublasPointerMode_t pointer_mode); + + /// \brief Get a locked cuSOLVER handle. + /// + /// \param stream the CUDA stream to use with the cuSOLVER handle + /// + /// \return a locked cuSOLVER handle, which is released for reuse on destruction PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp index 6b6cb0153..0a283d392 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp @@ -21,8 +21,13 @@ namespace pika::cuda::experimental { /// A scheduler for running work on a CUDA pool. /// - /// Provides access to scheduling work on a CUDA context represented by a - /// cuda_pool. + /// Provides access to scheduling work on a CUDA context represented by a \ref cuda_pool. Models + /// the `std::execution` scheduler concept. + /// + /// Move and copy constructible. The scheduler has reference semantics with respect to the + /// associated CUDA pool. + /// + /// Equality comparable. class cuda_scheduler { private: @@ -30,18 +35,32 @@ namespace pika::cuda::experimental { pika::execution::thread_priority priority; public: + /// \brief TODO PIKA_EXPORT cuda_scheduler(cuda_pool pool); cuda_scheduler(cuda_scheduler&&) = default; cuda_scheduler(cuda_scheduler const&) = default; cuda_scheduler& operator=(cuda_scheduler&&) = default; cuda_scheduler& operator=(cuda_scheduler const&) = default; - ~cuda_scheduler(){}; + ~cuda_scheduler() = default; // TODO: Default? + /// \brief Return the \ref cuda_pool associated with this scheduler. PIKA_EXPORT cuda_pool const& get_pool() const noexcept; + + /// \brief Return the next available CUDA stream from the pool. + /// + /// Equivalent to accessing the stream via \ref get_pool(). PIKA_EXPORT cuda_stream const& get_next_stream(); + + /// \brief Return the next available cuBLAS handle from the pool. + /// + /// Equivalent to accessing the cuBLAS handle via \ref get_pool(). PIKA_EXPORT locked_cublas_handle get_cublas_handle( cuda_stream const& stream, cublasPointerMode_t pointer_mode); + + /// \brief Return the next available cuSOLVER handle from the pool. + /// + /// Equivalent to accessing the cuSOLVER handle via \ref get_pool(). PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL @@ -56,6 +75,7 @@ namespace pika::cuda::experimental { } /// \endcond + /// TODO friend cuda_scheduler tag_invoke(pika::execution::experimental::with_priority_t, cuda_scheduler const& scheduler, pika::execution::thread_priority priority) { diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp index 2fb5cedf2..0a79ef68c 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/then_on_host.hpp @@ -212,17 +212,7 @@ namespace pika::cuda::experimental { }; } // namespace then_on_host_detail - // NOTE: This is not a customization of pika::execution::experimental::then. - // It retains the cuda_scheduler execution context from the predecessor - // sender, but does not run the continuation on a CUDA device. Instead, it - // runs the continuation in the polling thread used by the cuda_scheduler on - // the CPU. The continuation is run only after synchronizing all previous - // events scheduled on the cuda_scheduler. Blocking in the callable given to - // then_on_host blocks other work scheduled on cuda_scheduler from - // completing. Heavier work should be transferred to a host scheduler as - // soon as possible. - inline constexpr struct then_on_host_t final - : pika::functional::detail::tag_fallback + struct then_on_host_t final : pika::functional::detail::tag_fallback { private: template @@ -245,5 +235,16 @@ namespace pika::cuda::experimental { return pika::execution::experimental::detail::partial_algorithm{ PIKA_FORWARD(F, f)}; } - } then_on_host{}; + }; + + /// NOTE: This is not a customization of pika::execution::experimental::then. + /// It retains the cuda_scheduler execution context from the predecessor + /// sender, but does not run the continuation on a CUDA device. Instead, it + /// runs the continuation in the polling thread used by the cuda_scheduler on + /// the CPU. The continuation is run only after synchronizing all previous + /// events scheduled on the cuda_scheduler. Blocking in the callable given to + /// then_on_host blocks other work scheduled on cuda_scheduler from + /// completing. Heavier work should be transferred to a host scheduler as + /// soon as possible. + inline constexpr then_on_host_t then_on_host{}; } // namespace pika::cuda::experimental diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp index a0d7deda8..460bc338d 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp @@ -37,9 +37,9 @@ namespace pika::cuda::experimental::then_with_stream_detail { template auto invoke_with_thread_local_cublas_handle(cuda_scheduler& sched, cuda_stream const& stream, - cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts) - -> decltype(PIKA_INVOKE( - PIKA_FORWARD(F, f), std::declval(), PIKA_FORWARD(Ts, ts)...)) + cublasPointerMode_t pointer_mode, F&& f, + Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f), std::declval(), + PIKA_FORWARD(Ts, ts)...)) { auto locked_handle = sched.get_cublas_handle(stream, pointer_mode); return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); @@ -48,7 +48,7 @@ namespace pika::cuda::experimental::then_with_stream_detail { template auto invoke_with_thread_local_cusolver_handle(cuda_scheduler& sched, cuda_stream const& stream, F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f), - std::declval(), PIKA_FORWARD(Ts, ts)...)) + std::declval(), PIKA_FORWARD(Ts, ts)...)) { auto locked_handle = sched.get_cusolver_handle(stream); return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); @@ -249,9 +249,10 @@ namespace pika::cuda::experimental::then_with_stream_detail { } template - auto set_value(Ts&&... ts) noexcept - -> decltype(PIKA_INVOKE(PIKA_MOVE(f), op_state.sched, stream.value(), ts...), - void()) + auto + set_value(Ts&&... ts) noexcept -> decltype(PIKA_INVOKE(PIKA_MOVE(f), op_state.sched, + stream.value(), ts...), + void()) { pika::detail::try_catch_exception_ptr( [&]() mutable { @@ -389,8 +390,8 @@ namespace pika::cuda::experimental::then_with_stream_detail { // matches the argument list"). template friend auto tag_invoke(pika::execution::experimental::set_value_t, - then_with_cuda_stream_receiver&& r, Ts&&... ts) noexcept - -> decltype(r.set_value(PIKA_FORWARD(Ts, ts)...)) + then_with_cuda_stream_receiver&& r, + Ts&&... ts) noexcept -> decltype(r.set_value(PIKA_FORWARD(Ts, ts)...)) { // nvcc fails to compile this with std::forward(ts)... or // static_cast(ts)... so we explicitly use @@ -610,15 +611,7 @@ namespace pika::cuda::experimental { // - this operation can only be used when the predecessor sender has // cuda_scheduler as its completion scheduler - /// Attach a continuation to run f with an additional CUDA stream. - /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cudaStream_t - /// argument as the last argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. f does not have exclusive - /// access to the given stream and other calls may reuse the same stream - /// concurrently. - inline constexpr struct then_with_stream_t final + struct then_with_stream_t final { template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f) @@ -633,17 +626,20 @@ namespace pika::cuda::experimental { return pika::execution::experimental::detail::partial_algorithm{ PIKA_FORWARD(F, f)}; } - } then_with_stream{}; + }; - /// Attach a continuation to run f with an additional cuBLAS handle. + /// \brief Sender adaptor which calls \p f with CUDA stream. /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cublasHandle_t - /// argument as the first argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. The handle is - /// thread-local and f may not yield a pika thread until after the handle - /// has been used the last time by f. - inline constexpr struct then_with_cublas_t final + /// When the predecessor sender completes, calls \p f with a CUDA stream as the last argument + /// after other values sent by the predecessor sender. This adaptor can only be used when the + /// completion scheduler is a \ref cuda_scheduler. Other work may be scheduled concurrently on + /// the stream passed to \p f. Values sent by the predecessor sender are passed as references to + /// \p f and kept alive until the work submitted by \p f to the stream is completed. \p f may + /// return as soon as work has been submitted, and a connected receiver will be signaled only + /// once the kernels submitted to the stream have completed. + inline constexpr then_with_stream_t then_with_stream{}; + + struct then_with_cublas_t final { template constexpr PIKA_FORCEINLINE auto @@ -661,17 +657,19 @@ namespace pika::cuda::experimental { return pika::execution::experimental::detail::partial_algorithm{PIKA_FORWARD(F, f), pointer_mode}; } - } then_with_cublas{}; + }; - /// Attach a continuation to run f with an additional cuSOLVER handle. + /// \brief Sender adaptor which calls \p f with a cuBLAS handle. /// - /// Attaches a continuation to the given sender which will call f with the - /// arguments sent by the given sender with an additional cusolverDnHandle_t - /// argument as the first argument. This can only be called on a sender with - /// a completion scheduler that is cuda_scheduler. The handle is - /// thread-local and f may not yield a pika thread until after the handle - /// has been used the last time by f. - inline constexpr struct then_with_cusolver_t final + /// This sender is intended to be used to submit work using a cuBLAS handle. The stream + /// associated to the handle may also be used to submit work. The handle is accessed through a + /// \ref locked_cublas_handle and \p f should return as quickly as possible to avoid blocking + /// other work from using the handle. + /// + /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream. + inline constexpr then_with_cublas_t then_with_cublas{}; + + struct then_with_cusolver_t final { template constexpr PIKA_FORCEINLINE auto PIKA_STATIC_CALL_OPERATOR(Sender&& sender, F&& f) @@ -686,5 +684,17 @@ namespace pika::cuda::experimental { return pika::execution::experimental::detail::partial_algorithm{PIKA_FORWARD(F, f)}; } - } then_with_cusolver{}; + }; + + /// \brief Sender adaptor which calls \p f with a cuSOLVER handle. + /// + /// This sender is intended to be used to submit work using a cuSOLVER handle. The stream + /// associated to the handle may also be used to submit work. The handle is accessed through a + /// \ref locked_cusolver_handle and \p f should return as quickly as possible to avoid blocking + /// other work from using the handle. + /// + /// The behaviour of synchronization and lifetimes are the same as for \ref then_with_stream. + inline constexpr then_with_cusolver_t then_with_cusolver + { + }; } // namespace pika::cuda::experimental diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp index be6ff9590..50fdc3093 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp @@ -16,7 +16,12 @@ #include namespace pika::cuda::experimental { - /// RAII wrapper for a cuBLAS handle. + /// \brief RAII wrapper for a cuBLAS handle. + /// + /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it + /// on destruction. The wrapper is movable and copyable. A moved-from handle can not be used + /// other than to check for validity with \ref valid(). A copied stream uses the properties from + /// the given handle and creates a new handle. class cublas_handle { private: @@ -27,6 +32,7 @@ namespace pika::cuda::experimental { static PIKA_EXPORT cublasHandle_t create_handle(int device, whip::stream_t stream); public: + /// TODO: How to best document constructor and other special member functions. PIKA_EXPORT cublas_handle(); PIKA_EXPORT explicit cublas_handle(cuda_stream const& stream); PIKA_EXPORT ~cublas_handle(); @@ -35,11 +41,21 @@ namespace pika::cuda::experimental { PIKA_EXPORT cublas_handle(cublas_handle const&); PIKA_EXPORT cublas_handle& operator=(cublas_handle const&); + /// \brief Get the underlying cuBLAS handle. PIKA_EXPORT cublasHandle_t get() const noexcept; + + /// \brief Get the device associated with the stream of the cuBLAS handle. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the stream associated with the cuBLAS handle. PIKA_EXPORT whip::stream_t get_stream() const noexcept; + // TODO: Add valid/operator bool() methods + + /// \brief Set the stream associated with the cuBLAS handle. PIKA_EXPORT void set_stream(cuda_stream const& stream); + + /// \brief Set the cuBLAS pointer mode of the handle. PIKA_EXPORT void set_pointer_mode(cublasPointerMode_t pointer_mode); /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp index 5d5220b18..0c3d4a7d6 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp @@ -15,12 +15,16 @@ #include namespace pika::cuda::experimental { - /// RAII wrapper for a CUDA stream. + /// \brief RAII wrapper for a CUDA stream. /// - /// An RAII wrapper for a CUDA stream which creates a stream on construction - /// and destroys it on destruction. Is movable and copiable. A moved-from - /// stream holds the default stream. A copied stream uses the properties - /// from the given stream and creates a new stream. + /// An RAII wrapper for a CUDA stream which creates a stream on construction and destroys it on + /// destruction. Is movable and copyable. A moved-from stream holds the default stream. A copied + /// stream uses the properties from the given stream and creates a new stream. + /// + /// Streams are equality comparable and formattable. + /// + /// When accessing the underlying stream, https://github.com/eth-cscs/whip is used for + /// compatibility with CUDA and HIP. class cuda_stream { private: @@ -40,6 +44,13 @@ namespace pika::cuda::experimental { int device, pika::execution::thread_priority priority, unsigned int flags); public: + /// \brief Construct a new stream with the given device and priority. + /// + /// \param device The device to create the stream on. + /// \param priority The priority of the stream. The mapping from \ref thread_priority to + /// CUDA stream priorities is undefined, except that the order is preserved, allowing for + /// different \ref thread_priority to map to the same CUDA priority. + /// \param flags Flags to pass to the CUDA stream creation. PIKA_EXPORT explicit cuda_stream(int device = 0, pika::execution::thread_priority priority = pika::execution::thread_priority::default_, unsigned int flags = 0); @@ -49,9 +60,25 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_stream(cuda_stream const&); PIKA_EXPORT cuda_stream& operator=(cuda_stream const&); + /// \brief Get the underlying stream. + /// + /// \return the underlying stream. The stream is still owned by the \ref cuda_stream and + /// must not be manually released. PIKA_EXPORT whip::stream_t get() const noexcept; + + /// \brief Get the device of the stream. + /// + /// \return the device of the stream. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the priority of the stream. + /// + /// \return the priority of the stream. PIKA_EXPORT pika::execution::thread_priority get_priority() const noexcept; + + /// brief Get the flags of the stream. + /// + /// \return the flags of the stream. PIKA_EXPORT unsigned int get_flags() const noexcept; /// \cond NOINTERNAL diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp index bbd6a1883..c0bea7e4c 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp +++ b/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp @@ -7,17 +7,21 @@ #pragma once #include -#if defined(PIKA_HAVE_GPU_SUPPORT) -# include -# include +#include +#include -# include -# include +#include +#include -# include +#include namespace pika::cuda::experimental { - /// RAII wrapper for a cuSOLVER handle. + /// \brief RAII wrapper for a cuSOLVER handle. + /// + /// An RAII wrapper for a cuBLAS handle which creates a handle on construction and destroys it + /// on destruction. The wrapper is movable and copyable. A moved-from handle can not be used + /// other than to check for validity with \ref valid(). A copied stream uses the properties from + /// the given handle and creates a new handle. class cusolver_handle { private: @@ -28,6 +32,7 @@ namespace pika::cuda::experimental { static PIKA_EXPORT cusolverDnHandle_t create_handle(int device, whip::stream_t stream); public: + /// TODO: How to best document constructor and other special member functions. PIKA_EXPORT cusolver_handle(); PIKA_EXPORT explicit cusolver_handle(cuda_stream const& stream); PIKA_EXPORT ~cusolver_handle(); @@ -36,10 +41,18 @@ namespace pika::cuda::experimental { PIKA_EXPORT cusolver_handle(cusolver_handle const&); PIKA_EXPORT cusolver_handle& operator=(cusolver_handle const&); + /// \brief Get the underlying cuSOLVER handle. PIKA_EXPORT cusolverDnHandle_t get() const noexcept; + + /// \brief Get the device associated with the cuSOLVER handle. PIKA_EXPORT int get_device() const noexcept; + + /// \brief Get the stream associated with the cuSOLVER handle. PIKA_EXPORT whip::stream_t get_stream() const noexcept; + // TODO: Add valid/operator bool() methods + + /// \brief Set the stream associated with the cuSOLVER handle. PIKA_EXPORT void set_stream(cuda_stream const& stream); /// \cond NOINTERNAL @@ -66,4 +79,3 @@ struct fmt::formatter : fmt::formatte fmt::format("cusolver_handle({})", fmt::ptr(handle.get())), ctx); } }; -#endif