-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Expand and add more CUDA/HIP documentation
Document cuda_pool, cuda_scheduler, cuda_stream, cublas_handle, cusolver_handle, as well as expose these with CUDA sender adaptors in the documentation. Adds a high-level example of using CUDA functionality.
- Loading branch information
Showing
16 changed files
with
656 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// Copyright (c) 2024 ETH Zurich | ||
// | ||
// SPDX-License-Identifier: BSL-1.0 | ||
// Distributed under the Boost Software License, Version 1.0. (See accompanying | ||
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | ||
|
||
#include <pika/cuda.hpp> | ||
#include <pika/execution.hpp> | ||
#include <pika/init.hpp> | ||
|
||
#include <fmt/printf.h> | ||
#include <whip.hpp> | ||
|
||
#include <iostream> | ||
#include <utility> | ||
|
||
#if defined(PIKA_HAVE_CUDA) | ||
# include <cublas_v2.h> | ||
using blas_handle_t = cublasHandle_t; | ||
auto* blas_gemm = &cublasDgemm; | ||
auto blas_pointer_mode = CUBLAS_POINTER_MODE_HOST; | ||
auto blas_op_n = CUBLAS_OP_N; | ||
#elif defined(PIKA_HAVE_HIP) | ||
# include <rocblas/rocblas.h> | ||
using blas_handle_t = hipblasHandle_t; | ||
auto* blas_gemm = &rocblas_dgemm; | ||
auto blas_pointer_mode = ROCBLAS_POINTER_MODE_HOST; | ||
auto blas_op_n = rocblas_operation_none; | ||
#endif | ||
|
||
__global__ void kernel() { printf("Hello from kernel! threadIdx.x: %d\n", threadIdx.x); } | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
namespace cu = pika::cuda::experimental; | ||
namespace ex = pika::execution::experimental; | ||
namespace tt = pika::this_thread::experimental; | ||
|
||
pika::start(argc, argv); | ||
ex::thread_pool_scheduler cpu_sched{}; | ||
|
||
// Create a pool of CUDA streams and cuBLAS/SOLVER handles, and a scheduler that uses the pool. | ||
cu::cuda_pool pool{}; | ||
cu::cuda_scheduler cuda_sched{pool}; | ||
|
||
{ | ||
// Enable polling of CUDA events on the default pool. This is required to allow the adaptors | ||
// below to signal completion of kernels. | ||
cu::enable_user_polling p{}; | ||
|
||
constexpr std::size_t n = 2048; | ||
double* a = nullptr; | ||
double* b = nullptr; | ||
double* c = nullptr; | ||
double alpha = 1.0; | ||
double beta = 1.0; | ||
whip::malloc(&a, sizeof(double) * n * n); | ||
whip::malloc(&b, sizeof(double) * n * n); | ||
whip::malloc(&c, sizeof(double) * n * n); | ||
|
||
// The work created by the adaptors below will all be scheduled on the same stream from the | ||
// pool since the work is sequential. | ||
// | ||
// Note that error checking is omitted below. | ||
auto s = ex::just(42) | ex::continues_on(cuda_sched) | | ||
// CUDA kernel through a lambda. | ||
ex::then([](int x) { printf("Hello from the GPU! x: %d\n", x); }) | | ||
// Explicitly launch a CUDA kernel with a stream (see https://github.com/eth-cscs/whip | ||
// for details about whip) | ||
cu::then_with_stream([](whip::stream_t stream) { kernel<<<1, 32, 0, stream>>>(); }) | | ||
// Launch a cuBLAS/cuSOLVER kernel. | ||
cu::then_with_cublas( | ||
[&](blas_handle_t handle) { | ||
blas_gemm( | ||
handle, blas_op_n, blas_op_n, n, n, n, &alpha, a, n, b, n, &beta, c, n); | ||
}, | ||
blas_pointer_mode); | ||
tt::sync_wait(std::move(s)); | ||
|
||
// We know that all work has completed so we can safely free the memory. | ||
whip::free(a); | ||
whip::free(b); | ||
whip::free(c); | ||
} | ||
|
||
pika::finalize(); | ||
pika::stop(); | ||
|
||
return 0; | ||
} |
103 changes: 103 additions & 0 deletions
103
examples/documentation/then_with_cublas_documentation.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
// Copyright (c) 2024 ETH Zurich | ||
// | ||
// SPDX-License-Identifier: BSL-1.0 | ||
// Distributed under the Boost Software License, Version 1.0. (See accompanying | ||
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | ||
|
||
#include <pika/cuda.hpp> | ||
#include <pika/execution.hpp> | ||
#include <pika/init.hpp> | ||
|
||
#include <fmt/printf.h> | ||
#include <whip.hpp> | ||
|
||
#include <iostream> | ||
#include <utility> | ||
|
||
#if defined(PIKA_HAVE_CUDA) | ||
# include <cublas_v2.h> | ||
using blas_handle_t = cublasHandle_t; | ||
auto* blas_gemm = &cublasDgemm; | ||
auto blas_pointer_mode = CUBLAS_POINTER_MODE_HOST; | ||
auto blas_op_n = CUBLAS_OP_N; | ||
#elif defined(PIKA_HAVE_HIP) | ||
# include <rocblas/rocblas.h> | ||
using blas_handle_t = hipblasHandle_t; | ||
auto* blas_gemm = &rocblas_dgemm; | ||
auto blas_pointer_mode = ROCBLAS_POINTER_MODE_HOST; | ||
auto blas_op_n = rocblas_operation_none; | ||
#endif | ||
|
||
// Owning wrapper for GPU-allocated memory. | ||
class gpu_data | ||
{ | ||
double* p{nullptr}; | ||
std::size_t n{0}; | ||
|
||
public: | ||
// Note that blocking functions such as cudaMalloc will block the underlying operating system | ||
// thread instead of yielding the pika task. Consider using e.g. a pool of GPU memory to avoid | ||
// blocking the thread for too long. | ||
gpu_data(std::size_t n) | ||
Check notice on line 41 in examples/documentation/then_with_cublas_documentation.cpp Codacy Production / Codacy Static Code Analysisexamples/documentation/then_with_cublas_documentation.cpp#L41
|
||
: n(n) | ||
{ | ||
whip::malloc(&p, sizeof(double) * n); | ||
} | ||
gpu_data(gpu_data&& other) noexcept | ||
: p(std::exchange(other.p, nullptr)) | ||
, n(std::exchange(other.n, 0)) | ||
{ | ||
} | ||
gpu_data& operator=(gpu_data&& other) noexcept | ||
{ | ||
p = std::exchange(other.p, nullptr); | ||
n = std::exchange(other.n, 0); | ||
return *this; | ||
} | ||
gpu_data(gpu_data const&) = delete; | ||
gpu_data& operator=(gpu_data const&) = delete; | ||
~gpu_data() { whip::free(p); } | ||
|
||
std::size_t size() const { return n; } | ||
double* get() const { return p; } | ||
}; | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
namespace cu = pika::cuda::experimental; | ||
namespace ex = pika::execution::experimental; | ||
namespace tt = pika::this_thread::experimental; | ||
|
||
pika::start(argc, argv); | ||
ex::thread_pool_scheduler cpu_sched{}; | ||
cu::cuda_pool pool{}; | ||
cu::cuda_scheduler cuda_sched{pool}; | ||
|
||
{ | ||
cu::enable_user_polling p{}; | ||
|
||
constexpr std::size_t n = 2048; | ||
gpu_data a{n * n}; | ||
gpu_data b{n * n}; | ||
gpu_data c{n * n}; | ||
double alpha = 1.0; | ||
double beta = 1.0; | ||
|
||
auto s = ex::just(std::move(a), std::move(b), std::move(c)) | ex::continues_on(cuda_sched) | | ||
// a, b, and c will be kept alive by the then_with_cublas operation state at least until | ||
// the GPU kernels complete. Values sent by the predecessor sender are passed as the | ||
// last arguments after the handle. | ||
cu::then_with_cublas( | ||
[&](blas_handle_t handle, auto& a, auto& b, auto& c) { | ||
blas_gemm(handle, blas_op_n, blas_op_n, n, n, n, &alpha, a.get(), n, b.get(), n, | ||
&beta, c.get(), n); | ||
}, | ||
blas_pointer_mode); | ||
tt::sync_wait(std::move(s)); | ||
} | ||
|
||
pika::finalize(); | ||
pika::stop(); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.