From 2204a4175e93c6728a6eff3f8f2dee2e5278ad93 Mon Sep 17 00:00:00 2001 From: Artem Chirkin <9253178+achirkin@users.noreply.github.com> Date: Wed, 11 Dec 2024 02:02:18 -0800 Subject: [PATCH] Fix Grace-specific issues in CAGRA --- cpp/src/neighbors/detail/ann_utils.cuh | 11 +++++++++++ cpp/src/neighbors/detail/cagra/utils.hpp | 16 ++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 652d41c85..529356351 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -403,6 +403,17 @@ struct batch_load_iterator { /** A single batch of data residing in device memory. */ struct batch { + ~batch() noexcept + { + /* + If there's no copy, there's no allocation owned by the batch. + If there's no allocation, there's no guarantee that the device pointer is stream-ordered. + If there's no stream order guarantee, we must synchronize with the stream before the batch is + destroyed to make sure all GPU operations in that stream finish earlier. + */ + if (!does_copy()) { RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream_)); } + } + /** Logical width of a single row in a batch, in elements of type `T`. */ [[nodiscard]] auto row_width() const -> size_type { return row_width_; } /** Logical offset of the batch, in rows (`row_width()`) */ diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 0f8309328..9f95c5b1c 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -179,7 +179,7 @@ class device_matrix_view_from_host { public: device_matrix_view_from_host(raft::resources const& res, raft::host_matrix_view host_view) - : host_view_(host_view) + : res_(res), host_view_(host_view) { cudaPointerAttributes attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); @@ -199,6 +199,17 @@ class device_matrix_view_from_host { } } + ~device_matrix_view_from_host() noexcept + { + /* + If there's no copy, there's no allocation owned by this struct. + If there's no allocation, there's no guarantee that the device pointer is stream-ordered. + If there's no stream order guarantee, we must synchronize with the stream before the struct is + destroyed to make sure all GPU operations in that stream finish earlier. + */ + if (!allocated_memory()) { raft::resource::sync_stream(res_); } + } + raft::device_matrix_view view() { return raft::make_device_matrix_view( @@ -207,9 +218,10 @@ class device_matrix_view_from_host { T* data_handle() { return device_ptr; } - bool allocated_memory() const { return device_mem_.has_value(); } + [[nodiscard]] bool allocated_memory() const { return device_mem_.has_value(); } private: + const raft::resources& res_; std::optional> device_mem_; raft::host_matrix_view host_view_; T* device_ptr;