From 2204a4175e93c6728a6eff3f8f2dee2e5278ad93 Mon Sep 17 00:00:00 2001
From: Artem Chirkin <9253178+achirkin@users.noreply.github.com>
Date: Wed, 11 Dec 2024 02:02:18 -0800
Subject: [PATCH] Fix Grace-specific issues in CAGRA

---
 cpp/src/neighbors/detail/ann_utils.cuh   | 11 +++++++++++
 cpp/src/neighbors/detail/cagra/utils.hpp | 16 ++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 652d41c85..529356351 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -403,6 +403,17 @@ struct batch_load_iterator {
 
   /** A single batch of data residing in device memory. */
   struct batch {
+    ~batch() noexcept
+    {
+      /*
+      If there's no copy, there's no allocation owned by the batch.
+      If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+      If there's no stream order guarantee, we must synchronize with the stream before the batch is
+      destroyed to make sure all GPU operations in that stream finish earlier.
+      */
+      if (!does_copy()) { RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream_)); }
+    }
+
     /** Logical width of a single row in a batch, in elements of type `T`. */
     [[nodiscard]] auto row_width() const -> size_type { return row_width_; }
     /** Logical offset of the batch, in rows (`row_width()`) */
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 0f8309328..9f95c5b1c 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -179,7 +179,7 @@ class device_matrix_view_from_host {
  public:
   device_matrix_view_from_host(raft::resources const& res,
                                raft::host_matrix_view<T, IdxT> host_view)
-    : host_view_(host_view)
+    : res_(res), host_view_(host_view)
   {
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
@@ -199,6 +199,17 @@ class device_matrix_view_from_host {
     }
   }
 
+  ~device_matrix_view_from_host() noexcept
+  {
+    /*
+    If there's no copy, there's no allocation owned by this struct.
+    If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+    If there's no stream order guarantee, we must synchronize with the stream before the struct is
+    destroyed to make sure all GPU operations in that stream finish earlier.
+    */
+    if (!allocated_memory()) { raft::resource::sync_stream(res_); }
+  }
+
   raft::device_matrix_view<T, IdxT> view()
   {
     return raft::make_device_matrix_view<T, IdxT>(
@@ -207,9 +218,10 @@ class device_matrix_view_from_host {
 
   T* data_handle() { return device_ptr; }
 
-  bool allocated_memory() const { return device_mem_.has_value(); }
+  [[nodiscard]] bool allocated_memory() const { return device_mem_.has_value(); }
 
  private:
+  const raft::resources& res_;
   std::optional<raft::device_matrix<T, IdxT>> device_mem_;
   raft::host_matrix_view<T, IdxT> host_view_;
   T* device_ptr;