Make thrust_allocator deallocate safe in multi-device setting (#1533)

Previously, the user had to arrange that the device active when a thrust_allocator object was created was also active when allocate and deallocate was called. This is hard to manage if exceptions are thrown. Instead, save the active device on construction and ensure that it is active when calling deallocate and deallocate. This means that device_vector is safe to destruct with RAII semantics in a multi-device setting. Add tests of this facility, and correct the parameterization usage in the other thrust allocator tests such that we actually check the MRs we're parameterizing over. - Closes #1527 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Rong Ou (https://github.com/rongou) - Mark Harris (https://github.com/harrism) URL: #1533
rapidsai · Apr 16, 2024 · 588928f · 588928f
1 parent 7ed529f
commit 588928f
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -375,14 +375,16 @@ this code is correct:
 
 #### Use of `rmm::device_vector` with multiple devices
 
-> [!CAUTION] In contrast to the uninitialized `rmm:device_uvector`, `rmm::device_vector` **DOES
-> NOT** store the active device during construction, and therefore cannot arrange for it to be
-> active when the destructor runs. It is therefore the responsibility of the user to ensure the
-> currently active device is correct.
+`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to
+allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory
+resource apply: the active device must match the active device at resource construction time. To
+facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at
+construction time and ensures that device is active whenever it allocates or deallocates memory.
+Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`.
+One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them
+with a different active device.
 
-`rmm::device_vector` is therefore slightly less ergonomic to use in a multiple device setting since
-the caller must arrange that active devices on allocation and deallocation match. Recapitulating the
-previous example using `rmm::device_vector`:
+For example, recapitulating the previous example using `rmm::device_vector`:
 
 ```c++
 {
@@ -391,12 +393,16 @@ previous example using `rmm::device_vector`:
   rmm::device_vector<int> vec(16, rmm::mr::thrust_allocator<int>(rmm::cuda_stream_default, &mr));
   RMM_CUDA_TRY(cudaSetDevice(1));
   ...
-  // ERROR: ~vec runs with device 1 active, but needs device 0 to be active
+  // No need to switch back to device 0 before ~vec runs
 }
 ```
 
-A correct example adds a call to `cudaSetDevice(0)` on the line of the error comment before the dtor
-for `~vec` runs.
+> [!NOTE]
+> Although allocation and deallocation in the `thrust_allocator` run with the correct active device,
+> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the
+> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to
+> initialize new elements: the user must arrange for this kernel launch to occur with the correct
+> device for the memory resource active.
 
 ## `cuda_stream_view` and `cuda_stream`
 

diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -39,6 +40,9 @@ namespace rmm::mr {
  * allocate objects of a specific type `T`, but can be freely rebound to other
  * types.
  *
+ * The allocator records the current cuda device and may only be used with a backing
+ * `device_async_resource_ref` valid for the same device.
+ *
  * @tparam T The type of the objects that will be allocated by this allocator
  */
 template <typename T>
@@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   template <typename U>
   thrust_allocator(thrust_allocator<U> const& other)
-    : _mr(other.resource()), _stream{other.stream()}
+    : _mr(other.resource()), _stream{other.stream()}, _device{other._device}
   {
   }
 
@@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   pointer allocate(size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return thrust::device_pointer_cast(
       static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
   }
@@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   void deallocate(pointer ptr, size_type num)
   {
+    cuda_set_device_raii dev{_device};
     return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
   }
 
@@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
  private:
   cuda_stream_view _stream{};
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  cuda_device_id _device{get_current_cuda_device()};
 };
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
@@ -16,7 +16,9 @@
 
 #include "mr_ref_test.hpp"
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>
@@ -36,19 +38,36 @@ struct allocator_test : public mr_ref_test {};
 
 TEST_P(allocator_test, first)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   auto const num_ints{100};
   rmm::device_vector<int> ints(num_ints, 1);
   EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
 }
 
 TEST_P(allocator_test, defaults)
 {
+  rmm::mr::set_current_device_resource(this->mr.get());
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
   EXPECT_EQ(allocator.get_upstream_resource(),
             rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
+TEST_P(allocator_test, multi_device)
+{
+  if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; }
+  cuda_set_device_raii with_device{rmm::get_current_cuda_device()};
+  rmm::cuda_stream stream{};
+  // make allocator on device-0
+  rmm::mr::thrust_allocator<int> allocator(stream.view(), this->ref);
+  auto const size{100};
+  EXPECT_NO_THROW([&]() {
+    auto vec = rmm::device_vector<int>(size, allocator);
+    // Destruct with device-1 active
+    RMM_CUDA_TRY(cudaSetDevice(1));
+  }());
+}
+
 INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
                         allocator_test,
                         ::testing::Values(mr_factory{"CUDA", &make_cuda},