Skip to content

Commit

Permalink
Make thrust_allocator deallocate safe in multi-device setting (#1533)
Browse files Browse the repository at this point in the history
Previously, the user had to arrange that the device active when a
thrust_allocator object was created was also active when allocate and
deallocate was called. This is hard to manage if exceptions are
thrown. Instead, save the active device on construction and ensure
that it is active when calling deallocate and deallocate. This means
that device_vector is safe to destruct with RAII semantics in a
multi-device setting.

Add tests of this facility, and correct the parameterization usage in
the other thrust allocator tests such that we actually check the MRs
we're parameterizing over.

- Closes #1527

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Mark Harris (https://github.com/harrism)

URL: #1533
  • Loading branch information
wence- authored Apr 16, 2024
1 parent 7ed529f commit 588928f
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 11 deletions.
26 changes: 16 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -375,14 +375,16 @@ this code is correct:

#### Use of `rmm::device_vector` with multiple devices

> [!CAUTION] In contrast to the uninitialized `rmm:device_uvector`, `rmm::device_vector` **DOES
> NOT** store the active device during construction, and therefore cannot arrange for it to be
> active when the destructor runs. It is therefore the responsibility of the user to ensure the
> currently active device is correct.
`rmm:device_vector` uses an `rmm::mr::thrust_allocator` to enable `thrust::device_vector` to
allocate and deallocate memory using RMM. As such, the usual rules for usage of the backing memory
resource apply: the active device must match the active device at resource construction time. To
facilitate use in an RAII setting, `rmm::mr::thrust_allocator` records the active device at
construction time and ensures that device is active whenever it allocates or deallocates memory.
Usage of `rmm::device_vector` with multiple devices is therefore the same as `rmm::device_buffer`.
One must _create_ `device_vector`s with the correct device active, but it is safe to destroy them
with a different active device.

`rmm::device_vector` is therefore slightly less ergonomic to use in a multiple device setting since
the caller must arrange that active devices on allocation and deallocation match. Recapitulating the
previous example using `rmm::device_vector`:
For example, recapitulating the previous example using `rmm::device_vector`:

```c++
{
Expand All @@ -391,12 +393,16 @@ previous example using `rmm::device_vector`:
rmm::device_vector<int> vec(16, rmm::mr::thrust_allocator<int>(rmm::cuda_stream_default, &mr));
RMM_CUDA_TRY(cudaSetDevice(1));
...
// ERROR: ~vec runs with device 1 active, but needs device 0 to be active
// No need to switch back to device 0 before ~vec runs
}
```

A correct example adds a call to `cudaSetDevice(0)` on the line of the error comment before the dtor
for `~vec` runs.
> [!NOTE]
> Although allocation and deallocation in the `thrust_allocator` run with the correct active device,
> modification of `rmm::device_vector` might necessitate a kernel launch, and this must run with the
> correct device active. For example, `.resize()` might both allocate _and_ launch a kernel to
> initialize new elements: the user must arrange for this kernel launch to occur with the correct
> device for the memory resource active.
## `cuda_stream_view` and `cuda_stream`

Expand Down
9 changes: 8 additions & 1 deletion include/rmm/mr/device/thrust_allocator_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <rmm/cuda_device.hpp>
#include <rmm/detail/thrust_namespace.h>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>
Expand All @@ -39,6 +40,9 @@ namespace rmm::mr {
* allocate objects of a specific type `T`, but can be freely rebound to other
* types.
*
* The allocator records the current cuda device and may only be used with a backing
* `device_async_resource_ref` valid for the same device.
*
* @tparam T The type of the objects that will be allocated by this allocator
*/
template <typename T>
Expand Down Expand Up @@ -92,7 +96,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
*/
template <typename U>
thrust_allocator(thrust_allocator<U> const& other)
: _mr(other.resource()), _stream{other.stream()}
: _mr(other.resource()), _stream{other.stream()}, _device{other._device}
{
}

Expand All @@ -104,6 +108,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
*/
pointer allocate(size_type num)
{
cuda_set_device_raii dev{_device};
return thrust::device_pointer_cast(
static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
}
Expand All @@ -117,6 +122,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
*/
void deallocate(pointer ptr, size_type num)
{
cuda_set_device_raii dev{_device};
return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
}

Expand All @@ -143,6 +149,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
private:
cuda_stream_view _stream{};
rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
cuda_device_id _device{get_current_cuda_device()};
};
/** @} */ // end of group
} // namespace rmm::mr
19 changes: 19 additions & 0 deletions tests/mr/device/thrust_allocator_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

#include "mr_ref_test.hpp"

#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/device/thrust_allocator_adaptor.hpp>
Expand All @@ -36,19 +38,36 @@ struct allocator_test : public mr_ref_test {};

TEST_P(allocator_test, first)
{
rmm::mr::set_current_device_resource(this->mr.get());
auto const num_ints{100};
rmm::device_vector<int> ints(num_ints, 1);
EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end()));
}

TEST_P(allocator_test, defaults)
{
rmm::mr::set_current_device_resource(this->mr.get());
rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
EXPECT_EQ(allocator.get_upstream_resource(),
rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()});
}

TEST_P(allocator_test, multi_device)
{
if (rmm::get_num_cuda_devices() < 2) { GTEST_SKIP() << "Needs at least two devices"; }
cuda_set_device_raii with_device{rmm::get_current_cuda_device()};
rmm::cuda_stream stream{};
// make allocator on device-0
rmm::mr::thrust_allocator<int> allocator(stream.view(), this->ref);
auto const size{100};
EXPECT_NO_THROW([&]() {
auto vec = rmm::device_vector<int>(size, allocator);
// Destruct with device-1 active
RMM_CUDA_TRY(cudaSetDevice(1));
}());
}

INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
allocator_test,
::testing::Values(mr_factory{"CUDA", &make_cuda},
Expand Down

0 comments on commit 588928f

Please sign in to comment.