Add support for malloc pitch and 2d copy between host and device in cust

Rust-GPU · Mar 17, 2022 · 8331043 · 8331043
1 parent 3bfceac
commit 8331043
Show file tree

Hide file tree

Showing 3 changed files with 225 additions and 0 deletions.
diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
@@ -5,6 +5,7 @@ Notable changes to this project will be documented in this file.
 ## Unreleased
 
  - Add `memory::memcpy_dtoh` to allow copying from device to host.
+ - Add support in `memory` for pitched malloc and 2D memcpy between device and host.
 
 ## 0.3.2 - 2/16/22
 

diff --git a/crates/cust/src/memory/malloc.rs b/crates/cust/src/memory/malloc.rs
@@ -148,6 +148,60 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
     Ok(UnifiedPointer::wrap(ptr as *mut T))
 }
 
+/// Unsafe wrapper around the `cuMemAllocPitch` function, which allocates device memory in two dimensions
+/// where rows are memory aligned to the containing datatype. 
+/// 
+/// Returns a [`DevicePointer`](struct.DevicePointer.html), pointing to the allocated memory and 
+/// an `usize` containing the row pitch in *bytes*. The memory is not cleared.
+///
+/// Note that `count` is in units of T; thus a `count` of 3 will allocate `3 * size_of::<T>()` bytes
+/// of memory.
+///
+/// Memory buffers allocated using `cuda_malloc` must be freed using [`cuda_free`](fn.cuda_free.html).
+///
+/// # Errors
+///
+/// If allocating memory fails, returns the CUDA error value.
+/// If the number of bytes to allocate is zero (either because count is zero or because T is a
+/// zero-sized type), or if the size of the allocation would overflow a usize, returns InvalidValue.
+///
+/// # Safety
+///
+/// Since the allocated memory is not initialized, the caller must ensure that it is initialized
+/// before copying it to the host in any way. Additionally, the caller must ensure that the memory
+/// allocated is freed using cuda_free, or the memory will be leaked.
+///
+/// # Examples
+///
+/// ```
+/// # let _context = cust::quick_init().unwrap();
+/// # fn foo() -> Result<(), cust::error::CudaError> {
+/// use cust::memory::*;
+/// unsafe {
+///     // Allocate space for a 3x3 matrix of f32s
+///     let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
+///     cuda_free(device_buffer)?;
+/// }
+/// # Ok(())
+/// # }
+/// # foo().unwrap();
+/// ```
+pub unsafe fn cuda_malloc_pitched<T: DeviceCopy>(width: usize, height: usize) -> CudaResult<(DevicePointer<T>, usize)> {
+    let element_size: std::os::raw::c_uint = std::mem::size_of::<T>()
+        .try_into()
+        .map_err(|_| CudaError::InvalidMemoryAllocation)?;
+
+    let width_bytes = width.checked_mul(std::mem::size_of::<T>()).unwrap_or(0);
+    if width_bytes == 0 || height == 0 {
+        return Err(CudaError::InvalidMemoryAllocation);
+    }
+
+    let mut ptr = 0;
+    let mut pitch = 0;
+    cuda::cuMemAllocPitch_v2(&mut ptr, &mut pitch, width_bytes, height, element_size).to_result()?;
+    Ok((DevicePointer::from_raw(ptr), pitch))
+}
+
 /// Free memory allocated with [`cuda_malloc`](fn.cuda_malloc.html).
 ///
 /// # Errors

diff --git a/crates/cust/src/memory/mod.rs b/crates/cust/src/memory/mod.rs
@@ -225,6 +225,176 @@ pub unsafe fn memcpy_dtoh(
     Ok(())
 }
 
+/// Similar to `cudaMemcpy2D` with `HostToDevice` copy type.
+/// 
+/// `dpitch`/`spitch` is bytes between the start of two rows.
+/// `width` is the number of *elements* (not bytes) in a row.
+/// `height` is the total number of rows (not bytes).
+/// 
+/// # Examples
+///
+/// ```
+/// # let _context = cust::quick_init().unwrap();
+/// # fn foo() -> Result<(), cust::error::CudaError> {
+/// use cust::memory::*;
+/// unsafe {
+///     // Allocate space for a 3x3 matrix of f32s
+///     let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
+/// 
+///     let src_array: [f32; 9] = [
+///         1.0, 2.0, 3.0, 
+///         4.0, 5.0, 6.0, 
+///         7.0, 8.0, 9.0];
+/// 
+///     memcpy_2d_htod(
+///         device_buffer, 
+///         pitch, 
+///         src_array.as_slice().as_ptr(),
+///         3*std::mem::size_of::<f32>(),
+///         3,
+///         3
+///     )?;
+/// 
+///     let mut dst_array = [0.0f32; 9];
+///
+///     memcpy_2d_dtoh(
+///         dst_array.as_mut_slice().as_mut_ptr(),
+///         3*std::mem::size_of::<f32>(),
+///         device_buffer, 
+///         pitch, 
+///         3,
+///         3
+///     )?; 
+/// 
+///     assert_eq!(dst_array, src_array);
+///     cuda_free(device_buffer)?;
+/// }
+/// # Ok(())
+/// # }
+/// # foo().unwrap();
+/// ```
+#[allow(clippy::missing_safety_doc)]
+pub unsafe fn memcpy_2d_htod<T: DeviceCopy>(
+    dst: DevicePointer<T>,
+    dpitch: usize,
+    src: *const T,
+    spitch: usize,
+    width: usize,
+    height: usize,
+) -> CudaResult<()> {
+    use cust_raw::CUmemorytype;
+
+    let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
+        .ok_or(CudaError::InvalidMemoryAllocation)?;
+
+    let pcopy = cust_raw::CUDA_MEMCPY2D_st {
+        srcXInBytes: 0,
+        srcY: 0,
+        srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+        srcHost: src as *const c_void,
+        srcDevice: 0, // Ignored
+        srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
+        srcPitch: spitch,
+        dstXInBytes: 0,
+        dstY: 0,
+        dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+        dstHost: std::ptr::null_mut::<c_void>(), // Ignored
+        dstDevice: dst.as_raw(),
+        dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
+        dstPitch: dpitch,
+        WidthInBytes: width_in_bytes,
+        Height: height,
+    };
+
+    crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
+    Ok(())
+}
+
+/// Similar to `cudaMemcpy2D` with `DeviceToHost` copy type.
+/// 
+/// `dpitch`/`spitch` is bytes between the start of two rows.
+/// `width` is the number of *elements* (not bytes) in a row.
+/// `height` is the total number of rows (not bytes).
+/// 
+/// # Examples
+///
+/// ```
+/// # let _context = cust::quick_init().unwrap();
+/// # fn foo() -> Result<(), cust::error::CudaError> {
+/// use cust::memory::*;
+/// unsafe {
+///     // Allocate space for a 3x3 matrix of f32s
+///     let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
+/// 
+///     let src_array: [f32; 9] = [
+///         1.0, 2.0, 3.0, 
+///         4.0, 5.0, 6.0, 
+///         7.0, 8.0, 9.0];
+/// 
+///     memcpy_2d_htod(
+///         device_buffer, 
+///         pitch, 
+///         src_array.as_slice().as_ptr(),
+///         3*std::mem::size_of::<f32>(),
+///         3,
+///         3
+///     )?;
+/// 
+///     let mut dst_array = [0.0f32; 9];
+///
+///     memcpy_2d_dtoh(
+///         dst_array.as_mut_slice().as_mut_ptr(),
+///         3*std::mem::size_of::<f32>(),
+///         device_buffer, 
+///         pitch, 
+///         3,
+///         3
+///     )?; 
+/// 
+///     assert_eq!(dst_array, src_array);
+///     cuda_free(device_buffer)?;
+/// }
+/// # Ok(())
+/// # }
+/// # foo().unwrap();
+/// ```
+#[allow(clippy::missing_safety_doc)]
+pub unsafe fn memcpy_2d_dtoh<T: DeviceCopy>(
+    dst: *mut T,
+    dpitch: usize,
+    src: DevicePointer<T>,
+    spitch: usize,
+    width: usize,
+    height: usize,
+) -> CudaResult<()> {
+    use cust_raw::CUmemorytype;
+
+    let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
+        .ok_or(CudaError::InvalidMemoryAllocation)?;
+
+    let pcopy = cust_raw::CUDA_MEMCPY2D_st {
+        srcXInBytes: 0,
+        srcY: 0,
+        srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
+        srcHost: std::ptr::null_mut::<c_void>(), // Ignored
+        srcDevice: src.as_raw(),
+        srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
+        srcPitch: spitch,
+        dstXInBytes: 0,
+        dstY: 0,
+        dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
+        dstHost: dst as *mut c_void, 
+        dstDevice: 0, // Ignored
+        dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
+        dstPitch: dpitch,
+        WidthInBytes: width_in_bytes,
+        Height: height,
+    };
+
+    crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
+    Ok(())
+}
+
 /// Get the current free and total memory.
 ///
 /// Returns in `.1` the total amount of memory available to the the current context.