Skip to content

Commit

Permalink
Add support for malloc pitch and 2d copy between host and device in cust
Browse files Browse the repository at this point in the history
  • Loading branch information
kjetilkjeka authored and = committed Mar 17, 2022
1 parent 3bfceac commit 8331043
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
1 change: 1 addition & 0 deletions crates/cust/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Notable changes to this project will be documented in this file.
## Unreleased

- Add `memory::memcpy_dtoh` to allow copying from device to host.
- Add support in `memory` for pitched malloc and 2D memcpy between device and host.

## 0.3.2 - 2/16/22

Expand Down
54 changes: 54 additions & 0 deletions crates/cust/src/memory/malloc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,60 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
Ok(UnifiedPointer::wrap(ptr as *mut T))
}

/// Unsafe wrapper around the `cuMemAllocPitch` function, which allocates device memory in two dimensions
/// where rows are memory aligned to the containing datatype.
///
/// Returns a [`DevicePointer`](struct.DevicePointer.html), pointing to the allocated memory and
/// an `usize` containing the row pitch in *bytes*. The memory is not cleared.
///
/// Note that `count` is in units of T; thus a `count` of 3 will allocate `3 * size_of::<T>()` bytes
/// of memory.
///
/// Memory buffers allocated using `cuda_malloc` must be freed using [`cuda_free`](fn.cuda_free.html).
///
/// # Errors
///
/// If allocating memory fails, returns the CUDA error value.
/// If the number of bytes to allocate is zero (either because count is zero or because T is a
/// zero-sized type), or if the size of the allocation would overflow a usize, returns InvalidValue.
///
/// # Safety
///
/// Since the allocated memory is not initialized, the caller must ensure that it is initialized
/// before copying it to the host in any way. Additionally, the caller must ensure that the memory
/// allocated is freed using cuda_free, or the memory will be leaked.
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
pub unsafe fn cuda_malloc_pitched<T: DeviceCopy>(width: usize, height: usize) -> CudaResult<(DevicePointer<T>, usize)> {
let element_size: std::os::raw::c_uint = std::mem::size_of::<T>()
.try_into()
.map_err(|_| CudaError::InvalidMemoryAllocation)?;

let width_bytes = width.checked_mul(std::mem::size_of::<T>()).unwrap_or(0);
if width_bytes == 0 || height == 0 {
return Err(CudaError::InvalidMemoryAllocation);
}

let mut ptr = 0;
let mut pitch = 0;
cuda::cuMemAllocPitch_v2(&mut ptr, &mut pitch, width_bytes, height, element_size).to_result()?;
Ok((DevicePointer::from_raw(ptr), pitch))
}

/// Free memory allocated with [`cuda_malloc`](fn.cuda_malloc.html).
///
/// # Errors
Expand Down
170 changes: 170 additions & 0 deletions crates/cust/src/memory/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,176 @@ pub unsafe fn memcpy_dtoh(
Ok(())
}

/// Similar to `cudaMemcpy2D` with `HostToDevice` copy type.
///
/// `dpitch`/`spitch` is bytes between the start of two rows.
/// `width` is the number of *elements* (not bytes) in a row.
/// `height` is the total number of rows (not bytes).
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
///
/// let src_array: [f32; 9] = [
/// 1.0, 2.0, 3.0,
/// 4.0, 5.0, 6.0,
/// 7.0, 8.0, 9.0];
///
/// memcpy_2d_htod(
/// device_buffer,
/// pitch,
/// src_array.as_slice().as_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// 3,
/// 3
/// )?;
///
/// let mut dst_array = [0.0f32; 9];
///
/// memcpy_2d_dtoh(
/// dst_array.as_mut_slice().as_mut_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// device_buffer,
/// pitch,
/// 3,
/// 3
/// )?;
///
/// assert_eq!(dst_array, src_array);
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
#[allow(clippy::missing_safety_doc)]
pub unsafe fn memcpy_2d_htod<T: DeviceCopy>(
dst: DevicePointer<T>,
dpitch: usize,
src: *const T,
spitch: usize,
width: usize,
height: usize,
) -> CudaResult<()> {
use cust_raw::CUmemorytype;

let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
.ok_or(CudaError::InvalidMemoryAllocation)?;

let pcopy = cust_raw::CUDA_MEMCPY2D_st {
srcXInBytes: 0,
srcY: 0,
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
srcHost: src as *const c_void,
srcDevice: 0, // Ignored
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
srcPitch: spitch,
dstXInBytes: 0,
dstY: 0,
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
dstHost: std::ptr::null_mut::<c_void>(), // Ignored
dstDevice: dst.as_raw(),
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
dstPitch: dpitch,
WidthInBytes: width_in_bytes,
Height: height,
};

crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
Ok(())
}

/// Similar to `cudaMemcpy2D` with `DeviceToHost` copy type.
///
/// `dpitch`/`spitch` is bytes between the start of two rows.
/// `width` is the number of *elements* (not bytes) in a row.
/// `height` is the total number of rows (not bytes).
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
///
/// let src_array: [f32; 9] = [
/// 1.0, 2.0, 3.0,
/// 4.0, 5.0, 6.0,
/// 7.0, 8.0, 9.0];
///
/// memcpy_2d_htod(
/// device_buffer,
/// pitch,
/// src_array.as_slice().as_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// 3,
/// 3
/// )?;
///
/// let mut dst_array = [0.0f32; 9];
///
/// memcpy_2d_dtoh(
/// dst_array.as_mut_slice().as_mut_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// device_buffer,
/// pitch,
/// 3,
/// 3
/// )?;
///
/// assert_eq!(dst_array, src_array);
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
#[allow(clippy::missing_safety_doc)]
pub unsafe fn memcpy_2d_dtoh<T: DeviceCopy>(
dst: *mut T,
dpitch: usize,
src: DevicePointer<T>,
spitch: usize,
width: usize,
height: usize,
) -> CudaResult<()> {
use cust_raw::CUmemorytype;

let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
.ok_or(CudaError::InvalidMemoryAllocation)?;

let pcopy = cust_raw::CUDA_MEMCPY2D_st {
srcXInBytes: 0,
srcY: 0,
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
srcHost: std::ptr::null_mut::<c_void>(), // Ignored
srcDevice: src.as_raw(),
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
srcPitch: spitch,
dstXInBytes: 0,
dstY: 0,
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
dstHost: dst as *mut c_void,
dstDevice: 0, // Ignored
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
dstPitch: dpitch,
WidthInBytes: width_in_bytes,
Height: height,
};

crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
Ok(())
}

/// Get the current free and total memory.
///
/// Returns in `.1` the total amount of memory available to the the current context.
Expand Down

0 comments on commit 8331043

Please sign in to comment.