diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md index af5aa2b..d734c68 100644 --- a/crates/cust/CHANGELOG.md +++ b/crates/cust/CHANGELOG.md @@ -5,6 +5,7 @@ Notable changes to this project will be documented in this file. ## Unreleased - Add `memory::memcpy_dtoh` to allow copying from device to host. + - Add support in `memory` for pitched malloc and 2D memcpy between device and host. ## 0.3.2 - 2/16/22 diff --git a/crates/cust/src/memory/malloc.rs b/crates/cust/src/memory/malloc.rs index fb86501..890a1ce 100644 --- a/crates/cust/src/memory/malloc.rs +++ b/crates/cust/src/memory/malloc.rs @@ -148,6 +148,60 @@ pub unsafe fn cuda_malloc_unified(count: usize) -> CudaResult()` bytes +/// of memory. +/// +/// Memory buffers allocated using `cuda_malloc` must be freed using [`cuda_free`](fn.cuda_free.html). +/// +/// # Errors +/// +/// If allocating memory fails, returns the CUDA error value. +/// If the number of bytes to allocate is zero (either because count is zero or because T is a +/// zero-sized type), or if the size of the allocation would overflow a usize, returns InvalidValue. +/// +/// # Safety +/// +/// Since the allocated memory is not initialized, the caller must ensure that it is initialized +/// before copying it to the host in any way. Additionally, the caller must ensure that the memory +/// allocated is freed using cuda_free, or the memory will be leaked. +/// +/// # Examples +/// +/// ``` +/// # let _context = cust::quick_init().unwrap(); +/// # fn foo() -> Result<(), cust::error::CudaError> { +/// use cust::memory::*; +/// unsafe { +/// // Allocate space for a 3x3 matrix of f32s +/// let (device_buffer, pitch) = cuda_malloc_pitched::(3, 3)?; +/// cuda_free(device_buffer)?; +/// } +/// # Ok(()) +/// # } +/// # foo().unwrap(); +/// ``` +pub unsafe fn cuda_malloc_pitched(width: usize, height: usize) -> CudaResult<(DevicePointer, usize)> { + let element_size: std::os::raw::c_uint = std::mem::size_of::() + .try_into() + .map_err(|_| CudaError::InvalidMemoryAllocation)?; + + let width_bytes = width.checked_mul(std::mem::size_of::()).unwrap_or(0); + if width_bytes == 0 || height == 0 { + return Err(CudaError::InvalidMemoryAllocation); + } + + let mut ptr = 0; + let mut pitch = 0; + cuda::cuMemAllocPitch_v2(&mut ptr, &mut pitch, width_bytes, height, element_size).to_result()?; + Ok((DevicePointer::from_raw(ptr), pitch)) +} + /// Free memory allocated with [`cuda_malloc`](fn.cuda_malloc.html). /// /// # Errors diff --git a/crates/cust/src/memory/mod.rs b/crates/cust/src/memory/mod.rs index c0639e1..d8caa73 100644 --- a/crates/cust/src/memory/mod.rs +++ b/crates/cust/src/memory/mod.rs @@ -225,6 +225,176 @@ pub unsafe fn memcpy_dtoh( Ok(()) } +/// Similar to `cudaMemcpy2D` with `HostToDevice` copy type. +/// +/// `dpitch`/`spitch` is bytes between the start of two rows. +/// `width` is the number of *elements* (not bytes) in a row. +/// `height` is the total number of rows (not bytes). +/// +/// # Examples +/// +/// ``` +/// # let _context = cust::quick_init().unwrap(); +/// # fn foo() -> Result<(), cust::error::CudaError> { +/// use cust::memory::*; +/// unsafe { +/// // Allocate space for a 3x3 matrix of f32s +/// let (device_buffer, pitch) = cuda_malloc_pitched::(3, 3)?; +/// +/// let src_array: [f32; 9] = [ +/// 1.0, 2.0, 3.0, +/// 4.0, 5.0, 6.0, +/// 7.0, 8.0, 9.0]; +/// +/// memcpy_2d_htod( +/// device_buffer, +/// pitch, +/// src_array.as_slice().as_ptr(), +/// 3*std::mem::size_of::(), +/// 3, +/// 3 +/// )?; +/// +/// let mut dst_array = [0.0f32; 9]; +/// +/// memcpy_2d_dtoh( +/// dst_array.as_mut_slice().as_mut_ptr(), +/// 3*std::mem::size_of::(), +/// device_buffer, +/// pitch, +/// 3, +/// 3 +/// )?; +/// +/// assert_eq!(dst_array, src_array); +/// cuda_free(device_buffer)?; +/// } +/// # Ok(()) +/// # } +/// # foo().unwrap(); +/// ``` +#[allow(clippy::missing_safety_doc)] +pub unsafe fn memcpy_2d_htod( + dst: DevicePointer, + dpitch: usize, + src: *const T, + spitch: usize, + width: usize, + height: usize, +) -> CudaResult<()> { + use cust_raw::CUmemorytype; + + let width_in_bytes = width.checked_mul(std::mem::size_of::()) + .ok_or(CudaError::InvalidMemoryAllocation)?; + + let pcopy = cust_raw::CUDA_MEMCPY2D_st { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + srcHost: src as *const c_void, + srcDevice: 0, // Ignored + srcArray: std::ptr::null_mut::(), // Ignored + srcPitch: spitch, + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE, + dstHost: std::ptr::null_mut::(), // Ignored + dstDevice: dst.as_raw(), + dstArray: std::ptr::null_mut::(), // Ignored + dstPitch: dpitch, + WidthInBytes: width_in_bytes, + Height: height, + }; + + crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?; + Ok(()) +} + +/// Similar to `cudaMemcpy2D` with `DeviceToHost` copy type. +/// +/// `dpitch`/`spitch` is bytes between the start of two rows. +/// `width` is the number of *elements* (not bytes) in a row. +/// `height` is the total number of rows (not bytes). +/// +/// # Examples +/// +/// ``` +/// # let _context = cust::quick_init().unwrap(); +/// # fn foo() -> Result<(), cust::error::CudaError> { +/// use cust::memory::*; +/// unsafe { +/// // Allocate space for a 3x3 matrix of f32s +/// let (device_buffer, pitch) = cuda_malloc_pitched::(3, 3)?; +/// +/// let src_array: [f32; 9] = [ +/// 1.0, 2.0, 3.0, +/// 4.0, 5.0, 6.0, +/// 7.0, 8.0, 9.0]; +/// +/// memcpy_2d_htod( +/// device_buffer, +/// pitch, +/// src_array.as_slice().as_ptr(), +/// 3*std::mem::size_of::(), +/// 3, +/// 3 +/// )?; +/// +/// let mut dst_array = [0.0f32; 9]; +/// +/// memcpy_2d_dtoh( +/// dst_array.as_mut_slice().as_mut_ptr(), +/// 3*std::mem::size_of::(), +/// device_buffer, +/// pitch, +/// 3, +/// 3 +/// )?; +/// +/// assert_eq!(dst_array, src_array); +/// cuda_free(device_buffer)?; +/// } +/// # Ok(()) +/// # } +/// # foo().unwrap(); +/// ``` +#[allow(clippy::missing_safety_doc)] +pub unsafe fn memcpy_2d_dtoh( + dst: *mut T, + dpitch: usize, + src: DevicePointer, + spitch: usize, + width: usize, + height: usize, +) -> CudaResult<()> { + use cust_raw::CUmemorytype; + + let width_in_bytes = width.checked_mul(std::mem::size_of::()) + .ok_or(CudaError::InvalidMemoryAllocation)?; + + let pcopy = cust_raw::CUDA_MEMCPY2D_st { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE, + srcHost: std::ptr::null_mut::(), // Ignored + srcDevice: src.as_raw(), + srcArray: std::ptr::null_mut::(), // Ignored + srcPitch: spitch, + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + dstHost: dst as *mut c_void, + dstDevice: 0, // Ignored + dstArray: std::ptr::null_mut::(), // Ignored + dstPitch: dpitch, + WidthInBytes: width_in_bytes, + Height: height, + }; + + crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?; + Ok(()) +} + /// Get the current free and total memory. /// /// Returns in `.1` the total amount of memory available to the the current context.