Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pitched malloc and 2d memcpy to cust #64

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/cust/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Notable changes to this project will be documented in this file.
## Unreleased

- Add `memory::memcpy_dtoh` to allow copying from device to host.
- Add support in `memory` for pitched malloc and 2D memcpy between device and host.

## 0.3.2 - 2/16/22

Expand Down
2 changes: 1 addition & 1 deletion crates/cust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
"Riccardo D'Ambrosio <[email protected]>",
"Brook Heisler <[email protected]>",
]
edition = "2018"
edition = "2021"
license = "MIT OR Apache-2.0"
description = "High level bindings to the CUDA Driver API"
repository = "https://github.com/Rust-GPU/Rust-CUDA"
Expand Down
54 changes: 54 additions & 0 deletions crates/cust/src/memory/malloc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,60 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
Ok(UnifiedPointer::wrap(ptr as *mut T))
}

/// Unsafe wrapper around the `cuMemAllocPitch` function, which allocates device memory in two dimensions
/// where rows are memory aligned to the containing datatype.
///
/// Returns a [`DevicePointer`](struct.DevicePointer.html), pointing to the allocated memory and
/// an `usize` containing the row pitch in *bytes*. The memory is not cleared.
///
/// Note that `count` is in units of T; thus a `count` of 3 will allocate `3 * size_of::<T>()` bytes
/// of memory.
///
/// Memory buffers allocated using `cuda_malloc` must be freed using [`cuda_free`](fn.cuda_free.html).
///
/// # Errors
///
/// If allocating memory fails, returns the CUDA error value.
/// If the number of bytes to allocate is zero (either because count is zero or because T is a
/// zero-sized type), or if the size of the allocation would overflow a usize, returns InvalidValue.
///
/// # Safety
///
/// Since the allocated memory is not initialized, the caller must ensure that it is initialized
/// before copying it to the host in any way. Additionally, the caller must ensure that the memory
/// allocated is freed using cuda_free, or the memory will be leaked.
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
pub unsafe fn cuda_malloc_pitched<T: DeviceCopy>(width: usize, height: usize) -> CudaResult<(DevicePointer<T>, usize)> {
let element_size: std::os::raw::c_uint = std::mem::size_of::<T>()
.try_into()
.map_err(|_| CudaError::InvalidMemoryAllocation)?;

let width_bytes = width.checked_mul(std::mem::size_of::<T>()).unwrap_or(0);
if width_bytes == 0 || height == 0 {
return Err(CudaError::InvalidMemoryAllocation);
}

let mut ptr = 0;
let mut pitch = 0;
cuda::cuMemAllocPitch_v2(&mut ptr, &mut pitch, width_bytes, height, element_size).to_result()?;
Ok((DevicePointer::from_raw(ptr), pitch))
}

/// Free memory allocated with [`cuda_malloc`](fn.cuda_malloc.html).
///
/// # Errors
Expand Down
170 changes: 170 additions & 0 deletions crates/cust/src/memory/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,176 @@ pub unsafe fn memcpy_dtoh(
Ok(())
}

/// Similar to `cudaMemcpy2D` with `HostToDevice` copy type.
///
/// `dpitch`/`spitch` is bytes between the start of two rows.
/// `width` is the number of *elements* (not bytes) in a row.
/// `height` is the total number of rows (not bytes).
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
///
/// let src_array: [f32; 9] = [
/// 1.0, 2.0, 3.0,
/// 4.0, 5.0, 6.0,
/// 7.0, 8.0, 9.0];
///
/// memcpy_2d_htod(
/// device_buffer,
/// pitch,
/// src_array.as_slice().as_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// 3,
/// 3
/// )?;
///
/// let mut dst_array = [0.0f32; 9];
///
/// memcpy_2d_dtoh(
/// dst_array.as_mut_slice().as_mut_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// device_buffer,
/// pitch,
/// 3,
/// 3
/// )?;
///
/// assert_eq!(dst_array, src_array);
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
#[allow(clippy::missing_safety_doc)]
pub unsafe fn memcpy_2d_htod<T: DeviceCopy>(
dst: DevicePointer<T>,
dpitch: usize,
src: *const T,
spitch: usize,
width: usize,
height: usize,
) -> CudaResult<()> {
use cust_raw::CUmemorytype;

let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
.ok_or(CudaError::InvalidMemoryAllocation)?;

let pcopy = cust_raw::CUDA_MEMCPY2D_st {
srcXInBytes: 0,
srcY: 0,
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
srcHost: src as *const c_void,
srcDevice: 0, // Ignored
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
srcPitch: spitch,
dstXInBytes: 0,
dstY: 0,
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
dstHost: std::ptr::null_mut::<c_void>(), // Ignored
dstDevice: dst.as_raw(),
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
dstPitch: dpitch,
WidthInBytes: width_in_bytes,
Height: height,
};

crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
Ok(())
}

/// Similar to `cudaMemcpy2D` with `DeviceToHost` copy type.
///
/// `dpitch`/`spitch` is bytes between the start of two rows.
/// `width` is the number of *elements* (not bytes) in a row.
/// `height` is the total number of rows (not bytes).
///
/// # Examples
///
/// ```
/// # let _context = cust::quick_init().unwrap();
/// # fn foo() -> Result<(), cust::error::CudaError> {
/// use cust::memory::*;
/// unsafe {
/// // Allocate space for a 3x3 matrix of f32s
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
///
/// let src_array: [f32; 9] = [
/// 1.0, 2.0, 3.0,
/// 4.0, 5.0, 6.0,
/// 7.0, 8.0, 9.0];
///
/// memcpy_2d_htod(
/// device_buffer,
/// pitch,
/// src_array.as_slice().as_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// 3,
/// 3
/// )?;
///
/// let mut dst_array = [0.0f32; 9];
///
/// memcpy_2d_dtoh(
/// dst_array.as_mut_slice().as_mut_ptr(),
/// 3*std::mem::size_of::<f32>(),
/// device_buffer,
/// pitch,
/// 3,
/// 3
/// )?;
///
/// assert_eq!(dst_array, src_array);
/// cuda_free(device_buffer)?;
/// }
/// # Ok(())
/// # }
/// # foo().unwrap();
/// ```
#[allow(clippy::missing_safety_doc)]
pub unsafe fn memcpy_2d_dtoh<T: DeviceCopy>(
dst: *mut T,
dpitch: usize,
src: DevicePointer<T>,
spitch: usize,
width: usize,
height: usize,
) -> CudaResult<()> {
use cust_raw::CUmemorytype;

let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
.ok_or(CudaError::InvalidMemoryAllocation)?;

let pcopy = cust_raw::CUDA_MEMCPY2D_st {
srcXInBytes: 0,
srcY: 0,
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
srcHost: std::ptr::null_mut::<c_void>(), // Ignored
srcDevice: src.as_raw(),
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
srcPitch: spitch,
dstXInBytes: 0,
dstY: 0,
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
dstHost: dst as *mut c_void,
dstDevice: 0, // Ignored
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
dstPitch: dpitch,
WidthInBytes: width_in_bytes,
Height: height,
};

crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
Ok(())
}

/// Get the current free and total memory.
///
/// Returns in `.1` the total amount of memory available to the the current context.
Expand Down