Skip to content

Commit

Permalink
ENH: Remove CUDA code for compute capability 1
Browse files Browse the repository at this point in the history
RTK does not support compute capability 1 since
1e75fbc.
  • Loading branch information
Simon Rit authored and SimonRit committed Apr 21, 2024
1 parent 902db67 commit 4dfa489
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 347 deletions.
22 changes: 7 additions & 15 deletions src/rtkCudaBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ __constant__ int3 c_volSize;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

template <unsigned int vectorLength, bool isCylindrical>
template <unsigned int VVectorLength, bool VIsCylindrical>
__global__ void
kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTextureObject_t * dev_tex_proj)
{
Expand All @@ -68,13 +68,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
itk::SizeValueType vol_idx = i + (j + k * c_volSize.y) * (c_volSize.x);

float3 ip, pp;
float voxel_data[vectorLength];
for (unsigned int c = 0; c < vectorLength; c++)
float voxel_data[VVectorLength];
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] = 0.0f;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
if (isCylindrical)
if (VIsCylindrical)
{
// matrix multiply
pp = matrix_multiply(make_float3(i, j, k), &(c_volIndexToProjPP[12 * proj]));
Expand Down Expand Up @@ -105,13 +105,13 @@ kernel_backProject(float * dev_vol_in, float * dev_vol_out, float radius, cudaTe
}

// Get texture point, clip left to GPU, and accumulate in voxel_data
for (unsigned int c = 0; c < vectorLength; c++)
for (unsigned int c = 0; c < VVectorLength; c++)
voxel_data[c] += tex2DLayered<float>(dev_tex_proj[c], ip.x, ip.y, proj);
}

// Place it into the volume
for (unsigned int c = 0; c < vectorLength; c++)
dev_vol_out[vol_idx * vectorLength + c] = dev_vol_in[vol_idx * vectorLength + c] + voxel_data[c];
for (unsigned int c = 0; c < VVectorLength; c++)
dev_vol_out[vol_idx * VVectorLength + c] = dev_vol_in[vol_idx * VVectorLength + c] + voxel_data[c];
}


Expand All @@ -134,14 +134,6 @@ CUDA_back_project(int projSize[3],
double radiusCylindricalDetector,
unsigned int vectorLength)
{
// 2D layered texture requires CudaComputeCapability >= 2.0
int device;
cudaGetDevice(&device);
if (GetCudaComputeCapability(device).first <= 1)
{
itkGenericExceptionMacro(<< "RTK no longer supports GPUs with CudaComputeCapability < 2.0");
}

// Copy the size of inputs into constant memory
cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));
Expand Down
102 changes: 14 additions & 88 deletions src/rtkCudaFDKBackProjectionImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@
#include <cuda.h>

// T E X T U R E S ////////////////////////////////////////////////////////
texture<float, cudaTextureType2DLayered> tex_proj;
texture<float, 3, cudaReadModeElementType> tex_proj_3D;
texture<float, cudaTextureType2DLayered> tex_proj;

// Constant memory
__constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
Expand All @@ -53,48 +52,6 @@ __constant__ int3 c_vol_size;
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_( S T A R T )_
//_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_

__global__ void
kernel_fdk(float * dev_vol_in, float * dev_vol_out, unsigned int Blocks_Y)
{
// CUDA 2.0 does not allow for a 3D grid, which severely
// limits the manipulation of large 3D arrays of data. The
// following code is a hack to bypass this implementation
// limitation.
unsigned int blockIdx_z = blockIdx.y / Blocks_Y;
unsigned int blockIdx_y = blockIdx.y - __umul24(blockIdx_z, Blocks_Y);
itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
itk::SizeValueType j = __umul24(blockIdx_y, blockDim.y) + threadIdx.y;
itk::SizeValueType k = __umul24(blockIdx_z, blockDim.z) + threadIdx.z;

if (i >= c_vol_size.x || j >= c_vol_size.y || k >= c_vol_size.z)
{
return;
}

// Index row major into the volume
itk::SizeValueType vol_idx = i + (j + k * c_vol_size.y) * (c_vol_size.x);

float3 ip;
float voxel_data = 0;

for (unsigned int proj = 0; proj < c_projSize.z; proj++)
{
// matrix multiply
ip = matrix_multiply(make_float3(i, j, k), &(c_matrices[12 * proj]));

// Change coordinate systems
ip.z = 1 / ip.z;
ip.x = ip.x * ip.z;
ip.y = ip.y * ip.z;

// Get texture point, clip left to GPU
voxel_data += tex3D(tex_proj_3D, ip.x, ip.y, proj + 0.5) * ip.z * ip.z;
}

// Place it into the volume
dev_vol_out[vol_idx] = dev_vol_in[vol_idx] + voxel_data;
}

__global__ void
kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
{
Expand Down Expand Up @@ -146,9 +103,6 @@ CUDA_reconstruct_conebeam(int proj_size[3],
float * dev_vol_out,
float * dev_proj)
{
int device;
cudaGetDevice(&device);

// Copy the size of inputs into constant memory
cudaMemcpyToSymbol(c_projSize, proj_size, sizeof(int3));
cudaMemcpyToSymbol(c_vol_size, vol_size, sizeof(int3));
Expand All @@ -163,25 +117,15 @@ CUDA_reconstruct_conebeam(int proj_size[3],
tex_proj.filterMode = cudaFilterModeLinear;
tex_proj.normalized = false; // don't access with normalized texture coords

tex_proj_3D.addressMode[0] = cudaAddressModeBorder;
tex_proj_3D.addressMode[1] = cudaAddressModeBorder;
tex_proj_3D.addressMode[2] = cudaAddressModeBorder;
tex_proj_3D.filterMode = cudaFilterModeLinear;
tex_proj_3D.normalized = false; // don't access with normalized texture coords

// Copy projection data to array, bind the array to the texture
cudaExtent projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
cudaArray * array_proj;
static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
CUDA_CHECK_ERROR;

// Allocate array for input projections, in order to bind them to
// either a 2D layered texture (requires GetCudaComputeCapability >= 2.0) or
// a 3D texture
if (GetCudaComputeCapability(device).first <= 1)
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent);
else
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
// a 2D layered texture
cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
CUDA_CHECK_ERROR;

// Copy data to 3D array
Expand All @@ -205,39 +149,21 @@ CUDA_reconstruct_conebeam(int proj_size[3],

// Run kernels. Note: Projection data is passed via texture memory,
// transform matrix is passed via constant memory
if (GetCudaComputeCapability(device).first <= 1)
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY * blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);

// Bind the array of projections to a 3D texture
cudaBindTextureToArray(tex_proj_3D, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

kernel_fdk<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, blocksInY);

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj_3D);
CUDA_CHECK_ERROR;
}
else
{
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;
// Compute block and grid sizes
dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
CUDA_CHECK_ERROR;

// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;
// Bind the array of projections to a 2D layered texture
cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
CUDA_CHECK_ERROR;

kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);

// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;
}
// Unbind the image and projection matrix textures
cudaUnbindTexture(tex_proj);
CUDA_CHECK_ERROR;

// Cleanup
cudaFreeArray((cudaArray *)array_proj);
Expand Down
Loading

0 comments on commit 4dfa489

Please sign in to comment.