Skip to content

Commit

Permalink
ENH: Use 64-bit cublas for CUDA 12
Browse files Browse the repository at this point in the history
  • Loading branch information
Simon Rit authored and SimonRit committed May 12, 2024
1 parent 0a1a8ca commit 74d25e0
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 37 deletions.
12 changes: 6 additions & 6 deletions include/rtkCudaConjugateGradientImageFilter.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@
#include "RTKExport.h"

void RTK_EXPORT
CUDA_copy(long int numberOfElements, float * in, float * out);
CUDA_copy(size_t numberOfElements, float * in, float * out);

void RTK_EXPORT
CUDA_copy(long int numberOfElements, double * in, double * out);
CUDA_copy(size_t numberOfElements, double * in, double * out);

void RTK_EXPORT
CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted);
CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted);

void RTK_EXPORT
CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted);
CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted);

void RTK_EXPORT
CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);

void RTK_EXPORT
CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
#endif
4 changes: 2 additions & 2 deletions include/rtkCudaConjugateGradientImageFilter.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ void
CudaConjugateGradientImageFilter<TImage>::GPUGenerateData()
{
using DataType = typename itk::PixelTraits<typename TImage::PixelType>::ValueType;
long int numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
itk::PixelTraits<typename TImage::PixelType>::Dimension;
size_t numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
itk::PixelTraits<typename TImage::PixelType>::Dimension;

// Create and allocate images
typename TImage::Pointer P_k = TImage::New();
Expand Down
1 change: 0 additions & 1 deletion include/rtkCudaUtilities.hcu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <itkMacro.h>
#undef ITK_STATIC
#include <cuda.h>
#include <cublas_v2.h>

#define CUDA_CHECK_ERROR \
{ \
Expand Down
48 changes: 26 additions & 22 deletions src/rtkCudaConjugateGradientImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,33 @@
// TEXTURES AND CONSTANTS //

void
CUDA_copy(long int numberOfElements, float * in, float * out)
CUDA_copy(size_t numberOfElements, float * in, float * out)
{
// Copy input volume to output
size_t memorySizeOutput = numberOfElements * sizeof(float);
cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
}

void
CUDA_copy(long int numberOfElements, double * in, double * out)
CUDA_copy(size_t numberOfElements, double * in, double * out)
{
// Copy input volume to output
size_t memorySizeOutput = numberOfElements * sizeof(double);
cudaMemcpy(out, in, memorySizeOutput, cudaMemcpyDeviceToDevice);
}

void
CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted)
{
cublasHandle_t handle;
cublasCreate(&handle);

const float alpha = -1.0;
cublasSaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#else
cublasSaxpy_64(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
#endif

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -59,13 +63,13 @@ CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
}

void
CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted)
{
cublasHandle_t handle;
cublasCreate(&handle);

const double alpha = -1.0;
cublasDaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
cublasDaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -74,7 +78,7 @@ CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
}

void
CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
{
cublasHandle_t handle;
cublasCreate(&handle);
Expand All @@ -83,33 +87,33 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float

// Compute Rk_square = sum(Rk(:).^2) by cublas
float Rk_square = 0;
cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);

// Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
float Pk_APk = 0;
cublasSdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
cublasSdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);

const float alpha_k = Rk_square / (Pk_APk + eps);
const float minus_alpha_k = -alpha_k;

// Compute Xk+1 = Xk + alpha_k * Pk
cublasSaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
cublasSaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);

// Compute Rk+1 = Rk - alpha_k * APk
cublasSaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
cublasSaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);

// Compute beta_k = sum(Rk+1(:).^2) / Rk_square
float Rkplusone_square = 0;
cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);

float beta_k = Rkplusone_square / (Rk_square + eps);
float one = 1.0;

// Compute Pk+1 = Rk+1 + beta_k * Pk
// This requires two cublas functions,
// since axpy would store the result in the wrong array
cublasSscal(handle, numberOfElements, &beta_k, Pk, 1);
cublasSaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
cublasSscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
cublasSaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand All @@ -118,7 +122,7 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
}

void
CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
{
cublasHandle_t handle;
cublasCreate(&handle);
Expand All @@ -127,33 +131,33 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou

// Compute Rk_square = sum(Rk(:).^2) by cublas
double Rk_square = 0;
cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);

// Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
double Pk_APk = 0;
cublasDdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
cublasDdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);

const double alpha_k = Rk_square / (Pk_APk + eps);
const double minus_alpha_k = -alpha_k;

// Compute Xk+1 = Xk + alpha_k * Pk
cublasDaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
cublasDaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);

// Compute Rk+1 = Rk - alpha_k * APk
cublasDaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
cublasDaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);

// Compute beta_k = sum(Rk+1(:).^2) / Rk_square
double Rkplusone_square = 0;
cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);

double beta_k = Rkplusone_square / (Rk_square + eps);
double one = 1.0;

// Compute Pk+1 = Rk+1 + beta_k * Pk
// This requires two cublas functions,
// since axpy would store the result in the wrong array
cublasDscal(handle, numberOfElements, &beta_k, Pk, 1);
cublasDaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
cublasDscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
cublasDaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);

// Destroy Cublas context
cublasDestroy(handle);
Expand Down
12 changes: 10 additions & 2 deletions src/rtkCudaCyclicDeformationImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,19 @@ CUDA_linear_interpolate_along_fourth_dimension(unsigned int inputSize[4],

// Add it weightInf times the frameInf to the output
float * pinf = input + frameInf * numel;
cublasSaxpy(handle, numel, &wInf, pinf, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &wInf, pinf, 1, output, 1);
#else
cublasSaxpy_64(handle, numel, &wInf, pinf, 1, output, 1);
#endif

// Add it weightSup times the frameSup to the output
float * psup = input + frameSup * numel;
cublasSaxpy(handle, numel, &wSup, psup, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &wSup, psup, 1, output, 1);
#else
cublasSaxpy_64(handle, numel, &wSup, psup, 1, output, 1);
#endif

// Destroy Cublas context
cublasDestroy(handle);
Expand Down
8 changes: 6 additions & 2 deletions src/rtkCudaInterpolateImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
cublasCreate(&handle);

// CUDA device pointers
int nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
size_t nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
size_t memorySizeOutput = nVoxelsOutput * sizeof(float);

// Reset output volume
Expand All @@ -49,7 +49,11 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
float * p = input + phase * nVoxelsOutput;

// Add "weight" times the "phase"-th volume in the input to the output
cublasSaxpy(handle, nVoxelsOutput, &weight, p, 1, output, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)nVoxelsOutput, &weight, p, 1, output, 1);
#else
cublasSaxpy_64(handle, nVoxelsOutput, &weight, p, 1, output, 1);
#endif
}
}

Expand Down
8 changes: 6 additions & 2 deletions src/rtkCudaSplatImageFilter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio
cublasHandle_t handle;
cublasCreate(&handle);

int numel = outputSize.x * outputSize.y * outputSize.z;
size_t numel = outputSize.x * outputSize.y * outputSize.z;

for (int phase = 0; phase < outputSize.w; phase++)
{
Expand All @@ -44,7 +44,11 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio
float * p = output + phase * numel;

// Add "weight" times the input to the "phase"-th volume in the output
cublasSaxpy(handle, numel, &weight, input, 1, p, 1);
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &weight, input, 1, p, 1);
#else
cublasSaxpy_64(handle, numel, &weight, input, 1, p, 1);
#endif
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/rtkCudaUtilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,11 @@ prepareVectorTextureObject(int size[3],

// Fill it with the current component
const float * pComponent = dev_ptr + component;
#if CUDA_VERSION < 12000
cublasSaxpy(handle, (int)numel, &one, pComponent, nComponents, singleComponent, 1);
#else
cublasSaxpy_64(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
#endif

// Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
if (isProjections)
Expand Down

0 comments on commit 74d25e0

Please sign in to comment.