diff --git a/include/rtkCudaConjugateGradientImageFilter.hcu b/include/rtkCudaConjugateGradientImageFilter.hcu index c53b5d8c1..1b9f127d9 100644 --- a/include/rtkCudaConjugateGradientImageFilter.hcu +++ b/include/rtkCudaConjugateGradientImageFilter.hcu @@ -22,20 +22,20 @@ #include "RTKExport.h" void RTK_EXPORT - CUDA_copy(long int numberOfElements, float * in, float * out); + CUDA_copy(size_t numberOfElements, float * in, float * out); void RTK_EXPORT - CUDA_copy(long int numberOfElements, double * in, double * out); + CUDA_copy(size_t numberOfElements, double * in, double * out); void RTK_EXPORT - CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted); + CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted); void RTK_EXPORT - CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted); + CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted); void RTK_EXPORT - CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk); + CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk); void RTK_EXPORT - CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk); + CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk); #endif diff --git a/include/rtkCudaConjugateGradientImageFilter.hxx b/include/rtkCudaConjugateGradientImageFilter.hxx index e5cd2ddbb..4fdf9df22 100644 --- a/include/rtkCudaConjugateGradientImageFilter.hxx +++ b/include/rtkCudaConjugateGradientImageFilter.hxx @@ -42,8 +42,8 @@ void CudaConjugateGradientImageFilter::GPUGenerateData() { using DataType = typename itk::PixelTraits::ValueType; - long int numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() * - itk::PixelTraits::Dimension; + size_t numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() * + itk::PixelTraits::Dimension; // Create and allocate images typename TImage::Pointer P_k = TImage::New(); diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu index d4ab3b8bb..2a49cca81 100644 --- a/include/rtkCudaUtilities.hcu +++ b/include/rtkCudaUtilities.hcu @@ -26,7 +26,6 @@ #include #undef ITK_STATIC #include -#include #define CUDA_CHECK_ERROR \ { \ diff --git a/src/rtkCudaConjugateGradientImageFilter.cu b/src/rtkCudaConjugateGradientImageFilter.cu index f04420dea..42b53f68f 100644 --- a/src/rtkCudaConjugateGradientImageFilter.cu +++ b/src/rtkCudaConjugateGradientImageFilter.cu @@ -28,7 +28,7 @@ // TEXTURES AND CONSTANTS // void -CUDA_copy(long int numberOfElements, float * in, float * out) +CUDA_copy(size_t numberOfElements, float * in, float * out) { // Copy input volume to output size_t memorySizeOutput = numberOfElements * sizeof(float); @@ -36,7 +36,7 @@ CUDA_copy(long int numberOfElements, float * in, float * out) } void -CUDA_copy(long int numberOfElements, double * in, double * out) +CUDA_copy(size_t numberOfElements, double * in, double * out) { // Copy input volume to output size_t memorySizeOutput = numberOfElements * sizeof(double); @@ -44,13 +44,17 @@ CUDA_copy(long int numberOfElements, double * in, double * out) } void -CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted) +CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted) { cublasHandle_t handle; cublasCreate(&handle); const float alpha = -1.0; - cublasSaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1); +#if CUDA_VERSION < 12000 + cublasSaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1); +#else + cublasSaxpy_64(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1); +#endif // Destroy Cublas context cublasDestroy(handle); @@ -59,13 +63,13 @@ CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted) } void -CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted) +CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = -1.0; - cublasDaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1); + cublasDaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1); // Destroy Cublas context cublasDestroy(handle); @@ -74,7 +78,7 @@ CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted) } void -CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk) +CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk) { cublasHandle_t handle; cublasCreate(&handle); @@ -83,24 +87,24 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float // Compute Rk_square = sum(Rk(:).^2) by cublas float Rk_square = 0; - cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square); + cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square); // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:)) float Pk_APk = 0; - cublasSdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk); + cublasSdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk); const float alpha_k = Rk_square / (Pk_APk + eps); const float minus_alpha_k = -alpha_k; // Compute Xk+1 = Xk + alpha_k * Pk - cublasSaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1); + cublasSaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1); // Compute Rk+1 = Rk - alpha_k * APk - cublasSaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1); + cublasSaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1); // Compute beta_k = sum(Rk+1(:).^2) / Rk_square float Rkplusone_square = 0; - cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square); + cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square); float beta_k = Rkplusone_square / (Rk_square + eps); float one = 1.0; @@ -108,8 +112,8 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float // Compute Pk+1 = Rk+1 + beta_k * Pk // This requires two cublas functions, // since axpy would store the result in the wrong array - cublasSscal(handle, numberOfElements, &beta_k, Pk, 1); - cublasSaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1); + cublasSscal(handle, (int)numberOfElements, &beta_k, Pk, 1); + cublasSaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1); // Destroy Cublas context cublasDestroy(handle); @@ -118,7 +122,7 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float } void -CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk) +CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk) { cublasHandle_t handle; cublasCreate(&handle); @@ -127,24 +131,24 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou // Compute Rk_square = sum(Rk(:).^2) by cublas double Rk_square = 0; - cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square); + cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square); // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:)) double Pk_APk = 0; - cublasDdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk); + cublasDdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk); const double alpha_k = Rk_square / (Pk_APk + eps); const double minus_alpha_k = -alpha_k; // Compute Xk+1 = Xk + alpha_k * Pk - cublasDaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1); + cublasDaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1); // Compute Rk+1 = Rk - alpha_k * APk - cublasDaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1); + cublasDaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1); // Compute beta_k = sum(Rk+1(:).^2) / Rk_square double Rkplusone_square = 0; - cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square); + cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square); double beta_k = Rkplusone_square / (Rk_square + eps); double one = 1.0; @@ -152,8 +156,8 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou // Compute Pk+1 = Rk+1 + beta_k * Pk // This requires two cublas functions, // since axpy would store the result in the wrong array - cublasDscal(handle, numberOfElements, &beta_k, Pk, 1); - cublasDaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1); + cublasDscal(handle, (int)numberOfElements, &beta_k, Pk, 1); + cublasDaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1); // Destroy Cublas context cublasDestroy(handle); diff --git a/src/rtkCudaCyclicDeformationImageFilter.cu b/src/rtkCudaCyclicDeformationImageFilter.cu index 79f9da0d1..53bcce3e7 100644 --- a/src/rtkCudaCyclicDeformationImageFilter.cu +++ b/src/rtkCudaCyclicDeformationImageFilter.cu @@ -57,11 +57,19 @@ CUDA_linear_interpolate_along_fourth_dimension(unsigned int inputSize[4], // Add it weightInf times the frameInf to the output float * pinf = input + frameInf * numel; - cublasSaxpy(handle, numel, &wInf, pinf, 1, output, 1); +#if CUDA_VERSION < 12000 + cublasSaxpy(handle, (int)numel, &wInf, pinf, 1, output, 1); +#else + cublasSaxpy_64(handle, numel, &wInf, pinf, 1, output, 1); +#endif // Add it weightSup times the frameSup to the output float * psup = input + frameSup * numel; - cublasSaxpy(handle, numel, &wSup, psup, 1, output, 1); +#if CUDA_VERSION < 12000 + cublasSaxpy(handle, (int)numel, &wSup, psup, 1, output, 1); +#else + cublasSaxpy_64(handle, numel, &wSup, psup, 1, output, 1); +#endif // Destroy Cublas context cublasDestroy(handle); diff --git a/src/rtkCudaInterpolateImageFilter.cu b/src/rtkCudaInterpolateImageFilter.cu index ee7bcca32..9760f0119 100644 --- a/src/rtkCudaInterpolateImageFilter.cu +++ b/src/rtkCudaInterpolateImageFilter.cu @@ -34,7 +34,7 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr cublasCreate(&handle); // CUDA device pointers - int nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z; + size_t nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z; size_t memorySizeOutput = nVoxelsOutput * sizeof(float); // Reset output volume @@ -49,7 +49,11 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr float * p = input + phase * nVoxelsOutput; // Add "weight" times the "phase"-th volume in the input to the output - cublasSaxpy(handle, nVoxelsOutput, &weight, p, 1, output, 1); +#if CUDA_VERSION < 12000 + cublasSaxpy(handle, (int)nVoxelsOutput, &weight, p, 1, output, 1); +#else + cublasSaxpy_64(handle, nVoxelsOutput, &weight, p, 1, output, 1); +#endif } } diff --git a/src/rtkCudaSplatImageFilter.cu b/src/rtkCudaSplatImageFilter.cu index ffb127869..726aa3e43 100644 --- a/src/rtkCudaSplatImageFilter.cu +++ b/src/rtkCudaSplatImageFilter.cu @@ -33,7 +33,7 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio cublasHandle_t handle; cublasCreate(&handle); - int numel = outputSize.x * outputSize.y * outputSize.z; + size_t numel = outputSize.x * outputSize.y * outputSize.z; for (int phase = 0; phase < outputSize.w; phase++) { @@ -44,7 +44,11 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio float * p = output + phase * numel; // Add "weight" times the input to the "phase"-th volume in the output - cublasSaxpy(handle, numel, &weight, input, 1, p, 1); +#if CUDA_VERSION < 12000 + cublasSaxpy(handle, (int)numel, &weight, input, 1, p, 1); +#else + cublasSaxpy_64(handle, numel, &weight, input, 1, p, 1); +#endif } } diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu index 95f306141..bab5d4bff 100644 --- a/src/rtkCudaUtilities.cu +++ b/src/rtkCudaUtilities.cu @@ -182,7 +182,11 @@ prepareVectorTextureObject(int size[3], // Fill it with the current component const float * pComponent = dev_ptr + component; +#if CUDA_VERSION < 12000 cublasSaxpy(handle, (int)numel, &one, pComponent, nComponents, singleComponent, 1); +#else + cublasSaxpy_64(handle, numel, &one, pComponent, nComponents, singleComponent, 1); +#endif // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays if (isProjections)