diff --git a/include/rtkCudaConjugateGradientImageFilter.hcu b/include/rtkCudaConjugateGradientImageFilter.hcu
index c53b5d8c1..1b9f127d9 100644
--- a/include/rtkCudaConjugateGradientImageFilter.hcu
+++ b/include/rtkCudaConjugateGradientImageFilter.hcu
@@ -22,20 +22,20 @@
 #include "RTKExport.h"
 
 void RTK_EXPORT
-     CUDA_copy(long int numberOfElements, float * in, float * out);
+     CUDA_copy(size_t numberOfElements, float * in, float * out);
 
 void RTK_EXPORT
-     CUDA_copy(long int numberOfElements, double * in, double * out);
+     CUDA_copy(size_t numberOfElements, double * in, double * out);
 
 void RTK_EXPORT
-     CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted);
+     CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted);
 
 void RTK_EXPORT
-     CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted);
+     CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted);
 
 void RTK_EXPORT
-     CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
+     CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk);
 
 void RTK_EXPORT
-     CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
+     CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk);
 #endif
diff --git a/include/rtkCudaConjugateGradientImageFilter.hxx b/include/rtkCudaConjugateGradientImageFilter.hxx
index e5cd2ddbb..4fdf9df22 100644
--- a/include/rtkCudaConjugateGradientImageFilter.hxx
+++ b/include/rtkCudaConjugateGradientImageFilter.hxx
@@ -42,8 +42,8 @@ void
 CudaConjugateGradientImageFilter<TImage>::GPUGenerateData()
 {
   using DataType = typename itk::PixelTraits<typename TImage::PixelType>::ValueType;
-  long int numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
-                              itk::PixelTraits<typename TImage::PixelType>::Dimension;
+  size_t numberOfElements = this->GetOutput()->GetLargestPossibleRegion().GetNumberOfPixels() *
+                            itk::PixelTraits<typename TImage::PixelType>::Dimension;
 
   // Create and allocate images
   typename TImage::Pointer P_k = TImage::New();
diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu
index d4ab3b8bb..2a49cca81 100644
--- a/include/rtkCudaUtilities.hcu
+++ b/include/rtkCudaUtilities.hcu
@@ -26,7 +26,6 @@
 #include <itkMacro.h>
 #undef ITK_STATIC
 #include <cuda.h>
-#include <cublas_v2.h>
 
 #define CUDA_CHECK_ERROR                                                                                               \
   {                                                                                                                    \
diff --git a/src/rtkCudaConjugateGradientImageFilter.cu b/src/rtkCudaConjugateGradientImageFilter.cu
index f04420dea..42b53f68f 100644
--- a/src/rtkCudaConjugateGradientImageFilter.cu
+++ b/src/rtkCudaConjugateGradientImageFilter.cu
@@ -28,7 +28,7 @@
 // TEXTURES AND CONSTANTS //
 
 void
-CUDA_copy(long int numberOfElements, float * in, float * out)
+CUDA_copy(size_t numberOfElements, float * in, float * out)
 {
   // Copy input volume to output
   size_t memorySizeOutput = numberOfElements * sizeof(float);
@@ -36,7 +36,7 @@ CUDA_copy(long int numberOfElements, float * in, float * out)
 }
 
 void
-CUDA_copy(long int numberOfElements, double * in, double * out)
+CUDA_copy(size_t numberOfElements, double * in, double * out)
 {
   // Copy input volume to output
   size_t memorySizeOutput = numberOfElements * sizeof(double);
@@ -44,13 +44,17 @@ CUDA_copy(long int numberOfElements, double * in, double * out)
 }
 
 void
-CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
+CUDA_subtract(size_t numberOfElements, float * out, float * toBeSubtracted)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
 
   const float alpha = -1.0;
-  cublasSaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#else
+  cublasSaxpy_64(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+#endif
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -59,13 +63,13 @@ CUDA_subtract(long int numberOfElements, float * out, float * toBeSubtracted)
 }
 
 void
-CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
+CUDA_subtract(size_t numberOfElements, double * out, double * toBeSubtracted)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
 
   const double alpha = -1.0;
-  cublasDaxpy(handle, numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &alpha, toBeSubtracted, 1, out, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -74,7 +78,7 @@ CUDA_subtract(long int numberOfElements, double * out, double * toBeSubtracted)
 }
 
 void
-CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
+CUDA_conjugate_gradient(size_t numberOfElements, float * Xk, float * Rk, float * Pk, float * APk)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
@@ -83,24 +87,24 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
 
   // Compute Rk_square = sum(Rk(:).^2) by cublas
   float Rk_square = 0;
-  cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
+  cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);
 
   // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
   float Pk_APk = 0;
-  cublasSdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
+  cublasSdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);
 
   const float alpha_k = Rk_square / (Pk_APk + eps);
   const float minus_alpha_k = -alpha_k;
 
   // Compute Xk+1 = Xk + alpha_k * Pk
-  cublasSaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);
 
   // Compute Rk+1 = Rk - alpha_k * APk
-  cublasSaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
 
   // Compute beta_k = sum(Rk+1(:).^2) / Rk_square
   float Rkplusone_square = 0;
-  cublasSdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
+  cublasSdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
 
   float beta_k = Rkplusone_square / (Rk_square + eps);
   float one = 1.0;
@@ -108,8 +112,8 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
   // Compute Pk+1 = Rk+1 + beta_k * Pk
   // This requires two cublas functions,
   // since axpy would store the result in the wrong array
-  cublasSscal(handle, numberOfElements, &beta_k, Pk, 1);
-  cublasSaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
+  cublasSscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
+  cublasSaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);
@@ -118,7 +122,7 @@ CUDA_conjugate_gradient(long int numberOfElements, float * Xk, float * Rk, float
 }
 
 void
-CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
+CUDA_conjugate_gradient(size_t numberOfElements, double * Xk, double * Rk, double * Pk, double * APk)
 {
   cublasHandle_t handle;
   cublasCreate(&handle);
@@ -127,24 +131,24 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou
 
   // Compute Rk_square = sum(Rk(:).^2) by cublas
   double Rk_square = 0;
-  cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rk_square);
+  cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rk_square);
 
   // Compute alpha_k = Rk_square / sum(Pk(:) .* APk(:))
   double Pk_APk = 0;
-  cublasDdot(handle, numberOfElements, Pk, 1, APk, 1, &Pk_APk);
+  cublasDdot(handle, (int)numberOfElements, Pk, 1, APk, 1, &Pk_APk);
 
   const double alpha_k = Rk_square / (Pk_APk + eps);
   const double minus_alpha_k = -alpha_k;
 
   // Compute Xk+1 = Xk + alpha_k * Pk
-  cublasDaxpy(handle, numberOfElements, &alpha_k, Pk, 1, Xk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &alpha_k, Pk, 1, Xk, 1);
 
   // Compute Rk+1 = Rk - alpha_k * APk
-  cublasDaxpy(handle, numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &minus_alpha_k, APk, 1, Rk, 1);
 
   // Compute beta_k = sum(Rk+1(:).^2) / Rk_square
   double Rkplusone_square = 0;
-  cublasDdot(handle, numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
+  cublasDdot(handle, (int)numberOfElements, Rk, 1, Rk, 1, &Rkplusone_square);
 
   double beta_k = Rkplusone_square / (Rk_square + eps);
   double one = 1.0;
@@ -152,8 +156,8 @@ CUDA_conjugate_gradient(long int numberOfElements, double * Xk, double * Rk, dou
   // Compute Pk+1 = Rk+1 + beta_k * Pk
   // This requires two cublas functions,
   // since axpy would store the result in the wrong array
-  cublasDscal(handle, numberOfElements, &beta_k, Pk, 1);
-  cublasDaxpy(handle, numberOfElements, &one, Rk, 1, Pk, 1);
+  cublasDscal(handle, (int)numberOfElements, &beta_k, Pk, 1);
+  cublasDaxpy(handle, (int)numberOfElements, &one, Rk, 1, Pk, 1);
 
   // Destroy Cublas context
   cublasDestroy(handle);
diff --git a/src/rtkCudaCyclicDeformationImageFilter.cu b/src/rtkCudaCyclicDeformationImageFilter.cu
index 79f9da0d1..53bcce3e7 100644
--- a/src/rtkCudaCyclicDeformationImageFilter.cu
+++ b/src/rtkCudaCyclicDeformationImageFilter.cu
@@ -57,11 +57,19 @@ CUDA_linear_interpolate_along_fourth_dimension(unsigned int inputSize[4],
 
   // Add it weightInf times the frameInf to the output
   float * pinf = input + frameInf * numel;
-  cublasSaxpy(handle, numel, &wInf, pinf, 1, output, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numel, &wInf, pinf, 1, output, 1);
+#else
+  cublasSaxpy_64(handle, numel, &wInf, pinf, 1, output, 1);
+#endif
 
   // Add it weightSup times the frameSup to the output
   float * psup = input + frameSup * numel;
-  cublasSaxpy(handle, numel, &wSup, psup, 1, output, 1);
+#if CUDA_VERSION < 12000
+  cublasSaxpy(handle, (int)numel, &wSup, psup, 1, output, 1);
+#else
+  cublasSaxpy_64(handle, numel, &wSup, psup, 1, output, 1);
+#endif
 
   // Destroy Cublas context
   cublasDestroy(handle);
diff --git a/src/rtkCudaInterpolateImageFilter.cu b/src/rtkCudaInterpolateImageFilter.cu
index ee7bcca32..9760f0119 100644
--- a/src/rtkCudaInterpolateImageFilter.cu
+++ b/src/rtkCudaInterpolateImageFilter.cu
@@ -34,7 +34,7 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
   cublasCreate(&handle);
 
   // CUDA device pointers
-  int    nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
+  size_t nVoxelsOutput = inputSize.x * inputSize.y * inputSize.z;
   size_t memorySizeOutput = nVoxelsOutput * sizeof(float);
 
   // Reset output volume
@@ -49,7 +49,11 @@ CUDA_interpolation(const int4 & inputSize, float * input, float * output, int pr
       float * p = input + phase * nVoxelsOutput;
 
       // Add "weight" times the "phase"-th volume in the input to the output
-      cublasSaxpy(handle, nVoxelsOutput, &weight, p, 1, output, 1);
+#if CUDA_VERSION < 12000
+      cublasSaxpy(handle, (int)nVoxelsOutput, &weight, p, 1, output, 1);
+#else
+      cublasSaxpy_64(handle, nVoxelsOutput, &weight, p, 1, output, 1);
+#endif
     }
   }
 
diff --git a/src/rtkCudaSplatImageFilter.cu b/src/rtkCudaSplatImageFilter.cu
index ffb127869..726aa3e43 100644
--- a/src/rtkCudaSplatImageFilter.cu
+++ b/src/rtkCudaSplatImageFilter.cu
@@ -33,7 +33,7 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio
   cublasHandle_t handle;
   cublasCreate(&handle);
 
-  int numel = outputSize.x * outputSize.y * outputSize.z;
+  size_t numel = outputSize.x * outputSize.y * outputSize.z;
 
   for (int phase = 0; phase < outputSize.w; phase++)
   {
@@ -44,7 +44,11 @@ CUDA_splat(const int4 & outputSize, float * input, float * output, int projectio
       float * p = output + phase * numel;
 
       // Add "weight" times the input to the "phase"-th volume in the output
-      cublasSaxpy(handle, numel, &weight, input, 1, p, 1);
+#if CUDA_VERSION < 12000
+      cublasSaxpy(handle, (int)numel, &weight, input, 1, p, 1);
+#else
+      cublasSaxpy_64(handle, numel, &weight, input, 1, p, 1);
+#endif
     }
   }
 
diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu
index 95f306141..bab5d4bff 100644
--- a/src/rtkCudaUtilities.cu
+++ b/src/rtkCudaUtilities.cu
@@ -182,7 +182,11 @@ prepareVectorTextureObject(int                                size[3],
 
     // Fill it with the current component
     const float * pComponent = dev_ptr + component;
+#if CUDA_VERSION < 12000
     cublasSaxpy(handle, (int)numel, &one, pComponent, nComponents, singleComponent, 1);
+#else
+    cublasSaxpy_64(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
+#endif
 
     // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
     if (isProjections)