diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu index 6cf2b6090..612a8fda6 100644 --- a/include/rtkCudaUtilities.hcu +++ b/include/rtkCudaUtilities.hcu @@ -126,13 +126,15 @@ inline __host__ __device__ float { return u.x * v.x + u.y * v.y + u.z * v.z; } + __host__ void -prepareTextureObject(int size[3], - float * dev_ptr, - cudaArray **& componentArrays, - unsigned int nComponents, - cudaTextureObject_t * tex, - bool isProjections); +prepareVectorTextureObject(int size[3], + const float * dev_ptr, + std::vector & componentArrays, + const unsigned int nComponents, + std::vector & tex, + bool isProjections); + inline __device__ void matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB) diff --git a/src/rtkCudaBackProjectionImageFilter.cu b/src/rtkCudaBackProjectionImageFilter.cu index 577a533c4..5ffd4faac 100644 --- a/src/rtkCudaBackProjectionImageFilter.cu +++ b/src/rtkCudaBackProjectionImageFilter.cu @@ -161,18 +161,15 @@ CUDA_back_project(int projSize[3], dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z); CUDA_CHECK_ERROR; - cudaArray ** projComponentArrays = new cudaArray *[vectorLength]; - - // Create an array of textures - cudaTextureObject_t * tex_proj = new cudaTextureObject_t[vectorLength]; - // Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_proj" argument) - prepareTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true); + std::vector projComponentArrays; + std::vector tex_proj; + prepareVectorTextureObject(projSize, dev_proj, projComponentArrays, vectorLength, tex_proj, true); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_proj; cudaMalloc(&dev_tex_proj, vectorLength * sizeof(cudaTextureObject_t)); - cudaMemcpy(dev_tex_proj, tex_proj, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tex_proj, tex_proj.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); // Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument, // the compiler can't assume it's constant, and a dirty trick has to be used. @@ -237,11 +234,11 @@ CUDA_back_project(int projSize[3], // Cleanup for (unsigned int c = 0; c < vectorLength; c++) { - cudaFreeArray((cudaArray *)projComponentArrays[c]); + cudaFreeArray(projComponentArrays[c]); + CUDA_CHECK_ERROR; cudaDestroyTextureObject(tex_proj[c]); + CUDA_CHECK_ERROR; } cudaFree(dev_tex_proj); - delete[] tex_proj; - delete[] projComponentArrays; CUDA_CHECK_ERROR; } diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu index 9b14365c7..3940d0f0d 100644 --- a/src/rtkCudaForwardProjectionImageFilter.cu +++ b/src/rtkCudaForwardProjectionImageFilter.cu @@ -60,7 +60,7 @@ __constant__ float c_sourcePos[SLAB_SIZE * 3]; // Can process stacks of at most //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ // KERNEL kernel_forwardProject -template +template __global__ void kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObject_t * dev_tex_vol) { @@ -103,8 +103,8 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje // Detect intersection with box if (!intersectBox(ray, &tnear, &tfar, c_boxMin, c_boxMax) || tfar < 0.f) { - for (unsigned int c = 0; c < vectorLength; c++) - dev_proj_out[projOffset * vectorLength + c] = dev_proj_in[projOffset * vectorLength + c]; + for (unsigned int c = 0; c < VVectorLength; c++) + dev_proj_out[projOffset * VVectorLength + c] = dev_proj_in[projOffset * VVectorLength + c]; } else { @@ -123,9 +123,9 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje pos = ray.o + tnear * ray.d; float t; - float sample[vectorLength]; - float sum[vectorLength]; - for (unsigned int c = 0; c < vectorLength; c++) + float sample[VVectorLength]; + float sum[VVectorLength]; + for (unsigned int c = 0; c < VVectorLength; c++) { sample[c] = 0.0f; sum[c] = 0.0f; @@ -134,11 +134,11 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje for (t = tnear; t <= tfar; t += vStep) { // Read from 3D texture from volume(s) - for (unsigned int c = 0; c < vectorLength; c++) + for (unsigned int c = 0; c < VVectorLength; c++) sample[c] = tex3D(dev_tex_vol[c], pos.x, pos.y, pos.z); // Accumulate - for (unsigned int c = 0; c < vectorLength; c++) + for (unsigned int c = 0; c < VVectorLength; c++) sum[c] += sample[c]; // Step forward @@ -146,9 +146,9 @@ kernel_forwardProject(float * dev_proj_in, float * dev_proj_out, cudaTextureObje } // Update the output projection pixels - for (unsigned int c = 0; c < vectorLength; c++) - dev_proj_out[projOffset * vectorLength + c] = - dev_proj_in[projOffset * vectorLength + c] + (sum[c] + (tfar - t + halfVStep) / vStep * sample[c]) * c_tStep; + for (unsigned int c = 0; c < VVectorLength; c++) + dev_proj_out[projOffset * VVectorLength + c] = + dev_proj_in[projOffset * VVectorLength + c] + (sum[c] + (tfar - t + halfVStep) / vStep * sample[c]) * c_tStep; } } } @@ -198,17 +198,15 @@ CUDA_forward_project(int projSize[3], cudaMemcpyToSymbol( c_translatedVolumeTransformMatrices, &(translatedVolumeTransformMatrices[0]), 12 * sizeof(float) * projSize[2]); - // Create an array of textures - cudaTextureObject_t * tex_vol = new cudaTextureObject_t[vectorLength]; - cudaArray ** volComponentArrays = new cudaArray *[vectorLength]; - - // Prepare texture objects (needs an array of cudaTextureObjects on the host as "tex_vol" argument) - prepareTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); + // Prepare texture objects + std::vector volComponentArrays; + std::vector tex_vol; + prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false); // Copy them to a device pointer, since it will have to be de-referenced in the kernels cudaTextureObject_t * dev_tex_vol; cudaMalloc(&dev_tex_vol, vectorLength * sizeof(cudaTextureObject_t)); - cudaMemcpy(dev_tex_vol, tex_vol, vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tex_vol, tex_vol.data(), vectorLength * sizeof(cudaTextureObject_t), cudaMemcpyHostToDevice); // Run the kernel. Since "vectorLength" is passed as a function argument, not as a template argument, // the compiler can't assume it's constant, and a dirty trick has to be used. @@ -239,11 +237,10 @@ CUDA_forward_project(int projSize[3], for (unsigned int c = 0; c < vectorLength; c++) { cudaFreeArray((cudaArray *)volComponentArrays[c]); + CUDA_CHECK_ERROR; cudaDestroyTextureObject(tex_vol[c]); + CUDA_CHECK_ERROR; } cudaFree(dev_tex_vol); - delete[] volComponentArrays; CUDA_CHECK_ERROR; - - delete[] tex_vol; } diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu index 4f11e45db..4f3938d05 100644 --- a/src/rtkCudaUtilities.cu +++ b/src/rtkCudaUtilities.cu @@ -77,13 +77,16 @@ GetFreeGPUGlobalMemory(int device) } __host__ void -prepareTextureObject(int size[3], - float * dev_ptr, - cudaArray **& componentArrays, - unsigned int nComponents, - cudaTextureObject_t * tex, - bool isProjections) +prepareVectorTextureObject(int size[3], + const float * dev_ptr, + std::vector & componentArrays, + const unsigned int nComponents, + std::vector & tex, + bool isProjections) { + componentArrays.resize(nComponents); + tex.resize(nComponents); + // Create CUBLAS context cublasHandle_t handle; cublasCreate(&handle); @@ -129,15 +132,15 @@ prepareTextureObject(int size[3], // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays if (isProjections) - cudaMalloc3DArray((cudaArray **)&componentArrays[component], &channelDesc, volExtent, cudaArrayLayered); + cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent, cudaArrayLayered); else - cudaMalloc3DArray((cudaArray **)&componentArrays[component], &channelDesc, volExtent); + cudaMalloc3DArray(&componentArrays[component], &channelDesc, volExtent); CUDA_CHECK_ERROR; // Fill it with the current singleComponent cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms(); CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]); - CopyParams.dstArray = (cudaArray *)componentArrays[component]; + CopyParams.dstArray = componentArrays[component]; CopyParams.extent = volExtent; CopyParams.kind = cudaMemcpyDeviceToDevice; cudaMemcpy3D(&CopyParams);