From f953b5f9e540e978d3072fb2b06f2f72da63f06f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=9Clgen?= Date: Fri, 17 Nov 2023 09:53:15 +0000 Subject: [PATCH] Convert reference and floating images to float arrays from cudaArrays #92 - Eliminate unnecessary Cuda::* functions - Refactor Cuda::CreateTextureObject() --- niftyreg_build_version.txt | 2 +- reg-apps/reg_benchmark.cpp | 20 +- reg-lib/cuda/CudaCommon.cu | 325 ++++--------------- reg-lib/cuda/CudaCommon.hpp | 62 ++-- reg-lib/cuda/CudaContent.cpp | 8 +- reg-lib/cuda/CudaContent.h | 16 +- reg-lib/cuda/CudaKernelConvolution.cu | 14 +- reg-lib/cuda/CudaNormaliseGradient.cu | 6 +- reg-lib/cuda/_reg_localTransformation_gpu.cu | 60 ++-- reg-lib/cuda/_reg_measure_gpu.h | 20 +- reg-lib/cuda/_reg_nmi_gpu.cu | 94 +++--- reg-lib/cuda/_reg_nmi_gpu.h | 8 +- reg-lib/cuda/_reg_optimiser_gpu.cu | 29 +- reg-lib/cuda/_reg_resampling_gpu.cu | 19 +- reg-lib/cuda/_reg_resampling_gpu.h | 4 +- reg-lib/cuda/_reg_resampling_kernels.cu | 43 ++- reg-lib/cuda/_reg_ssd_gpu.cu | 62 ++-- reg-lib/cuda/_reg_ssd_gpu.h | 4 +- reg-lib/cuda/_reg_ssd_kernels.cu | 14 +- reg-lib/cuda/_reg_tools_gpu.cu | 15 +- reg-lib/cuda/blockMatchingKernel.cu | 9 +- 21 files changed, 281 insertions(+), 553 deletions(-) diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index 9c6f0c3e..47531021 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -364 +365 diff --git a/reg-apps/reg_benchmark.cpp b/reg-apps/reg_benchmark.cpp index 52661f88..6a8ebfbe 100644 --- a/reg-apps/reg_benchmark.cpp +++ b/reg-apps/reg_benchmark.cpp @@ -181,18 +181,18 @@ int main(int argc, char **argv) #ifdef USE_CUDA float *targetImageArray_d; - cudaArray *sourceImageArray_d; + float *sourceImageArray_d; int *targetMask_d; float4 *deformationFieldImageArray_d; if(runGPU) { - Cuda::Allocate(&targetImageArray_d, targetImage->nvox); - Cuda::TransferNiftiToDevice(targetImageArray_d, targetImage); + Cuda::Allocate(&targetImageArray_d, targetImage->nvox); + Cuda::TransferNiftiToDevice(targetImageArray_d, targetImage); Cuda::Allocate(&sourceImageArray_d, sourceImage->nvox); - Cuda::TransferNiftiToDevice(sourceImageArray_d,sourceImage); - CUDA_SAFE_CALL(cudaMalloc((void **)&targetMask_d, targetImage->nvox*sizeof(int))); + Cuda::TransferNiftiToDevice(sourceImageArray_d,sourceImage); + CUDA_SAFE_CALL(cudaMalloc((void**)&targetMask_d, targetImage->nvox*sizeof(int))); CUDA_SAFE_CALL(cudaMemcpy(targetMask_d, maskImage, targetImage->nvox*sizeof(int), cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMalloc((void **)&deformationFieldImageArray_d, targetImage->nvox*sizeof(float4))); + CUDA_SAFE_CALL(cudaMalloc((void**)&deformationFieldImageArray_d, targetImage->nvox*sizeof(float4))); } #endif @@ -277,8 +277,8 @@ int main(int argc, char **argv) float4 *controlPointImageArray_d; if(runGPU) { - Cuda::Allocate(&controlPointImageArray_d, controlPointImage->dim); - Cuda::TransferNiftiToDevice(controlPointImageArray_d,controlPointImage); + Cuda::Allocate(&controlPointImageArray_d, controlPointImage->dim); + Cuda::TransferNiftiToDevice(controlPointImageArray_d, controlPointImage); } #endif { @@ -330,8 +330,8 @@ int main(int argc, char **argv) float4 *velocityFieldImageArray_d; if(runGPU) { - Cuda::Allocate(&velocityFieldImageArray_d, velocityFieldImage->dim); - Cuda::TransferNiftiToDevice(velocityFieldImageArray_d,velocityFieldImage); + Cuda::Allocate(&velocityFieldImageArray_d, velocityFieldImage->dim); + Cuda::TransferNiftiToDevice(velocityFieldImageArray_d, velocityFieldImage); } #endif { diff --git a/reg-lib/cuda/CudaCommon.cu b/reg-lib/cuda/CudaCommon.cu index 27804dcb..1f56f95e 100644 --- a/reg-lib/cuda/CudaCommon.cu +++ b/reg-lib/cuda/CudaCommon.cu @@ -14,37 +14,16 @@ /* *************************************************************** */ namespace NiftyReg::Cuda { /* *************************************************************** */ -template -void Allocate(cudaArray **arrayCuda, const int *dim) { - const cudaExtent volumeSize = make_cudaExtent(std::abs(dim[1]), std::abs(dim[2]), std::abs(dim[3])); - const cudaChannelFormatDesc texDesc = cudaCreateChannelDesc(); - NR_CUDA_SAFE_CALL(cudaMalloc3DArray(arrayCuda, &texDesc, volumeSize)); -} -template void Allocate(cudaArray**, const int*); -template void Allocate(cudaArray**, const int*); -template void Allocate(cudaArray**, const int*); // for deformation field -/* *************************************************************** */ -template -void Allocate(cudaArray **array1Cuda, cudaArray **array2Cuda, const int *dim) { - const cudaExtent volumeSize = make_cudaExtent(std::abs(dim[1]), std::abs(dim[2]), std::abs(dim[3])); - const cudaChannelFormatDesc texDesc = cudaCreateChannelDesc(); - NR_CUDA_SAFE_CALL(cudaMalloc3DArray(array1Cuda, &texDesc, volumeSize)); - NR_CUDA_SAFE_CALL(cudaMalloc3DArray(array2Cuda, &texDesc, volumeSize)); -} -template void Allocate(cudaArray**, cudaArray**, const int*); -template void Allocate(cudaArray**, cudaArray**, const int*); -template void Allocate(cudaArray**, cudaArray**, const int*); // for deformation field -/* *************************************************************** */ -template -void Allocate(DataType **arrayCuda, const size_t& nVoxels) { +template +void Allocate(DataType **arrayCuda, const size_t nVoxels) { NR_CUDA_SAFE_CALL(cudaMalloc(arrayCuda, nVoxels * sizeof(DataType))); } -template void Allocate(int**, const size_t&); -template void Allocate(float**, const size_t&); -template void Allocate(double**, const size_t&); -template void Allocate(float4**, const size_t&); // for deformation field +template void Allocate(int**, const size_t); +template void Allocate(float**, const size_t); +template void Allocate(double**, const size_t); +template void Allocate(float4**, const size_t); /* *************************************************************** */ -template +template void Allocate(DataType **arrayCuda, const int *dim) { const size_t memSize = (size_t)std::abs(dim[1]) * (size_t)std::abs(dim[2]) * (size_t)std::abs(dim[3]) * sizeof(DataType); NR_CUDA_SAFE_CALL(cudaMalloc(arrayCuda, memSize)); @@ -52,9 +31,9 @@ void Allocate(DataType **arrayCuda, const int *dim) { template void Allocate(int**, const int*); template void Allocate(float**, const int*); template void Allocate(double**, const int*); -template void Allocate(float4**, const int*); // for deformation field +template void Allocate(float4**, const int*); /* *************************************************************** */ -template +template void Allocate(DataType **array1Cuda, DataType **array2Cuda, const int *dim) { const size_t memSize = (size_t)std::abs(dim[1]) * (size_t)std::abs(dim[2]) * (size_t)std::abs(dim[3]) * sizeof(DataType); NR_CUDA_SAFE_CALL(cudaMalloc(array1Cuda, memSize)); @@ -62,167 +41,16 @@ void Allocate(DataType **array1Cuda, DataType **array2Cuda, const int *dim) { } template void Allocate(float**, float**, const int*); template void Allocate(double**, double**, const int*); -template void Allocate(float4**, float4**, const int*); // for deformation field -/* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray *arrayCuda, const nifti_image *img) { - if (sizeof(DataType) != sizeof(NiftiType)) - NR_FATAL_ERROR("The host and device arrays are of different types"); - cudaMemcpy3DParms copyParams{}; - copyParams.extent = make_cudaExtent(std::abs(img->dim[1]), std::abs(img->dim[2]), std::abs(img->dim[3])); - copyParams.srcPtr = make_cudaPitchedPtr(img->data, - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = arrayCuda; - copyParams.kind = cudaMemcpyHostToDevice; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); -} +template void Allocate(float4**, float4**, const int*); /* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray *arrayCuda, const nifti_image *img) { - if (sizeof(DataType) == sizeof(float4)) { - if (img->datatype != NIFTI_TYPE_FLOAT32) - NR_FATAL_ERROR("The specified image is not a single precision image"); - const float *niftiImgValues = static_cast(img->data); - const size_t voxelNumber = NiftiImage::calcVoxelNumber(img, 3); - const auto timePointCount = img->dim[4] * img->dim[5]; - unique_ptr array(new float4[voxelNumber]()); - for (size_t i = 0; i < voxelNumber; i++) - array[i].x = *niftiImgValues++; - if (timePointCount >= 2) { - for (size_t i = 0; i < voxelNumber; i++) - array[i].y = *niftiImgValues++; - } - if (timePointCount >= 3) { - for (size_t i = 0; i < voxelNumber; i++) - array[i].z = *niftiImgValues++; - } - if (timePointCount >= 4) { - for (size_t i = 0; i < voxelNumber; i++) - array[i].w = *niftiImgValues++; - } - cudaMemcpy3DParms copyParams{}; - copyParams.extent = make_cudaExtent(std::abs(img->dim[1]), std::abs(img->dim[2]), std::abs(img->dim[3])); - copyParams.srcPtr = make_cudaPitchedPtr(array.get(), - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = arrayCuda; - copyParams.kind = cudaMemcpyHostToDevice; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); - } else { // All these else could be removed but the nvcc compiler would warn for unreachable statement - switch (img->datatype) { - case NIFTI_TYPE_FLOAT32: - TransferNiftiToDevice(arrayCuda, img); - break; - default: - NR_FATAL_ERROR("The image data type is not supported"); - } - } -} -template void TransferNiftiToDevice(cudaArray*, const nifti_image*); -template void TransferNiftiToDevice(cudaArray*, const nifti_image*); -template void TransferNiftiToDevice(cudaArray*, const nifti_image*); -template void TransferNiftiToDevice(cudaArray*, const nifti_image*); // for deformation field -/* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray *array1Cuda, cudaArray *array2Cuda, const nifti_image *img) { - if (sizeof(DataType) != sizeof(NiftiType)) - NR_FATAL_ERROR("The host and device arrays are of different types"); - NiftiType *array1 = static_cast(img->data); - NiftiType *array2 = &array1[NiftiImage::calcVoxelNumber(img, 3)]; - cudaMemcpy3DParms copyParams{}; - copyParams.extent = make_cudaExtent(std::abs(img->dim[1]), std::abs(img->dim[2]), std::abs(img->dim[3])); - copyParams.kind = cudaMemcpyHostToDevice; - // First timepoint - copyParams.srcPtr = make_cudaPitchedPtr(array1, - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = array1Cuda; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); - // Second timepoint - copyParams.srcPtr = make_cudaPitchedPtr(array2, - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = array2Cuda; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); -} -/* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray *array1Cuda, cudaArray *array2Cuda, const nifti_image *img) { - if (sizeof(DataType) == sizeof(float4)) { - if (img->datatype != NIFTI_TYPE_FLOAT32) - NR_FATAL_ERROR("The specified image is not a single precision image"); - const float *niftiImgValues = static_cast(img->data); - const size_t voxelNumber = NiftiImage::calcVoxelNumber(img, 3); - const auto timePointCount = img->dim[4] * img->dim[5]; - unique_ptr array1(new float4[voxelNumber]()); - unique_ptr array2(new float4[voxelNumber]()); - for (size_t i = 0; i < voxelNumber; i++) - array1[i].x = *niftiImgValues++; - for (size_t i = 0; i < voxelNumber; i++) - array2[i].x = *niftiImgValues++; - if (timePointCount >= 2) { - for (size_t i = 0; i < voxelNumber; i++) - array1[i].y = *niftiImgValues++; - for (size_t i = 0; i < voxelNumber; i++) - array2[i].y = *niftiImgValues++; - } - if (timePointCount >= 3) { - for (size_t i = 0; i < voxelNumber; i++) - array1[i].z = *niftiImgValues++; - for (size_t i = 0; i < voxelNumber; i++) - array2[i].z = *niftiImgValues++; - } - if (timePointCount >= 4) { - for (size_t i = 0; i < voxelNumber; i++) - array1[i].w = *niftiImgValues++; - for (size_t i = 0; i < voxelNumber; i++) - array2[i].w = *niftiImgValues++; - } - - cudaMemcpy3DParms copyParams{}; - copyParams.extent = make_cudaExtent(std::abs(img->dim[1]), std::abs(img->dim[2]), std::abs(img->dim[3])); - copyParams.kind = cudaMemcpyHostToDevice; - // First timepoint - copyParams.srcPtr = make_cudaPitchedPtr(array1.get(), - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = array1Cuda; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); - // Second timepoint - copyParams.srcPtr = make_cudaPitchedPtr(array2.get(), - copyParams.extent.width * sizeof(DataType), - copyParams.extent.width, - copyParams.extent.height); - copyParams.dstArray = array2Cuda; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); - } else { // All these else could be removed but the nvcc compiler would warn for unreachable statement - switch (img->datatype) { - case NIFTI_TYPE_FLOAT32: - TransferNiftiToDevice(array1Cuda, array2Cuda, img); - break; - default: - NR_FATAL_ERROR("The image data type is not supported"); - } - } -} -template void TransferNiftiToDevice(cudaArray*, cudaArray*, const nifti_image*); -template void TransferNiftiToDevice(cudaArray*, cudaArray*, const nifti_image*); -template void TransferNiftiToDevice(cudaArray*, cudaArray*, const nifti_image*); // for deformation field -/* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType *arrayCuda, const nifti_image *img) { if (sizeof(DataType) != sizeof(NiftiType)) NR_FATAL_ERROR("The host and device arrays are of different types"); NR_CUDA_SAFE_CALL(cudaMemcpy(arrayCuda, img->data, img->nvox * sizeof(NiftiType), cudaMemcpyHostToDevice)); } /* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType *arrayCuda, const nifti_image *img) { if (sizeof(DataType) == sizeof(float4)) { if (img->datatype != NIFTI_TYPE_FLOAT32) @@ -246,7 +74,7 @@ void TransferNiftiToDevice(DataType *arrayCuda, const nifti_image *img) { array[i].w = *niftiImgValues++; } NR_CUDA_SAFE_CALL(cudaMemcpy(arrayCuda, array.get(), voxelNumber * sizeof(float4), cudaMemcpyHostToDevice)); - } else { // All these else could be removed but the nvcc compiler would warn for unreachable statement + } else { switch (img->datatype) { case NIFTI_TYPE_FLOAT32: TransferNiftiToDevice(arrayCuda, img); @@ -261,7 +89,7 @@ template void TransferNiftiToDevice(float*, const nifti_image*); template void TransferNiftiToDevice(double*, const nifti_image*); template void TransferNiftiToDevice(float4*, const nifti_image*); /* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType *array1Cuda, DataType *array2Cuda, const nifti_image *img) { if (sizeof(DataType) != sizeof(NiftiType)) NR_FATAL_ERROR("The host and device arrays are of different types"); @@ -273,7 +101,7 @@ void TransferNiftiToDevice(DataType *array1Cuda, DataType *array2Cuda, const nif NR_CUDA_SAFE_CALL(cudaMemcpy(array2Cuda, array2, memSize, cudaMemcpyHostToDevice)); } /* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType *array1Cuda, DataType *array2Cuda, const nifti_image *img) { if (sizeof(DataType) == sizeof(float4)) { if (img->datatype != NIFTI_TYPE_FLOAT32) @@ -307,7 +135,7 @@ void TransferNiftiToDevice(DataType *array1Cuda, DataType *array2Cuda, const nif } NR_CUDA_SAFE_CALL(cudaMemcpy(array1Cuda, array1.get(), voxelNumber * sizeof(float4), cudaMemcpyHostToDevice)); NR_CUDA_SAFE_CALL(cudaMemcpy(array2Cuda, array2.get(), voxelNumber * sizeof(float4), cudaMemcpyHostToDevice)); - } else { // All these else could be removed but the nvcc compiler would warn for unreachable statement + } else { switch (img->datatype) { case NIFTI_TYPE_FLOAT32: TransferNiftiToDevice(array1Cuda, array2Cuda, img); @@ -319,38 +147,24 @@ void TransferNiftiToDevice(DataType *array1Cuda, DataType *array2Cuda, const nif } template void TransferNiftiToDevice(float*, float*, const nifti_image*); template void TransferNiftiToDevice(double*, double*, const nifti_image*); -template void TransferNiftiToDevice(float4*, float4*, const nifti_image*); // for deformation field +template void TransferNiftiToDevice(float4*, float4*, const nifti_image*); /* *************************************************************** */ -template -void TransferNiftiToDevice(DataType *arrayCuda, const DataType *img, const size_t& nvox) { +template +void TransferNiftiToDevice(DataType *arrayCuda, const DataType *img, const size_t nvox) { NR_CUDA_SAFE_CALL(cudaMemcpy(arrayCuda, img, nvox * sizeof(DataType), cudaMemcpyHostToDevice)); } -template void TransferNiftiToDevice(int*, const int*, const size_t&); -template void TransferNiftiToDevice(float*, const float*, const size_t&); -template void TransferNiftiToDevice(double*, const double*, const size_t&); +template void TransferNiftiToDevice(int*, const int*, const size_t); +template void TransferNiftiToDevice(float*, const float*, const size_t); +template void TransferNiftiToDevice(double*, const double*, const size_t); /* *************************************************************** */ -void TransferFromDeviceToNifti(nifti_image *img, const cudaArray *arrayCuda) { - if (img->datatype != NIFTI_TYPE_FLOAT32) - NR_FATAL_ERROR("The image data type is not supported"); - cudaMemcpy3DParms copyParams{}; - copyParams.extent = make_cudaExtent(std::abs(img->dim[1]), std::abs(img->dim[2]), std::abs(img->dim[3])); - copyParams.srcArray = const_cast(arrayCuda); - copyParams.dstPtr = make_cudaPitchedPtr(img->data, - copyParams.extent.width * sizeof(float), - copyParams.extent.width, - copyParams.extent.height); - copyParams.kind = cudaMemcpyDeviceToHost; - NR_CUDA_SAFE_CALL(cudaMemcpy3D(©Params)); -} -/* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image *img, const DataType *arrayCuda) { if (sizeof(DataType) != sizeof(NiftiType)) NR_FATAL_ERROR("The host and device arrays are of different types"); NR_CUDA_SAFE_CALL(cudaMemcpy(img->data, arrayCuda, img->nvox * sizeof(DataType), cudaMemcpyDeviceToHost)); } /* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image *img, const DataType *arrayCuda) { if (sizeof(DataType) == sizeof(float4)) { // A nifti 5D volume is expected @@ -387,9 +201,9 @@ void TransferFromDeviceToNifti(nifti_image *img, const DataType *arrayCuda) { } template void TransferFromDeviceToNifti(nifti_image*, const float*); template void TransferFromDeviceToNifti(nifti_image*, const double*); -template void TransferFromDeviceToNifti(nifti_image*, const float4*); // for deformation field +template void TransferFromDeviceToNifti(nifti_image*, const float4*); /* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image *img, const DataType *array1Cuda, const DataType *array2Cuda) { if (sizeof(DataType) != sizeof(NiftiType)) NR_FATAL_ERROR("The host and device arrays are of different types"); @@ -400,7 +214,7 @@ void TransferFromDeviceToNifti(nifti_image *img, const DataType *array1Cuda, con NR_CUDA_SAFE_CALL(cudaMemcpy(array2, array2Cuda, voxelNumber * sizeof(DataType), cudaMemcpyDeviceToHost)); } /* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image *img, const DataType *array1Cuda, const DataType *array2Cuda) { if (sizeof(DataType) == sizeof(float4)) { // A nifti 5D volume is expected @@ -447,29 +261,24 @@ void TransferFromDeviceToNifti(nifti_image *img, const DataType *array1Cuda, con } template void TransferFromDeviceToNifti(nifti_image*, const float*, const float*); template void TransferFromDeviceToNifti(nifti_image*, const double*, const double*); -template void TransferFromDeviceToNifti(nifti_image*, const float4*, const float4*); // for deformation field +template void TransferFromDeviceToNifti(nifti_image*, const float4*, const float4*); /* *************************************************************** */ -template -void TransferFromDeviceToHost(DataType *array, const DataType *arrayCuda, const size_t& nElements) { +template +void TransferFromDeviceToHost(DataType *array, const DataType *arrayCuda, const size_t nElements) { NR_CUDA_SAFE_CALL(cudaMemcpy(array, arrayCuda, nElements * sizeof(DataType), cudaMemcpyDeviceToHost)); } -template void TransferFromDeviceToHost(float*, const float*, const size_t&); -template void TransferFromDeviceToHost(double*, const double*, const size_t&); +template void TransferFromDeviceToHost(float*, const float*, const size_t); +template void TransferFromDeviceToHost(double*, const double*, const size_t); /* *************************************************************** */ -template -void TransferFromHostToDevice(DataType *arrayCuda, const DataType *array, const size_t& nElements) { +template +void TransferFromHostToDevice(DataType *arrayCuda, const DataType *array, const size_t nElements) { NR_CUDA_SAFE_CALL(cudaMemcpy(arrayCuda, array, nElements * sizeof(DataType), cudaMemcpyHostToDevice)); } -template void TransferFromHostToDevice(int*, const int*, const size_t&); -template void TransferFromHostToDevice(float*, const float*, const size_t&); -template void TransferFromHostToDevice(double*, const double*, const size_t&); +template void TransferFromHostToDevice(int*, const int*, const size_t); +template void TransferFromHostToDevice(float*, const float*, const size_t); +template void TransferFromHostToDevice(double*, const double*, const size_t); /* *************************************************************** */ -void Free(cudaArray *arrayCuda) { - if (arrayCuda != nullptr) - NR_CUDA_SAFE_CALL(cudaFreeArray(arrayCuda)); -} -/* *************************************************************** */ -template +template void Free(DataType *arrayCuda) { if (arrayCuda != nullptr) NR_CUDA_SAFE_CALL(cudaFree(arrayCuda)); @@ -479,56 +288,52 @@ template void Free(float*); template void Free(double*); template void Free(float4*); /* *************************************************************** */ -void DestroyTextureObject(cudaTextureObject_t *texObj) { +template<> +void Free(cudaTextureObject_t *texObj) { NR_CUDA_SAFE_CALL(cudaDestroyTextureObject(*texObj)); delete texObj; } /* *************************************************************** */ -UniqueTextureObjectPtr CreateTextureObject(const void *devPtr, - const cudaResourceType& resType, - const size_t& size, - const cudaChannelFormatKind& channelFormat, - const unsigned& channelCount, - const cudaTextureFilterMode& filterMode, - const bool& normalizedCoordinates) { +template +UniqueTextureObjectPtr CreateTextureObject(const DataType *devPtr, + const size_t count, + const cudaChannelFormatKind channelFormat, + const unsigned channelCount) { // Specify texture cudaResourceDesc resDesc{}; - resDesc.resType = resType; - switch (resType) { - case cudaResourceTypeLinear: - resDesc.res.linear.devPtr = const_cast(devPtr); - resDesc.res.linear.desc.f = channelFormat; - resDesc.res.linear.desc.x = 32; - if (channelCount > 1) - resDesc.res.linear.desc.y = 32; - if (channelCount > 2) - resDesc.res.linear.desc.z = 32; - if (channelCount > 3) - resDesc.res.linear.desc.w = 32; - resDesc.res.linear.sizeInBytes = size; - break; - case cudaResourceTypeArray: - resDesc.res.array.array = static_cast(const_cast(devPtr)); - break; - default: - NR_FATAL_ERROR("Unsupported resource type"); - } + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = const_cast(devPtr); + resDesc.res.linear.desc.f = channelFormat; + resDesc.res.linear.desc.x = 32; + if (channelCount > 1) + resDesc.res.linear.desc.y = 32; + if (channelCount > 2) + resDesc.res.linear.desc.z = 32; + if (channelCount > 3) + resDesc.res.linear.desc.w = 32; + resDesc.res.linear.sizeInBytes = count * sizeof(DataType); // Specify texture object parameters cudaTextureDesc texDesc{}; texDesc.addressMode[0] = cudaAddressModeWrap; texDesc.addressMode[1] = cudaAddressModeWrap; texDesc.addressMode[2] = cudaAddressModeWrap; - texDesc.filterMode = filterMode; + texDesc.filterMode = cudaFilterModePoint; texDesc.readMode = cudaReadModeElementType; - texDesc.normalizedCoords = normalizedCoordinates; + texDesc.normalizedCoords = false; // Create texture object - UniqueTextureObjectPtr texObj(new cudaTextureObject_t(), DestroyTextureObject); + UniqueTextureObjectPtr texObj(new cudaTextureObject_t()); NR_CUDA_SAFE_CALL(cudaCreateTextureObject(texObj.get(), &resDesc, &texDesc, nullptr)); return texObj; } +template UniqueTextureObjectPtr CreateTextureObject(const bool*, const size_t, const cudaChannelFormatKind, const unsigned); +template UniqueTextureObjectPtr CreateTextureObject(const int*, const size_t, const cudaChannelFormatKind, const unsigned); +template UniqueTextureObjectPtr CreateTextureObject(const float*, const size_t, const cudaChannelFormatKind, const unsigned); +template UniqueTextureObjectPtr CreateTextureObject(const float2*, const size_t, const cudaChannelFormatKind, const unsigned); +template UniqueTextureObjectPtr CreateTextureObject(const float4*, const size_t, const cudaChannelFormatKind, const unsigned); +template UniqueTextureObjectPtr CreateTextureObject(const mat33*, const size_t, const cudaChannelFormatKind, const unsigned); /* *************************************************************** */ } // namespace NiftyReg::Cuda /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaCommon.hpp b/reg-lib/cuda/CudaCommon.hpp index 9b32dd4d..b5872e56 100644 --- a/reg-lib/cuda/CudaCommon.hpp +++ b/reg-lib/cuda/CudaCommon.hpp @@ -69,53 +69,37 @@ inline void CheckKernel(const std::string& file, const int line, const std::stri #define NR_CUDA_SAFE_CALL(call) { call; NiftyReg::Cuda::Internal::SafeCall(__FILE__, __LINE__, NR_FUNCTION); } #define NR_CUDA_CHECK_KERNEL(grid, block) NiftyReg::Cuda::Internal::CheckKernel(__FILE__, __LINE__, NR_FUNCTION, grid, block) /* *************************************************************** */ -template -void Allocate(cudaArray**, const int*); +template +void Allocate(DataType**, const size_t); /* *************************************************************** */ -template -void Allocate(cudaArray**, cudaArray**, const int*); -/* *************************************************************** */ -template -void Allocate(DataType**, const size_t&); -/* *************************************************************** */ -template +template void Allocate(DataType**, const int*); /* *************************************************************** */ -template +template void Allocate(DataType**, DataType**, const int*); /* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray*, const nifti_image*); -/* *************************************************************** */ -template -void TransferNiftiToDevice(cudaArray*, cudaArray*, const nifti_image*); -/* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType*, const nifti_image*); /* *************************************************************** */ -template +template void TransferNiftiToDevice(DataType*, DataType*, const nifti_image*); /* *************************************************************** */ -template -void TransferNiftiToDevice(DataType*, const DataType*, const size_t&); +template +void TransferNiftiToDevice(DataType*, const DataType*, const size_t); /* *************************************************************** */ -void TransferFromDeviceToNifti(nifti_image*, const cudaArray*); -/* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image*, const DataType*); /* *************************************************************** */ -template +template void TransferFromDeviceToNifti(nifti_image*, const DataType*, const DataType*); /* *************************************************************** */ -template -void TransferFromDeviceToHost(DataType*, const DataType*, const size_t&); -/* *************************************************************** */ -template -void TransferFromHostToDevice(DataType*, const DataType*, const size_t&); +template +void TransferFromDeviceToHost(DataType*, const DataType*, const size_t); /* *************************************************************** */ -void Free(cudaArray*); +template +void TransferFromHostToDevice(DataType*, const DataType*, const size_t); /* *************************************************************** */ -template +template void Free(DataType*); /* *************************************************************** */ namespace Internal { @@ -123,18 +107,16 @@ template struct UniquePtrDeleter { void operator()(T *ptr) const { Free(ptr); } }; } /* *************************************************************** */ -template +template using UniquePtr = unique_ptr>; /* *************************************************************** */ -using UniqueTextureObjectPtr = unique_ptr; +using UniqueTextureObjectPtr = UniquePtr; /* *************************************************************** */ -UniqueTextureObjectPtr CreateTextureObject(const void *devPtr, - const cudaResourceType& resType, - const size_t& size = 0, - const cudaChannelFormatKind& channelFormat = cudaChannelFormatKindNone, - const unsigned& channelCount = 1, - const cudaTextureFilterMode& filterMode = cudaFilterModePoint, - const bool& normalizedCoordinates = false); +template +UniqueTextureObjectPtr CreateTextureObject(const DataType *devPtr, + const size_t count, + const cudaChannelFormatKind channelFormat, + const unsigned channelCount); /* *************************************************************** */ } // namespace NiftyReg::Cuda /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaContent.cpp b/reg-lib/cuda/CudaContent.cpp index 37df05ab..f26f8c69 100644 --- a/reg-lib/cuda/CudaContent.cpp +++ b/reg-lib/cuda/CudaContent.cpp @@ -25,17 +25,17 @@ CudaContent::~CudaContent() { void CudaContent::AllocateReference() { if (reference->nbyper != NIFTI_TYPE_FLOAT32) reg_tools_changeDatatype(reference); - Cuda::Allocate(&referenceCuda, reference->dim); + Cuda::Allocate(&referenceCuda, reference->nvox); referenceCudaManaged.reset(referenceCuda); - Cuda::TransferNiftiToDevice(referenceCuda, reference); + Cuda::TransferNiftiToDevice(referenceCuda, reference); } /* *************************************************************** */ void CudaContent::AllocateFloating() { if (floating->nbyper != NIFTI_TYPE_FLOAT32) reg_tools_changeDatatype(floating); - Cuda::Allocate(&floatingCuda, floating->dim); + Cuda::Allocate(&floatingCuda, floating->nvox); floatingCudaManaged.reset(floatingCuda); - Cuda::TransferNiftiToDevice(floatingCuda, floating); + Cuda::TransferNiftiToDevice(floatingCuda, floating); } /* *************************************************************** */ void CudaContent::AllocateDeformationField() { diff --git a/reg-lib/cuda/CudaContent.h b/reg-lib/cuda/CudaContent.h index f308ec1b..bf3230c4 100644 --- a/reg-lib/cuda/CudaContent.h +++ b/reg-lib/cuda/CudaContent.h @@ -18,8 +18,8 @@ class CudaContent: public virtual Content { // Getters virtual nifti_image* GetDeformationField() override; virtual nifti_image* GetWarped() override; - virtual cudaArray* GetReferenceCuda() { return referenceCuda; } - virtual cudaArray* GetFloatingCuda() { return floatingCuda; } + virtual float* GetReferenceCuda() { return referenceCuda; } + virtual float* GetFloatingCuda() { return floatingCuda; } virtual float4* GetDeformationFieldCuda() { return deformationFieldCuda; } virtual int* GetReferenceMaskCuda() { return referenceMaskCuda; } virtual float* GetTransformationMatrixCuda() { return transformationMatrixCuda; } @@ -30,10 +30,10 @@ class CudaContent: public virtual Content { virtual void UpdateWarped() override; protected: - cudaArray *referenceCuda = nullptr; - Cuda::UniquePtr referenceCudaManaged; - cudaArray *floatingCuda = nullptr; - Cuda::UniquePtr floatingCudaManaged; + float *referenceCuda = nullptr; + Cuda::UniquePtr referenceCudaManaged; + float *floatingCuda = nullptr; + Cuda::UniquePtr floatingCudaManaged; float4 *deformationFieldCuda = nullptr; int *referenceMaskCuda = nullptr; float *transformationMatrixCuda = nullptr; @@ -49,8 +49,8 @@ class CudaContent: public virtual Content { template DataType CastImageData(float intensity, int datatype); template void FillImageData(nifti_image *image, float *memoryObject, int datatype); void DownloadImage(nifti_image *image, float *memoryObject, int datatype); - void SetReferenceCuda(cudaArray *referenceCudaIn) { referenceCudaManaged = nullptr; referenceCuda = referenceCudaIn; } - void SetFloatingCuda(cudaArray *floatingCudaIn) { floatingCudaManaged = nullptr; floatingCuda = floatingCudaIn; } + void SetReferenceCuda(float *referenceCudaIn) { referenceCudaManaged = nullptr; referenceCuda = referenceCudaIn; } + void SetFloatingCuda(float *floatingCudaIn) { floatingCudaManaged = nullptr; floatingCuda = floatingCudaIn; } // Friend classes friend class CudaF3d2ContentCreator; diff --git a/reg-lib/cuda/CudaKernelConvolution.cu b/reg-lib/cuda/CudaKernelConvolution.cu index ff2037ff..67a081ed 100644 --- a/reg-lib/cuda/CudaKernelConvolution.cu +++ b/reg-lib/cuda/CudaKernelConvolution.cu @@ -36,12 +36,9 @@ void NiftyReg::Cuda::KernelConvolution(const nifti_image *image, float *bufferDensityCudaPtr = bufferDensityCuda.data().get(); // Create texture objects - auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 1); - auto densityTexturePtr = Cuda::CreateTextureObject(densityCudaPtr, cudaResourceTypeLinear, - voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1); - auto nanImageTexturePtr = Cuda::CreateTextureObject(nanImageCudaPtr, cudaResourceTypeLinear, - voxelNumber * sizeof(bool), cudaChannelFormatKindUnsigned, 1); + auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, voxelNumber, cudaChannelFormatKindFloat, 1); + auto densityTexturePtr = Cuda::CreateTextureObject(densityCudaPtr, voxelNumber, cudaChannelFormatKindFloat, 1); + auto nanImageTexturePtr = Cuda::CreateTextureObject(nanImageCudaPtr, voxelNumber, cudaChannelFormatKindUnsigned, 1); auto imageTexture = *imageTexturePtr; auto densityTexture = *densityTexturePtr; auto nanImageTexture = *nanImageTexturePtr; @@ -138,12 +135,11 @@ void NiftyReg::Cuda::KernelConvolution(const nifti_image *image, const int imageDim = reinterpret_cast(&imageDims)[n]; // Create the kernel texture thrust::device_vector kernelCuda; - Cuda::UniqueTextureObjectPtr kernelTexturePtr(nullptr, nullptr); + Cuda::UniqueTextureObjectPtr kernelTexturePtr; cudaTextureObject_t kernelTexture = 0; if (kernelSum > 0) { kernelCuda = kernel; - kernelTexturePtr = std::move(Cuda::CreateTextureObject(kernelCuda.data().get(), cudaResourceTypeLinear, - kernel.size() * sizeof(float), cudaChannelFormatKindFloat, 1)); + kernelTexturePtr = Cuda::CreateTextureObject(kernelCuda.data().get(), kernel.size(), cudaChannelFormatKindFloat, 1); kernelTexture = *kernelTexturePtr; } diff --git a/reg-lib/cuda/CudaNormaliseGradient.cu b/reg-lib/cuda/CudaNormaliseGradient.cu index 85a250a5..8d948c2e 100644 --- a/reg-lib/cuda/CudaNormaliseGradient.cu +++ b/reg-lib/cuda/CudaNormaliseGradient.cu @@ -4,8 +4,7 @@ /* *************************************************************** */ template float GetMaximalLength(const float4 *imageCuda, const size_t nVoxels) { - auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, nVoxels, cudaChannelFormatKindFloat, 4); auto imageTexture = *imageTexturePtr; thrust::counting_iterator index(0); return thrust::transform_reduce(thrust::device, index, index + nVoxels, [=]__device__(const unsigned index) { @@ -47,8 +46,7 @@ float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda, /* *************************************************************** */ template void NormaliseGradient(float4 *imageCuda, const size_t nVoxels, const double maxGradLengthInv) { - auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, nVoxels, cudaChannelFormatKindFloat, 4); auto imageTexture = *imageTexturePtr; thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const unsigned index) { const float4 val = tex1Dfetch(imageTexture, index); diff --git a/reg-lib/cuda/_reg_localTransformation_gpu.cu b/reg-lib/cuda/_reg_localTransformation_gpu.cu index 569136b1..ac5be2b0 100755 --- a/reg-lib/cuda/_reg_localTransformation_gpu.cu +++ b/reg-lib/cuda/_reg_localTransformation_gpu.cu @@ -31,10 +31,8 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, controlPointImage->dy / referenceImage->dy, controlPointImage->dz / referenceImage->dz); - auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, cudaResourceTypeLinear, - controlPointNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, - activeVoxelNumber * sizeof(int), cudaChannelFormatKindSigned, 1); + auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4); + auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); // Get the reference matrix if composition is required thrust::device_vector realToVoxel; @@ -151,8 +149,7 @@ template double reg_spline_approxBendingEnergy_gpu(const nifti_image *controlPointImage, const float4 *controlPointImageCuda) { const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz); - auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointImageCuda, cudaResourceTypeLinear, - controlPointNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4); auto controlPointTexture = *controlPointTexturePtr; // Get the constant basis values @@ -188,8 +185,7 @@ void reg_spline_approxBendingEnergyGradient_gpu(nifti_image *controlPointImage, auto blockSize = CudaContext::GetBlockSize(); const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz); - auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointImageCuda, cudaResourceTypeLinear, - controlPointNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4); auto controlPointTexture = *controlPointTexturePtr; // Get the constant basis values @@ -223,9 +219,8 @@ void reg_spline_approxBendingEnergyGradient_gpu(nifti_image *controlPointImage, } }); - auto secondDerivativesTexturePtr = Cuda::CreateTextureObject(secondDerivativesCuda, cudaResourceTypeLinear, - secondDerivativesCudaVec.size() * sizeof(typename SecondDerivative::TextureType), - cudaChannelFormatKindFloat, sizeof(typename SecondDerivative::TextureType) / sizeof(float)); + auto secondDerivativesTexturePtr = Cuda::CreateTextureObject(secondDerivativesCuda, secondDerivativesCudaVec.size(), cudaChannelFormatKindFloat, + sizeof(typename SecondDerivative::TextureType) / sizeof(float)); auto secondDerivativesTexture = *secondDerivativesTexturePtr; // Compute the gradient @@ -293,8 +288,7 @@ void reg_spline_ComputeApproxJacobianValues(const nifti_image *controlPointImage auto blockSize = CudaContext::GetBlockSize(); const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz); - auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, cudaResourceTypeLinear, - controlPointNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4); // Need to reorient the Jacobian matrix using the header information - real to voxel conversion const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz); @@ -330,8 +324,7 @@ void reg_spline_ComputeJacobianValues(const nifti_image *controlPointImage, const int3 referenceImageDim = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz); const float3 controlPointSpacing = make_float3(controlPointImage->dx, controlPointImage->dy, controlPointImage->dz); - auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, cudaResourceTypeLinear, - controlPointNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto controlPointTexture = Cuda::CreateTextureObject(controlPointImageCuda, controlPointNumber, cudaChannelFormatKindFloat, 4); // Need to reorient the Jacobian matrix using the header information - real to voxel conversion const mat33 reorientation = reg_mat44_to_mat33(controlPointImage->sform_code > 0 ? &controlPointImage->sto_xyz : &controlPointImage->qto_xyz); @@ -434,10 +427,8 @@ void reg_spline_getJacobianPenaltyTermGradient_gpu(const nifti_image *referenceI const float3 weight = make_float3(referenceImage->dx * jacobianWeight / ((float)jacNumber * controlPointImage->dx), referenceImage->dy * jacobianWeight / ((float)jacNumber * controlPointImage->dy), referenceImage->dz * jacobianWeight / ((float)jacNumber * controlPointImage->dz)); - auto jacobianDeterminantTexture = Cuda::CreateTextureObject(jacobianDetCuda, cudaResourceTypeLinear, jacNumber * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto jacobianMatricesTexture = Cuda::CreateTextureObject(jacobianMatricesCuda, cudaResourceTypeLinear, - (controlPointImage->nz > 1 ? 9 : 4) * jacNumber * sizeof(float), + auto jacobianDeterminantTexture = Cuda::CreateTextureObject(jacobianDetCuda, jacNumber, cudaChannelFormatKindFloat, 1); + auto jacobianMatricesTexture = Cuda::CreateTextureObject(jacobianMatricesCuda, (controlPointImage->nz > 1 ? 9 : 4) * jacNumber, cudaChannelFormatKindFloat, 1); if (approx) { if (controlPointImage->nz > 1) { @@ -498,22 +489,20 @@ double reg_spline_correctFolding_gpu(const nifti_image *referenceImage, // The Jacobian matrices and determinants are computed float *jacobianMatricesCuda, *jacobianDetCuda; - size_t jacobianDetSize, jacobianMatricesSize; - size_t jacNumber; double jacSum; + size_t jacobianDetSize, jacNumber; + double jacSum; if (approx) { jacNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); jacSum = (controlPointImage->nx - 2) * (controlPointImage->ny - 2) * (controlPointImage->nz - 2); jacobianDetSize = jacNumber * sizeof(float); - jacobianMatricesSize = 9 * jacobianDetSize; - NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianMatricesCuda, jacobianMatricesSize)); + NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianMatricesCuda, 9 * jacobianDetSize)); NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianDetCuda, jacobianDetSize)); reg_spline_ComputeApproxJacobianValues(controlPointImage, controlPointImageCuda, jacobianMatricesCuda, jacobianDetCuda); } else { jacNumber = NiftiImage::calcVoxelNumber(referenceImage, 3); jacSum = static_cast(jacNumber); jacobianDetSize = jacNumber * sizeof(float); - jacobianMatricesSize = 9 * jacobianDetSize; - NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianMatricesCuda, jacobianMatricesSize)); + NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianMatricesCuda, 9 * jacobianDetSize)); NR_CUDA_SAFE_CALL(cudaMalloc(&jacobianDetCuda, jacobianDetSize)); reg_spline_ComputeJacobianValues(controlPointImage, referenceImage, controlPointImageCuda, jacobianMatricesCuda, jacobianDetCuda); } @@ -548,10 +537,8 @@ double reg_spline_correctFolding_gpu(const nifti_image *referenceImage, const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); const int3 controlPointImageDim = make_int3(controlPointImage->nx, controlPointImage->ny, controlPointImage->nz); const float3 controlPointSpacing = make_float3(controlPointImage->dx, controlPointImage->dy, controlPointImage->dz); - auto jacobianDeterminantTexture = Cuda::CreateTextureObject(jacobianDetCuda, cudaResourceTypeLinear, jacobianDetSize, - cudaChannelFormatKindFloat, 1); - auto jacobianMatricesTexture = Cuda::CreateTextureObject(jacobianMatricesCuda, cudaResourceTypeLinear, jacobianMatricesSize, - cudaChannelFormatKindFloat, 1); + auto jacobianDeterminantTexture = Cuda::CreateTextureObject(jacobianDetCuda, jacNumber, cudaChannelFormatKindFloat, 1); + auto jacobianMatricesTexture = Cuda::CreateTextureObject(jacobianMatricesCuda, 9 * jacNumber, cudaChannelFormatKindFloat, 1); if (approx) { const unsigned blocks = blockSize->reg_spline_approxCorrectFolding3D; const unsigned grids = (unsigned)Ceil(sqrtf((float)controlPointNumber / (float)blocks)); @@ -676,8 +663,7 @@ void reg_defField_compose_gpu(const nifti_image *deformationField, const int3 referenceImageDim{ deformationField->nx, deformationField->ny, deformationField->nz }; const mat44& affineMatrixB = deformationField->sform_code > 0 ? deformationField->sto_ijk : deformationField->qto_ijk; const mat44& affineMatrixC = deformationField->sform_code > 0 ? deformationField->sto_xyz : deformationField->qto_xyz; - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4); if (deformationField->nz > 1) { const unsigned blocks = blockSize->reg_defField_compose3D; @@ -835,8 +821,7 @@ void reg_defField_getJacobianMatrix_gpu(const nifti_image *deformationField, const int3 referenceImageDim = make_int3(deformationField->nx, deformationField->ny, deformationField->nz); const size_t voxelNumber = NiftiImage::calcVoxelNumber(deformationField, 3); const mat33 reorientation = reg_mat44_to_mat33(deformationField->sform_code > 0 ? &deformationField->sto_xyz : &deformationField->qto_xyz); - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4); const unsigned blocks = CudaContext::GetBlockSize()->reg_defField_getJacobianMatrix; const unsigned grids = (unsigned)Ceil(sqrtf((float)voxelNumber / (float)blocks)); @@ -864,8 +849,7 @@ double reg_spline_approxLinearEnergy_gpu(const nifti_image *controlPointGrid, set_first_order_basis_values(basis.x, basis.y); // Create the control point texture - auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointGridCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto controlPointTexturePtr = Cuda::CreateTextureObject(controlPointGridCuda, voxelNumber, cudaChannelFormatKindFloat, 4); auto controlPointTexture = *controlPointTexturePtr; constexpr int matSize = is3d ? 3 : 2; @@ -912,10 +896,8 @@ void reg_spline_approxLinearEnergyGradient_gpu(const nifti_image *controlPointGr thrust::device_vector dispMatricesCuda(voxelNumber); // Create the textures - auto controlPointTexture = Cuda::CreateTextureObject(controlPointGridCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto dispMatricesTexture = Cuda::CreateTextureObject(dispMatricesCuda.data().get(), cudaResourceTypeLinear, - voxelNumber * sizeof(mat33), cudaChannelFormatKindFloat, 1); + auto controlPointTexture = Cuda::CreateTextureObject(controlPointGridCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto dispMatricesTexture = Cuda::CreateTextureObject(dispMatricesCuda.data().get(), voxelNumber, cudaChannelFormatKindFloat, 1); // Create the displacement matrices reg_spline_createDisplacementMatrices_kernel<<>>(dispMatricesCuda.data().get(), *controlPointTexture, diff --git a/reg-lib/cuda/_reg_measure_gpu.h b/reg-lib/cuda/_reg_measure_gpu.h index e2c4e836..8d753747 100755 --- a/reg-lib/cuda/_reg_measure_gpu.h +++ b/reg-lib/cuda/_reg_measure_gpu.h @@ -22,9 +22,9 @@ class reg_measure_gpu { virtual ~reg_measure_gpu() {} virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, @@ -75,8 +75,8 @@ class reg_measure_gpu { } protected: - cudaArray *referenceImageCuda; - cudaArray *floatingImageCuda; + float *referenceImageCuda; + float *floatingImageCuda; int *referenceMaskCuda; size_t activeVoxelNumber; float *warpedImageCuda; @@ -100,9 +100,9 @@ class reg_lncc_gpu: public reg_lncc, public reg_measure_gpu { virtual ~reg_lncc_gpu() {} virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, @@ -142,9 +142,9 @@ class reg_kld_gpu: public reg_kld, public reg_measure_gpu { virtual ~reg_kld_gpu() {} virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, @@ -184,9 +184,9 @@ class reg_dti_gpu: public reg_dti, public reg_measure_gpu { virtual ~reg_dti_gpu() {} virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, diff --git a/reg-lib/cuda/_reg_nmi_gpu.cu b/reg-lib/cuda/_reg_nmi_gpu.cu index 45a6616d..1758eda5 100755 --- a/reg-lib/cuda/_reg_nmi_gpu.cu +++ b/reg-lib/cuda/_reg_nmi_gpu.cu @@ -22,8 +22,8 @@ reg_nmi_gpu::~reg_nmi_gpu() { NR_FUNC_CALLED(); } /* *************************************************************** */ -void reg_nmi_gpu::InitialiseMeasure(nifti_image *refImg, cudaArray *refImgCuda, - nifti_image *floImg, cudaArray *floImgCuda, +void reg_nmi_gpu::InitialiseMeasure(nifti_image *refImg, float *refImgCuda, + nifti_image *floImg, float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, nifti_image *warpedImg, float *warpedImgCuda, @@ -44,8 +44,8 @@ void reg_nmi_gpu::InitialiseMeasure(nifti_image *refImg, cudaArray *refImgCuda, if (this->referenceTimePoints > 1 || this->floatingImage->nt > 1) NR_FATAL_ERROR("Multiple time points are not yet supported"); // The reference and floating images have to be updated on the device - Cuda::TransferNiftiToDevice(this->referenceImageCuda, this->referenceImage); - Cuda::TransferNiftiToDevice(this->floatingImageCuda, this->floatingImage); + Cuda::TransferNiftiToDevice(this->referenceImageCuda, this->referenceImage); + Cuda::TransferNiftiToDevice(this->floatingImageCuda, this->floatingImage); // Create the joint histograms this->jointHistogramLogCudaVecs.resize(this->referenceTimePoints); this->jointHistogramProCudaVecs.resize(this->referenceTimePoints); @@ -67,7 +67,7 @@ void reg_nmi_gpu::InitialiseMeasure(nifti_image *refImg, cudaArray *refImgCuda, } /* *************************************************************** */ void reg_getNmiValue_gpu(const nifti_image *referenceImage, - const cudaArray *referenceImageCuda, + const float *referenceImageCuda, const float *warpedImageCuda, const double *timePointWeights, const int referenceTimePoints, @@ -82,10 +82,7 @@ void reg_getNmiValue_gpu(const nifti_image *referenceImage, const bool approximation) { const size_t voxelNumber = NiftiImage::calcVoxelNumber(referenceImage, 3); const int3 referenceImageDims = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); - auto referenceImageTexturePtr = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray); - auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); - auto referenceImageTexture = *referenceImageTexturePtr; + auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); auto maskTexture = *maskTexturePtr; // Iterate over all active time points @@ -100,21 +97,21 @@ void reg_getNmiValue_gpu(const nifti_image *referenceImage, thrust::fill(thrust::device, jointHistogramProCudaVecs[t].begin(), jointHistogramProCudaVecs[t].end(), 0.0); double *jointHistogramLogCuda = jointHistogramLogCudaVecs[t].data().get(); double *jointHistogramProCuda = jointHistogramProCudaVecs[t].data().get(); - // Define warped image texture - auto warpedImageTexturePtr = Cuda::CreateTextureObject(warpedImageCuda + t * voxelNumber, cudaResourceTypeLinear, - voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1); + // Define the current textures + auto referenceImageTexturePtr = Cuda::CreateTextureObject(referenceImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto warpedImageTexturePtr = Cuda::CreateTextureObject(warpedImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto referenceImageTexture = *referenceImageTexturePtr; auto warpedImageTexture = *warpedImageTexturePtr; // Fill the joint histograms if (approximation == false) { // No approximation is used for the Parzen windowing thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const unsigned index) { - const int& voxel = tex1Dfetch(maskTexture, index); - const float& warValue = tex1Dfetch(warpedImageTexture, voxel); - if (warValue != warValue) return; - auto&& [x, y, z] = reg_indexToDims_cuda(voxel, referenceImageDims); - const float& refValue = tex3D(referenceImageTexture, x, y, z); + const int voxel = tex1Dfetch(maskTexture, index); + const float refValue = tex1Dfetch(referenceImageTexture, voxel); if (refValue != refValue) return; - for (int r = int(refValue - 1); r < int(refValue + 3); r++) { + const float warValue = tex1Dfetch(warpedImageTexture, voxel); + if (warValue != warValue) return; + for (int r = int(refValue) - 1; r < int(refValue) + 3; r++) { if (0 <= r && r < curRefBinNumber) { const double refBasis = GetBasisSplineValue(refValue - r); for (int w = int(warValue) - 1; w < int(warValue) + 3; w++) { @@ -130,12 +127,11 @@ void reg_getNmiValue_gpu(const nifti_image *referenceImage, // An approximation is used for the Parzen windowing. First intensities are binarised then // the histogram is convolved with a spine kernel function. thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const unsigned index) { - const int& voxel = tex1Dfetch(maskTexture, index); - const float& warValue = tex1Dfetch(warpedImageTexture, voxel); - if (warValue != warValue) return; - auto&& [x, y, z] = reg_indexToDims_cuda(voxel, referenceImageDims); - const float& refValue = tex3D(referenceImageTexture, x, y, z); + const int voxel = tex1Dfetch(maskTexture, index); + const float refValue = tex1Dfetch(referenceImageTexture, voxel); if (refValue != refValue) return; + const float warValue = tex1Dfetch(warpedImageTexture, voxel); + if (warValue != warValue) return; if (0 <= refValue && refValue < curRefBinNumber && 0 <= warValue && warValue < curFloBinNumber) atomicAdd(&jointHistogramProCuda[int(refValue) + int(warValue) * curRefBinNumber], 1.0); }); @@ -225,7 +221,7 @@ void reg_getNmiValue_gpu(const nifti_image *referenceImage, } /* *************************************************************** */ static double GetSimilarityMeasureValue(const nifti_image *referenceImage, - const cudaArray *referenceImageCuda, + const float *referenceImageCuda, const nifti_image *warpedImage, const float *warpedImageCuda, const double *timePointWeights, @@ -304,7 +300,7 @@ template<> struct Derivative { using Type = double2; }; /// Called when we only have one target and one source image template void reg_getVoxelBasedNmiGradient_gpu(const nifti_image *referenceImage, - const cudaArray *referenceImageCuda, + const float *referenceImageCuda, const float *warpedImageCuda, const float4 *warpedGradientCuda, const double *jointHistogramLogCuda, @@ -324,14 +320,10 @@ void reg_getVoxelBasedNmiGradient_gpu(const nifti_image *referenceImage, const int referenceOffset = refBinNumber * floBinNumber; const int floatingOffset = referenceOffset + refBinNumber; - auto referenceImageTexturePtr = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray, 0, - cudaChannelFormatKindNone, 1, cudaFilterModePoint, true); - auto warpedImageTexturePtr = Cuda::CreateTextureObject(warpedImageCuda + currentTimePoint * voxelNumber, cudaResourceTypeLinear, - voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1); - auto warpedGradientTexturePtr = Cuda::CreateTextureObject(warpedGradientCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float4), - cudaChannelFormatKindFloat, 4); - auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); + auto referenceImageTexturePtr = Cuda::CreateTextureObject(referenceImageCuda + currentTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto warpedImageTexturePtr = Cuda::CreateTextureObject(warpedImageCuda + currentTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto warpedGradientTexturePtr = Cuda::CreateTextureObject(warpedGradientCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); auto referenceImageTexture = *referenceImageTexturePtr; auto warpedImageTexture = *warpedImageTexturePtr; auto warpedGradientTexture = *warpedGradientTexturePtr; @@ -339,45 +331,40 @@ void reg_getVoxelBasedNmiGradient_gpu(const nifti_image *referenceImage, thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const unsigned index) { const int targetIndex = tex1Dfetch(maskTexture, index); - const float warpedImageValue = tex1Dfetch(warpedImageTexture, targetIndex); - if (warpedImageValue != warpedImageValue) return; - const auto&& [x, y, z] = reg_indexToDims_cuda(targetIndex, imageSize); - const float referenceImageValue = tex3D(referenceImageTexture, - (float(x) + 0.5f) / float(imageSize.x), - (float(y) + 0.5f) / float(imageSize.y), - is3d ? (float(z) + 0.5f) / float(imageSize.z) : 0.5f); - if (referenceImageValue != referenceImageValue) return; - const float4& warpedGradValue = tex1Dfetch(warpedGradientTexture, index); - float4 gradValue = voxelBasedGradientCuda[targetIndex]; + const float refValue = tex1Dfetch(referenceImageTexture, targetIndex); + if (refValue != refValue) return; + const float warValue = tex1Dfetch(warpedImageTexture, targetIndex); + if (warValue != warValue) return; + const float4 warGradValue = tex1Dfetch(warpedGradientTexture, index); // No computation is performed if any of the point is part of the background // The two is added because the image is resample between 2 and bin+2 // if 64 bins are used the histogram will have 68 bins et the image will be between 2 and 65 typename Derivative::Type jointDeriv{}, refDeriv{}, warDeriv{}; - for (int r = (int)referenceImageValue - 1; r < (int)referenceImageValue + 3; ++r) { + for (int r = int(refValue) - 1; r < int(refValue) + 3; r++) { if (-1 < r && r < refBinNumber) { - for (int w = (int)warpedImageValue - 1; w < (int)warpedImageValue + 3; ++w) { + for (int w = int(warValue) - 1; w < int(warValue) + 3; w++) { if (-1 < w && w < floBinNumber) { - const double commonValue = (GetBasisSplineValue(referenceImageValue - r) * - GetBasisSplineDerivativeValue(warpedImageValue - w)); + const double commonValue = (GetBasisSplineValue(refValue - r) * + GetBasisSplineDerivativeValue(warValue - w)); const double jointLog = jointHistogramLogCuda[r + w * refBinNumber]; const double refLog = jointHistogramLogCuda[r + referenceOffset]; const double warLog = jointHistogramLogCuda[w + floatingOffset]; - if (warpedGradValue.x == warpedGradValue.x) { - const double commonMultGrad = commonValue * warpedGradValue.x; + if (warGradValue.x == warGradValue.x) { + const double commonMultGrad = commonValue * warGradValue.x; jointDeriv.x += commonMultGrad * jointLog; refDeriv.x += commonMultGrad * refLog; warDeriv.x += commonMultGrad * warLog; } - if (warpedGradValue.y == warpedGradValue.y) { - const double commonMultGrad = commonValue * warpedGradValue.y; + if (warGradValue.y == warGradValue.y) { + const double commonMultGrad = commonValue * warGradValue.y; jointDeriv.y += commonMultGrad * jointLog; refDeriv.y += commonMultGrad * refLog; warDeriv.y += commonMultGrad * warLog; } if constexpr (is3d) { - if (warpedGradValue.z == warpedGradValue.z) { - const double commonMultGrad = commonValue * warpedGradValue.z; + if (warGradValue.z == warGradValue.z) { + const double commonMultGrad = commonValue * warGradValue.z; jointDeriv.z += commonMultGrad * jointLog; refDeriv.z += commonMultGrad * refLog; warDeriv.z += commonMultGrad * warLog; @@ -389,6 +376,7 @@ void reg_getVoxelBasedNmiGradient_gpu(const nifti_image *referenceImage, } // (Marc) I removed the normalisation by the voxel number as each gradient has to be normalised in the same way + float4 gradValue = voxelBasedGradientCuda[targetIndex]; gradValue.x += static_cast(timePointWeight * (refDeriv.x + warDeriv.x - nmi * jointDeriv.x) / normalisedJE); gradValue.y += static_cast(timePointWeight * (refDeriv.y + warDeriv.y - nmi * jointDeriv.y) / normalisedJE); if constexpr (is3d) diff --git a/reg-lib/cuda/_reg_nmi_gpu.h b/reg-lib/cuda/_reg_nmi_gpu.h index c3f33d4c..3af164a9 100755 --- a/reg-lib/cuda/_reg_nmi_gpu.h +++ b/reg-lib/cuda/_reg_nmi_gpu.h @@ -26,9 +26,9 @@ class reg_nmi_gpu: public reg_nmi, public reg_measure_gpu { /// @brief Initialise the reg_nmi_gpu object virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, @@ -68,9 +68,9 @@ class reg_nmi_gpu: public reg_nmi, public reg_measure_gpu { class reg_multichannel_nmi_gpu: public reg_multichannel_nmi, public reg_measure_gpu { public: void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, diff --git a/reg-lib/cuda/_reg_optimiser_gpu.cu b/reg-lib/cuda/_reg_optimiser_gpu.cu index 474ff131..28b187b6 100755 --- a/reg-lib/cuda/_reg_optimiser_gpu.cu +++ b/reg-lib/cuda/_reg_optimiser_gpu.cu @@ -172,8 +172,7 @@ void reg_initialiseConjugateGradient_gpu(float4 *gradientImageCuda, float4 *conjugateGCuda, float4 *conjugateHCuda, const size_t nVoxels) { - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); const unsigned blocks = CudaContext::GetBlockSize()->reg_initialiseConjugateGradient; const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); @@ -200,20 +199,14 @@ void reg_getConjugateGradient_gpu(float4 *gradientImageCuda, float4 *conjugateGBwCuda, float4 *conjugateHBwCuda, const size_t nVoxelsBw) { - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto conjugateGTexture = Cuda::CreateTextureObject(conjugateGCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto conjugateHTexture = Cuda::CreateTextureObject(conjugateHCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); - Cuda::UniqueTextureObjectPtr gradientImageBwTexture(nullptr, nullptr), conjugateGBwTexture(nullptr, nullptr), conjugateHBwTexture(nullptr, nullptr); + auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto conjugateGTexture = Cuda::CreateTextureObject(conjugateGCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto conjugateHTexture = Cuda::CreateTextureObject(conjugateHCuda, nVoxels, cudaChannelFormatKindFloat, 4); + Cuda::UniqueTextureObjectPtr gradientImageBwTexture, conjugateGBwTexture, conjugateHBwTexture; if (isSymmetric) { - gradientImageBwTexture = std::move(Cuda::CreateTextureObject(gradientImageBwCuda, cudaResourceTypeLinear, - nVoxelsBw * sizeof(float4), cudaChannelFormatKindFloat, 4)); - conjugateGBwTexture = std::move(Cuda::CreateTextureObject(conjugateGBwCuda, cudaResourceTypeLinear, - nVoxelsBw * sizeof(float4), cudaChannelFormatKindFloat, 4)); - conjugateHBwTexture = std::move(Cuda::CreateTextureObject(conjugateHBwCuda, cudaResourceTypeLinear, - nVoxelsBw * sizeof(float4), cudaChannelFormatKindFloat, 4)); + gradientImageBwTexture = Cuda::CreateTextureObject(gradientImageBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + conjugateGBwTexture = Cuda::CreateTextureObject(conjugateGBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + conjugateHBwTexture = Cuda::CreateTextureObject(conjugateHBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); } // gam = sum((grad+g)*grad)/sum(HxG); @@ -267,10 +260,8 @@ void reg_updateControlPointPosition_gpu(const size_t nVoxels, const bool optimiseX, const bool optimiseY, const bool optimiseZ) { - auto bestControlPointTexture = Cuda::CreateTextureObject(bestControlPointCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, cudaResourceTypeLinear, - nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto bestControlPointTexture = Cuda::CreateTextureObject(bestControlPointCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); const unsigned blocks = (unsigned)CudaContext::GetBlockSize()->reg_updateControlPointPosition; const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); diff --git a/reg-lib/cuda/_reg_resampling_gpu.cu b/reg-lib/cuda/_reg_resampling_gpu.cu index 6eb684ff..fe3eb39b 100755 --- a/reg-lib/cuda/_reg_resampling_gpu.cu +++ b/reg-lib/cuda/_reg_resampling_gpu.cu @@ -16,7 +16,7 @@ /* *************************************************************** */ void reg_resampleImage_gpu(const nifti_image *floatingImage, float *warpedImageCuda, - const cudaArray *floatingImageCuda, + const float *floatingImageCuda, const float4 *deformationFieldCuda, const int *maskCuda, const size_t activeVoxelNumber, @@ -26,16 +26,15 @@ void reg_resampleImage_gpu(const nifti_image *floatingImage, NR_FATAL_ERROR("Only linear interpolation is supported on the GPU"); auto blockSize = CudaContext::GetBlockSize(); + const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3); const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz); // Create the texture object for the floating image - auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, cudaResourceTypeArray); + auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1); // Create the texture object for the deformation field - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, cudaResourceTypeLinear, - activeVoxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); // Create the texture object for the mask - auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); + auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); // Bind the real to voxel matrix to the texture const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk; @@ -60,7 +59,7 @@ void reg_resampleImage_gpu(const nifti_image *floatingImage, } /* *************************************************************** */ void reg_getImageGradient_gpu(const nifti_image *floatingImage, - const cudaArray *floatingImageCuda, + const float *floatingImageCuda, const float4 *deformationFieldCuda, float4 *warpedGradientCuda, const size_t activeVoxelNumber, @@ -70,14 +69,14 @@ void reg_getImageGradient_gpu(const nifti_image *floatingImage, NR_FATAL_ERROR("Only linear interpolation is supported on the GPU"); auto blockSize = CudaContext::GetBlockSize(); + const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3); const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz); if (paddingValue != paddingValue) paddingValue = 0; // Create the texture object for the floating image - auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, cudaResourceTypeArray); + auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1); // Create the texture object for the deformation field - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, cudaResourceTypeLinear, - activeVoxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); // Bind the real to voxel matrix to the texture const mat44 floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk; diff --git a/reg-lib/cuda/_reg_resampling_gpu.h b/reg-lib/cuda/_reg_resampling_gpu.h index 6afd287a..5fc18144 100755 --- a/reg-lib/cuda/_reg_resampling_gpu.h +++ b/reg-lib/cuda/_reg_resampling_gpu.h @@ -17,7 +17,7 @@ /* *************************************************************** */ void reg_resampleImage_gpu(const nifti_image *floatingImage, float *warpedImageCuda, - const cudaArray *floatingImageCuda, + const float *floatingImageCuda, const float4 *deformationFieldCuda, const int *maskCuda, const size_t activeVoxelNumber, @@ -25,7 +25,7 @@ void reg_resampleImage_gpu(const nifti_image *floatingImage, const float paddingValue); /* *************************************************************** */ void reg_getImageGradient_gpu(const nifti_image *floatingImage, - const cudaArray *floatingImageCuda, + const float *floatingImageCuda, const float4 *deformationFieldCuda, float4 *warpedGradientCuda, const size_t activeVoxelNumber, diff --git a/reg-lib/cuda/_reg_resampling_kernels.cu b/reg-lib/cuda/_reg_resampling_kernels.cu index 0782a984..c2711fdf 100755 --- a/reg-lib/cuda/_reg_resampling_kernels.cu +++ b/reg-lib/cuda/_reg_resampling_kernels.cu @@ -50,13 +50,15 @@ __global__ void reg_resampleImage2D_kernel(float *resultArray, InterpLinearKernel(relative.y, yBasis); double intensity = 0; - for (char b = 0; b < 2; b++) { + int indexY = previous.y * floatingDim.x + previous.x; + for (char b = 0; b < 2; b++, indexY += floatingDim.x) { const int y = previous.y + b; + int index = indexY; double xTempNewValue = 0; - for (char a = 0; a < 2; a++) { + for (char a = 0; a < 2; a++, index++) { const int x = previous.x + a; if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) { - xTempNewValue += tex3D(floatingTexture, x, y, 0) * xBasis[a]; + xTempNewValue += tex1Dfetch(floatingTexture, index) * xBasis[a]; } else { // Padding value xTempNewValue += paddingValue * xBasis[a]; @@ -78,13 +80,12 @@ __global__ void reg_resampleImage3D_kernel(float *resultArray, const float paddingValue) { const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (tid >= activeVoxelNumber) return; - const int tid2 = tex1Dfetch(maskTexture, tid); - // Get the real world deformation in the floating space - float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); + const int tid2 = tex1Dfetch(maskTexture, tid); + const float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); // Get the voxel-based deformation in the floating space - float3 voxelDeformation; + double3 voxelDeformation; voxelDeformation.x = (double(floatingMatrix.m[0][0]) * double(realDeformation.x) + double(floatingMatrix.m[0][1]) * double(realDeformation.y) + double(floatingMatrix.m[0][2]) * double(realDeformation.z) + @@ -109,14 +110,16 @@ __global__ void reg_resampleImage3D_kernel(float *resultArray, double intensity = 0; for (char c = 0; c < 2; c++) { const int z = previous.z + c; + int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; double yTempNewValue = 0; - for (char b = 0; b < 2; b++) { + for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { const int y = previous.y + b; + int index = indexYZ + previous.x; double xTempNewValue = 0; - for (char a = 0; a < 2; a++) { + for (char a = 0; a < 2; a++, index++) { const int x = previous.x + a; if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) { - xTempNewValue += tex3D(floatingTexture, x, y, z) * xBasis[a]; + xTempNewValue += tex1Dfetch(floatingTexture, index) * xBasis[a]; } else { // Padding value xTempNewValue += paddingValue * xBasis[a]; @@ -160,15 +163,17 @@ __global__ void reg_getImageGradient2D_kernel(float4 *gradientArray, constexpr float deriv[] = { -1.0f, 1.0f }; float4 gradientValue{}; - for (char b = 0; b < 2; b++) { - float2 tempValueX{}; + int indexY = previous.y * floatingDim.x + previous.x; + for (char b = 0; b < 2; b++, indexY += floatingDim.x) { const int y = previous.y + b; - for (char a = 0; a < 2; a++) { + int index = indexY; + float2 tempValueX{}; + for (char a = 0; a < 2; a++, index++) { const int x = previous.x + a; float intensity = paddingValue; if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) - intensity = tex3D(floatingTexture, x, y, 0); + intensity = tex1Dfetch(floatingTexture, index); tempValueX.x += intensity * deriv[a]; tempValueX.y += intensity * xBasis[a]; @@ -219,16 +224,18 @@ __global__ void reg_getImageGradient3D_kernel(float4 *gradientArray, float4 gradientValue{}; for (char c = 0; c < 2; c++) { const int z = previous.z + c; + int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; float3 tempValueY{}; - for (char b = 0; b < 2; b++) { - float2 tempValueX{}; + for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { const int y = previous.y + b; - for (char a = 0; a < 2; a++) { + int index = indexYZ + previous.x; + float2 tempValueX{}; + for (char a = 0; a < 2; a++, index++) { const int x = previous.x + a; float intensity = paddingValue; if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) - intensity = tex3D(floatingTexture, x, y, z); + intensity = tex1Dfetch(floatingTexture, index); tempValueX.x += intensity * deriv[a]; tempValueX.y += intensity * xBasis[a]; diff --git a/reg-lib/cuda/_reg_ssd_gpu.cu b/reg-lib/cuda/_reg_ssd_gpu.cu index bf414396..7b7d94d4 100755 --- a/reg-lib/cuda/_reg_ssd_gpu.cu +++ b/reg-lib/cuda/_reg_ssd_gpu.cu @@ -22,8 +22,8 @@ reg_ssd_gpu::~reg_ssd_gpu() { NR_FUNC_CALLED(); } /* *************************************************************** */ -void reg_ssd_gpu::InitialiseMeasure(nifti_image *refImg, cudaArray *refImgCuda, - nifti_image *floImg, cudaArray *floImgCuda, +void reg_ssd_gpu::InitialiseMeasure(nifti_image *refImg, float *refImgCuda, + nifti_image *floImg, float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, nifti_image *warpedImg, float *warpedImgCuda, @@ -46,32 +46,29 @@ void reg_ssd_gpu::InitialiseMeasure(nifti_image *refImg, cudaArray *refImgCuda, // Check if the reference and floating images need to be updated for (int i = 0; i < this->referenceTimePoints; ++i) if (this->timePointWeights[i] > 0 && normaliseTimePoint[i]) { - Cuda::TransferNiftiToDevice(this->referenceImageCuda, this->referenceImage); - Cuda::TransferNiftiToDevice(this->floatingImageCuda, this->floatingImage); + Cuda::TransferNiftiToDevice(this->referenceImageCuda, this->referenceImage); + Cuda::TransferNiftiToDevice(this->floatingImageCuda, this->floatingImage); break; } NR_FUNC_CALLED(); } /* *************************************************************** */ double reg_getSsdValue_gpu(const nifti_image *referenceImage, - const cudaArray *referenceImageCuda, + const float *referenceImageCuda, const float *warpedCuda, const float *localWeightSimCuda, const int *maskCuda, - const size_t& activeVoxelNumber) { + const size_t activeVoxelNumber) { // Copy the constant memory variables const int3 referenceImageDim = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); const size_t voxelNumber = NiftiImage::calcVoxelNumber(referenceImage, 3); - auto referenceTexture = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray); - auto warpedTexture = Cuda::CreateTextureObject(warpedCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); - Cuda::UniqueTextureObjectPtr localWeightSimTexture(nullptr, nullptr); + auto referenceTexture = Cuda::CreateTextureObject(referenceImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1); + auto warpedTexture = Cuda::CreateTextureObject(warpedCuda, voxelNumber, cudaChannelFormatKindFloat, 1); + auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); + Cuda::UniqueTextureObjectPtr localWeightSimTexture; if (localWeightSimCuda) - localWeightSimTexture = std::move(Cuda::CreateTextureObject(localWeightSimCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1)); + localWeightSimTexture = Cuda::CreateTextureObject(localWeightSimCuda, voxelNumber, cudaChannelFormatKindFloat, 1); // Create an array on the device to store the absolute difference values thrust::device_vector ssdSum(1), ssdCount(1); @@ -111,7 +108,7 @@ double reg_ssd_gpu::GetSimilarityMeasureValueBw() { } /* *************************************************************** */ void reg_getVoxelBasedSsdGradient_gpu(const nifti_image *referenceImage, - const cudaArray *referenceImageCuda, + const float *referenceImageCuda, const float *warpedCuda, const float4 *spatialGradCuda, const float *localWeightSimCuda, @@ -123,29 +120,22 @@ void reg_getVoxelBasedSsdGradient_gpu(const nifti_image *referenceImage, const int3 referenceImageDim = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); const size_t voxelNumber = NiftiImage::calcVoxelNumber(referenceImage, 3); - auto referenceTexture = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray); - auto warpedTexture = Cuda::CreateTextureObject(warpedCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); - auto spatialGradTexture = Cuda::CreateTextureObject(spatialGradCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float4), - cudaChannelFormatKindFloat, 4); - Cuda::UniqueTextureObjectPtr localWeightSimTexture(nullptr, nullptr); + auto referenceTexturePtr = Cuda::CreateTextureObject(referenceImageCuda, voxelNumber, cudaChannelFormatKindFloat, 1); + auto warpedTexturePtr = Cuda::CreateTextureObject(warpedCuda, voxelNumber, cudaChannelFormatKindFloat, 1); + auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); + auto spatialGradTexturePtr = Cuda::CreateTextureObject(spatialGradCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + Cuda::UniqueTextureObjectPtr localWeightSimTexturePtr; if (localWeightSimCuda) - localWeightSimTexture = std::move(Cuda::CreateTextureObject(localWeightSimCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1)); + localWeightSimTexturePtr = Cuda::CreateTextureObject(localWeightSimCuda, voxelNumber, cudaChannelFormatKindFloat, 1); // Find number of valid voxels and correct weight - const cudaTextureObject_t referenceTextureObject = *referenceTexture; - const cudaTextureObject_t warpedTextureObject = *warpedTexture; - const size_t validVoxelNumber = thrust::count_if(thrust::device, maskCuda, maskCuda + activeVoxelNumber, [=]__device__(const int& index) { - const float warValue = tex1Dfetch(warpedTextureObject, index); - if (warValue != warValue) return false; - - const auto&& [x, y, z] = reg_indexToDims_cuda(index, referenceImageDim); - const float refValue = tex3D(referenceTextureObject, x, y, z); + const auto referenceTexture = *referenceTexturePtr; + const auto warpedTexture = *warpedTexturePtr; + const size_t validVoxelNumber = thrust::count_if(thrust::device, maskCuda, maskCuda + activeVoxelNumber, [=]__device__(const int index) { + const float refValue = tex1Dfetch(referenceTexture, index); if (refValue != refValue) return false; - + const float warValue = tex1Dfetch(warpedTexture, index); + if (warValue != warValue) return false; return true; }); const float adjustedWeight = timepointWeight / static_cast(validVoxelNumber); @@ -154,8 +144,8 @@ void reg_getVoxelBasedSsdGradient_gpu(const nifti_image *referenceImage, const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); const dim3 gridDims(grids, grids, 1); const dim3 blockDims(blocks, 1, 1); - Cuda::GetSsdGradientKernel<<>>(ssdGradientCuda, *referenceTexture, *warpedTexture, *maskTexture, - *spatialGradTexture, localWeightSimCuda ? *localWeightSimTexture : 0, + Cuda::GetSsdGradientKernel<<>>(ssdGradientCuda, *referenceTexturePtr, *warpedTexturePtr, *maskTexturePtr, + *spatialGradTexturePtr, localWeightSimCuda ? *localWeightSimTexturePtr : 0, referenceImageDim, adjustedWeight, (unsigned)activeVoxelNumber); NR_CUDA_CHECK_KERNEL(gridDims, blockDims); } diff --git a/reg-lib/cuda/_reg_ssd_gpu.h b/reg-lib/cuda/_reg_ssd_gpu.h index 03f184a4..23bd6fd5 100755 --- a/reg-lib/cuda/_reg_ssd_gpu.h +++ b/reg-lib/cuda/_reg_ssd_gpu.h @@ -27,9 +27,9 @@ class reg_ssd_gpu: public reg_ssd, public reg_measure_gpu { /// @brief Initialise the reg_ssd object virtual void InitialiseMeasure(nifti_image *refImg, - cudaArray *refImgCuda, + float *refImgCuda, nifti_image *floImg, - cudaArray *floImgCuda, + float *floImgCuda, int *refMask, int *refMaskCuda, size_t activeVoxNum, diff --git a/reg-lib/cuda/_reg_ssd_kernels.cu b/reg-lib/cuda/_reg_ssd_kernels.cu index 3b0255e7..99a61530 100755 --- a/reg-lib/cuda/_reg_ssd_kernels.cu +++ b/reg-lib/cuda/_reg_ssd_kernels.cu @@ -31,13 +31,12 @@ __global__ void GetSsdValueKernel(float *ssdSum, if (tid < activeVoxelNumber) { const int index = tex1Dfetch(maskTexture, tid); + const float refValue = tex1Dfetch(referenceTexture, index); + if (refValue != refValue) return; + const float warValue = tex1Dfetch(warpedTexture, index); if (warValue != warValue) return; - const auto&& [x, y, z] = reg_indexToDims_cuda(index, referenceImageDim); - const float refValue = tex3D(referenceTexture, x, y, z); - if (refValue != refValue) return; - const float val = localWeightSimTexture ? tex1Dfetch(localWeightSimTexture, index) : 1.f; const float diff = refValue - warValue; atomicAdd(ssdSum, diff * diff * val); @@ -58,6 +57,9 @@ __global__ void GetSsdGradientKernel(float4 *ssdGradient, if (tid < activeVoxelNumber) { const int index = tex1Dfetch(maskTexture, tid); + const float refValue = tex1Dfetch(referenceTexture, index); + if (refValue != refValue) return; + const float warValue = tex1Dfetch(warpedTexture, index); if (warValue != warValue) return; @@ -67,10 +69,6 @@ __global__ void GetSsdGradientKernel(float4 *ssdGradient, spaGradientValue.z != spaGradientValue.z) return; - const auto&& [x, y, z] = reg_indexToDims_cuda(index, referenceImageDim); - const float refValue = tex3D(referenceTexture, x, y, z); - if (refValue != refValue) return; - const float val = localWeightSimTexture ? tex1Dfetch(localWeightSimTexture, index) : 1.f; const float common = -2.f * (refValue - warValue) * adjustedWeight * val; diff --git a/reg-lib/cuda/_reg_tools_gpu.cu b/reg-lib/cuda/_reg_tools_gpu.cu index 2a4bb2bb..f1b9c401 100755 --- a/reg-lib/cuda/_reg_tools_gpu.cu +++ b/reg-lib/cuda/_reg_tools_gpu.cu @@ -26,8 +26,7 @@ void reg_voxelCentricToNodeCentric_gpu(const nifti_image *nodeImage, const size_t voxelNumber = NiftiImage::calcVoxelNumber(voxelImage, 3); const int3 nodeImageDims = make_int3(nodeImage->nx, nodeImage->ny, nodeImage->nz); const int3 voxelImageDims = make_int3(voxelImage->nx, voxelImage->ny, voxelImage->nz); - auto voxelImageTexture = Cuda::CreateTextureObject(voxelImageCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); + auto voxelImageTexture = Cuda::CreateTextureObject(voxelImageCuda, voxelNumber, cudaChannelFormatKindFloat, 4); // The transformation between the image and the grid mat44 transformation; @@ -133,10 +132,8 @@ void reg_gaussianSmoothing_gpu(const nifti_image *image, float4 *smoothedImage; NR_CUDA_SAFE_CALL(cudaMalloc(&smoothedImage, voxelNumber * sizeof(float4))); - auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto kernelTexture = Cuda::CreateTextureObject(kernelCuda, cudaResourceTypeLinear, - kernelSize * sizeof(float), cudaChannelFormatKindFloat, 1); + auto imageTexture = Cuda::CreateTextureObject(imageCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto kernelTexture = Cuda::CreateTextureObject(kernelCuda, kernelSize, cudaChannelFormatKindFloat, 1); unsigned blocks, grids; dim3 blockDims, gridDims; @@ -208,10 +205,8 @@ void reg_smoothImageForCubicSpline_gpu(const nifti_image *image, NR_CUDA_SAFE_CALL(cudaMemcpy(kernelCuda, kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice)); NR_CUDA_SAFE_CALL(cudaFreeHost(kernel)); - auto imageTexture = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear, - voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 4); - auto kernelTexture = Cuda::CreateTextureObject(kernelCuda, cudaResourceTypeLinear, - kernelSize * sizeof(float), cudaChannelFormatKindFloat, 1); + auto imageTexture = Cuda::CreateTextureObject(imageCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto kernelTexture = Cuda::CreateTextureObject(kernelCuda, kernelSize, cudaChannelFormatKindFloat, 1); float4 *smoothedImage; NR_CUDA_SAFE_CALL(cudaMalloc(&smoothedImage, voxelNumber * sizeof(float4))); diff --git a/reg-lib/cuda/blockMatchingKernel.cu b/reg-lib/cuda/blockMatchingKernel.cu index d638755d..035e29c3 100644 --- a/reg-lib/cuda/blockMatchingKernel.cu +++ b/reg-lib/cuda/blockMatchingKernel.cu @@ -345,12 +345,9 @@ void block_matching_method_gpu(const nifti_image *referenceImage, const uint3 blockSize = make_uint3(params->blockNumber[0], params->blockNumber[1], params->blockNumber[2]); const unsigned numBlocks = params->blockNumber[0] * params->blockNumber[1] * params->blockNumber[2]; - auto referenceTexture = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeLinear, referenceImage->nvox * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto warpedTexture = Cuda::CreateTextureObject(warpedImageCuda, cudaResourceTypeLinear, referenceImage->nvox * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto totalBlockTexture = Cuda::CreateTextureObject(totalBlockCuda, cudaResourceTypeLinear, numBlocks * sizeof(int), - cudaChannelFormatKindSigned, 1); + auto referenceTexture = Cuda::CreateTextureObject(referenceImageCuda, referenceImage->nvox, cudaChannelFormatKindFloat, 1); + auto warpedTexture = Cuda::CreateTextureObject(warpedImageCuda, referenceImage->nvox, cudaChannelFormatKindFloat, 1); + auto totalBlockTexture = Cuda::CreateTextureObject(totalBlockCuda, numBlocks, cudaChannelFormatKindSigned, 1); unsigned definedBlock = 0, *definedBlockCuda; NR_CUDA_SAFE_CALL(cudaMalloc(&definedBlockCuda, sizeof(unsigned)));