From 92ec3cef24d394735b28536bc84f90fbfc6220c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=9Clgen?= Date: Wed, 10 Jan 2024 12:55:06 +0000 Subject: [PATCH] Optimise Cuda::DefFieldCompose() #92 --- niftyreg_build_version.txt | 2 +- reg-lib/cuda/BlockSize.hpp | 6 - reg-lib/cuda/CudaCompute.cu | 3 +- reg-lib/cuda/CudaLocalTransformation.cu | 32 ++--- reg-lib/cuda/CudaLocalTransformation.hpp | 1 + .../cuda/CudaLocalTransformationKernels.cu | 110 ++++++++---------- 6 files changed, 61 insertions(+), 93 deletions(-) diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index fae51388..77851f13 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -381 +382 diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp index a0d2ea14..6338cf87 100644 --- a/reg-lib/cuda/BlockSize.hpp +++ b/reg-lib/cuda/BlockSize.hpp @@ -26,8 +26,6 @@ struct BlockSize { unsigned ComputeJacGradient3d; unsigned ApproxCorrectFolding3d; unsigned CorrectFolding3d; - unsigned DefFieldCompose2d; - unsigned DefFieldCompose3d; unsigned GetJacobianMatrix; unsigned ConvertNmiGradientFromVoxelToRealSpace; unsigned ApplyConvolutionWindowAlongX; @@ -49,8 +47,6 @@ struct BlockSize100: public BlockSize { ComputeJacGradient3d = 256; // 32 reg - 24 smem - 64 cmem ApproxCorrectFolding3d = 256; // 32 reg - 24 smem - 24 cmem CorrectFolding3d = 256; // 31 reg - 24 smem - 32 cmem - DefFieldCompose2d = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem - DefFieldCompose3d = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem GetJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem ConvertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem @@ -74,8 +70,6 @@ struct BlockSize300: public BlockSize { ComputeJacGradient3d = 768; // 37 reg ApproxCorrectFolding3d = 768; // 34 reg CorrectFolding3d = 768; // 34 reg - DefFieldCompose2d = 1024; // 23 reg - DefFieldCompose3d = 1024; // 24 reg GetJacobianMatrix = 768; // 34 reg ConvertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg ApplyConvolutionWindowAlongX = 1024; // 25 reg diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index 7b49be10..c81a0e97 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -324,6 +324,7 @@ void CudaCompute::DefFieldCompose(const nifti_image *defField) { const size_t voxelNumber = NiftiImage::calcVoxelNumber(defField, 3); thrust::device_vector defFieldCuda(voxelNumber); Cuda::TransferNiftiToDevice(defFieldCuda.data().get(), defField); - Cuda::DefFieldCompose(defField, defFieldCuda.data().get(), con.GetDeformationFieldCuda()); + auto defFieldCompose = defField->nz > 1 ? Cuda::DefFieldCompose : Cuda::DefFieldCompose; + defFieldCompose(defField, defFieldCuda.data().get(), con.GetDeformationFieldCuda()); } /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaLocalTransformation.cu b/reg-lib/cuda/CudaLocalTransformation.cu index 3c1ff918..20d2c471 100644 --- a/reg-lib/cuda/CudaLocalTransformation.cu +++ b/reg-lib/cuda/CudaLocalTransformation.cu @@ -634,33 +634,20 @@ void GetFlowFieldFromVelocityGrid(nifti_image *velocityFieldGrid, velocityFieldGrid->num_ext = oldNumExt; } /* *************************************************************** */ +template void DefFieldCompose(const nifti_image *deformationField, const float4 *deformationFieldCuda, - float4 *deformationFieldCudaOut) { - auto blockSize = CudaContext::GetBlockSize(); + float4 *deformationFieldOutCuda) { const size_t voxelNumber = NiftiImage::calcVoxelNumber(deformationField, 3); - const int3 referenceImageDim{ deformationField->nx, deformationField->ny, deformationField->nz }; + const int3 referenceImageDims{ deformationField->nx, deformationField->ny, deformationField->nz }; const mat44& affineMatrixB = deformationField->sform_code > 0 ? deformationField->sto_ijk : deformationField->qto_ijk; const mat44& affineMatrixC = deformationField->sform_code > 0 ? deformationField->sto_xyz : deformationField->qto_xyz; - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, voxelNumber, cudaChannelFormatKindFloat, 4); + auto deformationFieldTexture = *deformationFieldTexturePtr; - if (deformationField->nz > 1) { - const unsigned blocks = blockSize->DefFieldCompose3d; - const unsigned grids = (unsigned)Ceil(sqrtf((float)voxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - DefFieldCompose3d<<>>(deformationFieldCudaOut, *deformationFieldTexture, referenceImageDim, - (unsigned)voxelNumber, affineMatrixB, affineMatrixC); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } else { - const unsigned blocks = blockSize->DefFieldCompose2d; - const unsigned grids = (unsigned)Ceil(sqrtf((float)voxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - DefFieldCompose2d<<>>(deformationFieldCudaOut, *deformationFieldTexture, referenceImageDim, - (unsigned)voxelNumber, affineMatrixB, affineMatrixC); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), voxelNumber, [=]__device__(const int index) { + DefFieldComposeKernel(deformationFieldOutCuda, deformationFieldTexture, referenceImageDims, affineMatrixB, affineMatrixC, index); + }); } /* *************************************************************** */ void GetDeformationFieldFromFlowField(nifti_image *flowField, @@ -725,9 +712,10 @@ void GetDeformationFieldFromFlowField(nifti_image *flowField, thrust::copy(thrust::device, flowFieldCuda, flowFieldCuda + voxelNumber, deformationFieldCuda); // The deformation field is squared + auto defFieldCompose = deformationField->nz > 1 ? DefFieldCompose : DefFieldCompose; for (int i = 0; i < squaringNumber; ++i) { // The deformation field is applied to itself - DefFieldCompose(deformationField, deformationFieldCuda, flowFieldCuda); + defFieldCompose(deformationField, deformationFieldCuda, flowFieldCuda); // The computed scaled deformation field is copied over thrust::copy(thrust::device, flowFieldCuda, flowFieldCuda + voxelNumber, deformationFieldCuda); NR_DEBUG("Squaring (composition) step " << i + 1 << "/" << squaringNumber); diff --git a/reg-lib/cuda/CudaLocalTransformation.hpp b/reg-lib/cuda/CudaLocalTransformation.hpp index 90a13749..8e718822 100644 --- a/reg-lib/cuda/CudaLocalTransformation.hpp +++ b/reg-lib/cuda/CudaLocalTransformation.hpp @@ -56,6 +56,7 @@ double CorrectFolding(const nifti_image *referenceImage, float4 *controlPointImageCuda, const bool approx); /* *************************************************************** */ +template void DefFieldCompose(const nifti_image *deformationField, const float4 *deformationFieldCuda, float4 *deformationFieldOutCuda); diff --git a/reg-lib/cuda/CudaLocalTransformationKernels.cu b/reg-lib/cuda/CudaLocalTransformationKernels.cu index ef900936..af983f9b 100644 --- a/reg-lib/cuda/CudaLocalTransformationKernels.cu +++ b/reg-lib/cuda/CudaLocalTransformationKernels.cu @@ -1057,67 +1057,22 @@ __global__ void CorrectFolding3d(float4 *controlPointGrid, } } /* *************************************************************** */ -__global__ void DefFieldCompose2d(float4 *deformationField, - cudaTextureObject_t deformationFieldTexture, - const int3 referenceImageDim, - const unsigned voxelNumber, - const mat44 affineMatrixB, - const mat44 affineMatrixC) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < voxelNumber) { - // Extract the original voxel position - float4 position = deformationField[tid]; - - // Conversion from real position to voxel coordinate - const float4 voxelPosition{ - position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + affineMatrixB.m[0][3], - position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + affineMatrixB.m[1][3], - 0.f, - 0.f - }; - - // Linear interpolation - const int2 ante = { Floor(voxelPosition.x), Floor(voxelPosition.y) }; - float relX[2], relY[2]; - relX[1] = voxelPosition.x - (float)ante.x; relX[0] = 1.f - relX[1]; - relY[1] = voxelPosition.y - (float)ante.y; relY[0] = 1.f - relY[1]; - - position = make_float4(0.f, 0.f, 0.f, 0.f); - for (short b = 0; b < 2; ++b) { - for (short a = 0; a < 2; ++a) { - float4 deformation; - if (-1 < ante.x + a && ante.x + a < referenceImageDim.x && - -1 < ante.y + b && ante.y + b < referenceImageDim.y) { - const int index = (ante.y + b) * referenceImageDim.x + ante.x + a; - deformation = tex1Dfetch(deformationFieldTexture, index); - } else { - deformation = GetSlidedValues(ante.x + a, ante.y + b, deformationFieldTexture, referenceImageDim, affineMatrixC); - } - const float basis = relX[a] * relY[b]; - position = position + basis * deformation; - } - } - deformationField[tid] = position; - } -} -/* *************************************************************** */ -__global__ void DefFieldCompose3d(float4 *deformationField, - cudaTextureObject_t deformationFieldTexture, - const int3 referenceImageDim, - const unsigned voxelNumber, - const mat44 affineMatrixB, - const mat44 affineMatrixC) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < voxelNumber) { - // Extract the original voxel position - float4 position = deformationField[tid]; +template +__device__ void DefFieldComposeKernel(float4 *deformationField, + cudaTextureObject_t deformationFieldTexture, + const int3 referenceImageDims, + const mat44 affineMatrixB, + const mat44 affineMatrixC, + const int index) { + // Extract the original voxel position + float4 position = deformationField[index]; + if constexpr (is3d) { // Conversion from real position to voxel coordinate - const float4 voxelPosition{ + const float3 voxelPosition{ position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + position.z * affineMatrixB.m[0][2] + affineMatrixB.m[0][3], position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + position.z * affineMatrixB.m[1][2] + affineMatrixB.m[1][3], - position.x * affineMatrixB.m[2][0] + position.y * affineMatrixB.m[2][1] + position.z * affineMatrixB.m[2][2] + affineMatrixB.m[2][3], - 0.f + position.x * affineMatrixB.m[2][0] + position.y * affineMatrixB.m[2][1] + position.z * affineMatrixB.m[2][2] + affineMatrixB.m[2][3] }; // Linear interpolation @@ -1132,21 +1087,50 @@ __global__ void DefFieldCompose3d(float4 *deformationField, for (short b = 0; b < 2; ++b) { for (short a = 0; a < 2; ++a) { float4 deformation; - if (-1 < ante.x + a && ante.x + a < referenceImageDim.x && - -1 < ante.y + b && ante.y + b < referenceImageDim.y && - -1 < ante.z + c && ante.z + c < referenceImageDim.z) { - const int index = ((ante.z + c) * referenceImageDim.y + ante.y + b) * referenceImageDim.x + ante.x + a; + if (-1 < ante.x + a && ante.x + a < referenceImageDims.x && + -1 < ante.y + b && ante.y + b < referenceImageDims.y && + -1 < ante.z + c && ante.z + c < referenceImageDims.z) { + const int index = ((ante.z + c) * referenceImageDims.y + ante.y + b) * referenceImageDims.x + ante.x + a; deformation = tex1Dfetch(deformationFieldTexture, index); } else { - deformation = GetSlidedValues(ante.x + a, ante.y + b, ante.z + c, deformationFieldTexture, referenceImageDim, affineMatrixC); + deformation = GetSlidedValues(ante.x + a, ante.y + b, ante.z + c, deformationFieldTexture, referenceImageDims, affineMatrixC); } const float basis = relX[a] * relY[b] * relZ[c]; position = position + basis * deformation; } } } - deformationField[tid] = position; + } else { + // Conversion from real position to voxel coordinate + const float2 voxelPosition{ + position.x * affineMatrixB.m[0][0] + position.y * affineMatrixB.m[0][1] + affineMatrixB.m[0][3], + position.x * affineMatrixB.m[1][0] + position.y * affineMatrixB.m[1][1] + affineMatrixB.m[1][3] + }; + + // Linear interpolation + const int2 ante = { Floor(voxelPosition.x), Floor(voxelPosition.y) }; + float relX[2], relY[2]; + relX[1] = voxelPosition.x - (float)ante.x; relX[0] = 1.f - relX[1]; + relY[1] = voxelPosition.y - (float)ante.y; relY[0] = 1.f - relY[1]; + + position = make_float4(0.f, 0.f, 0.f, 0.f); + for (short b = 0; b < 2; ++b) { + for (short a = 0; a < 2; ++a) { + float4 deformation; + if (-1 < ante.x + a && ante.x + a < referenceImageDims.x && + -1 < ante.y + b && ante.y + b < referenceImageDims.y) { + const int index = (ante.y + b) * referenceImageDims.x + ante.x + a; + deformation = tex1Dfetch(deformationFieldTexture, index); + } else { + deformation = GetSlidedValues(ante.x + a, ante.y + b, deformationFieldTexture, referenceImageDims, affineMatrixC); + } + const float basis = relX[a] * relY[b]; + position = position + basis * deformation; + } + } } + + deformationField[index] = position; } /* *************************************************************** */ __global__ void GetJacobianMatrix3d(float *jacobianMatrices,