Optimise CudaResampling #92

KCL-BMEIS · Nov 28, 2023 · 708106f · 708106f
1 parent b9c9bec
commit 708106f
Show file tree

Hide file tree

Showing 6 changed files with 185 additions and 321 deletions.
diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
@@ -1 +1 @@
-372
+373
diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp
@@ -38,10 +38,6 @@ struct BlockSize {
     unsigned reg_ApplyConvolutionWindowAlongY;
     unsigned reg_ApplyConvolutionWindowAlongZ;
     unsigned Arithmetic;
-    unsigned reg_resampleImage2D;
-    unsigned reg_resampleImage3D;
-    unsigned reg_getImageGradient2D;
-    unsigned reg_getImageGradient3D;
 };
 /* *************************************************************** */
 struct BlockSize100: public BlockSize {
@@ -70,10 +66,6 @@ struct BlockSize100: public BlockSize {
         reg_ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem
         reg_ApplyConvolutionWindowAlongZ = 512; // 15 reg - 28 smem - 08 cmem
         Arithmetic = 384; // 5 reg - 24 smem
-        reg_resampleImage2D = 320; // 10 reg - 24 smem - 12 cmem
-        reg_resampleImage3D = 512; // 16 reg - 24 smem - 12 cmem
-        reg_getImageGradient2D = 512; // 16 reg - 24 smem - 20 cmem - 24 lmem
-        reg_getImageGradient3D = 320; // 24 reg - 24 smem - 16 cmem - 32 lmem
         NR_FUNC_CALLED();
     }
 };
@@ -104,10 +96,6 @@ struct BlockSize300: public BlockSize {
         reg_ApplyConvolutionWindowAlongY = 1024; // 25 reg
         reg_ApplyConvolutionWindowAlongZ = 1024; // 25 reg
         Arithmetic = 1024; //
-        reg_resampleImage2D = 1024; // 23 reg
-        reg_resampleImage3D = 1024; // 24 reg
-        reg_getImageGradient2D = 1024; // 34 reg
-        reg_getImageGradient3D = 1024; // 34 reg
         NR_FUNC_CALLED();
     }
 };

diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu
@@ -175,14 +175,16 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
 /* *************************************************************** */
 void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) {
     CudaDefContent& con = dynamic_cast<CudaDefContent&>(this->con);
-    Cuda::GetImageGradient(con.Content::GetFloating(),
-                           con.GetFloatingCuda(),
-                           con.GetDeformationFieldCuda(),
-                           con.GetWarpedGradientCuda(),
-                           con.GetActiveVoxelNumber(),
-                           interpolation,
-                           paddingValue,
-                           activeTimePoint);
+    const nifti_image *floating = con.Content::GetFloating();
+    auto getImageGradient = floating->nz > 1 ? Cuda::GetImageGradient<true> : Cuda::GetImageGradient<false>;
+    getImageGradient(floating,
+                     con.GetFloatingCuda(),
+                     con.GetDeformationFieldCuda(),
+                     con.GetWarpedGradientCuda(),
+                     con.GetActiveVoxelNumber(),
+                     interpolation,
+                     paddingValue,
+                     activeTimePoint);
 }
 /* *************************************************************** */
 double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {

diff --git a/reg-lib/cuda/CudaResampling.cu b/reg-lib/cuda/CudaResampling.cu
@@ -11,11 +11,54 @@
  */
 
 #include "CudaResampling.hpp"
-#include "CudaResamplingKernels.cu"
 
 /* *************************************************************** */
 namespace NiftyReg::Cuda {
 /* *************************************************************** */
+template<typename T>
+__inline__ __device__ void InterpLinearKernel(T relative, T (&basis)[2]) {
+    basis[1] = relative;
+    basis[0] = 1.f - relative;
+}
+/* *************************************************************** */
+template<typename T, bool is3d>
+__inline__ __device__ void TransformInterpolate(const mat44 matrix, const float4 realDeformation, int3& previous,
+                                                T (&xBasis)[2], T (&yBasis)[2], T (&zBasis)[2]) {
+    // Get the voxel-based deformation
+    T voxelDeformation[is3d ? 3 : 2];
+    if constexpr (is3d) {
+        voxelDeformation[0] = (static_cast<T>(matrix.m[0][0]) * static_cast<T>(realDeformation.x) +
+                               static_cast<T>(matrix.m[0][1]) * static_cast<T>(realDeformation.y) +
+                               static_cast<T>(matrix.m[0][2]) * static_cast<T>(realDeformation.z) +
+                               static_cast<T>(matrix.m[0][3]));
+        voxelDeformation[1] = (static_cast<T>(matrix.m[1][0]) * static_cast<T>(realDeformation.x) +
+                               static_cast<T>(matrix.m[1][1]) * static_cast<T>(realDeformation.y) +
+                               static_cast<T>(matrix.m[1][2]) * static_cast<T>(realDeformation.z) +
+                               static_cast<T>(matrix.m[1][3]));
+        voxelDeformation[2] = (static_cast<T>(matrix.m[2][0]) * static_cast<T>(realDeformation.x) +
+                               static_cast<T>(matrix.m[2][1]) * static_cast<T>(realDeformation.y) +
+                               static_cast<T>(matrix.m[2][2]) * static_cast<T>(realDeformation.z) +
+                               static_cast<T>(matrix.m[2][3]));
+    } else {
+        voxelDeformation[0] = (static_cast<T>(matrix.m[0][0]) * static_cast<T>(realDeformation.x) +
+                               static_cast<T>(matrix.m[0][1]) * static_cast<T>(realDeformation.y) +
+                               static_cast<T>(matrix.m[0][3]));
+        voxelDeformation[1] = (static_cast<T>(matrix.m[1][0]) * static_cast<T>(realDeformation.x) +
+                               static_cast<T>(matrix.m[1][1]) * static_cast<T>(realDeformation.y) +
+                               static_cast<T>(matrix.m[1][3]));
+    }
+
+    // Compute the linear interpolation
+    previous.x = Floor(voxelDeformation[0]);
+    previous.y = Floor(voxelDeformation[1]);
+    InterpLinearKernel(voxelDeformation[0] - static_cast<T>(previous.x), xBasis);
+    InterpLinearKernel(voxelDeformation[1] - static_cast<T>(previous.y), yBasis);
+    if constexpr (is3d) {
+        previous.z = Floor(voxelDeformation[2]);
+        InterpLinearKernel(voxelDeformation[2] - static_cast<T>(previous.z), zBasis);
+    }
+}
+/* *************************************************************** */
 template<bool is3d>
 void ResampleImage(const nifti_image *floatingImage,
                    const float *floatingImageCuda,
@@ -29,39 +72,82 @@ void ResampleImage(const nifti_image *floatingImage,
     if (interpolation != 1)
         NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");
 
-    auto blockSize = CudaContext::GetBlockSize();
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
     const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
-    auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
-    auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
+    auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
+    auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
+    auto deformationFieldTexture = *deformationFieldTexturePtr;
+    auto maskTexture = *maskTexturePtr;
     // Bind the real to voxel matrix to the texture
     const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
 
     for (int t = 0; t < warpedImage->nt * warpedImage->nu; t++) {
         NR_DEBUG((is3d ? "3" : "2") << "D resampling of volume number " << t);
-        auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
-        if constexpr (is3d) {
-            const unsigned blocks = blockSize->reg_resampleImage3D;
-            const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-            const dim3 gridDims(grids, grids, 1);
-            const dim3 blockDims(blocks, 1, 1);
-            ResampleImage3D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
-                                                     floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-            NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-        } else {
-            const unsigned blocks = blockSize->reg_resampleImage2D;
-            const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-            const dim3 gridDims(grids, grids, 1);
-            const dim3 blockDims(blocks, 1, 1);
-            ResampleImage2D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
-                                                     floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-            NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-        }
+        auto curWarpedCuda = warpedImageCuda + t * voxelNumber;
+        auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
+        auto floatingTexture = *floatingTexturePtr;
+        thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [
+            curWarpedCuda, floatingTexture, deformationFieldTexture, maskTexture, floatingMatrix, floatingDim, paddingValue
+        ]__device__(const int index) {
+            // Get the real world deformation in the floating space
+            const int voxel = tex1Dfetch<int>(maskTexture, index);
+            const float4 realDeformation = tex1Dfetch<float4>(deformationFieldTexture, index);
+
+            // Get the voxel-based deformation in the floating space and compute the linear interpolation
+            int3 previous;
+            double xBasis[2], yBasis[2], zBasis[2];
+            TransformInterpolate<double, is3d>(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis);
+
+            double intensity = 0;
+            if constexpr (is3d) {
+                for (char c = 0; c < 2; c++) {
+                    const int z = previous.z + c;
+                    int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x;
+                    double tempY = 0;
+                    for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) {
+                        const int y = previous.y + b;
+                        int index = indexYZ + previous.x;
+                        double tempX = 0;
+                        for (char a = 0; a < 2; a++, index++) {
+                            const int x = previous.x + a;
+                            if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) {
+                                tempX += tex1Dfetch<float>(floatingTexture, index) * xBasis[a];
+                            } else {
+                                // Padding value
+                                tempX += paddingValue * xBasis[a];
+                            }
+                        }
+                        tempY += tempX * yBasis[b];
+                    }
+                    intensity += tempY * zBasis[c];
+                }
+            } else {
+                int indexY = previous.y * floatingDim.x + previous.x;
+                for (char b = 0; b < 2; b++, indexY += floatingDim.x) {
+                    const int y = previous.y + b;
+                    int index = indexY;
+                    double tempX = 0;
+                    for (char a = 0; a < 2; a++, index++) {
+                        const int x = previous.x + a;
+                        if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) {
+                            tempX += tex1Dfetch<float>(floatingTexture, index) * xBasis[a];
+                        } else {
+                            // Padding value
+                            tempX += paddingValue * xBasis[a];
+                        }
+                    }
+                    intensity += tempX * yBasis[b];
+                }
+            }
+
+            curWarpedCuda[voxel] = intensity;
+        });
     }
 }
 template void ResampleImage<false>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
 template void ResampleImage<true>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
 /* *************************************************************** */
+template<bool is3d>
 void GetImageGradient(const nifti_image *floatingImage,
                       const float *floatingImageCuda,
                       const float4 *deformationFieldCuda,
@@ -73,33 +159,78 @@ void GetImageGradient(const nifti_image *floatingImage,
     if (interpolation != 1)
         NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");
 
-    auto blockSize = CudaContext::GetBlockSize();
     const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
     const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
     if (paddingValue != paddingValue) paddingValue = 0;
-    auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
-    auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
+    auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
+    auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
+    auto floatingTexture = *floatingTexturePtr;
+    auto deformationFieldTexture = *deformationFieldTexturePtr;
     // Bind the real to voxel matrix to the texture
     const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;
 
-    if (floatingImage->nz > 1) {
-        const unsigned blocks = blockSize->reg_getImageGradient3D;
-        const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-        const dim3 gridDims(grids, grids, 1);
-        const dim3 blockDims(blocks, 1, 1);
-        GetImageGradient3D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
-                                                    floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-    } else {
-        const unsigned blocks = blockSize->reg_getImageGradient2D;
-        const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
-        const dim3 gridDims(grids, grids, 1);
-        const dim3 blockDims(blocks, 1, 1);
-        GetImageGradient2D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
-                                                    floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-    }
+    thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [
+        warpedGradientCuda, floatingTexture, deformationFieldTexture, floatingMatrix, floatingDim, paddingValue
+    ]__device__(const int index) {
+            // Get the real world deformation in the floating space
+            float4 realDeformation = tex1Dfetch<float4>(deformationFieldTexture, index);
+
+            // Get the voxel-based deformation in the floating space and compute the linear interpolation
+            int3 previous;
+            float xBasis[2], yBasis[2], zBasis[2];
+            TransformInterpolate<float, is3d>(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis);
+            constexpr float deriv[] = { -1.0f, 1.0f };
+
+            float4 gradientValue{};
+            if constexpr (is3d) {
+                for (char c = 0; c < 2; c++) {
+                    const int z = previous.z + c;
+                    int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x;
+                    float3 tempY{};
+                    for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) {
+                        const int y = previous.y + b;
+                        int index = indexYZ + previous.x;
+                        float2 tempX{};
+                        for (char a = 0; a < 2; a++, index++) {
+                            const int x = previous.x + a;
+                            const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z ?
+                                tex1Dfetch<float>(floatingTexture, index) : paddingValue;
+
+                            tempX.x += intensity * deriv[a];
+                            tempX.y += intensity * xBasis[a];
+                        }
+                        tempY.x += tempX.x * yBasis[b];
+                        tempY.y += tempX.y * deriv[b];
+                        tempY.z += tempX.y * yBasis[b];
+                    }
+                    gradientValue.x += tempY.x * zBasis[c];
+                    gradientValue.y += tempY.y * zBasis[c];
+                    gradientValue.z += tempY.z * deriv[c];
+                }
+            } else {
+                int indexY = previous.y * floatingDim.x + previous.x;
+                for (char b = 0; b < 2; b++, indexY += floatingDim.x) {
+                    const int y = previous.y + b;
+                    int index = indexY;
+                    float2 tempX{};
+                    for (char a = 0; a < 2; a++, index++) {
+                        const int x = previous.x + a;
+                        const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y ?
+                            tex1Dfetch<float>(floatingTexture, index) : paddingValue;
+
+                        tempX.x += intensity * deriv[a];
+                        tempX.y += intensity * xBasis[a];
+                    }
+                    gradientValue.x += tempX.x * yBasis[b];
+                    gradientValue.y += tempX.y * deriv[b];
+                }
+            }
+
+            warpedGradientCuda[index] = gradientValue;
+    });
 }
+template void GetImageGradient<false>(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int);
+template void GetImageGradient<true>(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int);
 /* *************************************************************** */
 } // namespace NiftyReg::Cuda
 /* *************************************************************** */
diff --git a/reg-lib/cuda/CudaResampling.hpp b/reg-lib/cuda/CudaResampling.hpp
@@ -28,6 +28,7 @@ void ResampleImage(const nifti_image *floatingImage,
                    const int interpolation,
                    const float paddingValue);
 /* *************************************************************** */
+template<bool is3d>
 void GetImageGradient(const nifti_image *floatingImage,
                       const float *floatingImageCuda,
                       const float4 *deformationFieldCuda,