From 708106f0592549203e05083f03a205b767bdce7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=9Clgen?= Date: Tue, 28 Nov 2023 09:29:18 +0000 Subject: [PATCH] Optimise CudaResampling #92 --- niftyreg_build_version.txt | 2 +- reg-lib/cuda/BlockSize.hpp | 12 -- reg-lib/cuda/CudaCompute.cu | 18 +- reg-lib/cuda/CudaResampling.cu | 215 ++++++++++++++++----- reg-lib/cuda/CudaResampling.hpp | 1 + reg-lib/cuda/CudaResamplingKernels.cu | 258 -------------------------- 6 files changed, 185 insertions(+), 321 deletions(-) delete mode 100644 reg-lib/cuda/CudaResamplingKernels.cu diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index ba300673..a5c3fde3 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -372 +373 diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp index fe411adb..50a0cfbc 100644 --- a/reg-lib/cuda/BlockSize.hpp +++ b/reg-lib/cuda/BlockSize.hpp @@ -38,10 +38,6 @@ struct BlockSize { unsigned reg_ApplyConvolutionWindowAlongY; unsigned reg_ApplyConvolutionWindowAlongZ; unsigned Arithmetic; - unsigned reg_resampleImage2D; - unsigned reg_resampleImage3D; - unsigned reg_getImageGradient2D; - unsigned reg_getImageGradient3D; }; /* *************************************************************** */ struct BlockSize100: public BlockSize { @@ -70,10 +66,6 @@ struct BlockSize100: public BlockSize { reg_ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem reg_ApplyConvolutionWindowAlongZ = 512; // 15 reg - 28 smem - 08 cmem Arithmetic = 384; // 5 reg - 24 smem - reg_resampleImage2D = 320; // 10 reg - 24 smem - 12 cmem - reg_resampleImage3D = 512; // 16 reg - 24 smem - 12 cmem - reg_getImageGradient2D = 512; // 16 reg - 24 smem - 20 cmem - 24 lmem - reg_getImageGradient3D = 320; // 24 reg - 24 smem - 16 cmem - 32 lmem NR_FUNC_CALLED(); } }; @@ -104,10 +96,6 @@ struct BlockSize300: public BlockSize { reg_ApplyConvolutionWindowAlongY = 1024; // 25 reg reg_ApplyConvolutionWindowAlongZ = 1024; // 25 reg Arithmetic = 1024; // - reg_resampleImage2D = 1024; // 23 reg - reg_resampleImage3D = 1024; // 24 reg - reg_getImageGradient2D = 1024; // 34 reg - reg_getImageGradient3D = 1024; // 34 reg NR_FUNC_CALLED(); } }; diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index 9dfae7b0..629ed5e0 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -175,14 +175,16 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof, /* *************************************************************** */ void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) { CudaDefContent& con = dynamic_cast(this->con); - Cuda::GetImageGradient(con.Content::GetFloating(), - con.GetFloatingCuda(), - con.GetDeformationFieldCuda(), - con.GetWarpedGradientCuda(), - con.GetActiveVoxelNumber(), - interpolation, - paddingValue, - activeTimePoint); + const nifti_image *floating = con.Content::GetFloating(); + auto getImageGradient = floating->nz > 1 ? Cuda::GetImageGradient : Cuda::GetImageGradient; + getImageGradient(floating, + con.GetFloatingCuda(), + con.GetDeformationFieldCuda(), + con.GetWarpedGradientCuda(), + con.GetActiveVoxelNumber(), + interpolation, + paddingValue, + activeTimePoint); } /* *************************************************************** */ double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) { diff --git a/reg-lib/cuda/CudaResampling.cu b/reg-lib/cuda/CudaResampling.cu index f72f6bee..ee2deab5 100644 --- a/reg-lib/cuda/CudaResampling.cu +++ b/reg-lib/cuda/CudaResampling.cu @@ -11,11 +11,54 @@ */ #include "CudaResampling.hpp" -#include "CudaResamplingKernels.cu" /* *************************************************************** */ namespace NiftyReg::Cuda { /* *************************************************************** */ +template +__inline__ __device__ void InterpLinearKernel(T relative, T (&basis)[2]) { + basis[1] = relative; + basis[0] = 1.f - relative; +} +/* *************************************************************** */ +template +__inline__ __device__ void TransformInterpolate(const mat44 matrix, const float4 realDeformation, int3& previous, + T (&xBasis)[2], T (&yBasis)[2], T (&zBasis)[2]) { + // Get the voxel-based deformation + T voxelDeformation[is3d ? 3 : 2]; + if constexpr (is3d) { + voxelDeformation[0] = (static_cast(matrix.m[0][0]) * static_cast(realDeformation.x) + + static_cast(matrix.m[0][1]) * static_cast(realDeformation.y) + + static_cast(matrix.m[0][2]) * static_cast(realDeformation.z) + + static_cast(matrix.m[0][3])); + voxelDeformation[1] = (static_cast(matrix.m[1][0]) * static_cast(realDeformation.x) + + static_cast(matrix.m[1][1]) * static_cast(realDeformation.y) + + static_cast(matrix.m[1][2]) * static_cast(realDeformation.z) + + static_cast(matrix.m[1][3])); + voxelDeformation[2] = (static_cast(matrix.m[2][0]) * static_cast(realDeformation.x) + + static_cast(matrix.m[2][1]) * static_cast(realDeformation.y) + + static_cast(matrix.m[2][2]) * static_cast(realDeformation.z) + + static_cast(matrix.m[2][3])); + } else { + voxelDeformation[0] = (static_cast(matrix.m[0][0]) * static_cast(realDeformation.x) + + static_cast(matrix.m[0][1]) * static_cast(realDeformation.y) + + static_cast(matrix.m[0][3])); + voxelDeformation[1] = (static_cast(matrix.m[1][0]) * static_cast(realDeformation.x) + + static_cast(matrix.m[1][1]) * static_cast(realDeformation.y) + + static_cast(matrix.m[1][3])); + } + + // Compute the linear interpolation + previous.x = Floor(voxelDeformation[0]); + previous.y = Floor(voxelDeformation[1]); + InterpLinearKernel(voxelDeformation[0] - static_cast(previous.x), xBasis); + InterpLinearKernel(voxelDeformation[1] - static_cast(previous.y), yBasis); + if constexpr (is3d) { + previous.z = Floor(voxelDeformation[2]); + InterpLinearKernel(voxelDeformation[2] - static_cast(previous.z), zBasis); + } +} +/* *************************************************************** */ template void ResampleImage(const nifti_image *floatingImage, const float *floatingImageCuda, @@ -29,39 +72,82 @@ void ResampleImage(const nifti_image *floatingImage, if (interpolation != 1) NR_FATAL_ERROR("Only linear interpolation is supported on the GPU"); - auto blockSize = CudaContext::GetBlockSize(); const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3); const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz); - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); - auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); + auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); + auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1); + auto deformationFieldTexture = *deformationFieldTexturePtr; + auto maskTexture = *maskTexturePtr; // Bind the real to voxel matrix to the texture const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk; for (int t = 0; t < warpedImage->nt * warpedImage->nu; t++) { NR_DEBUG((is3d ? "3" : "2") << "D resampling of volume number " << t); - auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); - if constexpr (is3d) { - const unsigned blocks = blockSize->reg_resampleImage3D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - ResampleImage3D<<>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture, - floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } else { - const unsigned blocks = blockSize->reg_resampleImage2D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - ResampleImage2D<<>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture, - floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } + auto curWarpedCuda = warpedImageCuda + t * voxelNumber; + auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto floatingTexture = *floatingTexturePtr; + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [ + curWarpedCuda, floatingTexture, deformationFieldTexture, maskTexture, floatingMatrix, floatingDim, paddingValue + ]__device__(const int index) { + // Get the real world deformation in the floating space + const int voxel = tex1Dfetch(maskTexture, index); + const float4 realDeformation = tex1Dfetch(deformationFieldTexture, index); + + // Get the voxel-based deformation in the floating space and compute the linear interpolation + int3 previous; + double xBasis[2], yBasis[2], zBasis[2]; + TransformInterpolate(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis); + + double intensity = 0; + if constexpr (is3d) { + for (char c = 0; c < 2; c++) { + const int z = previous.z + c; + int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; + double tempY = 0; + for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { + const int y = previous.y + b; + int index = indexYZ + previous.x; + double tempX = 0; + for (char a = 0; a < 2; a++, index++) { + const int x = previous.x + a; + if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) { + tempX += tex1Dfetch(floatingTexture, index) * xBasis[a]; + } else { + // Padding value + tempX += paddingValue * xBasis[a]; + } + } + tempY += tempX * yBasis[b]; + } + intensity += tempY * zBasis[c]; + } + } else { + int indexY = previous.y * floatingDim.x + previous.x; + for (char b = 0; b < 2; b++, indexY += floatingDim.x) { + const int y = previous.y + b; + int index = indexY; + double tempX = 0; + for (char a = 0; a < 2; a++, index++) { + const int x = previous.x + a; + if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) { + tempX += tex1Dfetch(floatingTexture, index) * xBasis[a]; + } else { + // Padding value + tempX += paddingValue * xBasis[a]; + } + } + intensity += tempX * yBasis[b]; + } + } + + curWarpedCuda[voxel] = intensity; + }); } } template void ResampleImage(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float); template void ResampleImage(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float); /* *************************************************************** */ +template void GetImageGradient(const nifti_image *floatingImage, const float *floatingImageCuda, const float4 *deformationFieldCuda, @@ -73,33 +159,78 @@ void GetImageGradient(const nifti_image *floatingImage, if (interpolation != 1) NR_FATAL_ERROR("Only linear interpolation is supported on the GPU"); - auto blockSize = CudaContext::GetBlockSize(); const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3); const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz); if (paddingValue != paddingValue) paddingValue = 0; - auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); - auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); + auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1); + auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4); + auto floatingTexture = *floatingTexturePtr; + auto deformationFieldTexture = *deformationFieldTexturePtr; // Bind the real to voxel matrix to the texture const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk; - if (floatingImage->nz > 1) { - const unsigned blocks = blockSize->reg_getImageGradient3D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - GetImageGradient3D<<>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture, - floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } else { - const unsigned blocks = blockSize->reg_getImageGradient2D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - GetImageGradient2D<<>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture, - floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [ + warpedGradientCuda, floatingTexture, deformationFieldTexture, floatingMatrix, floatingDim, paddingValue + ]__device__(const int index) { + // Get the real world deformation in the floating space + float4 realDeformation = tex1Dfetch(deformationFieldTexture, index); + + // Get the voxel-based deformation in the floating space and compute the linear interpolation + int3 previous; + float xBasis[2], yBasis[2], zBasis[2]; + TransformInterpolate(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis); + constexpr float deriv[] = { -1.0f, 1.0f }; + + float4 gradientValue{}; + if constexpr (is3d) { + for (char c = 0; c < 2; c++) { + const int z = previous.z + c; + int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; + float3 tempY{}; + for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { + const int y = previous.y + b; + int index = indexYZ + previous.x; + float2 tempX{}; + for (char a = 0; a < 2; a++, index++) { + const int x = previous.x + a; + const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z ? + tex1Dfetch(floatingTexture, index) : paddingValue; + + tempX.x += intensity * deriv[a]; + tempX.y += intensity * xBasis[a]; + } + tempY.x += tempX.x * yBasis[b]; + tempY.y += tempX.y * deriv[b]; + tempY.z += tempX.y * yBasis[b]; + } + gradientValue.x += tempY.x * zBasis[c]; + gradientValue.y += tempY.y * zBasis[c]; + gradientValue.z += tempY.z * deriv[c]; + } + } else { + int indexY = previous.y * floatingDim.x + previous.x; + for (char b = 0; b < 2; b++, indexY += floatingDim.x) { + const int y = previous.y + b; + int index = indexY; + float2 tempX{}; + for (char a = 0; a < 2; a++, index++) { + const int x = previous.x + a; + const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y ? + tex1Dfetch(floatingTexture, index) : paddingValue; + + tempX.x += intensity * deriv[a]; + tempX.y += intensity * xBasis[a]; + } + gradientValue.x += tempX.x * yBasis[b]; + gradientValue.y += tempX.y * deriv[b]; + } + } + + warpedGradientCuda[index] = gradientValue; + }); } +template void GetImageGradient(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int); +template void GetImageGradient(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int); /* *************************************************************** */ } // namespace NiftyReg::Cuda /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaResampling.hpp b/reg-lib/cuda/CudaResampling.hpp index 6d54dad6..1366ccc7 100644 --- a/reg-lib/cuda/CudaResampling.hpp +++ b/reg-lib/cuda/CudaResampling.hpp @@ -28,6 +28,7 @@ void ResampleImage(const nifti_image *floatingImage, const int interpolation, const float paddingValue); /* *************************************************************** */ +template void GetImageGradient(const nifti_image *floatingImage, const float *floatingImageCuda, const float4 *deformationFieldCuda, diff --git a/reg-lib/cuda/CudaResamplingKernels.cu b/reg-lib/cuda/CudaResamplingKernels.cu deleted file mode 100644 index 868d03f5..00000000 --- a/reg-lib/cuda/CudaResamplingKernels.cu +++ /dev/null @@ -1,258 +0,0 @@ -/* - * CudaResamplingKernels.cu - * - * - * Created by Marc Modat on 24/03/2009. - * Copyright (c) 2009-2018, University College London - * Copyright (c) 2018, NiftyReg Developers. - * All rights reserved. - * See the LICENSE.txt file in the nifty_reg root folder - * - */ - -/* *************************************************************** */ -namespace NiftyReg::Cuda { -/* *************************************************************** */ -template -__inline__ __device__ constexpr void InterpLinearKernel(T relative, T (&basis)[2]) { - if (relative < 0) - relative = 0; // reg_rounding error - basis[1] = relative; - basis[0] = 1.f - relative; -} -/* *************************************************************** */ -__global__ void ResampleImage2D(float *resultArray, - cudaTextureObject_t floatingTexture, - cudaTextureObject_t deformationFieldTexture, - cudaTextureObject_t maskTexture, - const mat44 floatingMatrix, - const int3 floatingDim, - const unsigned activeVoxelNumber, - const float paddingValue) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid >= activeVoxelNumber) return; - // Get the real world deformation in the floating space - const int tid2 = tex1Dfetch(maskTexture, tid); - const float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); - - // Get the voxel-based deformation in the floating space - double2 voxelDeformation; - voxelDeformation.x = (double(floatingMatrix.m[0][0]) * double(realDeformation.x) + - double(floatingMatrix.m[0][1]) * double(realDeformation.y) + - double(floatingMatrix.m[0][3])); - voxelDeformation.y = (double(floatingMatrix.m[1][0]) * double(realDeformation.x) + - double(floatingMatrix.m[1][1]) * double(realDeformation.y) + - double(floatingMatrix.m[1][3])); - - // Compute the linear interpolation - const int2 previous = { Floor(voxelDeformation.x), Floor(voxelDeformation.y) }; - const double2 relative = { voxelDeformation.x - previous.x, voxelDeformation.y - previous.y }; - double xBasis[2], yBasis[2]; - InterpLinearKernel(relative.x, xBasis); - InterpLinearKernel(relative.y, yBasis); - - double intensity = 0; - int indexY = previous.y * floatingDim.x + previous.x; - for (char b = 0; b < 2; b++, indexY += floatingDim.x) { - const int y = previous.y + b; - int index = indexY; - double xTempNewValue = 0; - for (char a = 0; a < 2; a++, index++) { - const int x = previous.x + a; - if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) { - xTempNewValue += tex1Dfetch(floatingTexture, index) * xBasis[a]; - } else { - // Padding value - xTempNewValue += paddingValue * xBasis[a]; - } - } - intensity += xTempNewValue * yBasis[b]; - } - - resultArray[tid2] = intensity; -} -/* *************************************************************** */ -__global__ void ResampleImage3D(float *resultArray, - cudaTextureObject_t floatingTexture, - cudaTextureObject_t deformationFieldTexture, - cudaTextureObject_t maskTexture, - const mat44 floatingMatrix, - const int3 floatingDim, - const unsigned activeVoxelNumber, - const float paddingValue) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid >= activeVoxelNumber) return; - // Get the real world deformation in the floating space - const int tid2 = tex1Dfetch(maskTexture, tid); - const float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); - - // Get the voxel-based deformation in the floating space - double3 voxelDeformation; - voxelDeformation.x = (double(floatingMatrix.m[0][0]) * double(realDeformation.x) + - double(floatingMatrix.m[0][1]) * double(realDeformation.y) + - double(floatingMatrix.m[0][2]) * double(realDeformation.z) + - double(floatingMatrix.m[0][3])); - voxelDeformation.y = (double(floatingMatrix.m[1][0]) * double(realDeformation.x) + - double(floatingMatrix.m[1][1]) * double(realDeformation.y) + - double(floatingMatrix.m[1][2]) * double(realDeformation.z) + - double(floatingMatrix.m[1][3])); - voxelDeformation.z = (double(floatingMatrix.m[2][0]) * double(realDeformation.x) + - double(floatingMatrix.m[2][1]) * double(realDeformation.y) + - double(floatingMatrix.m[2][2]) * double(realDeformation.z) + - double(floatingMatrix.m[2][3])); - - // Compute the linear interpolation - const int3 previous = { Floor(voxelDeformation.x), Floor(voxelDeformation.y), Floor(voxelDeformation.z) }; - const double3 relative = { voxelDeformation.x - previous.x, voxelDeformation.y - previous.y, voxelDeformation.z - previous.z }; - double xBasis[2], yBasis[2], zBasis[2]; - InterpLinearKernel(relative.x, xBasis); - InterpLinearKernel(relative.y, yBasis); - InterpLinearKernel(relative.z, zBasis); - - double intensity = 0; - for (char c = 0; c < 2; c++) { - const int z = previous.z + c; - int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; - double yTempNewValue = 0; - for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { - const int y = previous.y + b; - int index = indexYZ + previous.x; - double xTempNewValue = 0; - for (char a = 0; a < 2; a++, index++) { - const int x = previous.x + a; - if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) { - xTempNewValue += tex1Dfetch(floatingTexture, index) * xBasis[a]; - } else { - // Padding value - xTempNewValue += paddingValue * xBasis[a]; - } - } - yTempNewValue += xTempNewValue * yBasis[b]; - } - intensity += yTempNewValue * zBasis[c]; - } - - resultArray[tid2] = intensity; -} -/* *************************************************************** */ -__global__ void GetImageGradient2D(float4 *gradientArray, - cudaTextureObject_t floatingTexture, - cudaTextureObject_t deformationFieldTexture, - const mat44 floatingMatrix, - const int3 floatingDim, - const unsigned activeVoxelNumber, - const float paddingValue) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid >= activeVoxelNumber) return; - // Get the real world deformation in the floating space - float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); - - // Get the voxel-based deformation in the floating space - float2 voxelDeformation; - voxelDeformation.x = (floatingMatrix.m[0][0] * realDeformation.x + - floatingMatrix.m[0][1] * realDeformation.y + - floatingMatrix.m[0][3]); - voxelDeformation.y = (floatingMatrix.m[1][0] * realDeformation.x + - floatingMatrix.m[1][1] * realDeformation.y + - floatingMatrix.m[1][3]); - - // Compute the gradient - const int2 previous = { Floor(voxelDeformation.x), Floor(voxelDeformation.y) }; - float xBasis[2], yBasis[2]; - const float2 relative = { voxelDeformation.x - previous.x, voxelDeformation.y - previous.y }; - InterpLinearKernel(relative.x, xBasis); - InterpLinearKernel(relative.y, yBasis); - constexpr float deriv[] = { -1.0f, 1.0f }; - - float4 gradientValue{}; - int indexY = previous.y * floatingDim.x + previous.x; - for (char b = 0; b < 2; b++, indexY += floatingDim.x) { - const int y = previous.y + b; - int index = indexY; - float2 tempValueX{}; - for (char a = 0; a < 2; a++, index++) { - const int x = previous.x + a; - float intensity = paddingValue; - - if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) - intensity = tex1Dfetch(floatingTexture, index); - - tempValueX.x += intensity * deriv[a]; - tempValueX.y += intensity * xBasis[a]; - } - gradientValue.x += tempValueX.x * yBasis[b]; - gradientValue.y += tempValueX.y * deriv[b]; - } - - gradientArray[tid] = gradientValue; -} -/* *************************************************************** */ -__global__ void GetImageGradient3D(float4 *gradientArray, - cudaTextureObject_t floatingTexture, - cudaTextureObject_t deformationFieldTexture, - const mat44 floatingMatrix, - const int3 floatingDim, - const unsigned activeVoxelNumber, - const float paddingValue) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid >= activeVoxelNumber) return; - // Get the real world deformation in the floating space - float4 realDeformation = tex1Dfetch(deformationFieldTexture, tid); - - // Get the voxel-based deformation in the floating space - float3 voxelDeformation; - voxelDeformation.x = (floatingMatrix.m[0][0] * realDeformation.x + - floatingMatrix.m[0][1] * realDeformation.y + - floatingMatrix.m[0][2] * realDeformation.z + - floatingMatrix.m[0][3]); - voxelDeformation.y = (floatingMatrix.m[1][0] * realDeformation.x + - floatingMatrix.m[1][1] * realDeformation.y + - floatingMatrix.m[1][2] * realDeformation.z + - floatingMatrix.m[1][3]); - voxelDeformation.z = (floatingMatrix.m[2][0] * realDeformation.x + - floatingMatrix.m[2][1] * realDeformation.y + - floatingMatrix.m[2][2] * realDeformation.z + - floatingMatrix.m[2][3]); - - // Compute the gradient - const int3 previous = { Floor(voxelDeformation.x), Floor(voxelDeformation.y), Floor(voxelDeformation.z) }; - float xBasis[2], yBasis[2], zBasis[2]; - const float3 relative = { voxelDeformation.x - previous.x, voxelDeformation.y - previous.y, voxelDeformation.z - previous.z }; - InterpLinearKernel(relative.x, xBasis); - InterpLinearKernel(relative.y, yBasis); - InterpLinearKernel(relative.z, zBasis); - constexpr float deriv[] = { -1.0f, 1.0f }; - - float4 gradientValue{}; - for (char c = 0; c < 2; c++) { - const int z = previous.z + c; - int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x; - float3 tempValueY{}; - for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) { - const int y = previous.y + b; - int index = indexYZ + previous.x; - float2 tempValueX{}; - for (char a = 0; a < 2; a++, index++) { - const int x = previous.x + a; - float intensity = paddingValue; - - if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) - intensity = tex1Dfetch(floatingTexture, index); - - tempValueX.x += intensity * deriv[a]; - tempValueX.y += intensity * xBasis[a]; - } - tempValueY.x += tempValueX.x * yBasis[b]; - tempValueY.y += tempValueX.y * deriv[b]; - tempValueY.z += tempValueX.y * yBasis[b]; - } - gradientValue.x += tempValueY.x * zBasis[c]; - gradientValue.y += tempValueY.y * zBasis[c]; - gradientValue.z += tempValueY.z * deriv[c]; - } - - gradientArray[tid] = gradientValue; -} -/* *************************************************************** */ -} // namespace NiftyReg::Cuda -/* *************************************************************** */