Skip to content

Commit

Permalink
Optimise CudaResampling #92
Browse files Browse the repository at this point in the history
  • Loading branch information
onurulgen committed Nov 28, 2023
1 parent b9c9bec commit 708106f
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 321 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
372
373
12 changes: 0 additions & 12 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ struct BlockSize {
unsigned reg_ApplyConvolutionWindowAlongY;
unsigned reg_ApplyConvolutionWindowAlongZ;
unsigned Arithmetic;
unsigned reg_resampleImage2D;
unsigned reg_resampleImage3D;
unsigned reg_getImageGradient2D;
unsigned reg_getImageGradient3D;
};
/* *************************************************************** */
struct BlockSize100: public BlockSize {
Expand Down Expand Up @@ -70,10 +66,6 @@ struct BlockSize100: public BlockSize {
reg_ApplyConvolutionWindowAlongY = 512; // 14 reg - 28 smem - 08 cmem
reg_ApplyConvolutionWindowAlongZ = 512; // 15 reg - 28 smem - 08 cmem
Arithmetic = 384; // 5 reg - 24 smem
reg_resampleImage2D = 320; // 10 reg - 24 smem - 12 cmem
reg_resampleImage3D = 512; // 16 reg - 24 smem - 12 cmem
reg_getImageGradient2D = 512; // 16 reg - 24 smem - 20 cmem - 24 lmem
reg_getImageGradient3D = 320; // 24 reg - 24 smem - 16 cmem - 32 lmem
NR_FUNC_CALLED();
}
};
Expand Down Expand Up @@ -104,10 +96,6 @@ struct BlockSize300: public BlockSize {
reg_ApplyConvolutionWindowAlongY = 1024; // 25 reg
reg_ApplyConvolutionWindowAlongZ = 1024; // 25 reg
Arithmetic = 1024; //
reg_resampleImage2D = 1024; // 23 reg
reg_resampleImage3D = 1024; // 24 reg
reg_getImageGradient2D = 1024; // 34 reg
reg_getImageGradient3D = 1024; // 34 reg
NR_FUNC_CALLED();
}
};
Expand Down
18 changes: 10 additions & 8 deletions reg-lib/cuda/CudaCompute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,16 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
/* *************************************************************** */
void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) {
CudaDefContent& con = dynamic_cast<CudaDefContent&>(this->con);
Cuda::GetImageGradient(con.Content::GetFloating(),
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetWarpedGradientCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue,
activeTimePoint);
const nifti_image *floating = con.Content::GetFloating();
auto getImageGradient = floating->nz > 1 ? Cuda::GetImageGradient<true> : Cuda::GetImageGradient<false>;
getImageGradient(floating,
con.GetFloatingCuda(),
con.GetDeformationFieldCuda(),
con.GetWarpedGradientCuda(),
con.GetActiveVoxelNumber(),
interpolation,
paddingValue,
activeTimePoint);
}
/* *************************************************************** */
double CudaCompute::GetMaximalLength(bool optimiseX, bool optimiseY, bool optimiseZ) {
Expand Down
215 changes: 173 additions & 42 deletions reg-lib/cuda/CudaResampling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,54 @@
*/

#include "CudaResampling.hpp"
#include "CudaResamplingKernels.cu"

/* *************************************************************** */
namespace NiftyReg::Cuda {
/* *************************************************************** */
template<typename T>
__inline__ __device__ void InterpLinearKernel(T relative, T (&basis)[2]) {
basis[1] = relative;
basis[0] = 1.f - relative;
}
/* *************************************************************** */
template<typename T, bool is3d>
__inline__ __device__ void TransformInterpolate(const mat44 matrix, const float4 realDeformation, int3& previous,
T (&xBasis)[2], T (&yBasis)[2], T (&zBasis)[2]) {
// Get the voxel-based deformation
T voxelDeformation[is3d ? 3 : 2];
if constexpr (is3d) {
voxelDeformation[0] = (static_cast<T>(matrix.m[0][0]) * static_cast<T>(realDeformation.x) +
static_cast<T>(matrix.m[0][1]) * static_cast<T>(realDeformation.y) +
static_cast<T>(matrix.m[0][2]) * static_cast<T>(realDeformation.z) +
static_cast<T>(matrix.m[0][3]));
voxelDeformation[1] = (static_cast<T>(matrix.m[1][0]) * static_cast<T>(realDeformation.x) +
static_cast<T>(matrix.m[1][1]) * static_cast<T>(realDeformation.y) +
static_cast<T>(matrix.m[1][2]) * static_cast<T>(realDeformation.z) +
static_cast<T>(matrix.m[1][3]));
voxelDeformation[2] = (static_cast<T>(matrix.m[2][0]) * static_cast<T>(realDeformation.x) +
static_cast<T>(matrix.m[2][1]) * static_cast<T>(realDeformation.y) +
static_cast<T>(matrix.m[2][2]) * static_cast<T>(realDeformation.z) +
static_cast<T>(matrix.m[2][3]));
} else {
voxelDeformation[0] = (static_cast<T>(matrix.m[0][0]) * static_cast<T>(realDeformation.x) +
static_cast<T>(matrix.m[0][1]) * static_cast<T>(realDeformation.y) +
static_cast<T>(matrix.m[0][3]));
voxelDeformation[1] = (static_cast<T>(matrix.m[1][0]) * static_cast<T>(realDeformation.x) +
static_cast<T>(matrix.m[1][1]) * static_cast<T>(realDeformation.y) +
static_cast<T>(matrix.m[1][3]));
}

// Compute the linear interpolation
previous.x = Floor(voxelDeformation[0]);
previous.y = Floor(voxelDeformation[1]);
InterpLinearKernel(voxelDeformation[0] - static_cast<T>(previous.x), xBasis);
InterpLinearKernel(voxelDeformation[1] - static_cast<T>(previous.y), yBasis);
if constexpr (is3d) {
previous.z = Floor(voxelDeformation[2]);
InterpLinearKernel(voxelDeformation[2] - static_cast<T>(previous.z), zBasis);
}
}
/* *************************************************************** */
template<bool is3d>
void ResampleImage(const nifti_image *floatingImage,
const float *floatingImageCuda,
Expand All @@ -29,39 +72,82 @@ void ResampleImage(const nifti_image *floatingImage,
if (interpolation != 1)
NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");

auto blockSize = CudaContext::GetBlockSize();
const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
auto maskTexture = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, activeVoxelNumber, cudaChannelFormatKindSigned, 1);
auto deformationFieldTexture = *deformationFieldTexturePtr;
auto maskTexture = *maskTexturePtr;
// Bind the real to voxel matrix to the texture
const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;

for (int t = 0; t < warpedImage->nt * warpedImage->nu; t++) {
NR_DEBUG((is3d ? "3" : "2") << "D resampling of volume number " << t);
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
if constexpr (is3d) {
const unsigned blocks = blockSize->reg_resampleImage3D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
ResampleImage3D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->reg_resampleImage2D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
ResampleImage2D<<<gridDims, blockDims>>>(warpedImageCuda + t * voxelNumber, *floatingTexture, *deformationFieldTexture, *maskTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
}
auto curWarpedCuda = warpedImageCuda + t * voxelNumber;
auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + t * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
auto floatingTexture = *floatingTexturePtr;
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [
curWarpedCuda, floatingTexture, deformationFieldTexture, maskTexture, floatingMatrix, floatingDim, paddingValue
]__device__(const int index) {
// Get the real world deformation in the floating space
const int voxel = tex1Dfetch<int>(maskTexture, index);
const float4 realDeformation = tex1Dfetch<float4>(deformationFieldTexture, index);

// Get the voxel-based deformation in the floating space and compute the linear interpolation
int3 previous;
double xBasis[2], yBasis[2], zBasis[2];
TransformInterpolate<double, is3d>(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis);

double intensity = 0;
if constexpr (is3d) {
for (char c = 0; c < 2; c++) {
const int z = previous.z + c;
int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x;
double tempY = 0;
for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) {
const int y = previous.y + b;
int index = indexYZ + previous.x;
double tempX = 0;
for (char a = 0; a < 2; a++, index++) {
const int x = previous.x + a;
if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z) {
tempX += tex1Dfetch<float>(floatingTexture, index) * xBasis[a];
} else {
// Padding value
tempX += paddingValue * xBasis[a];
}
}
tempY += tempX * yBasis[b];
}
intensity += tempY * zBasis[c];
}
} else {
int indexY = previous.y * floatingDim.x + previous.x;
for (char b = 0; b < 2; b++, indexY += floatingDim.x) {
const int y = previous.y + b;
int index = indexY;
double tempX = 0;
for (char a = 0; a < 2; a++, index++) {
const int x = previous.x + a;
if (-1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y) {
tempX += tex1Dfetch<float>(floatingTexture, index) * xBasis[a];
} else {
// Padding value
tempX += paddingValue * xBasis[a];
}
}
intensity += tempX * yBasis[b];
}
}

curWarpedCuda[voxel] = intensity;
});
}
}
template void ResampleImage<false>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
template void ResampleImage<true>(const nifti_image*, const float*, const nifti_image*, float*, const float4*, const int*, const size_t, const int, const float);
/* *************************************************************** */
template<bool is3d>
void GetImageGradient(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
Expand All @@ -73,33 +159,78 @@ void GetImageGradient(const nifti_image *floatingImage,
if (interpolation != 1)
NR_FATAL_ERROR("Only linear interpolation is supported on the GPU");

auto blockSize = CudaContext::GetBlockSize();
const size_t voxelNumber = NiftiImage::calcVoxelNumber(floatingImage, 3);
const int3 floatingDim = make_int3(floatingImage->nx, floatingImage->ny, floatingImage->nz);
if (paddingValue != paddingValue) paddingValue = 0;
auto floatingTexture = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
auto deformationFieldTexture = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
auto floatingTexturePtr = Cuda::CreateTextureObject(floatingImageCuda + activeTimePoint * voxelNumber, voxelNumber, cudaChannelFormatKindFloat, 1);
auto deformationFieldTexturePtr = Cuda::CreateTextureObject(deformationFieldCuda, activeVoxelNumber, cudaChannelFormatKindFloat, 4);
auto floatingTexture = *floatingTexturePtr;
auto deformationFieldTexture = *deformationFieldTexturePtr;
// Bind the real to voxel matrix to the texture
const mat44& floatingMatrix = floatingImage->sform_code > 0 ? floatingImage->sto_ijk : floatingImage->qto_ijk;

if (floatingImage->nz > 1) {
const unsigned blocks = blockSize->reg_getImageGradient3D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
GetImageGradient3D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
} else {
const unsigned blocks = blockSize->reg_getImageGradient2D;
const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks));
const dim3 gridDims(grids, grids, 1);
const dim3 blockDims(blocks, 1, 1);
GetImageGradient2D<<<gridDims, blockDims>>>(warpedGradientCuda, *floatingTexture, *deformationFieldTexture,
floatingMatrix, floatingDim, (unsigned)activeVoxelNumber, paddingValue);
NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
}
thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [
warpedGradientCuda, floatingTexture, deformationFieldTexture, floatingMatrix, floatingDim, paddingValue
]__device__(const int index) {
// Get the real world deformation in the floating space
float4 realDeformation = tex1Dfetch<float4>(deformationFieldTexture, index);

// Get the voxel-based deformation in the floating space and compute the linear interpolation
int3 previous;
float xBasis[2], yBasis[2], zBasis[2];
TransformInterpolate<float, is3d>(floatingMatrix, realDeformation, previous, xBasis, yBasis, zBasis);
constexpr float deriv[] = { -1.0f, 1.0f };

float4 gradientValue{};
if constexpr (is3d) {
for (char c = 0; c < 2; c++) {
const int z = previous.z + c;
int indexYZ = (z * floatingDim.y + previous.y) * floatingDim.x;
float3 tempY{};
for (char b = 0; b < 2; b++, indexYZ += floatingDim.x) {
const int y = previous.y + b;
int index = indexYZ + previous.x;
float2 tempX{};
for (char a = 0; a < 2; a++, index++) {
const int x = previous.x + a;
const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y && -1 < z && z < floatingDim.z ?
tex1Dfetch<float>(floatingTexture, index) : paddingValue;

tempX.x += intensity * deriv[a];
tempX.y += intensity * xBasis[a];
}
tempY.x += tempX.x * yBasis[b];
tempY.y += tempX.y * deriv[b];
tempY.z += tempX.y * yBasis[b];
}
gradientValue.x += tempY.x * zBasis[c];
gradientValue.y += tempY.y * zBasis[c];
gradientValue.z += tempY.z * deriv[c];
}
} else {
int indexY = previous.y * floatingDim.x + previous.x;
for (char b = 0; b < 2; b++, indexY += floatingDim.x) {
const int y = previous.y + b;
int index = indexY;
float2 tempX{};
for (char a = 0; a < 2; a++, index++) {
const int x = previous.x + a;
const float intensity = -1 < x && x < floatingDim.x && -1 < y && y < floatingDim.y ?
tex1Dfetch<float>(floatingTexture, index) : paddingValue;

tempX.x += intensity * deriv[a];
tempX.y += intensity * xBasis[a];
}
gradientValue.x += tempX.x * yBasis[b];
gradientValue.y += tempX.y * deriv[b];
}
}

warpedGradientCuda[index] = gradientValue;
});
}
template void GetImageGradient<false>(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int);
template void GetImageGradient<true>(const nifti_image*, const float*, const float4*, float4*, const size_t, const int, float, const int);
/* *************************************************************** */
} // namespace NiftyReg::Cuda
/* *************************************************************** */
1 change: 1 addition & 0 deletions reg-lib/cuda/CudaResampling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ void ResampleImage(const nifti_image *floatingImage,
const int interpolation,
const float paddingValue);
/* *************************************************************** */
template<bool is3d>
void GetImageGradient(const nifti_image *floatingImage,
const float *floatingImageCuda,
const float4 *deformationFieldCuda,
Expand Down
Loading

0 comments on commit 708106f

Please sign in to comment.