diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index c9693eb7..fe2cd8b0 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -342 +343 diff --git a/reg-lib/cpu/_reg_localTrans.cpp b/reg-lib/cpu/_reg_localTrans.cpp index 685ab580..2dac9946 100755 --- a/reg-lib/cpu/_reg_localTrans.cpp +++ b/reg-lib/cpu/_reg_localTrans.cpp @@ -563,7 +563,7 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, } val; __m128 tempCurrent, tempX, tempY; #ifdef _WIN32 - __declspec(align(16)) DataType temp[4]; + __declspec(align(16)) DataType xBasis[4]; __declspec(align(16)) DataType yBasis[4]; union { __m128 m[16]; @@ -578,7 +578,7 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, __declspec(align(16)) DataType f[16]; } xyBasis; #else // _WIN32 - DataType temp[4] __attribute__((aligned(16))); + DataType xBasis[4] __attribute__((aligned(16))); DataType yBasis[4] __attribute__((aligned(16))); union { __m128 m[16]; @@ -594,7 +594,7 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, } xyBasis; #endif // _WIN32 #else // _USE_SSE - DataType temp[4]; + DataType xBasis[4]; DataType yBasis[4]; DataType xyBasis[16]; DataType xControlPointCoordinates[16]; @@ -626,7 +626,6 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, index = y * deformationField->nx; oldXpre = oldYpre = 99999999; for (x = 0; x < deformationField->nx; x++) { - // The previous position at the current pixel position is read xReal = static_cast(fieldPtrX[index]); yReal = static_cast(fieldPtrY[index]); @@ -643,8 +642,8 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, xPre = Floor(xVoxel); basis = xVoxel - static_cast(xPre--); if (basis < 0) basis = 0; //rounding error - if (bspline) get_BSplineBasisValues(basis, temp); - else get_SplineBasisValues(basis, temp); + if (bspline) get_BSplineBasisValues(basis, xBasis); + else get_SplineBasisValues(basis, xBasis); yPre = Floor(yVoxel); basis = yVoxel - static_cast(yPre--); @@ -688,7 +687,7 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, coord = 0; for (b = 0; b < 4; b++) { for (a = 0; a < 4; a++) { - xyBasis.f[coord++] = temp[a] * yBasis[b]; + xyBasis.f[coord++] = xBasis[a] * yBasis[b]; } } @@ -707,7 +706,7 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, #else for (b = 0; b < 4; b++) { for (a = 0; a < 4; a++) { - DataType tempValue = temp[a] * yBasis[b]; + DataType tempValue = xBasis[a] * yBasis[b]; xReal += xControlPointCoordinates[b * 4 + a] * tempValue; yReal += yControlPointCoordinates[b * 4 + a] * tempValue; } @@ -728,14 +727,14 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, shared(deformationField, gridVoxelSpacing, splineControlPoint, controlPointPtrX, \ controlPointPtrY, mask, fieldPtrX, fieldPtrY, bspline) \ private(x, a, xPre, yPre, oldXpre, oldYpre, index, xReal, yReal, basis, \ - val, temp, yBasis, tempCurrent, xyBasis, tempX, tempY, \ + val, xBasis, yBasis, tempCurrent, xyBasis, tempX, tempY, \ xControlPointCoordinates, yControlPointCoordinates) #else // _USE_SSE #pragma omp parallel for default(none) \ shared(deformationField, gridVoxelSpacing, splineControlPoint, controlPointPtrX, \ controlPointPtrY, mask, fieldPtrX, fieldPtrY, bspline) \ private(x, a, xPre, yPre, oldXpre, oldYpre, index, xReal, yReal, basis, coord, \ - temp, yBasis, xyBasis, xControlPointCoordinates, yControlPointCoordinates) + xBasis, yBasis, xyBasis, xControlPointCoordinates, yControlPointCoordinates) #endif // _USE_SEE #endif // _OPENMP for (y = 0; y < deformationField->ny; y++) { @@ -744,21 +743,21 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, yPre = static_cast(static_cast(y) / gridVoxelSpacing[1]); basis = static_cast(y) / gridVoxelSpacing[1] - static_cast(yPre); - if (basis < 0) basis = 0; //rounding error + if (basis < 0) basis = 0; // rounding error if (bspline) get_BSplineBasisValues(basis, yBasis); else get_SplineBasisValues(basis, yBasis); for (x = 0; x < deformationField->nx; x++) { xPre = static_cast(static_cast(x) / gridVoxelSpacing[0]); basis = static_cast(x) / gridVoxelSpacing[0] - static_cast(xPre); - if (basis < 0) basis = 0; //rounding error - if (bspline) get_BSplineBasisValues(basis, temp); - else get_SplineBasisValues(basis, temp); + if (basis < 0) basis = 0; // rounding error + if (bspline) get_BSplineBasisValues(basis, xBasis); + else get_SplineBasisValues(basis, xBasis); #if _USE_SSE - val.f[0] = static_cast(temp[0]); - val.f[1] = static_cast(temp[1]); - val.f[2] = static_cast(temp[2]); - val.f[3] = static_cast(temp[3]); + val.f[0] = static_cast(xBasis[0]); + val.f[1] = static_cast(xBasis[1]); + val.f[2] = static_cast(xBasis[2]); + val.f[3] = static_cast(xBasis[3]); tempCurrent = val.m; for (a = 0; a < 4; a++) { val.m = _mm_set_ps1(static_cast(yBasis[a])); @@ -767,10 +766,10 @@ void reg_cubic_spline_getDeformationField2D(nifti_image *splineControlPoint, #else coord = 0; for (a = 0; a < 4; a++) { - xyBasis[coord++] = temp[0] * yBasis[a]; - xyBasis[coord++] = temp[1] * yBasis[a]; - xyBasis[coord++] = temp[2] * yBasis[a]; - xyBasis[coord++] = temp[3] * yBasis[a]; + xyBasis[coord++] = xBasis[0] * yBasis[a]; + xyBasis[coord++] = xBasis[1] * yBasis[a]; + xyBasis[coord++] = xBasis[2] * yBasis[a]; + xyBasis[coord++] = xBasis[3] * yBasis[a]; } #endif if (oldXpre != xPre || oldYpre != yPre) { @@ -837,7 +836,7 @@ void reg_cubic_spline_getDeformationField3D(nifti_image *splineControlPoint, int *mask, bool composition, bool bspline, - bool force_no_lut = false) { + bool forceNoLut = false) { #if _USE_SSE union { __m128 m; @@ -1111,7 +1110,7 @@ void reg_cubic_spline_getDeformationField3D(nifti_image *splineControlPoint, #endif // _USE_SSE // Assess if lookup table can be used - if (gridVoxelSpacing[0] == 5. && gridVoxelSpacing[0] == 5. && gridVoxelSpacing[0] == 5. && force_no_lut == false) { + if (gridVoxelSpacing[0] == 5. && gridVoxelSpacing[0] == 5. && gridVoxelSpacing[0] == 5. && forceNoLut == false) { // Assign a single array that will contain all coefficients DataType *coefficients = (DataType*)malloc(125 * 64 * sizeof(DataType)); // Compute and store all required coefficients @@ -1462,7 +1461,7 @@ void reg_spline_getDeformationField(nifti_image *splineControlPoint, int *mask, bool composition, bool bspline, - bool force_no_lut) { + bool forceNoLut) { if (splineControlPoint->datatype != deformationField->datatype) NR_FATAL_ERROR("The spline control point image and the deformation field image are expected to be of the same type"); @@ -1471,11 +1470,11 @@ void reg_spline_getDeformationField(nifti_image *splineControlPoint, NR_FATAL_ERROR("SSE computation has only been implemented for single precision"); #endif - bool MrPropre = false; - if (mask == nullptr) { + unique_ptr currentMask; + if (!mask) { // Active voxel are all superior to -1, 0 thus will do ! - MrPropre = true; - mask = (int*)calloc(NiftiImage::calcVoxelNumber(deformationField, 3), sizeof(int)); + currentMask.reset(new int[NiftiImage::calcVoxelNumber(deformationField, 3)]()); + mask = currentMask.get(); } // Check if an affine initialisation is required @@ -1519,10 +1518,10 @@ void reg_spline_getDeformationField(nifti_image *splineControlPoint, } else { switch (deformationField->datatype) { case NIFTI_TYPE_FLOAT32: - reg_cubic_spline_getDeformationField3D(splineControlPoint, deformationField, mask, composition, bspline, force_no_lut); + reg_cubic_spline_getDeformationField3D(splineControlPoint, deformationField, mask, composition, bspline, forceNoLut); break; case NIFTI_TYPE_FLOAT64: - reg_cubic_spline_getDeformationField3D(splineControlPoint, deformationField, mask, composition, bspline, force_no_lut); + reg_cubic_spline_getDeformationField3D(splineControlPoint, deformationField, mask, composition, bspline, forceNoLut); break; default: NR_FATAL_ERROR("Only single or double precision is implemented for deformation field"); @@ -1534,12 +1533,10 @@ void reg_spline_getDeformationField(nifti_image *splineControlPoint, if (splineControlPoint->ext_list[1].edata != nullptr) { reg_affine_getDeformationField(reinterpret_cast(splineControlPoint->ext_list[1].edata), deformationField, - true, //composition + true, // composition mask); } } - if (MrPropre) - free(mask); } /* *************************************************************** */ template @@ -3497,7 +3494,7 @@ void reg_spline_getFlowFieldFromVelocityGrid(nifti_image *velocityFieldGrid, flowField->intent_p1 = DISP_VEL_FIELD; reg_getDeformationFromDisplacement(flowField); - // fake the number of extension here to avoid the second half of the affine + // Fake the number of extension here to avoid the second half of the affine int oldNumExt = velocityFieldGrid->num_ext; if (oldNumExt > 1) velocityFieldGrid->num_ext = 1; @@ -3508,7 +3505,7 @@ void reg_spline_getFlowFieldFromVelocityGrid(nifti_image *velocityFieldGrid, reg_spline_getDeformationField(velocityFieldGrid, flowField, nullptr, // mask - true, //composition + true, // composition true); // bspline velocityFieldGrid->num_ext = oldNumExt; diff --git a/reg-lib/cpu/_reg_localTrans.h b/reg-lib/cpu/_reg_localTrans.h index ad1f0daf..ad6f930d 100755 --- a/reg-lib/cpu/_reg_localTrans.h +++ b/reg-lib/cpu/_reg_localTrans.h @@ -73,7 +73,7 @@ void reg_spline_getDeformationField(nifti_image *controlPointGridImage, int *mask = nullptr, bool composition = false, bool bspline = true, - bool force_no_lut = false); + bool forceNoLut = false); /* *************************************************************** */ /** @brief Upsample an image from voxel space to node space using * millimetre correspondences. diff --git a/reg-lib/cpu/_reg_splineBasis.cpp b/reg-lib/cpu/_reg_splineBasis.cpp index 6565cb83..244bf4c0 100755 --- a/reg-lib/cpu/_reg_splineBasis.cpp +++ b/reg-lib/cpu/_reg_splineBasis.cpp @@ -460,36 +460,34 @@ template void set_second_order_bspline_basis_values(double*, double*, do template void get_SlidedValues(DataType& defX, DataType& defY, - const int X, - const int Y, + const int x, + const int y, const DataType *defPtrX, const DataType *defPtrY, - const mat44 *df_voxel2Real, + const mat44 *dfVoxel2Real, const int *dim, const bool displacement) { - int newX = X; - int newY = Y; - if (X < 0) { + int newX = x; + if (x < 0) newX = 0; - } else if (X >= dim[1]) { + else if (x >= dim[1]) newX = dim[1] - 1; - } - if (Y < 0) { + + int newY = y; + if (y < 0) newY = 0; - } else if (Y >= dim[2]) { + else if (y >= dim[2]) newY = dim[2] - 1; - } + DataType shiftValueX = 0; DataType shiftValueY = 0; if (!displacement) { - int shiftIndexX = X - newX; - int shiftIndexY = Y - newY; - shiftValueX = shiftIndexX * df_voxel2Real->m[0][0] + - shiftIndexY * df_voxel2Real->m[0][1]; - shiftValueY = shiftIndexX * df_voxel2Real->m[1][0] + - shiftIndexY * df_voxel2Real->m[1][1]; + const int shiftIndexX = x - newX; + const int shiftIndexY = y - newY; + shiftValueX = shiftIndexX * dfVoxel2Real->m[0][0] + shiftIndexY * dfVoxel2Real->m[0][1]; + shiftValueY = shiftIndexX * dfVoxel2Real->m[1][0] + shiftIndexY * dfVoxel2Real->m[1][1]; } - size_t index = newY * dim[1] + newX; + const int index = newY * dim[1] + newX; defX = defPtrX[index] + shiftValueX; defY = defPtrY[index] + shiftValueY; } @@ -500,54 +498,54 @@ template void get_SlidedValues(DataType& defX, DataType& defY, DataType& defZ, - const int X, - const int Y, - const int Z, + const int x, + const int y, + const int z, const DataType *defPtrX, const DataType *defPtrY, const DataType *defPtrZ, - const mat44 *df_voxel2Real, + const mat44 *dfVoxel2Real, const int *dim, const bool displacement) { - int newX = X; - int newY = Y; - int newZ = Z; - if (X < 0) { + int newX = x; + if (x < 0) newX = 0; - } else if (X >= dim[1]) { + else if (x >= dim[1]) newX = dim[1] - 1; - } - if (Y < 0) { + + int newY = y; + if (y < 0) newY = 0; - } else if (Y >= dim[2]) { + else if (y >= dim[2]) newY = dim[2] - 1; - } - if (Z < 0) { + + int newZ = z; + if (z < 0) newZ = 0; - } else if (Z >= dim[3]) { + else if (z >= dim[3]) newZ = dim[3] - 1; - } + DataType shiftValueX = 0; DataType shiftValueY = 0; DataType shiftValueZ = 0; if (!displacement) { - int shiftIndexX = X - newX; - int shiftIndexY = Y - newY; - int shiftIndexZ = Z - newZ; + const int shiftIndexX = x - newX; + const int shiftIndexY = y - newY; + const int shiftIndexZ = z - newZ; shiftValueX = - shiftIndexX * df_voxel2Real->m[0][0] + - shiftIndexY * df_voxel2Real->m[0][1] + - shiftIndexZ * df_voxel2Real->m[0][2]; + shiftIndexX * dfVoxel2Real->m[0][0] + + shiftIndexY * dfVoxel2Real->m[0][1] + + shiftIndexZ * dfVoxel2Real->m[0][2]; shiftValueY = - shiftIndexX * df_voxel2Real->m[1][0] + - shiftIndexY * df_voxel2Real->m[1][1] + - shiftIndexZ * df_voxel2Real->m[1][2]; + shiftIndexX * dfVoxel2Real->m[1][0] + + shiftIndexY * dfVoxel2Real->m[1][1] + + shiftIndexZ * dfVoxel2Real->m[1][2]; shiftValueZ = - shiftIndexX * df_voxel2Real->m[2][0] + - shiftIndexY * df_voxel2Real->m[2][1] + - shiftIndexZ * df_voxel2Real->m[2][2]; + shiftIndexX * dfVoxel2Real->m[2][0] + + shiftIndexY * dfVoxel2Real->m[2][1] + + shiftIndexZ * dfVoxel2Real->m[2][2]; } - size_t index = (newZ * dim[2] + newY) * dim[1] + newX; + const int index = (newZ * dim[2] + newY) * dim[1] + newX; defX = defPtrX[index] + shiftValueX; defY = defPtrY[index] + shiftValueY; defZ = defPtrZ[index] + shiftValueZ; diff --git a/reg-lib/cpu/_reg_splineBasis.h b/reg-lib/cpu/_reg_splineBasis.h index 77cd6dd8..9c645a26 100755 --- a/reg-lib/cpu/_reg_splineBasis.h +++ b/reg-lib/cpu/_reg_splineBasis.h @@ -84,24 +84,24 @@ void get_SplineBasisValues(DataType basis, template void get_SlidedValues(DataType &defX, DataType &defY, - const int X, - const int Y, + const int x, + const int y, const DataType *defPtrX, const DataType *defPtrY, - const mat44 *df_voxel2Real, + const mat44 *dfVoxel2Real, const int *dim, const bool displacement); template void get_SlidedValues(DataType &defX, DataType &defY, DataType &defZ, - const int X, - const int Y, - const int Z, + const int x, + const int y, + const int z, const DataType *defPtrX, const DataType *defPtrY, const DataType *defPtrZ, - const mat44 *df_voxel2Real, + const mat44 *dfVoxel2Real, const int *dim, const bool displacement); diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index 202eaa76..6a7d53a2 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -88,7 +88,6 @@ void CudaCompute::LandmarkDistanceGradient(size_t landmarkNumber, float *landmar } /* *************************************************************** */ void CudaCompute::GetDeformationField(bool composition, bool bspline) { - // TODO Fix reg_spline_getDeformationField_gpu to accept composition CudaF3dContent& con = dynamic_cast(this->con); reg_spline_getDeformationField_gpu(con.F3dContent::GetControlPointGrid(), con.F3dContent::GetReference(), @@ -96,6 +95,7 @@ void CudaCompute::GetDeformationField(bool composition, bool bspline) { con.GetDeformationFieldCuda(), con.GetReferenceMaskCuda(), con.GetActiveVoxelNumber(), + composition, bspline); } /* *************************************************************** */ diff --git a/reg-lib/cuda/_reg_common_cuda_kernels.cu b/reg-lib/cuda/_reg_common_cuda_kernels.cu index af5d1b9c..87e1f975 100644 --- a/reg-lib/cuda/_reg_common_cuda_kernels.cu +++ b/reg-lib/cuda/_reg_common_cuda_kernels.cu @@ -140,14 +140,19 @@ __device__ __inline__ void reg_div_cuda(const int num, const int denom, int& quo rem = num % denom; } /* *************************************************************** */ +template __device__ __inline__ int3 reg_indexToDims_cuda(const int index, const int3& dims) { int quot = 0, rem; - if (dims.z > 1) + if constexpr (is3d) reg_div_cuda(index, dims.x * dims.y, quot, rem); else rem = index; const int z = quot; reg_div_cuda(rem, dims.x, quot, rem); - const int y = quot, x = rem; + const int& y = quot, &x = rem; return { x, y, z }; } /* *************************************************************** */ +__device__ __inline__ int3 reg_indexToDims_cuda(const int index, const int3& dims) { + return dims.z > 1 ? reg_indexToDims_cuda(index, dims) : reg_indexToDims_cuda(index, dims); +} +/* *************************************************************** */ diff --git a/reg-lib/cuda/_reg_localTransformation_gpu.cu b/reg-lib/cuda/_reg_localTransformation_gpu.cu index 9ce6ec2c..f221a67d 100755 --- a/reg-lib/cuda/_reg_localTransformation_gpu.cu +++ b/reg-lib/cuda/_reg_localTransformation_gpu.cu @@ -22,6 +22,7 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, float4 *deformationFieldCuda, const int *maskCuda, const size_t activeVoxelNumber, + const bool composition, const bool bspline) { const size_t controlPointNumber = NiftiImage::calcVoxelNumber(controlPointImage, 3); const int3 referenceImageDim = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); @@ -35,6 +36,13 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), cudaChannelFormatKindSigned, 1); + // Get the reference matrix if composition is required + thrust::device_vector referenceMatrix; + if (composition) { + const mat44 *refMatPtr = controlPointImage->sform_code > 0 ? &controlPointImage->sto_ijk : &controlPointImage->qto_ijk; + referenceMatrix = thrust::device_vector(refMatPtr, refMatPtr + 1); + } + if (referenceImage->nz > 1) { const unsigned blocks = CudaContext::GetBlockSize()->reg_spline_getDeformationField3D; const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); @@ -44,10 +52,12 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, reg_spline_getDeformationField3D<<>>(deformationFieldCuda, *controlPointTexture, *maskTexture, + referenceMatrix.data().get(), referenceImageDim, controlPointImageDim, controlPointVoxelSpacing, (unsigned)activeVoxelNumber, + composition, bspline); NR_CUDA_CHECK_KERNEL(gridDims, blockDims); } else { @@ -59,10 +69,12 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, reg_spline_getDeformationField2D<<>>(deformationFieldCuda, *controlPointTexture, *maskTexture, + referenceMatrix.data().get(), referenceImageDim, controlPointImageDim, controlPointVoxelSpacing, (unsigned)activeVoxelNumber, + composition, bspline); NR_CUDA_CHECK_KERNEL(gridDims, blockDims); } @@ -527,13 +539,13 @@ void reg_spline_getFlowFieldFromVelocityGrid_gpu(nifti_image *velocityFieldGrid, // Copy over the number of required squaring steps // The initial flow field is generated using cubic B-Spline interpolation/approximation - // TODO Composition is needed reg_spline_getDeformationField_gpu(velocityFieldGrid, flowField, velocityFieldGridCuda, flowFieldCuda, maskCuda, activeVoxelNumber, + true, // composition true); // bspline velocityFieldGrid->num_ext = oldNumExt; @@ -675,6 +687,7 @@ void reg_spline_getDefFieldFromVelocityGrid_gpu(nifti_image *velocityFieldGrid, deformationFieldCuda, maskCuda.data().get(), voxelNumber, + false, // composition true); // bspline } else if (velocityFieldGrid->intent_p1 == SPLINE_VEL_GRID) { // Create an image to store the flow field diff --git a/reg-lib/cuda/_reg_localTransformation_gpu.h b/reg-lib/cuda/_reg_localTransformation_gpu.h index 0c0e80a7..d3432ca1 100755 --- a/reg-lib/cuda/_reg_localTransformation_gpu.h +++ b/reg-lib/cuda/_reg_localTransformation_gpu.h @@ -21,6 +21,7 @@ void reg_spline_getDeformationField_gpu(const nifti_image *controlPointImage, float4 *deformationFieldCuda, const int *maskCuda, const size_t activeVoxelNumber, + const bool composition, const bool bspline); /* *************************************************************** */ float reg_spline_approxBendingEnergy_gpu(const nifti_image *controlPointImage, diff --git a/reg-lib/cuda/_reg_localTransformation_kernels.cu b/reg-lib/cuda/_reg_localTransformation_kernels.cu index a95f4bba..05644a08 100755 --- a/reg-lib/cuda/_reg_localTransformation_kernels.cu +++ b/reg-lib/cuda/_reg_localTransformation_kernels.cu @@ -256,23 +256,22 @@ __device__ float4 GetSlidedValues(int x, int y, const int3& referenceImageDim, const mat44& affineMatrix) { int newX = x; - int newY = y; - if (x < 0) { + if (x < 0) newX = 0; - } else if (x >= referenceImageDim.x) { + else if (x >= referenceImageDim.x) newX = referenceImageDim.x - 1; - } - if (y < 0) { + + int newY = y; + if (y < 0) newY = 0; - } else if (y >= referenceImageDim.y) { + else if (y >= referenceImageDim.y) newY = referenceImageDim.y - 1; - } x -= newX; y -= newY; - const float4 slidedValues = make_float4(x * affineMatrix.m[0][0] + y * affineMatrix.m[0][1], - x * affineMatrix.m[1][0] + y * affineMatrix.m[1][1], - 0.f, 0.f); + const float4& slidedValues = make_float4(x * affineMatrix.m[0][0] + y * affineMatrix.m[0][1], + x * affineMatrix.m[1][0] + y * affineMatrix.m[1][1], + 0.f, 0.f); return slidedValues + tex1Dfetch(deformationFieldTexture, newY * referenceImageDim.x + newX); } /* *************************************************************** */ @@ -281,177 +280,215 @@ __device__ float4 GetSlidedValues(int x, int y, int z, const int3& referenceImageDim, const mat44& affineMatrix) { int newX = x; - int newY = y; - int newZ = z; - if (x < 0) { + if (x < 0) newX = 0; - } else if (x >= referenceImageDim.x) { + else if (x >= referenceImageDim.x) newX = referenceImageDim.x - 1; - } - if (y < 0) { + + int newY = y; + if (y < 0) newY = 0; - } else if (y >= referenceImageDim.y) { + else if (y >= referenceImageDim.y) newY = referenceImageDim.y - 1; - } - if (z < 0) { + + int newZ = z; + if (z < 0) newZ = 0; - } else if (z >= referenceImageDim.z) { + else if (z >= referenceImageDim.z) newZ = referenceImageDim.z - 1; - } x -= newX; y -= newY; z -= newZ; - const float4 slidedValues = make_float4(x * affineMatrix.m[0][0] + y * affineMatrix.m[0][1] + z * affineMatrix.m[0][2], - x * affineMatrix.m[1][0] + y * affineMatrix.m[1][1] + z * affineMatrix.m[1][2], - x * affineMatrix.m[2][0] + y * affineMatrix.m[2][1] + z * affineMatrix.m[2][2], - 0.f); + const float4& slidedValues = make_float4(x * affineMatrix.m[0][0] + y * affineMatrix.m[0][1] + z * affineMatrix.m[0][2], + x * affineMatrix.m[1][0] + y * affineMatrix.m[1][1] + z * affineMatrix.m[1][2], + x * affineMatrix.m[2][0] + y * affineMatrix.m[2][1] + z * affineMatrix.m[2][2], + 0.f); return slidedValues + tex1Dfetch(deformationFieldTexture, (newZ * referenceImageDim.y + newY) * referenceImageDim.x + newX); } /* *************************************************************** */ __global__ void reg_spline_getDeformationField3D(float4 *deformationField, cudaTextureObject_t controlPointTexture, cudaTextureObject_t maskTexture, + const mat44 *referenceMatrix, const int3 referenceImageDim, const int3 controlPointImageDim, const float3 controlPointVoxelSpacing, const unsigned activeVoxelNumber, + const bool composition, const bool bspline) { const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < activeVoxelNumber) { - const int tid2 = tex1Dfetch(maskTexture, tid); - int quot, rem; - reg_div_cuda(tid2, referenceImageDim.x * referenceImageDim.y, quot, rem); - const int z = quot; - reg_div_cuda(rem, referenceImageDim.x, quot, rem); - const int y = quot, x = rem; - + if (tid >= activeVoxelNumber) return; + const int tid2 = tex1Dfetch(maskTexture, tid); + const auto&& [x, y, z] = reg_indexToDims_cuda(tid2, referenceImageDim); + int3 nodePre; + float3 basis; + + if (composition) { // Composition of deformation fields + // The previous position at the current pixel position is read + const float4 node = deformationField[tid]; + + // From real to pixel position in the CPP + const float xVoxel = (referenceMatrix->m[0][0] * node.x + + referenceMatrix->m[0][1] * node.y + + referenceMatrix->m[0][2] * node.z + + referenceMatrix->m[0][3]); + const float yVoxel = (referenceMatrix->m[1][0] * node.x + + referenceMatrix->m[1][1] * node.y + + referenceMatrix->m[1][2] * node.z + + referenceMatrix->m[1][3]); + const float zVoxel = (referenceMatrix->m[2][0] * node.x + + referenceMatrix->m[2][1] * node.y + + referenceMatrix->m[2][2] * node.z + + referenceMatrix->m[2][3]); + + if (xVoxel < 0 || xVoxel >= referenceImageDim.x || + yVoxel < 0 || yVoxel >= referenceImageDim.y || + zVoxel < 0 || zVoxel >= referenceImageDim.z) return; + + nodePre = { Floor(xVoxel), Floor(yVoxel), Floor(zVoxel) }; + basis = { xVoxel - float(nodePre.x--), yVoxel - float(nodePre.y--), zVoxel - float(nodePre.z--) }; + } else { // starting deformation field is blank - !composition // The "nearest previous" node is determined [0,0,0] - const int3 nodeAnte = { - int((float)x / controlPointVoxelSpacing.x), - int((float)y / controlPointVoxelSpacing.y), - int((float)z / controlPointVoxelSpacing.z) - }; - - // Z basis values - extern __shared__ float yBasis[]; // Shared memory - const unsigned sharedMemIndex = 4 * threadIdx.x; - // Compute the shared memory offset which corresponds to four times the number of threads per block - float *zBasis = &yBasis[4 * blockDim.x * blockDim.y * blockDim.z]; - float relative = (float)z / controlPointVoxelSpacing.z - (float)nodeAnte.z; - if (relative < 0) relative = 0; // rounding error - if (bspline) GetBasisBSplineValues(relative, &zBasis[sharedMemIndex]); - else GetBasisSplineValues(relative, &zBasis[sharedMemIndex]); - - // Y basis values - relative = (float)y / controlPointVoxelSpacing.y - (float)nodeAnte.y; - if (relative < 0) relative = 0; // rounding error - if (bspline) GetBasisBSplineValues(relative, &yBasis[sharedMemIndex]); - else GetBasisSplineValues(relative, &yBasis[sharedMemIndex]); - - // X basis values - float xBasis[4]; - relative = (float)x / controlPointVoxelSpacing.x - (float)nodeAnte.x; - if (relative < 0) relative = 0; // rounding error - if (bspline) GetBasisBSplineValues(relative, xBasis); - else GetBasisSplineValues(relative, xBasis); - - float4 displacement{}; - for (int c = 0; c < 4; c++) { - float3 tempDisplacement{}; - int indexYZ = ((nodeAnte.z + c) * controlPointImageDim.y + nodeAnte.y) * controlPointImageDim.x; - for (int b = 0; b < 4; b++) { - int indexXYZ = indexYZ + nodeAnte.x; - const float4 nodeCoefficientA = tex1Dfetch(controlPointTexture, indexXYZ++); - const float4 nodeCoefficientB = tex1Dfetch(controlPointTexture, indexXYZ++); - const float4 nodeCoefficientC = tex1Dfetch(controlPointTexture, indexXYZ++); - const float4 nodeCoefficientD = tex1Dfetch(controlPointTexture, indexXYZ); - - const float& basis = yBasis[sharedMemIndex + b]; - tempDisplacement.x += basis * (nodeCoefficientA.x * xBasis[0] + - nodeCoefficientB.x * xBasis[1] + - nodeCoefficientC.x * xBasis[2] + - nodeCoefficientD.x * xBasis[3]); - - tempDisplacement.y += basis * (nodeCoefficientA.y * xBasis[0] + - nodeCoefficientB.y * xBasis[1] + - nodeCoefficientC.y * xBasis[2] + - nodeCoefficientD.y * xBasis[3]); - - tempDisplacement.z += basis * (nodeCoefficientA.z * xBasis[0] + - nodeCoefficientB.z * xBasis[1] + - nodeCoefficientC.z * xBasis[2] + - nodeCoefficientD.z * xBasis[3]); - - indexYZ += controlPointImageDim.x; - } + const float xVoxel = float(x) / controlPointVoxelSpacing.x; + const float yVoxel = float(y) / controlPointVoxelSpacing.y; + const float zVoxel = float(z) / controlPointVoxelSpacing.z; + nodePre = { int(xVoxel), int(yVoxel), int(zVoxel) }; + basis = { xVoxel - float(nodePre.x), yVoxel - float(nodePre.y), zVoxel - float(nodePre.z) }; + } + // Z basis values + extern __shared__ float yBasis[]; // Shared memory + const unsigned sharedMemIndex = 4 * threadIdx.x; + // Compute the shared memory offset which corresponds to four times the number of threads per block + float *zBasis = &yBasis[4 * blockDim.x * blockDim.y * blockDim.z]; + if (basis.z < 0) basis.z = 0; // rounding error + if (bspline) GetBasisBSplineValues(basis.z, &zBasis[sharedMemIndex]); + else GetBasisSplineValues(basis.z, &zBasis[sharedMemIndex]); + + // Y basis values + if (basis.y < 0) basis.y = 0; // rounding error + if (bspline) GetBasisBSplineValues(basis.y, &yBasis[sharedMemIndex]); + else GetBasisSplineValues(basis.y, &yBasis[sharedMemIndex]); + + // X basis values + float xBasis[4]; + if (basis.x < 0) basis.x = 0; // rounding error + if (bspline) GetBasisBSplineValues(basis.x, xBasis); + else GetBasisSplineValues(basis.x, xBasis); + + float4 displacement{}; + for (int c = 0; c < 4; c++) { + float3 tempDisplacement{}; + int indexYZ = ((nodePre.z + c) * controlPointImageDim.y + nodePre.y) * controlPointImageDim.x; + for (int b = 0; b < 4; b++) { + int indexXYZ = indexYZ + nodePre.x; + const float4& nodeCoefficientA = tex1Dfetch(controlPointTexture, indexXYZ++); + const float4& nodeCoefficientB = tex1Dfetch(controlPointTexture, indexXYZ++); + const float4& nodeCoefficientC = tex1Dfetch(controlPointTexture, indexXYZ++); + const float4& nodeCoefficientD = tex1Dfetch(controlPointTexture, indexXYZ); - const float& basis = zBasis[sharedMemIndex + c]; - displacement.x += basis * tempDisplacement.x; - displacement.y += basis * tempDisplacement.y; - displacement.z += basis * tempDisplacement.z; + const float& basis = yBasis[sharedMemIndex + b]; + tempDisplacement.x += basis * (nodeCoefficientA.x * xBasis[0] + + nodeCoefficientB.x * xBasis[1] + + nodeCoefficientC.x * xBasis[2] + + nodeCoefficientD.x * xBasis[3]); + + tempDisplacement.y += basis * (nodeCoefficientA.y * xBasis[0] + + nodeCoefficientB.y * xBasis[1] + + nodeCoefficientC.y * xBasis[2] + + nodeCoefficientD.y * xBasis[3]); + + tempDisplacement.z += basis * (nodeCoefficientA.z * xBasis[0] + + nodeCoefficientB.z * xBasis[1] + + nodeCoefficientC.z * xBasis[2] + + nodeCoefficientD.z * xBasis[3]); + + indexYZ += controlPointImageDim.x; } - deformationField[tid] = displacement; + const float& basis = zBasis[sharedMemIndex + c]; + displacement.x += basis * tempDisplacement.x; + displacement.y += basis * tempDisplacement.y; + displacement.z += basis * tempDisplacement.z; } + deformationField[tid] = displacement; } /* *************************************************************** */ __global__ void reg_spline_getDeformationField2D(float4 *deformationField, cudaTextureObject_t controlPointTexture, cudaTextureObject_t maskTexture, + const mat44 *referenceMatrix, const int3 referenceImageDim, const int3 controlPointImageDim, const float3 controlPointVoxelSpacing, const unsigned activeVoxelNumber, + const bool composition, const bool bspline) { const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < activeVoxelNumber) { - const int tid2 = tex1Dfetch(maskTexture, tid); - int quot, rem; - reg_div_cuda(tid2, referenceImageDim.x, quot, rem); - const int y = quot, x = rem; - + if (tid >= activeVoxelNumber) return; + const int tid2 = tex1Dfetch(maskTexture, tid); + const auto&& [x, y, z] = reg_indexToDims_cuda(tid2, referenceImageDim); + int2 nodePre; + float2 basis; + + if (composition) { // Composition of deformation fields + // The previous position at the current pixel position is read + const float4 node = deformationField[tid]; + + // From real to pixel position in the CPP + const float xVoxel = (referenceMatrix->m[0][0] * node.x + + referenceMatrix->m[0][1] * node.y + + referenceMatrix->m[0][3]); + const float yVoxel = (referenceMatrix->m[1][0] * node.x + + referenceMatrix->m[1][1] * node.y + + referenceMatrix->m[1][3]); + + if (xVoxel < 0 || xVoxel >= referenceImageDim.x || + yVoxel < 0 || yVoxel >= referenceImageDim.y) return; + + nodePre = { Floor(xVoxel), Floor(yVoxel) }; + basis = { xVoxel - float(nodePre.x--), yVoxel - float(nodePre.y--) }; + } else { // starting deformation field is blank - !composition // The "nearest previous" node is determined [0,0,0] - const int2 nodeAnte = { int((float)x / controlPointVoxelSpacing.x), int((float)y / controlPointVoxelSpacing.y) }; - - // Y basis values - extern __shared__ float yBasis[]; // Shared memory - const unsigned sharedMemIndex = 4 * threadIdx.x; - float relative = (float)y / controlPointVoxelSpacing.y - (float)nodeAnte.y; - if (relative < 0) relative = 0; // rounding error - if (bspline) GetBasisBSplineValues(relative, &yBasis[sharedMemIndex]); - else GetBasisSplineValues(relative, &yBasis[sharedMemIndex]); - - // X basis values - float xBasis[4]; - relative = (float)x / controlPointVoxelSpacing.x - (float)nodeAnte.x; - if (relative < 0) relative = 0; // rounding error - if (bspline) GetBasisBSplineValues(relative, xBasis); - else GetBasisSplineValues(relative, xBasis); - - float4 displacement{}; - for (int b = 0; b < 4; b++) { - int index = (nodeAnte.y + b) * controlPointImageDim.x + nodeAnte.x; - - const float4 nodeCoefficientA = tex1Dfetch(controlPointTexture, index++); - const float4 nodeCoefficientB = tex1Dfetch(controlPointTexture, index++); - const float4 nodeCoefficientC = tex1Dfetch(controlPointTexture, index++); - const float4 nodeCoefficientD = tex1Dfetch(controlPointTexture, index); - - const float& basis = yBasis[sharedMemIndex + b]; - displacement.x += basis * (nodeCoefficientA.x * xBasis[0] + - nodeCoefficientB.x * xBasis[1] + - nodeCoefficientC.x * xBasis[2] + - nodeCoefficientD.x * xBasis[3]); - - displacement.y += basis * (nodeCoefficientA.y * xBasis[0] + - nodeCoefficientB.y * xBasis[1] + - nodeCoefficientC.y * xBasis[2] + - nodeCoefficientD.y * xBasis[3]); - } - - deformationField[tid] = displacement; + const float xVoxel = float(x) / controlPointVoxelSpacing.x; + const float yVoxel = float(y) / controlPointVoxelSpacing.y; + nodePre = { int(xVoxel), int(yVoxel) }; + basis = { xVoxel - float(nodePre.x), yVoxel - float(nodePre.y) }; + } + // Y basis values + extern __shared__ float yBasis[]; // Shared memory + const unsigned sharedMemIndex = 4 * threadIdx.x; + if (basis.y < 0) basis.y = 0; // rounding error + if (bspline) GetBasisBSplineValues(basis.y, &yBasis[sharedMemIndex]); + else GetBasisSplineValues(basis.y, &yBasis[sharedMemIndex]); + + // X basis values + float xBasis[4]; + if (basis.x < 0) basis.x = 0; // rounding error + if (bspline) GetBasisBSplineValues(basis.x, xBasis); + else GetBasisSplineValues(basis.x, xBasis); + + float4 displacement{}; + for (int b = 0; b < 4; b++) { + int index = (nodePre.y + b) * controlPointImageDim.x + nodePre.x; + + const float4& nodeCoefficientA = tex1Dfetch(controlPointTexture, index++); + const float4& nodeCoefficientB = tex1Dfetch(controlPointTexture, index++); + const float4& nodeCoefficientC = tex1Dfetch(controlPointTexture, index++); + const float4& nodeCoefficientD = tex1Dfetch(controlPointTexture, index); + + const float& basis = yBasis[sharedMemIndex + b]; + displacement.x += basis * (nodeCoefficientA.x * xBasis[0] + + nodeCoefficientB.x * xBasis[1] + + nodeCoefficientC.x * xBasis[2] + + nodeCoefficientD.x * xBasis[3]); + + displacement.y += basis * (nodeCoefficientA.y * xBasis[0] + + nodeCoefficientB.y * xBasis[1] + + nodeCoefficientC.y * xBasis[2] + + nodeCoefficientD.y * xBasis[3]); } + deformationField[tid] = displacement; } /* *************************************************************** */ __global__ void reg_spline_getApproxSecondDerivatives2D(float4 *secondDerivativeValues, @@ -865,19 +902,19 @@ __global__ void reg_spline_getJacobianValues2D_kernel(float *jacobianMatrices, const int y = quot, x = rem; // the "nearest previous" node is determined [0,0,0] - const int2 nodeAnte = { Floor((float)x / controlPointSpacing.x), Floor((float)y / controlPointSpacing.y) }; + const int2 nodePre = { Floor((float)x / controlPointSpacing.x), Floor((float)y / controlPointSpacing.y) }; float xBasis[4], yBasis[4], xFirst[4], yFirst[4], relative; - relative = fabsf((float)x / controlPointSpacing.x - (float)nodeAnte.x); + relative = fabsf((float)x / controlPointSpacing.x - (float)nodePre.x); GetFirstBSplineValues(relative, xBasis, xFirst); - relative = fabsf((float)y / controlPointSpacing.y - (float)nodeAnte.y); + relative = fabsf((float)y / controlPointSpacing.y - (float)nodePre.y); GetFirstBSplineValues(relative, yBasis, yFirst); float2 tx{}, ty{}; for (int b = 0; b < 4; ++b) { - int indexXY = (nodeAnte.y + b) * controlPointImageDim.x + nodeAnte.x; + int indexXY = (nodePre.y + b) * controlPointImageDim.x + nodePre.x; float4 nodeCoefficient = tex1Dfetch(controlPointTexture, indexXY++); float2 basis = make_float2(xFirst[0] * yBasis[b], xBasis[0] * yFirst[b]); @@ -936,7 +973,7 @@ __global__ void reg_spline_getJacobianValues3D_kernel(float *jacobianMatrices, const int y = quot, x = rem; // the "nearest previous" node is determined [0,0,0] - const int3 nodeAnte = { + const int3 nodePre = { Floor((float)x / controlPointSpacing.x), Floor((float)y / controlPointSpacing.y), Floor((float)z / controlPointSpacing.z) @@ -948,19 +985,19 @@ __global__ void reg_spline_getJacobianValues3D_kernel(float *jacobianMatrices, float xBasis[4], yBasis[4], zBasis[4], xFirst[4], relative; const unsigned sharedMemIndex = 4 * threadIdx.x; - relative = fabsf((float)x / controlPointSpacing.x - (float)nodeAnte.x); + relative = fabsf((float)x / controlPointSpacing.x - (float)nodePre.x); GetFirstBSplineValues(relative, xBasis, xFirst); - relative = fabsf((float)y / controlPointSpacing.y - (float)nodeAnte.y); + relative = fabsf((float)y / controlPointSpacing.y - (float)nodePre.y); GetFirstBSplineValues(relative, yBasis, &yFirst[sharedMemIndex]); - relative = fabsf((float)z / controlPointSpacing.z - (float)nodeAnte.z); + relative = fabsf((float)z / controlPointSpacing.z - (float)nodePre.z); GetFirstBSplineValues(relative, zBasis, &zFirst[sharedMemIndex]); float3 tx{}, ty{}, tz{}; for (int c = 0; c < 4; ++c) { for (int b = 0; b < 4; ++b) { - int indexXYZ = ((nodeAnte.z + c) * controlPointImageDim.y + nodeAnte.y + b) * controlPointImageDim.x + nodeAnte.x; + int indexXYZ = ((nodePre.z + c) * controlPointImageDim.y + nodePre.y + b) * controlPointImageDim.x + nodePre.x; float3 basisXY{ yBasis[b] * zBasis[c], yFirst[sharedMemIndex + b] * zBasis[c], yBasis[b] * zFirst[sharedMemIndex + c] }; float4 nodeCoefficient = tex1Dfetch(controlPointTexture, indexXYZ++); @@ -1644,7 +1681,7 @@ __device__ static mat33 CreateDisplacementMatrix(const unsigned index, const int3& cppDims, const Basis& basis, const mat33& reorientation) { - const auto&& [x, y, z] = reg_indexToDims_cuda((int)index, cppDims); + const auto&& [x, y, z] = reg_indexToDims_cuda((int)index, cppDims); if (x < 1 || x >= cppDims.x - 1 || y < 1 || y >= cppDims.y - 1 || (is3d && (z < 1 || z >= cppDims.z - 1))) return {}; @@ -1721,7 +1758,7 @@ __global__ void reg_spline_approxLinearEnergyGradient_kernel(float4 *transGradie const unsigned voxelNumber) { const unsigned index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (index >= voxelNumber) return; - const auto&& [x, y, z] = reg_indexToDims_cuda((int)index, cppDims); + const auto&& [x, y, z] = reg_indexToDims_cuda((int)index, cppDims); auto gradVal = transGradient[index]; if constexpr (is3d) { diff --git a/reg-test/reg_test_getDeformationField.cpp b/reg-test/reg_test_getDeformationField.cpp index 444e6025..c49a1a24 100644 --- a/reg-test/reg_test_getDeformationField.cpp +++ b/reg-test/reg_test_getDeformationField.cpp @@ -141,7 +141,7 @@ class GetDeformationFieldTest { reg_getDeformationFromDisplacement(controlPointGrid3d); testDataComp.emplace_back(TestDataComp( "2D Composition ID", - reference3d, + reference2d, controlPointGrid2d, defField2d, expDefField2d @@ -167,7 +167,7 @@ class GetDeformationFieldTest { defField3dPtr[i] /= 1.1f; testDataComp.emplace_back(TestDataComp( "2D Composition Scaling", - reference3d, + reference2d, controlPointGrid2d, defField2d, expDefField2d @@ -181,7 +181,7 @@ class GetDeformationFieldTest { )); for (auto&& data : testDataComp) { - for (auto&& platformType : { PlatformType::Cpu }) { // Test only on CPU + for (auto&& platformType : PlatformTypes) { unique_ptr platform{ new Platform(platformType) }; unique_ptr contentCreator{ dynamic_cast(platform->CreateContentCreator(ContentType::F3d)) }; // Make a copy of the test data