Skip to content

Commit

Permalink
Convert reference and floating images to float arrays from cudaArrays #…
Browse files Browse the repository at this point in the history
…92

 - Eliminate unnecessary Cuda::* functions
 - Refactor Cuda::CreateTextureObject()
  • Loading branch information
onurulgen committed Nov 17, 2023
1 parent 5577571 commit f953b5f
Show file tree
Hide file tree
Showing 21 changed files with 281 additions and 553 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
364
365
20 changes: 10 additions & 10 deletions reg-apps/reg_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,18 @@ int main(int argc, char **argv)

#ifdef USE_CUDA
float *targetImageArray_d;
cudaArray *sourceImageArray_d;
float *sourceImageArray_d;
int *targetMask_d;
float4 *deformationFieldImageArray_d;
if(runGPU)
{
Cuda::Allocate<float>(&targetImageArray_d, targetImage->nvox);
Cuda::TransferNiftiToDevice<float>(targetImageArray_d, targetImage);
Cuda::Allocate(&targetImageArray_d, targetImage->nvox);
Cuda::TransferNiftiToDevice(targetImageArray_d, targetImage);
Cuda::Allocate<float>(&sourceImageArray_d, sourceImage->nvox);
Cuda::TransferNiftiToDevice<float>(sourceImageArray_d,sourceImage);
CUDA_SAFE_CALL(cudaMalloc((void **)&targetMask_d, targetImage->nvox*sizeof(int)));
Cuda::TransferNiftiToDevice(sourceImageArray_d,sourceImage);
CUDA_SAFE_CALL(cudaMalloc((void**)&targetMask_d, targetImage->nvox*sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(targetMask_d, maskImage, targetImage->nvox*sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void **)&deformationFieldImageArray_d, targetImage->nvox*sizeof(float4)));
CUDA_SAFE_CALL(cudaMalloc((void**)&deformationFieldImageArray_d, targetImage->nvox*sizeof(float4)));
}
#endif

Expand Down Expand Up @@ -277,8 +277,8 @@ int main(int argc, char **argv)
float4 *controlPointImageArray_d;
if(runGPU)
{
Cuda::Allocate<float4>(&controlPointImageArray_d, controlPointImage->dim);
Cuda::TransferNiftiToDevice<float4>(controlPointImageArray_d,controlPointImage);
Cuda::Allocate(&controlPointImageArray_d, controlPointImage->dim);
Cuda::TransferNiftiToDevice(controlPointImageArray_d, controlPointImage);
}
#endif
{
Expand Down Expand Up @@ -330,8 +330,8 @@ int main(int argc, char **argv)
float4 *velocityFieldImageArray_d;
if(runGPU)
{
Cuda::Allocate<float4>(&velocityFieldImageArray_d, velocityFieldImage->dim);
Cuda::TransferNiftiToDevice<float4>(velocityFieldImageArray_d,velocityFieldImage);
Cuda::Allocate(&velocityFieldImageArray_d, velocityFieldImage->dim);
Cuda::TransferNiftiToDevice(velocityFieldImageArray_d, velocityFieldImage);
}
#endif
{
Expand Down
325 changes: 65 additions & 260 deletions reg-lib/cuda/CudaCommon.cu

Large diffs are not rendered by default.

62 changes: 22 additions & 40 deletions reg-lib/cuda/CudaCommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,72 +69,54 @@ inline void CheckKernel(const std::string& file, const int line, const std::stri
#define NR_CUDA_SAFE_CALL(call) { call; NiftyReg::Cuda::Internal::SafeCall(__FILE__, __LINE__, NR_FUNCTION); }
#define NR_CUDA_CHECK_KERNEL(grid, block) NiftyReg::Cuda::Internal::CheckKernel(__FILE__, __LINE__, NR_FUNCTION, grid, block)
/* *************************************************************** */
template <class DataType>
void Allocate(cudaArray**, const int*);
template<class DataType>
void Allocate(DataType**, const size_t);
/* *************************************************************** */
template <class DataType>
void Allocate(cudaArray**, cudaArray**, const int*);
/* *************************************************************** */
template <class DataType>
void Allocate(DataType**, const size_t&);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void Allocate(DataType**, const int*);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void Allocate(DataType**, DataType**, const int*);
/* *************************************************************** */
template <class DataType>
void TransferNiftiToDevice(cudaArray*, const nifti_image*);
/* *************************************************************** */
template <class DataType>
void TransferNiftiToDevice(cudaArray*, cudaArray*, const nifti_image*);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void TransferNiftiToDevice(DataType*, const nifti_image*);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void TransferNiftiToDevice(DataType*, DataType*, const nifti_image*);
/* *************************************************************** */
template <class DataType>
void TransferNiftiToDevice(DataType*, const DataType*, const size_t&);
template<class DataType>
void TransferNiftiToDevice(DataType*, const DataType*, const size_t);
/* *************************************************************** */
void TransferFromDeviceToNifti(nifti_image*, const cudaArray*);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void TransferFromDeviceToNifti(nifti_image*, const DataType*);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void TransferFromDeviceToNifti(nifti_image*, const DataType*, const DataType*);
/* *************************************************************** */
template <class DataType>
void TransferFromDeviceToHost(DataType*, const DataType*, const size_t&);
/* *************************************************************** */
template <class DataType>
void TransferFromHostToDevice(DataType*, const DataType*, const size_t&);
template<class DataType>
void TransferFromDeviceToHost(DataType*, const DataType*, const size_t);
/* *************************************************************** */
void Free(cudaArray*);
template<class DataType>
void TransferFromHostToDevice(DataType*, const DataType*, const size_t);
/* *************************************************************** */
template <class DataType>
template<class DataType>
void Free(DataType*);
/* *************************************************************** */
namespace Internal {
template <class T>
struct UniquePtrDeleter { void operator()(T *ptr) const { Free(ptr); } };
}
/* *************************************************************** */
template <class T>
template<class T>
using UniquePtr = unique_ptr<T, Internal::UniquePtrDeleter<T>>;
/* *************************************************************** */
using UniqueTextureObjectPtr = unique_ptr<cudaTextureObject_t, void(*)(cudaTextureObject_t*)>;
using UniqueTextureObjectPtr = UniquePtr<cudaTextureObject_t>;
/* *************************************************************** */
UniqueTextureObjectPtr CreateTextureObject(const void *devPtr,
const cudaResourceType& resType,
const size_t& size = 0,
const cudaChannelFormatKind& channelFormat = cudaChannelFormatKindNone,
const unsigned& channelCount = 1,
const cudaTextureFilterMode& filterMode = cudaFilterModePoint,
const bool& normalizedCoordinates = false);
template<class DataType>
UniqueTextureObjectPtr CreateTextureObject(const DataType *devPtr,
const size_t count,
const cudaChannelFormatKind channelFormat,
const unsigned channelCount);
/* *************************************************************** */
} // namespace NiftyReg::Cuda
/* *************************************************************** */
8 changes: 4 additions & 4 deletions reg-lib/cuda/CudaContent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ CudaContent::~CudaContent() {
void CudaContent::AllocateReference() {
if (reference->nbyper != NIFTI_TYPE_FLOAT32)
reg_tools_changeDatatype<float>(reference);
Cuda::Allocate<float>(&referenceCuda, reference->dim);
Cuda::Allocate(&referenceCuda, reference->nvox);
referenceCudaManaged.reset(referenceCuda);
Cuda::TransferNiftiToDevice<float>(referenceCuda, reference);
Cuda::TransferNiftiToDevice(referenceCuda, reference);
}
/* *************************************************************** */
void CudaContent::AllocateFloating() {
if (floating->nbyper != NIFTI_TYPE_FLOAT32)
reg_tools_changeDatatype<float>(floating);
Cuda::Allocate<float>(&floatingCuda, floating->dim);
Cuda::Allocate(&floatingCuda, floating->nvox);
floatingCudaManaged.reset(floatingCuda);
Cuda::TransferNiftiToDevice<float>(floatingCuda, floating);
Cuda::TransferNiftiToDevice(floatingCuda, floating);
}
/* *************************************************************** */
void CudaContent::AllocateDeformationField() {
Expand Down
16 changes: 8 additions & 8 deletions reg-lib/cuda/CudaContent.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class CudaContent: public virtual Content {
// Getters
virtual nifti_image* GetDeformationField() override;
virtual nifti_image* GetWarped() override;
virtual cudaArray* GetReferenceCuda() { return referenceCuda; }
virtual cudaArray* GetFloatingCuda() { return floatingCuda; }
virtual float* GetReferenceCuda() { return referenceCuda; }
virtual float* GetFloatingCuda() { return floatingCuda; }
virtual float4* GetDeformationFieldCuda() { return deformationFieldCuda; }
virtual int* GetReferenceMaskCuda() { return referenceMaskCuda; }
virtual float* GetTransformationMatrixCuda() { return transformationMatrixCuda; }
Expand All @@ -30,10 +30,10 @@ class CudaContent: public virtual Content {
virtual void UpdateWarped() override;

protected:
cudaArray *referenceCuda = nullptr;
Cuda::UniquePtr<cudaArray> referenceCudaManaged;
cudaArray *floatingCuda = nullptr;
Cuda::UniquePtr<cudaArray> floatingCudaManaged;
float *referenceCuda = nullptr;
Cuda::UniquePtr<float> referenceCudaManaged;
float *floatingCuda = nullptr;
Cuda::UniquePtr<float> floatingCudaManaged;
float4 *deformationFieldCuda = nullptr;
int *referenceMaskCuda = nullptr;
float *transformationMatrixCuda = nullptr;
Expand All @@ -49,8 +49,8 @@ class CudaContent: public virtual Content {
template<class DataType> DataType CastImageData(float intensity, int datatype);
template<class DataType> void FillImageData(nifti_image *image, float *memoryObject, int datatype);
void DownloadImage(nifti_image *image, float *memoryObject, int datatype);
void SetReferenceCuda(cudaArray *referenceCudaIn) { referenceCudaManaged = nullptr; referenceCuda = referenceCudaIn; }
void SetFloatingCuda(cudaArray *floatingCudaIn) { floatingCudaManaged = nullptr; floatingCuda = floatingCudaIn; }
void SetReferenceCuda(float *referenceCudaIn) { referenceCudaManaged = nullptr; referenceCuda = referenceCudaIn; }
void SetFloatingCuda(float *floatingCudaIn) { floatingCudaManaged = nullptr; floatingCuda = floatingCudaIn; }

// Friend classes
friend class CudaF3d2ContentCreator;
Expand Down
14 changes: 5 additions & 9 deletions reg-lib/cuda/CudaKernelConvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,9 @@ void NiftyReg::Cuda::KernelConvolution(const nifti_image *image,
float *bufferDensityCudaPtr = bufferDensityCuda.data().get();

// Create texture objects
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
voxelNumber * sizeof(float4), cudaChannelFormatKindFloat, 1);
auto densityTexturePtr = Cuda::CreateTextureObject(densityCudaPtr, cudaResourceTypeLinear,
voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1);
auto nanImageTexturePtr = Cuda::CreateTextureObject(nanImageCudaPtr, cudaResourceTypeLinear,
voxelNumber * sizeof(bool), cudaChannelFormatKindUnsigned, 1);
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, voxelNumber, cudaChannelFormatKindFloat, 1);
auto densityTexturePtr = Cuda::CreateTextureObject(densityCudaPtr, voxelNumber, cudaChannelFormatKindFloat, 1);
auto nanImageTexturePtr = Cuda::CreateTextureObject(nanImageCudaPtr, voxelNumber, cudaChannelFormatKindUnsigned, 1);
auto imageTexture = *imageTexturePtr;
auto densityTexture = *densityTexturePtr;
auto nanImageTexture = *nanImageTexturePtr;
Expand Down Expand Up @@ -138,12 +135,11 @@ void NiftyReg::Cuda::KernelConvolution(const nifti_image *image,
const int imageDim = reinterpret_cast<const int*>(&imageDims)[n];
// Create the kernel texture
thrust::device_vector<float> kernelCuda;
Cuda::UniqueTextureObjectPtr kernelTexturePtr(nullptr, nullptr);
Cuda::UniqueTextureObjectPtr kernelTexturePtr;
cudaTextureObject_t kernelTexture = 0;
if (kernelSum > 0) {
kernelCuda = kernel;
kernelTexturePtr = std::move(Cuda::CreateTextureObject(kernelCuda.data().get(), cudaResourceTypeLinear,
kernel.size() * sizeof(float), cudaChannelFormatKindFloat, 1));
kernelTexturePtr = Cuda::CreateTextureObject(kernelCuda.data().get(), kernel.size(), cudaChannelFormatKindFloat, 1);
kernelTexture = *kernelTexturePtr;
}

Expand Down
6 changes: 2 additions & 4 deletions reg-lib/cuda/CudaNormaliseGradient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
/* *************************************************************** */
template<bool optimiseX, bool optimiseY, bool optimiseZ>
float GetMaximalLength(const float4 *imageCuda, const size_t nVoxels) {
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, nVoxels, cudaChannelFormatKindFloat, 4);
auto imageTexture = *imageTexturePtr;
thrust::counting_iterator<unsigned> index(0);
return thrust::transform_reduce(thrust::device, index, index + nVoxels, [=]__device__(const unsigned index) {
Expand Down Expand Up @@ -47,8 +46,7 @@ float NiftyReg::Cuda::GetMaximalLength(const float4 *imageCuda,
/* *************************************************************** */
template<bool optimiseX, bool optimiseY, bool optimiseZ>
void NormaliseGradient(float4 *imageCuda, const size_t nVoxels, const double maxGradLengthInv) {
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, cudaResourceTypeLinear,
nVoxels * sizeof(float4), cudaChannelFormatKindFloat, 4);
auto imageTexturePtr = Cuda::CreateTextureObject(imageCuda, nVoxels, cudaChannelFormatKindFloat, 4);
auto imageTexture = *imageTexturePtr;
thrust::for_each_n(thrust::device, thrust::make_counting_iterator<unsigned>(0), nVoxels, [=]__device__(const unsigned index) {
const float4 val = tex1Dfetch<float4>(imageTexture, index);
Expand Down
Loading

0 comments on commit f953b5f

Please sign in to comment.