jipolanco · jipolanco · Nov 17, 2024 · Nov 17, 2024 · Nov 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Changed
+
+- Avoid large GPU allocation in type-2 transforms when using the CUDA backend.
+  The allocation was due to CUDA.jl creating a copy of the input in complex-to-real FFTs
+  (see [CUDA.jl#2249](https://github.com/JuliaGPU/CUDA.jl/issues/2249)).
+
 ## [v0.6.2](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.6.1) - 2024-11-04
 
 ### Changed

diff --git a/ext/NonuniformFFTsCUDAExt.jl b/ext/NonuniformFFTsCUDAExt.jl
@@ -3,6 +3,7 @@ module NonuniformFFTsCUDAExt
 using NonuniformFFTs
 using NonuniformFFTs.Kernels: Kernels
 using CUDA
+using CUDA.CUFFT: CUFFT
 using CUDA: @device_override
 
 # This is currently not wrapped in CUDA.jl, probably because besseli0 is not defined by
@@ -46,4 +47,17 @@ end
 
 NonuniformFFTs.groupsize_interp_gpu_shmem(::CUDABackend) = 64
 
+# Override usual `mul!` to avoid GPU allocations.
+# See https://github.com/JuliaGPU/CUDA.jl/issues/2249
+# This is adapted from https://github.com/JuliaGPU/CUDA.jl/blob/a1db081cbc3d20fa3cb28a9f419b485db03a250f/lib/cufft/fft.jl#L308-L317
+# but without the copy.
+function NonuniformFFTs._fft_c2r!(
+        y::DenseCuArray{T}, p, x::DenseCuArray{Complex{T}},
+    ) where {T}
+    # Perform plan (this may modify not only y, but also the input x)
+    CUFFT.assert_applicable(p, x, y)
+    CUFFT.unsafe_execute_trailing!(p, x, y)
+    y
+end
+
 end
diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
@@ -271,12 +271,18 @@ end
 function _type2_fft!(data::RealNUFFTData)
     (; us, ûs, plan_bw,) = data
     for (u, û) ∈ zip(us, ûs)
-        # TODO: can we avoid big GPU allocation on CUDA.jl? (https://github.com/JuliaGPU/CUDA.jl/issues/2249)
-        mul!(u, plan_bw, û)  # perform inverse r2c FFT
+        _fft_c2r!(u, plan_bw, û)  # perform inverse r2c FFT
     end
     us
 end
 
+# Perform inverse r2c FFT.
+# This function is overridden by the CUDA extension to avoid GPU allocations.
+# See https://github.com/JuliaGPU/CUDA.jl/issues/2249
+function _fft_c2r!(u::AbstractArray{T}, plan_bw, û::AbstractArray{Complex{T}}) where {T}
+    mul!(u, plan_bw, û)
+end
+
 function _type2_fft!(data::ComplexNUFFTData)
     (; us, plan_bw,) = data
     for u ∈ us