Merge pull request #29 from jipolanco/gpu

Initial GPU implementation
jipolanco · Sep 19, 2024 · 0afefcf · 0afefcf
2 parents db05648 + 7c379d1
commit 0afefcf
Show file tree

Hide file tree

Showing 24 changed files with 1,058 additions and 404 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -55,5 +55,6 @@ jobs:
       - uses: codecov/codecov-action@v4
         with:
           files: lcov.info
+          token: ${{ secrets.CODECOV_TOKEN }} # required
 
 # vim: shiftwidth=2
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,23 @@
+# Changelog
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## Unreleased
+
+### Added
+
+- Add preliminary GPU support.
+
+## [v0.4.1] - 2024-09-14
+
+### Fixed
+
+- AbstractNFFTs interface: fix 1D transforms.
+
+## [v0.4.0] - 2024-09-13
+
+### Added
+
+- Implement [AbstractNFFTs](https://juliamath.github.io/NFFT.jl/stable/abstract/)
+  interface for easier comparison with other NUFFT packages.
diff --git a/Project.toml b/Project.toml
@@ -6,9 +6,12 @@ version = "0.4.1"
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Bessels = "0e736298-9ec6-45e8-9647-e4fc86a2fe38"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
@@ -19,9 +22,12 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 [compat]
 AbstractFFTs = "1.5.0"
 AbstractNFFTs = "0.8.2"
+Atomix = "0.1.0"
 Bessels = "0.2"
 Bumper = "0.6, 0.7"
 FFTW = "1.7"
+GPUArraysCore = "0.1.6"
+KernelAbstractions = "0.9.25"
 LinearAlgebra = "1.9"
 PrecompileTools = "1.2"
 Static = "0.8, 1"

diff --git a/README.md b/README.md
@@ -7,10 +7,13 @@
 
 Yet another package for computing multidimensional [non-uniform fast Fourier transforms (NUFFTs)](https://en.wikipedia.org/wiki/NUFFT) in Julia.
 
-Like other [existing packages](#differences-with-other-packages), computations
+Like other [existing packages](#differences-with-other-packages), CPU computations
 are parallelised using threads.
 By default, all available Julia threads are used.
 
+Preliminary support for GPUs is also available.
+In principle all kinds of GPU are supported.
+
 ## Basic usage
 
 ### Type-1 (or *adjoint*) NUFFT in one dimension
@@ -27,7 +30,7 @@ xp = rand(T, Np) .* T(2π)  # non-uniform points in [0, 2π]
 vp = randn(T, Np)          # random values at points
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, N; m = HalfSupport(8))  # larger support increases accuracy
+plan_nufft = PlanNUFFT(T, N; m = HalfSupport(4))  # larger support increases accuracy
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -51,7 +54,7 @@ xp = rand(T, Np) .* T(2π)          # non-uniform points in [0, 2π]
 ûs = randn(Complex{T}, N ÷ 2 + 1)  # random values at points (we need to store roughly half the Fourier modes for complex-to-real transform)
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, N; m = HalfSupport(8))
+plan_nufft = PlanNUFFT(T, N; m = HalfSupport(4))
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -80,7 +83,7 @@ xp = [T(2π) * rand(SVector{d, T}) for _ ∈ 1:Np]  # non-uniform points in [0,
 vp = randn(T, Np)                                # random values at points
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(8))
+plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(4))
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -127,6 +130,59 @@ exec_type2!(vp, plan_nufft, ûs)
 
 </details>
 
+<details>
+<summary><b>Transforms on the GPU</b></summary>
+
+Below is a GPU version of the multidimensional transform example above.
+The only differences are:
+
+- we import CUDA.jl and Adapt.jl (optional)
+- we pass `backend = CUDABackend()` to `PlanNUFFT` (`CUDABackend` is a [KernelAbstractions backend](https://juliagpu.github.io/KernelAbstractions.jl/stable/#Supported-backends) and is exported by CUDA.jl).
+  The default is `backend = CPU()`.
+- we copy input arrays to the GPU before calling any NUFFT-related functions (`set_points!`, `exec_type1!`, `exec_type2!`)
+
+The example is for an Nvidia GPU (using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)), but should also work with e.g. [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
+by simply choosing `backend = ROCBackend()` (this hasn't been tested yet).
+
+```julia
+using NonuniformFFTs
+using StaticArrays: SVector  # for convenience
+using CUDA
+using Adapt: adapt  # optional (see below)
+
+backend = CUDABackend()  # other options are CPU() or ROCBackend() (untested)
+
+Ns = (256, 256)  # number of Fourier modes in each direction
+Np = 1000        # number of non-uniform points
+
+# Generate some non-uniform random data
+T = Float64                                          # non-uniform data is real (can also be complex)
+d = length(Ns)                                       # number of dimensions (d = 2 here)
+xp_cpu = [T(2π) * rand(SVector{d, T}) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
+vp_cpu = randn(T, Np)                                # random values at points
+
+# Copy data to the GPU (using Adapt is optional but it makes code more generic).
+# Note that all data needs to be on the GPU before setting points or executing transforms.
+# We could have also generated the data directly on the GPU.
+xp = adapt(backend, xp_cpu)  # returns a CuArray if backend = CUDABackend
+vp = adapt(backend, vp_cpu)
+
+# Create plan for data of type T
+plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(4), backend)
+
+# Set non-uniform points
+set_points!(plan_nufft, xp)
+
+# Perform type-1 NUFFT on preallocated output
+ûs = similar(vp, Complex{T}, size(plan_nufft))  # initialises a GPU array for the output
+exec_type1!(ûs, plan_nufft, vp)
+
+# Perform type-2 NUFFT on preallocated output
+exec_type2!(vp, plan_nufft, ûs)
+```
+
+</details>
+
 <details>
 <summary><b>Using the AbstractNFFTs.jl interface</b></summary>
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -2,10 +2,13 @@
 
 Yet another package for computing multidimensional [non-uniform fast Fourier transforms (NUFFTs)](https://en.wikipedia.org/wiki/NUFFT) in Julia.
 
-Like other [existing packages](#similar-packages), computations are
+Like other [existing packages](#similar-packages), CPU computations are
 parallelised using threads.
 By default, all available Julia threads are used.
 
+Preliminary support for GPUs is also available.
+In principle all kinds of GPU are supported.
+
 ## Installation
 
 NonuniformFFTs.jl can be simply installed from the Julia REPL with:
@@ -90,7 +93,7 @@ xp = rand(T, Np) .* 2π  # non-uniform points in [0, 2π]
 vp = randn(T, Np)       # random values at points
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, N; m = HalfSupport(8))  # larger support increases accuracy
+plan_nufft = PlanNUFFT(T, N; m = HalfSupport(4))  # larger support increases accuracy
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -115,7 +118,7 @@ xp = rand(T, Np) .* 2π             # non-uniform points in [0, 2π]
 ûs = randn(Complex{T}, N ÷ 2 + 1)  # random values at points (we need to store roughly half the Fourier modes for complex-to-real transform)
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, N; m = HalfSupport(8))
+plan_nufft = PlanNUFFT(T, N; m = HalfSupport(4))
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -141,7 +144,7 @@ xp = [2π * rand(SVector{d, T}) for _ ∈ 1:Np]  # non-uniform points in [0, 2π
 vp = randn(T, Np)                             # random values at points
 
 # Create plan for data of type T
-plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(8))
+plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(4))
 
 # Set non-uniform points
 set_points!(plan_nufft, xp)
@@ -185,6 +188,58 @@ vp_interp = map(similar, vp)
 exec_type2!(vp, plan_nufft, ûs)
 ```
 
+## GPU usage
+
+GPUs are preliminary supported.
+
+Below is a GPU version of the multidimensional transform example above.
+The only differences are:
+
+- we import CUDA.jl and Adapt.jl (optional)
+- we pass `backend = CUDABackend()` to `PlanNUFFT` (`CUDABackend` is a [KernelAbstractions backend](https://juliagpu.github.io/KernelAbstractions.jl/stable/#Supported-backends) and is exported by CUDA.jl).
+  The default is `backend = CPU()`.
+- we copy input arrays to the GPU before calling any NUFFT-related functions (`set_points!`, `exec_type1!`, `exec_type2!`)
+
+The example is for an Nvidia GPU (using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl)), but should also work with e.g. [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl)
+by simply choosing `backend = ROCBackend()` (this hasn't been tested yet).
+
+```julia
+using NonuniformFFTs
+using StaticArrays: SVector  # for convenience
+using CUDA
+using Adapt: adapt  # optional (see below)
+
+backend = CUDABackend()  # other options are CPU() or ROCBackend() (untested)
+
+Ns = (256, 256)  # number of Fourier modes in each direction
+Np = 1000        # number of non-uniform points
+
+# Generate some non-uniform random data
+T = Float64                                          # non-uniform data is real (can also be complex)
+d = length(Ns)                                       # number of dimensions (d = 2 here)
+xp_cpu = [T(2π) * rand(SVector{d, T}) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
+vp_cpu = randn(T, Np)                                # random values at points
+
+# Copy data to the GPU (using Adapt is optional but it makes code more generic).
+# Note that all data needs to be on the GPU before setting points or executing transforms.
+# We could have also generated the data directly on the GPU.
+xp = adapt(backend, xp_cpu)  # returns a CuArray if backend = CUDABackend
+vp = adapt(backend, vp_cpu)
+
+# Create plan for data of type T
+plan_nufft = PlanNUFFT(T, Ns; m = HalfSupport(4), backend)
+
+# Set non-uniform points
+set_points!(plan_nufft, xp)
+
+# Perform type-1 NUFFT on preallocated output
+ûs = similar(vp, Complex{T}, size(plan_nufft))  # initialises a GPU array for the output
+exec_type1!(ûs, plan_nufft, vp)
+
+# Perform type-2 NUFFT on preallocated output
+exec_type2!(vp, plan_nufft, ûs)
+```
+
 ## [AbstractNFFTs.jl interface](@id AbstractNFFTs-interface)
 
 This package also implements the [AbstractNFFTs.jl](https://juliamath.github.io/NFFT.jl/stable/abstract/)

diff --git a/src/Kernels/Kernels.jl b/src/Kernels/Kernels.jl
@@ -1,5 +1,7 @@
 module Kernels
 
+using KernelAbstractions: KernelAbstractions as KA
+
 export HalfSupport
 
 struct HalfSupport{M} end
@@ -62,9 +64,7 @@ function init_fourier_coefficients!(g::AbstractKernelData, ks::AbstractVector)
     Nk == length(gk) && return gk  # assume coefficients were already computed
     resize!(gk, Nk)
     @assert eachindex(gk) == eachindex(ks)
-    @inbounds for (i, k) ∈ pairs(ks)
-        gk[i] = evaluate_fourier(g, k)
-    end
+    evaluate_fourier!(g, gk, ks)
     gk
 end
 
@@ -79,16 +79,23 @@ end
     i
 end
 
-@inline function evaluate_kernel(g::AbstractKernelData, x₀)
-    dx = gridstep(g)
-    i = point_to_cell(x₀, dx)
-    evaluate_kernel(g, x₀, i)
+# Note: evaluate_kernel_func generates a function which is callable from GPU kernels.
+# (Directly passing an AbstractKernelData to a GPU kernel fails, at least with CUDA).
+@inline evaluate_kernel(g::AbstractKernelData, x₀) = evaluate_kernel_func(g)(x₀)
+
+@inline function kernel_indices(i, ::AbstractKernelData{K, M}, args...) where {K, M}
+    kernel_indices(i, HalfSupport(M), args...)
+end
+
+# Returns a function which is callable from GPU kernels.
+function kernel_indices_func(::AbstractKernelData{K, M}) where {K, M}
+    @inline (i, args...) -> kernel_indices(i, HalfSupport(M), args...)
 end
 
 # Takes into account periodic wrapping.
 # This is equivalent to calling mod1(j, N) for each j, but much much faster.
 # We assume the central index `i` is in 1:N and that M < N / 2.
-function kernel_indices(i, ::AbstractKernelData{K, M}, N::Integer) where {K, M}
+function kernel_indices(i, ::HalfSupport{M}, N::Integer) where {M}
     L = 2M
     j = i - M
     j = ifelse(j < 0, j + N, j)
@@ -102,7 +109,7 @@ end
 
 # This variant can be used when periodic wrapping is not needed.
 # (Used when doing block partitioning for parallelisation using threads.)
-function kernel_indices(i, ::AbstractKernelData{K, M}) where {K, M}
+function kernel_indices(i, ::HalfSupport{M}) where {M}
     (i - M + 1):(i + M)
 end
 

diff --git a/src/Kernels/bspline.jl b/src/Kernels/bspline.jl
@@ -44,16 +44,19 @@ be equal to the grid step ``Δx``.
 This means that the resulting variance of the B-spline kernels is fixed to
 ``σ^2 = (n / 12) Δt^2 = (M / 6) Δx^2``.
 """
-struct BSplineKernelData{M, T <: AbstractFloat} <: AbstractKernelData{BSplineKernel, M, T}
+struct BSplineKernelData{
+        M, T <: AbstractFloat,
+        FourierCoefs <: AbstractVector{T},
+    } <: AbstractKernelData{BSplineKernel, M, T}
     σ  :: T
     Δt :: T          # knot separation
-    gk :: Vector{T}  # values in uniform Fourier grid
-    function BSplineKernelData{M}(Δx::Real) where {M}
+    gk :: FourierCoefs  # values in uniform Fourier grid
+    function BSplineKernelData{M}(backend::KA.Backend, Δx::Real) where {M}
         Δt = Δx
         σ = sqrt(M / 6) * Δt
         T = eltype(Δt)
-        gk = Vector{T}(undef, 0)
-        new{M, T}(T(σ), Δt, gk)
+        gk = KA.allocate(backend, T, 0)
+        new{M, T, typeof(gk)}(T(σ), Δt, gk)
     end
 end
 
@@ -66,8 +69,8 @@ function Base.show(io::IO, ::BSplineKernelData{M}) where {M}
 end
 
 # Here we ignore the oversampling factor, this kernel is not very adjustable...
-optimal_kernel(::BSplineKernel, h::HalfSupport, Δx, σ) =
-    BSplineKernelData(h, Δx)
+optimal_kernel(::BSplineKernel, h::HalfSupport, Δx, σ; backend) =
+    BSplineKernelData(h, backend, Δx)
 
 """
     order(::BSplineKernelData{M})
@@ -78,23 +81,30 @@ Note: the polynomial degree is `n - 1`.
 """
 order(::BSplineKernelData{M}) where {M} = 2M
 
-function evaluate_kernel(g::BSplineKernelData{M}, x, i::Integer) where {M}
+function evaluate_kernel_func(g::BSplineKernelData{M}) where {M}
     # The integral of a single B-spline, using its standard definition, is Δt.
     # This can be shown using the partition of unity property.
     (; Δt,) = g
-    x′ = i - (x / Δt)  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
-    # @assert 0 ≤ x′ ≤ 1
-    k = 2M  # B-spline order
-    values = bsplines_evaluate_all(x′, Val(k), typeof(Δt))
-    (; i, values,)
+    function (x)
+        i = point_to_cell(x, Δt)  # assume Δx = Δt
+        x′ = i - (x / Δt)  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
+        # @assert 0 ≤ x′ ≤ 1
+        k = 2M  # B-spline order
+        values = bsplines_evaluate_all(x′, Val(k), typeof(Δt))
+        (; i, values,)
+    end
 end
 
-function evaluate_fourier(g::BSplineKernelData{M}, k) where {M}
+# This should work on CPU and GPU.
+function evaluate_fourier!(g::BSplineKernelData{M}, gk::AbstractVector, ks::AbstractVector) where {M}
     (; Δt,) = g
-    kh = k * Δt / 2
-    s = sin(kh) / kh
-    n = 2M
-    ifelse(iszero(k), one(s), s^n) * Δt
+    @assert eachindex(gk) == eachindex(ks)
+    map!(gk, ks) do k
+        kh = k * Δt / 2
+        s = sin(kh) / kh
+        n = 2M
+        ifelse(iszero(k), one(s), s^n) * Δt
+    end
 end
 
 # Adapted from BSplineKit.jl.