Use direct evaluation of kernel functions on GPU (#39)

* Start work on shared-memory implementation * WIP * Interpolation now works (in 3D, ntransforms = 1) * Minor changes * Optimisations * Further optimisations * Generalise interpolation to all dimensions * Use get_inds_vals_gpu in spreading * Spreading with shmem works but is slow Atomic add on shared memory is very slow. Is it Atomix's fault? * SM kernels now work with KA CPU backends * Test SM kernels on CPU * Fix kernel compilation on CUDA * Reorganise some code * [WIP] avoid atomics in shared-memory arrays * Avoid atomic operations on shared memory * Minor improvement * Make window_vals a matrix * Implement hybrid parallelisation in SM spreading Much faster! * More optimisations * Try to fix tests (on CPU) * Shared memory array can be complex * Update interpolation based on spreading changes * Simplify setting workgroupsize * Minor changes * Fix CPU tests * Remove unused functions * Add tests for multiple transforms * Simplify atomic adds with complex data Doesn't change performance. * point_to_cell now also returns x/Δx * Remove direct evaluation functions (for now) * Update CHANGELOG [skip ci] * Add documentation * Update docs and comments * Define direct evaluation of KB kernels * Fix direct evaluation * Minor optimisations * Use direct evaluation in GPU kernels It is the same or faster than polynomial approximation (and more accurate!). Especially for shared-memory interpolation, it seems to improve performance by a lot. * Avoid division by zero in BKB kernel * Simplify window evaluation Doesn't really affect performance, on GPU at least. * Add comment on besseli0 * Add empty CUDA extension * KB kernel: call CUDA version of besseli0 * Update comment * Define direct evaluation for GaussianKernel * Define direct evaluation for BSplineKernel * Allow different default kernel per backend * Update comments * Add tests * Update CHANGELOG * Remove old comment * Update tests
jipolanco · Oct 25, 2024 · dd4d842 · dd4d842
1 parent ae8f012
commit dd4d842
Show file tree

Hide file tree

Showing 18 changed files with 172 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,12 +13,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- [BREAKING] Change default precision of transforms.
+- **BREAKING**: Change default precision of transforms.
   By default, transforms on `Float64` or `ComplexF64` now have a relative precision of the order of $10^{-7}$.
   This corresponds to setting `m = HalfSupport(4)` and oversampling factor `σ = 2.0`.
   Previously, the default was `m = HalfSupport(8)` and `σ = 2.0`, corresponding
   to a relative precision of the order of $10^{-14}$.
 
+- On GPUs, we now use direct evaluation of kernel functions (e.g.
+  Kaiser-Bessel) instead of polynomial approximations, as this seems to be
+  faster and uses far fewer GPU registers.
+
+- On CUDA, the default kernel is now `KaiserBesselKernel` instead of `BackwardsKaiserBesselKernel`.
+  The former seems to be a bit faster when using Bessel functions defined in CUDA.
+  Accuracy doesn't change much since both kernels have similar precisions.
+
 ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17
 
 ### Changed

diff --git a/Project.toml b/Project.toml
@@ -20,13 +20,20 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+NonuniformFFTsCUDAExt = ["CUDA"]
+
 [compat]
 AbstractFFTs = "1.5.0"
 AbstractNFFTs = "0.8.2"
 Adapt = "4.0.4"
 Atomix = "0.1.0"
 Bessels = "0.2"
 Bumper = "0.6, 0.7"
+CUDA = "5.5"
 FFTW = "1.7"
 GPUArraysCore = "0.1.6, 0.2"
 KernelAbstractions = "0.9.25"

diff --git a/ext/NonuniformFFTsCUDAExt.jl b/ext/NonuniformFFTsCUDAExt.jl
@@ -0,0 +1,21 @@
+module NonuniformFFTsCUDAExt
+
+using NonuniformFFTs
+using NonuniformFFTs.Kernels: Kernels
+using CUDA
+using CUDA: @device_override
+
+# This is currently not wrapped in CUDA.jl, probably because besseli0 is not defined by
+# SpecialFunctions.jl either (the more general besseli is defined though).
+# See:
+#  - https://docs.nvidia.com/cuda/cuda-math-api/index.html for available functions
+#  - https://github.com/JuliaGPU/CUDA.jl/blob/master/ext/SpecialFunctionsExt.jl for functions wrapped in CUDA.jl
+@device_override Kernels._besseli0(x::Float64) = ccall("extern __nv_cyl_bessel_i0", llvmcall, Cdouble, (Cdouble,), x)
+@device_override Kernels._besseli0(x::Float32) = ccall("extern __nv_cyl_bessel_i0f", llvmcall, Cfloat, (Cfloat,), x)
+
+# Set KaiserBesselKernel as default backend on CUDA.
+# It's slightly faster than BackwardsKaiserBesselKernel when using Bessel functions from CUDA (wrapped above).
+# The difference is not huge though.
+NonuniformFFTs.default_kernel(::CUDABackend) = KaiserBesselKernel()
+
+end
diff --git a/src/Kernels/Kernels.jl b/src/Kernels/Kernels.jl
@@ -85,6 +85,13 @@ end
 # Note: evaluate_kernel_func generates a function which is callable from GPU kernels.
 @inline evaluate_kernel(g::AbstractKernelData, x₀) = evaluate_kernel_func(g)(x₀)
 
+@inline function evaluate_kernel_direct(g::AbstractKernelData, x)
+    Δx = gridstep(g)
+    i, r = point_to_cell(x, Δx)
+    values = _evaluate_kernel_direct(g, i, r)
+    (; i, values,)
+end
+
 @inline function kernel_indices(i, ::AbstractKernelData{K, M}, args...) where {K, M}
     kernel_indices(i, HalfSupport(M), args...)
 end

diff --git a/src/Kernels/bspline.jl b/src/Kernels/bspline.jl
@@ -107,6 +107,14 @@ function evaluate_kernel_func(g::BSplineKernelData{M}) where {M}
     end
 end
 
+function _evaluate_kernel_direct(
+        ::BSplineKernelData{M}, i::Integer, r::T,
+    ) where {M, T}
+    x′ = i - r  # normalised coordinate, 0 < x′ ≤ 1
+    k = 2M  # B-spline order
+    bsplines_evaluate_all(x′, Val(k), T)
+end
+
 function evaluate_fourier_func(g::BSplineKernelData{M}) where {M}
     (; Δt,) = g
     function (k)

diff --git a/src/Kernels/gaussian.jl b/src/Kernels/gaussian.jl
@@ -131,6 +131,19 @@ function evaluate_kernel_func(g::GaussianKernelData{M}) where {M}
     end
 end
 
+# Note: direct evaluation seems to be faster than Gaussian gridding on GPU (tested on A100).
+@inline function _evaluate_kernel_direct(
+        g::GaussianKernelData{M}, i::Integer, r::T,
+    ) where {M, T}
+    (; τ, Δx,) = g
+    X = r - T(i - 1)  # = (x - x[i]) / Δx = x / Δx - (i - 1)
+    # @assert 0 ≤ X < 1
+    js = SVector(ntuple(identity, Val(2M)))
+    ys = @. T(M - js + X) * Δx
+    vals = @. exp(-ys^2 / τ)
+    Tuple(vals)
+end
+
 @inline function gaussian_gridding(a, b, cs::NTuple{M}) where {M}
     if @generated
         v₀ = Symbol(:v_, M)

diff --git a/src/Kernels/kaiser_bessel.jl b/src/Kernels/kaiser_bessel.jl
@@ -1,10 +1,10 @@
 export KaiserBesselKernel
 
-using Bessels: besseli0, besseli1
+using Bessels: Bessels
 
 # This is the equivalent variance of a KB window with width w = 1.
 # Should be compared to the variance of a Gaussian window.
-kb_equivalent_variance(β) = besseli0(β) / (β * besseli1(β))
+kb_equivalent_variance(β) = Bessels.besseli0(β) / (β * Bessels.besseli1(β))
 
 @doc raw"""
     KaiserBesselKernel <: AbstractKernel
@@ -118,7 +118,7 @@ struct KaiserBesselKernelData{
         gk = KA.allocate(backend, T, 0)
         Npoly = M + 4  # degree of polynomial is d = Npoly - 1
         cs = solve_piecewise_polynomial_coefficients(T, Val(M), Val(Npoly)) do x
-            besseli0(β * sqrt(1 - x^2))
+            Bessels.besseli0(β * sqrt(1 - x^2))
         end
         KaiserBesselKernelData{M}(Δx, σ, w, β, β², cs, gk)
     end
@@ -166,9 +166,33 @@ function evaluate_kernel_func(g::KaiserBesselKernelData{M, T}) where {M, T}
     (; Δx, cs,) = g
     function (x)
         i, r = point_to_cell(x, Δx)
-        X = (r - T(i - 1)) / M  # source position relative to xs[i]
-        # @assert 0 ≤ X < 1 / M
+        X = r - T(i - 1)  # = (x - x[i]) / Δx = x / Δx - (i - 1)
+        # @assert 0 ≤ X < 1
         values = evaluate_piecewise(X, cs)
         (; i, values,)
     end
 end
+
+# Not sure Bessels.besseli0 is optimised/overridden for GPUs. It works but I'm not sure it's
+# optimal, since it uses the implementation in Bessels.jl which is not made for
+# GPUs. An alternative would be to use functions from SpecialFunctions.jl, for which CUDA.jl
+# redirects to CUDA functions. However, CUDA.jl currently doesn't wrap the cyl_bessel_i0
+# function needed here (and also, SpecialFunctions doesn't provide a besseli0 function, but
+# only a besseli function for arbitrary order).
+# So, on CUDA, we use a package extension (ext/NonuniformFFTsCUDAExt.jl) to override this
+# function, and call the CUDA-defined version instead.
+_besseli0(x) = Bessels.besseli0(x)
+
+@inline function _evaluate_kernel_direct(
+        g::KaiserBesselKernelData{M, T}, i::Integer, r::T,
+    ) where {M, T}
+    (; β,) = g
+    X = r - T(i - 1)  # = (x - x[i]) / Δx = x / Δx - (i - 1)
+    # @assert 0 ≤ X < 1
+    js = SVector(ntuple(identity, Val(2M)))
+    ys = @. T(M - js + X) / M
+    zs = @. 1 - ys^2
+    s = @fastmath sqrt.(zs)  # the @fastmath avoids checking that z ≥ 0, returns NaN otherwise
+    vals = _besseli0.(β * s)
+    Tuple(vals)
+end
diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl
@@ -137,9 +137,28 @@ function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M
     (; Δx, cs,) = g
     function (x)
         i, r = point_to_cell(x, Δx)  # r = x / Δx
-        X = (r - T(i - 1)) / M  # source position relative to xs[i]
-        # @assert 0 ≤ X < 1 / M
+        X = r - T(i - 1)  # = (x - x[i]) / Δx = x / Δx - (i - 1)
+        # @assert 0 ≤ X < 1
         values = evaluate_piecewise(X, cs)
         (; i, values,)
     end
 end
+
+@inline function _evaluate_kernel_direct(
+        g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, r::T,
+    ) where {M, T}
+    (; β,) = g
+    X = r - T(i - 1)  # = (x - x[i]) / Δx = x / Δx - (i - 1)
+    # @assert 0 ≤ X < 1
+    js = SVector(ntuple(identity, Val(2M)))
+    ys = @. T(M - js + X) / M
+    zs = @. 1 - ys^2
+    s = @fastmath sqrt.(zs)  # the @fastmath avoids checking that z ≥ 0, returns NaN otherwise
+    vals = map(s) do s
+        @inline
+        # In rare cases `s` can be zero (if the point x is on the grid, and thus r = 0).
+        # The kernel is well defined at s = 0 since sinh(x)/x → 1 when x → 0 (same as sinc).
+        ifelse(iszero(s), one(s), sinh(β * s) / (β * s)) * (β / T(π))
+    end
+    Tuple(vals)
+end
diff --git a/src/Kernels/piecewise_polynomial.jl b/src/Kernels/piecewise_polynomial.jl
@@ -74,9 +74,8 @@ function solve_piecewise_polynomial_coefficients(
 end
 
 function evaluate_piecewise(δ, cs::NTuple)
-    L = length(first(cs))
-    # @assert 0 ≤ δ < 2/L == 1/M
-    x = L * δ - 1  # in [-1, 1]
+    # @assert 0 ≤ δ < 1
+    x = 2δ - 1  # in [-1, 1]
     evaluate_horner(x, cs)
 end
 

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
@@ -45,7 +45,8 @@ export
     exec_type1!,
     exec_type2!
 
-default_kernel() = BackwardsKaiserBesselKernel()
+default_kernel(::KA.Backend) = BackwardsKaiserBesselKernel()
+default_kernel() = default_kernel(CPU())
 
 default_block_size(::Dims, ::CPU) = 4096  # in number of linear elements
 default_block_size(::Dims, ::GPU) = 1024  # except in 2D and 3D (see below)

diff --git a/src/abstractNFFTs.jl b/src/abstractNFFTs.jl
@@ -54,7 +54,7 @@ function AbstractNFFTs.nodes!(p::PlanNUFFT{Complex{T}}, xp::Matrix{T}) where {T
     invoke(AbstractNFFTs.nodes!, Tuple{PlanNUFFT, AbstractMatrix{T}}, p, xp)
 end
 
-Base.@constprop :aggressive function convert_window_function(w::Symbol)
+Base.@constprop :aggressive function convert_window_function(w::Symbol, backend)
     if w === :gauss
         GaussianKernel()
     elseif w === :spline
@@ -67,7 +67,7 @@ Base.@constprop :aggressive function convert_window_function(w::Symbol)
     elseif w === :kaiser_bessel
         BackwardsKaiserBesselKernel()
     else
-        default_kernel()
+        default_kernel(backend)
     end
 end
 
@@ -81,7 +81,7 @@ end
 Base.@constprop :aggressive function PlanNUFFT(
         xp::AbstractMatrix{Tr}, Ns::Dims;
         fftflags = FFTW.ESTIMATE, blocking = true, sortNodes = false,
-        window = default_kernel(),
+        window = default_kernel(KA.get_backend(xp)),
         fftshift = true,  # for compatibility with NFFT.jl
         synchronise = false,
         kwargs...,
@@ -93,7 +93,7 @@ Base.@constprop :aggressive function PlanNUFFT(
     backend = KA.get_backend(xp)  # e.g. use GPU backend if xp is a GPU array
     sort_points = sortNodes ? True() : False()  # this is type-unstable (unless constant propagation happens)
     block_size = blocking ? default_block_size(Ns, backend) : nothing  # also type-unstable
-    kernel = window isa AbstractKernel ? window : convert_window_function(window)
+    kernel = window isa AbstractKernel ? window : convert_window_function(window, backend)
     p = PlanNUFFT(
         Complex{Tr}, Ns, HalfSupport(m);
         backend, σ = Tr(σ), sort_points, fftshift, block_size,

diff --git a/src/gpu_common.jl b/src/gpu_common.jl
@@ -81,7 +81,7 @@ end
 
 @inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer)
     x = @inbounds points[j]
-    gdata = Kernels.evaluate_kernel(g, x)
+    gdata = Kernels.evaluate_kernel_direct(g, x)
     vals = gdata.values    # kernel values
     M = Kernels.half_support(g)
     i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
@@ -256,7 +256,7 @@ end
             indvals = ntuple(Val(D)) do d
                 @inline
                 x = @inbounds points[d][j]
-                gdata = Kernels.evaluate_kernel(gs[d], x)
+                gdata = Kernels.evaluate_kernel_direct(gs[d], x)
                 local i₀ = gdata.i - ishifts_sm[d]
                 local vals = gdata.values    # kernel values
                 # @assert i₀ ≥ 0

diff --git a/src/plan.jl b/src/plan.jl
@@ -413,11 +413,12 @@ end
 function PlanNUFFT(
         ::Type{T}, Ns::Dims, h::HalfSupport;
         ntransforms = Val(1),
-        kernel::AbstractKernel = default_kernel(),
+        backend = CPU(),
+        kernel::AbstractKernel = default_kernel(backend),
         σ::Real = real(T)(2), kws...,
     ) where {T <: Number}
     R = real(T)
-    _PlanNUFFT(T, kernel, h, R(σ), Ns, to_static(ntransforms); kws...)
+    _PlanNUFFT(T, kernel, h, R(σ), Ns, to_static(ntransforms); backend, kws...)
 end
 
 @inline to_static(ntrans::Val) = ntrans

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
@@ -318,7 +318,7 @@ end
                 p, d = Tuple(inds[n])
                 g = gs[d]
                 x = points_sm[d, p]
-                gdata = Kernels.evaluate_kernel(g, x)
+                gdata = Kernels.evaluate_kernel_direct(g, x)
                 ishift = ishifts_sm[d]
                 inds_start[d, p] = gdata.i - ishift
                 local vals = gdata.values

diff --git a/test/approx_window_functions.jl b/test/approx_window_functions.jl
@@ -0,0 +1,35 @@
+# Test direct ("exact") and approximate evaluation of window functions.
+# This is particularly useful for testing approximations based on piecewise polynomials.
+
+using NonuniformFFTs
+using NonuniformFFTs.Kernels
+using StaticArrays
+using Test
+
+function test_kernel(kernel; σ = 1.5, m = HalfSupport(4), N = 256)
+    backend = CPU()
+    Δx = 2π / N
+    g = Kernels.optimal_kernel(kernel, m, Δx, σ; backend)
+    xs = range(0.8, 2.2; length = 1000) .* Δx
+
+    for x ∈ xs
+        a = Kernels.evaluate_kernel(g, x)
+        b = Kernels.evaluate_kernel_direct(g, x)
+        @test a.i == b.i  # same bin
+        @test SVector(a.values) ≈ SVector(b.values) rtol=1e-7  # TODO: rtol should depend on (M, σ, kernel)
+    end
+
+    nothing
+end
+
+@testset "Kernel approximations" begin
+    kernels = (
+        BSplineKernel(),
+        GaussianKernel(),
+        KaiserBesselKernel(),
+        BackwardsKaiserBesselKernel(),
+    )
+    @testset "$(nameof(typeof(kernel)))" for kernel ∈ kernels
+        test_kernel(kernel)
+    end
+end
diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
@@ -103,8 +103,13 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc}
     r_gpu = run_plan(p_gpu, xp_init, vp_init)
 
     for c ∈ 1:Nc
-        @test r_cpu.us[c] ≈ r_gpu.us[c]  # output of type-1 transform
-        @test r_cpu.wp[c] ≈ r_gpu.wp[c]  # output of type-2 transform
+        # The differences of the order of 1e-7 (= roughly the expected accuracy given the
+        # chosen parameters) are explained by the fact that the CPU uses a polynomial
+        # approximation of the KB kernel, while the GPU evaluates it "exactly" from its
+        # definition (based on Bessel functions for KB).
+        rtol = Tr === Float64 ? 1e-7 : Tr === Float32 ? 1f-5 : nothing
+        @test r_cpu.us[c] ≈ r_gpu.us[c] rtol=rtol  # output of type-1 transform
+        @test r_cpu.wp[c] ≈ r_gpu.wp[c] rtol=rtol  # output of type-2 transform
     end
 
     nothing

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -20,6 +20,7 @@ end
 
 @testset "NonuniformFFTs.jl" begin
     @includetest "errors.jl"
+    @includetest "approx_window_functions.jl"
     @includetest "accuracy.jl"
     @includetest "multidimensional.jl"
     @includetest "pseudo_gpu.jl"