From 11e566e62c4cb8ee3869ba5eb8b123000aab2f72 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 19 Sep 2024 16:32:06 +0200
Subject: [PATCH 01/10] Adapt Hilbert sorting from BijectiveHilbert.jl

---
 src/NonuniformFFTs.jl  |   1 +
 src/sorting_hilbert.jl | 135 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 src/sorting_hilbert.jl

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index 1ce94dc..2d430bb 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -64,6 +64,7 @@ end
 default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(64, ndrange[1]),)
 
 include("sorting.jl")
+include("sorting_hilbert.jl")
 include("blocking.jl")
 include("plan.jl")
 include("set_points.jl")
diff --git a/src/sorting_hilbert.jl b/src/sorting_hilbert.jl
new file mode 100644
index 0000000..e519c80
--- /dev/null
+++ b/src/sorting_hilbert.jl
@@ -0,0 +1,135 @@
+# Code adapted from the BijectiveHilbert.jl package by Andrew Dolgert (MIT license).
+#
+# Hilbert sorting is a method for sorting a set of points in n-dimensional space so that
+# points which are physically close are also close in memory. See for instance:
+#
+# - https://en.wikipedia.org/wiki/Hilbert_curve
+# - https://charlesreid1.com/wiki/Hilbert_Sort
+# - https://doc.cgal.org/latest/Spatial_sorting/index.html
+# - https://computingkitchen.com/BijectiveHilbert.jl/stable/hilbert/
+#
+# We define a GlobalGrayStatic{T,B} type adapted from the BijectiveHilbert.GlobalGray{T}
+# type. The differences are that GlobalGrayStatic contains the number of bits `B` as a
+# static parameter, and that it simply does not include the number
+# of dimensions `n`. Instead, `n` is inferred from the input to `encode_hilbert_zero`, which
+# should be a (statically-sized) `Tuple` instead of a `Vector` as in BijectiveHilbert.jl.
+#
+# Moreover, functions for computing the Hilbert index have been adapted for working with
+# tuples and have no allocations. The performance is greatly improved compared to the
+# original GlobalGray type. Our code can also be called from GPU kernels, since it only uses
+# tuples and MVectors.
+#
+# We only implement encoding as we currently don't need decoding.
+#
+# See also https://computingkitchen.com/BijectiveHilbert.jl/stable/globalgray/#Global-Gray
+# for some details on the algorithm.
+
+"""
+    GlobalGrayStatic(T <: Unsigned, B::Int)
+    GlobalGrayStatic(B::Int, N::Int)
+
+Hilbert sorting algorithm adapted from the `GlobalGray` algorithm in BijectiveHilbert.jl.
+
+Here `T <: Unsigned` is the type of the Hilbert index and `B` is the required number of
+bits per dimension. For `B` bits, the algorithm creates a grid of size ``2^B`` in each
+direction.
+
+The second constructor chooses the smallest unsigned type `T` based on the required number
+of bits `B` and the space dimension `N`. If they are constants, they will likely be
+constant-propagated so that `T` is inferred by the compiler.
+
+The total number of bits required is `nbits = B * N`. For instance, if `N = 3` and `B = 4`
+(corresponding to a grid of dimensions ``(2^4)^3 = 16^3``), then `nbits = 12` and the
+smallest possible type `T` is `UInt16`, whose size is `8 * sizeof(UInt16) = 16` bits (as its
+name suggests!).
+"""
+struct GlobalGrayStatic{T, B} end
+
+Base.eltype(::GlobalGrayStatic{T}) where {T} = T
+
+GlobalGrayStatic(::Type{T}, B::Int) where {T} = GlobalGrayStatic{T, B}()
+
+Base.@constprop :aggressive function GlobalGrayStatic(B::Int, N::Int)
+    nbits = B * N  # minimal number of bits needed
+    T = large_enough_unsigned_static(Val(nbits))
+    GlobalGrayStatic{T, B}()
+end
+
+@inline function large_enough_unsigned_static(::Val{nbits}) where {nbits}
+    if @generated
+        unsigned_types = (UInt8, UInt16, UInt32, UInt64, UInt128)
+        for T ∈ unsigned_types
+            if 8 * sizeof(T) ≥ nbits
+                return :($T)
+            end
+        end
+        :(nothing)
+    else
+        unsigned_types = (UInt8, UInt16, UInt32, UInt64, UInt128)
+        for T ∈ unsigned_types
+            if 8 * sizeof(T) ≥ nbits
+                return T
+            end
+        end
+        nothing
+    end
+end
+
+"""
+    encode_hilbert_zero(algorithm::GlobalGrayStatic{T}, X::NTuple{N, <:Integer}) -> T
+
+Return Hilbert index associated to location `X` in `N`-dimensional space.
+"""
+function encode_hilbert_zero(::GlobalGrayStatic{T, B}, X::NTuple) where {T, B}
+    X = axes_to_transpose(X, Val(B))
+    interleave_transpose(T, X, Val(B))::T
+end
+
+function axes_to_transpose(X_in::NTuple{N,T}, ::Val{b}) where {N, b, T <: Integer}
+    X = MVector(X_in)
+    M = one(T) << (b - 1)
+    # Inverse undo
+    Q = M
+    @inbounds while Q > one(T)
+        P = Q - one(T)
+        for io ∈ 1:N
+            if !iszero(X[io] & Q)
+                X[1] ⊻= P
+            else
+                t = (X[1] ⊻ X[io]) & P
+                X[1] ⊻= t
+                X[io] ⊻= t
+            end
+        end
+        Q >>= one(Q)
+    end
+    # Gray encode
+    for jo ∈ 2:N
+        @inbounds X[jo] ⊻= X[jo - 1]
+    end
+    t2 = zero(T)
+    Q = M
+    @inbounds while Q > one(T)
+        if !iszero(X[N] & Q)
+            t2 ⊻= (Q - one(T))
+        end
+        Q >>= one(T)
+    end
+    for ko ∈ eachindex(X)
+        @inbounds X[ko] ⊻= t2
+    end
+    Tuple(X)
+end
+
+function interleave_transpose(::Type{T}, x::NTuple, ::Val{b}) where {T, b}
+    N = length(x)
+    S = eltype(x)
+    h = zero(T)
+    @inbounds for i ∈ 0:(b - 1)
+        for d ∈ eachindex(x)
+            ith_bit = Bool((x[d] & (one(S) << i)) >> i)  # 0 or 1
+            h |= T(ith_bit << (i * N + N - d))
+        end
+    end
+    h
+end

From e7e05e798fea8cf388ad344b9bda88f07156de07 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 09:32:18 +0200
Subject: [PATCH 02/10] Implement BlockDataGPU

---
 src/NonuniformFFTs.jl               |  12 +-
 src/blocking.jl                     | 221 ++++++++++++++++++++++++++--
 src/interpolation/cpu_blocked.jl    |   5 +-
 src/interpolation/cpu_nonblocked.jl |   1 +
 src/interpolation/gpu.jl            |   1 +
 src/plan.jl                         |  21 ++-
 src/set_points.jl                   |   2 +-
 src/sorting_hilbert.jl              |  13 +-
 src/spreading/cpu_blocked.jl        |   4 +-
 src/spreading/cpu_nonblocked.jl     |   1 +
 src/spreading/gpu.jl                |   1 +
 11 files changed, 243 insertions(+), 39 deletions(-)

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index 2d430bb..6e3b7e8 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -146,11 +146,7 @@ function exec_type1!(ûs_k::NTuple{C, AbstractArray{<:Complex}}, p::PlanNUFFT,
         end
 
         @timeit timer "(1) Spreading" begin
-            if with_blocking(blocks)
-                spread_from_points_blocked!(backend, kernels, blocks, us, points, vp)  # CPU-only
-            else
-                spread_from_points!(backend, kernels, us, points, vp)  # single-threaded case? Also GPU case.
-            end
+            spread_from_points!(backend, blocks, kernels, us, points, vp)
             KA.synchronize(backend)
         end
 
@@ -249,11 +245,7 @@ function exec_type2!(vp::NTuple{C, AbstractVector}, p::PlanNUFFT, ûs_k::NTuple
         end
 
         @timeit timer "(3) Interpolation" begin
-            if with_blocking(blocks)
-                interpolate_blocked!(kernels, blocks, vp, us, points)  # CPU-onlu
-            else
-                interpolate!(backend, kernels, vp, us, points)  # single-threaded case? Also GPU case.
-            end
+            interpolate!(backend, blocks, kernels, vp, us, points)
             KA.synchronize(backend)
         end
     end
diff --git a/src/blocking.jl b/src/blocking.jl
index 961d6ad..ef65a0a 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -15,16 +15,14 @@ end
 
 # Here the element type of `xp` can be either an NTuple{N, <:Real}, an SVector{N, <:Real},
 # or anything else which has length `N`.
-function set_points!(::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
-    # TODO: avoid implicit copy in resize!? (can be costly, especially on GPU)
-    length(points) == length(xp) || resize!(points, length(xp))
+function set_points!(backend, ::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
+    length(points) == length(xp) || resize_no_copy!(points, length(xp))
     Base.require_one_based_indexing(points)
     @timeit timer "(1) Copy + fold" begin
         # NOTE: we explicitly iterate through StructVector components because CUDA.jl
         # currently fails when implicitly writing to a StructArray (compilation fails,
         # tested on Julia 1.11-rc3 and CUDA.jl v5.4.3).
         points_comp = StructArrays.components(points)
-        backend = KA.get_backend(points_comp[1])
         kernel! = copy_points_unblocked_kernel!(backend)
         kernel!(transform, points_comp, xp; ndrange = size(xp))
         KA.synchronize(backend)  # mostly to get accurate timings
@@ -51,6 +49,205 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 
 # ================================================================================ #
 
+# Maximum number of bits allowed for Hilbert sort.
+# Hilbert sorting divides the domain in a grid of size N^d, where `d` is the dimension and
+# N = 2^b. Here `b` is the number of bits used in the algorithm.
+# This means that `b = 16` corresponds to a grid of dimensions 65536 *in each direction*,
+# which is far larger than anything we will ever need in practice.
+# (Also note that the Hilbert "grid" is generally coarser than the actual NUFFT grid where
+# values are spread and interpolated.)
+const MAX_BITS_HILBERT_SORT = 16
+
+# GPU implementation
+struct BlockDataGPU{
+        PermVector <: AbstractVector{<:Integer},
+        SortPoints <: StaticBool,
+    } <: AbstractBlockData
+    pointperm     :: PermVector
+    nbits_hilbert :: Int  # number of bits required in Hilbert sorting
+    sort_points   :: SortPoints
+
+    BlockDataGPU(pointperm::P, nbits, sort::S) where {P, S} =
+        new{P, S}(pointperm, nbits, sort)
+end
+
+with_blocking(::BlockDataGPU) = true
+
+# In the case of Hilbert sorting, what we call "block" corresponds to the size of the
+# minimal box in the Hilbert curve algorithm.
+function BlockDataGPU(backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, sort_points) where {D}
+    # We use a Hilbert sorting algorithm which requires the same number of blocks in each
+    # direction. In the best case, blocks will have the size required by `block_dims`. If
+    # that's not possible, we will prefer to use a smaller block size (so more blocks).
+    # This makes sense if the input block size was tuned as proportional to some memory
+    # limit (e.g. shared memory size on GPUs).
+    nblocks_per_dir_wanted = map(Ñs, block_dims) do N, B
+        cld(N, B)  # ≥ 1
+    end
+    nblocks = max(nblocks_per_dir_wanted...)  # this makes the actual block size ≤ the wanted block size
+    nbits = exponent(nblocks - 1) + 1
+    @assert UInt(2)^(nbits - 1) < nblocks ≤ UInt(2)^nbits  # verify that we got the right value of nbits
+    nbits = min(nbits, MAX_BITS_HILBERT_SORT)  # just in case; in practice nbits < MAX_BITS_HILBERT_SORT all the time
+    pointperm = KA.allocate(backend, Int, 0)   # stores permutation indices
+    BlockDataGPU(pointperm, nbits, sort_points)
+end
+
+function set_points!(
+        backend::GPU, bd::BlockDataGPU, points::StructVector, xp, timer;
+        transform::F = identity,
+    ) where {F <: Function}
+
+    # Total number of bits required to hold a Hilbert index.
+    # The index type T must be larger than this size.
+    # nbits_index = N * nbits_hilbert
+
+    # Recursively iterate over possible values of nbits_hilbert to avoid type instability.
+    # We start by nbits = 1, and increase until nbits == bd.nbits_hilbert.
+    @assert bd.nbits_hilbert ≤ MAX_BITS_HILBERT_SORT
+    _set_points_hilbert!(Val(1), backend, bd, points, xp, timer, transform)
+
+    # if nbits_index ≤ 8 * sizeof(UInt8)
+    #     _set_points!(UInt8, backend, bd, points, xp, timer, transform)
+    # elseif nbits_index ≤ 8 * sizeof(UInt16)
+    #     _set_points!(UInt16, backend, bd, points, xp, timer, transform)
+    # elseif nbits_index ≤ 8 * sizeof(UInt32)
+    #     _set_points!(UInt32, backend, bd, points, xp, timer, transform)
+    # else
+    #     _set_points!(UInt64, backend, bd, points, xp, timer, transform)
+    # end
+
+    nothing
+end
+
+@inline function _set_points_hilbert!(::Val{nbits}, backend, bd, points, xp, args...) where {nbits}
+    if nbits > 16  # this is to avoid the compiler from exploding if it thinks the recursion is infinite
+        nothing
+    elseif nbits == bd.nbits_hilbert
+        N = type_length(eltype(xp))  # = number of dimensions
+        alg = GlobalGrayStatic(nbits, N)  # should be inferred, since nbits is statically known
+        _set_points_hilbert!(alg, backend, bd, points, xp, args...)  # call method defined further below
+    else
+        _set_points_hilbert!(Val(nbits + 1), backend, bd, points, xp, args...)  # keep iterating
+    end
+end
+
+function _set_points_hilbert!(
+        sortalg::HilbertSortingAlgorithm, backend, bd::BlockDataGPU,
+        points::StructVector, xp, timer, transform::F,
+    ) where {F}
+    (; pointperm, sort_points,) = bd
+
+    B = nbits(sortalg)  # static value
+    nblocks = 1 << B    # same number of blocks in all directions (= 2^B)
+
+    Np = length(xp)
+    resize_no_copy!(pointperm, Np)
+    resize_no_copy!(points, Np)
+    @assert eachindex(points) == eachindex(xp)
+
+    # Allocate temporary array for holding Hilbert indices (manually deallocated later)
+    T = eltype(sortalg)
+    inds = KA.zeros(backend, T, Np)
+
+    # We avoid passing a StructVector to the kernel, so we pass `points` as a tuple of
+    # vectors. The kernel might fail if `xp` is also a StructVector, which is not imposed
+    # nor disallowed.
+    points_comp = StructArrays.components(points)
+
+    ndrange = size(points)
+    groupsize = default_workgroupsize(backend, ndrange)
+    kernel! = hilbert_sort_kernel!(backend, groupsize, ndrange)
+    @timeit timer "(1) Hilbert encoding" begin
+        kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform)
+        KA.synchronize(backend)
+    end
+
+    # Compute permutation needed to sort Hilbert indices.
+    @timeit timer "(2) Sortperm" begin
+        sortperm!(pointperm, inds)
+        KA.synchronize(backend)
+    end
+    KA.unsafe_free!(inds)
+
+    # `pointperm` now contains the permutation needed to sort points
+    if sort_points === True()
+        @timeit timer "(3) Permute points" let
+            local kernel! = permute_kernel!(backend, groupsize, ndrange)
+            kernel!(points_comp, xp, pointperm, transform)
+            KA.synchronize(backend)
+            resize!(pointperm, 0)  # free some memory (we won't need it anymore)
+        end
+    end
+
+    nothing
+end
+
+# This kernel may do multiple things at once:
+# - Compute Hilbert index associated to a point
+# - Copy point from `xp` to `points`, after transformations and folding, if sort_points === False().
+@kernel function hilbert_sort_kernel!(
+        inds::AbstractVector{<:Unsigned},
+        points::NTuple,
+        @Const(xp),
+        @Const(sortalg::HilbertSortingAlgorithm),
+        @Const(nblocks::Integer),  # TODO: can we pass this as a compile-time constant? (Val)
+        @Const(sort_points),
+        @Const(transform::F),
+    ) where {F}
+    I = @index(Global, Linear)
+    @inbounds x⃗ = xp[I]
+    y⃗ = to_unit_cell(transform(Tuple(x⃗))) :: NTuple
+    T = eltype(y⃗)
+    @assert T <: AbstractFloat
+
+    L = T(2) * π
+    block_size = L / nblocks
+
+    is = map(y⃗) do y
+        i = Kernels.point_to_cell(y, block_size)
+        i - one(i)  # Hilbert sorting requires zero-based indices
+    end
+
+    @inbounds inds[I] = encode_hilbert_zero(sortalg, is)  # compute Hilbert index for sorting
+
+    # If points need to be sorted, then we fill `points` some time later (in permute_kernel!).
+    if sort_points === False()
+        for n ∈ eachindex(x⃗)
+            @inbounds points[n][I] = y⃗[n]
+        end
+    end
+
+    nothing
+end
+
+@kernel function permute_kernel!(
+        points::NTuple,
+        @Const(xp::AbstractVector),
+        @Const(perm::AbstractVector{<:Integer}),
+        @Const(transform::F),
+    ) where {F}
+    i = @index(Global, Linear)
+    j = @inbounds perm[i]
+    x⃗ = @inbounds xp[j]
+    y⃗ = to_unit_cell(transform(Tuple(x⃗))) :: NTuple  # note: we perform the transform + folding twice per point...
+    for n ∈ eachindex(x⃗)
+        @inbounds points[n][i] = y⃗[n]
+    end
+    nothing
+end
+
+# Resize vector trying to avoid copy when N is larger than the original length.
+# In other words, we completely discard the original contents of x, which is not the
+# original behaviour of resize!. This can save us some device-to-device copies.
+function resize_no_copy!(x, N)
+    resize!(x, 0)
+    resize!(x, N)
+    x
+end
+
+# ================================================================================ #
+
+# CPU only
 struct BlockData{
         T, N, Nc,
         Tr,  # = real(T)
@@ -82,7 +279,7 @@ function BlockData(
         @assert N - M > 0
         min(B, N ÷ 2, N - M)
     end
-    dims = block_dims .+ 2M  # include padding for values outside of block
+    dims = block_dims .+ 2M  # include padding for values outside of block (TODO: include padding in original block_dims?)
     Tr = real(T)
     block_sizes = map(Ñs, block_dims) do N, B
         @inline
@@ -108,11 +305,9 @@ function BlockData(
     )
 end
 
-# Blocking is considered to be disabled if there are no allocated buffers.
-with_blocking(bd::BlockData) = !isempty(bd.buffers)
-
-function set_points!(bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
-    with_blocking(bd) || return set_points!(NullBlockData(), points, xp, timer)
+function set_points!(::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
+    # This technically never happens, but we might use it as a way to disable blocking.
+    isempty(bd.buffers) && return set_points!(NullBlockData(), points, xp, timer)
 
     (; indices, cumulative_npoints_per_block, blockidx, pointperm, block_sizes,) = bd
     N = type_length(eltype(xp))  # = number of dimensions
@@ -120,9 +315,9 @@ function set_points!(bd::BlockData, points::StructVector, xp, timer; transform::
     fill!(cumulative_npoints_per_block, 0)
     to_linear_index = LinearIndices(axes(indices))  # maps Cartesian to linear index of a block
     Np = length(xp)
-    resize!(blockidx, Np)
-    resize!(pointperm, Np)
-    resize!(points, Np)
+    resize_no_copy!(blockidx, Np)
+    resize_no_copy!(pointperm, Np)
+    resize_no_copy!(points, Np)
 
     @timeit timer "(1) Assign blocks" @inbounds for (i, x⃗) ∈ pairs(xp)
         # Get index of block where point x⃗ is located.
diff --git a/src/interpolation/cpu_blocked.jl b/src/interpolation/cpu_blocked.jl
index e8eed94..d225c25 100644
--- a/src/interpolation/cpu_blocked.jl
+++ b/src/interpolation/cpu_blocked.jl
@@ -89,9 +89,10 @@ function interpolate_from_arrays_blocked(
     end
 end
 
-function interpolate_blocked!(
-        gs,
+function interpolate!(
+        backend::CPU,
         bd::BlockData,
+        gs,
         vp_all::NTuple{C, AbstractVector},
         us::NTuple{C, AbstractArray},
         x⃗s::AbstractArray,
diff --git a/src/interpolation/cpu_nonblocked.jl b/src/interpolation/cpu_nonblocked.jl
index 1eaf54c..4ae52ca 100644
--- a/src/interpolation/cpu_nonblocked.jl
+++ b/src/interpolation/cpu_nonblocked.jl
@@ -1,5 +1,6 @@
 function interpolate!(
         backend::CPU,
+        ::NullBlockData,
         gs,
         vp_all::NTuple{C, AbstractVector},
         us::NTuple{C, AbstractArray},
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index e9c5893..6ec829d 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -41,6 +41,7 @@ end
 
 function interpolate!(
         backend::GPU,
+        bd::Union{BlockDataGPU, NullBlockData},
         gs,
         vp_all::NTuple{C, AbstractVector},
         us::NTuple{C, AbstractArray},
diff --git a/src/plan.jl b/src/plan.jl
index fdbb137..238c3c6 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -88,12 +88,12 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
 
 ## Performance parameters
 
-- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning.
+- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning
+  or when sorting is enabled.
   This can be tuned for maximal performance.
   Using block partitioning is required for running with multiple threads.
   Blocking can be completely disabled by passing `block_size = nothing` (but this is
   generally slower, even when running on a single thread).
-  This parameter is ignored in GPU implementations.
 
 - `sort_points = False()`: whether to internally permute the order of the non-uniform points.
   This can be enabled by passing `sort_points = True()`.
@@ -247,7 +247,10 @@ Return the number of datasets which are simultaneously transformed by a plan.
 """
 ntransforms(::PlanNUFFT{T, N, Nc}) where {T, N, Nc} = Nc
 
-default_block_size() = 4096  # in number of linear elements
+default_block_size(::CPU) = 4096  # in number of linear elements
+
+# TODO: adapt this based on size of shared memory and on element type T (and padding 2M)?
+default_block_size(::GPU) = 4096
 
 function get_block_dims(Ñs::Dims, bsize::Int)
     d = length(Ñs)
@@ -270,9 +273,9 @@ function _PlanNUFFT(
         timer = TimerOutput(),
         fftw_flags = FFTW.MEASURE,
         fftshift = false,
-        block_size::Union{Integer, Nothing} = default_block_size(),
         sort_points::StaticBool = False(),
         backend::KA.Backend = CPU(),
+        block_size::Union{Integer, Nothing} = default_block_size(backend),
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.
@@ -301,13 +304,17 @@ function _PlanNUFFT(
         foreach(init_fourier_coefficients!, kernel_data, ks)
     end
     points = StructVector(ntuple(_ -> KA.allocate(backend, Tr, 0), Val(D)))  # empty vector of points
-    if block_size === nothing || backend isa GPU
+    if block_size === nothing
         blocks = NullBlockData()  # disable blocking (→ can't use multithreading when spreading)
         backend isa CPU && FFTW.set_num_threads(1)   # also disable FFTW threading (avoids allocations)
     else
         block_dims = get_block_dims(Ñs, block_size)
-        blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
-        backend isa CPU && FFTW.set_num_threads(Threads.nthreads())
+        if backend isa GPU
+            blocks = BlockDataGPU(backend, block_dims, Ñs, sort_points)
+        else
+            blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
+            FFTW.set_num_threads(Threads.nthreads())
+        end
     end
     plan_kwargs = backend isa CPU ? (flags = fftw_flags,) : (;)
     nufft_data = init_plan_data(T, backend, Ñs, ks, num_transforms; plan_kwargs)
diff --git a/src/set_points.jl b/src/set_points.jl
index 2e3fc65..d34d64a 100644
--- a/src/set_points.jl
+++ b/src/set_points.jl
@@ -32,7 +32,7 @@ function set_points!(p::PlanNUFFT, xp::AbstractVector; kwargs...)
     (; points, timer,) = p
     N = ndims(p)
     type_length(eltype(xp)) == N || throw(DimensionMismatch(lazy"expected $N-dimensional points"))
-    @timeit timer "Set points" set_points!(p.blocks, points, xp, timer; kwargs...)
+    @timeit timer "Set points" set_points!(p.backend, p.blocks, points, xp, timer; kwargs...)
     p
 end
 
diff --git a/src/sorting_hilbert.jl b/src/sorting_hilbert.jl
index e519c80..4902204 100644
--- a/src/sorting_hilbert.jl
+++ b/src/sorting_hilbert.jl
@@ -24,8 +24,13 @@
 # See also https://computingkitchen.com/BijectiveHilbert.jl/stable/globalgray/#Global-Gray
 # for some details on the algorithm.
 
+abstract type HilbertSortingAlgorithm{T, B} end
+
+Base.eltype(::HilbertSortingAlgorithm{T}) where {T} = T
+nbits(::HilbertSortingAlgorithm{T, B}) where {T, B} = B
+
 """
-    GlobalGrayStatic(T <: Unsigned, B::Int)
+    GlobalGrayStatic(T <: Unsigned, B::Int) <: HilbertSortingAlgorithm{T, B}
     GlobalGrayStatic(B::Int, N::Int)
 
 Hilbert sorting algorithm adapted from the `GlobalGray` algorithm in BijectiveHilbert.jl.
@@ -43,9 +48,7 @@ The total number of bits required is `nbits = B * N`. For instance, if `N = 3` a
 smallest possible type `T` is `UInt16`, whose size is `8 * sizeof(UInt16) = 16` bits (as its
 name suggests!).
 """
-struct GlobalGrayStatic{T, B} end
-
-Base.eltype(::GlobalGrayStatic{T}) where {T} = T
+struct GlobalGrayStatic{T, B} <: HilbertSortingAlgorithm{T, B} end
 
 GlobalGrayStatic(::Type{T}, B::Int) where {T} = GlobalGrayStatic{T, B}()
 
@@ -79,6 +82,8 @@ end
     encode_hilbert_zero(algorithm::GlobalGrayStatic{T}, X::NTuple{N, <:Integer}) -> T
 
 Return Hilbert index associated to location `X` in `N`-dimensional space.
+
+Values in `X` usually correspond to indices on a grid. This function takes indices which start at 0 (!!).
 """
 function encode_hilbert_zero(::GlobalGrayStatic{T, B}, X::NTuple) where {T, B}
     X = axes_to_transpose(X, Val(B))
diff --git a/src/spreading/cpu_blocked.jl b/src/spreading/cpu_blocked.jl
index 695c5d2..8f90c7e 100644
--- a/src/spreading/cpu_blocked.jl
+++ b/src/spreading/cpu_blocked.jl
@@ -80,10 +80,10 @@ function fill_with_zeros_serial!(us_all::NTuple{C, A}) where {C, A <: DenseArray
     us_all
 end
 
-function spread_from_points_blocked!(
+function spread_from_points!(
         ::CPU,
-        gs,
         bd::BlockData,
+        gs,
         us_all::NTuple{C, AbstractArray},
         xp::AbstractVector,
         vp_all::NTuple{C, AbstractVector},
diff --git a/src/spreading/cpu_nonblocked.jl b/src/spreading/cpu_nonblocked.jl
index 2cacc67..bc81a07 100644
--- a/src/spreading/cpu_nonblocked.jl
+++ b/src/spreading/cpu_nonblocked.jl
@@ -39,6 +39,7 @@ end
 
 function spread_from_points!(
         ::CPU,
+        ::NullBlockData,  # no blocking
         gs,
         us_all::NTuple{C, AbstractArray},
         x⃗s::AbstractVector,
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index cdc6ee7..bf97903 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -109,6 +109,7 @@ end
 # We assume all arrays are already on the GPU.
 function spread_from_points!(
         backend::GPU,
+        bd::Union{BlockDataGPU, NullBlockData},
         gs,
         us_all::NTuple{C, AbstractGPUArray},
         x⃗s::StructVector,

From e9d06b48069a513af0200b9767685471b490bfc8 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 10:19:25 +0200
Subject: [PATCH 03/10] Use permutations in spreading

---
 src/spreading/gpu.jl | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index bf97903..1c5b2f5 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -3,12 +3,27 @@
         us::NTuple{C},
         @Const(points::NTuple{D}),
         @Const(vp::NTuple{C}),
+        @Const(pointperm),
+        @Const(sort_points::StaticBool),
         evaluate::NTuple{D, <:Function},  # can't be marked Const for some reason
         to_indices::NTuple{D, <:Function},
     ) where {C, D}
     i = @index(Global, Linear)
-    x⃗ = map(xs -> @inbounds(xs[i]), points)
-    v⃗ = map(v -> @inbounds(v[i]), vp)
+
+    j = if pointperm === nothing
+        i
+    else
+        @inbounds pointperm[i]
+    end
+
+    i_x = if sort_points === True()
+        i  # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done)
+    else
+        j  # don't access points contiguously in memory, but spread onto a localised region in space (and GPU memory?)
+    end
+
+    x⃗ = map(xs -> @inbounds(xs[i_x]), points)
+    v⃗ = map(v -> @inbounds(v[j]), vp)
 
     # Determine grid dimensions.
     Z = eltype(v⃗)
@@ -135,11 +150,19 @@ function spread_from_points!(
         map(u -> reinterpret(T, u), us_all)  # note: we don't use reshape, so the first dimension has 2x elements
     end
 
-    # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another)
+    pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
+    sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
+
+    if pointperm !== nothing
+        @assert eachindex(pointperm) == eachindex(x⃗s)
+    end
+
+    # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points
+    # may change from one call to another)
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
     kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(us_real, xs_comp, vp_all, evaluate, to_indices)
+    kernel!(us_real, xs_comp, vp_all, pointperm, sort_points, evaluate, to_indices)
 
     us_all
 end

From e063e6edbec8346b0bbcb1e59f8a1f9ed3b1fa9d Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 10:25:27 +0200
Subject: [PATCH 04/10] Update interpolation code

---
 src/blocking.jl          | 25 +++++++++----------------
 src/interpolation/gpu.jl | 28 +++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index ef65a0a..6f90e86 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -4,6 +4,10 @@ abstract type AbstractBlockData end
 struct NullBlockData <: AbstractBlockData end
 with_blocking(::NullBlockData) = false
 
+# For now these are only used in the GPU implementation
+get_pointperm(::NullBlockData) = nothing
+get_sort_points(::NullBlockData) = False()
+
 @kernel function copy_points_unblocked_kernel!(@Const(transform::F), points::NTuple, @Const(xp)) where {F}
     I = @index(Global, Linear)
     @inbounds x⃗ = xp[I]
@@ -92,35 +96,24 @@ function BlockDataGPU(backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, so
     BlockDataGPU(pointperm, nbits, sort_points)
 end
 
+get_pointperm(bd::BlockDataGPU) = bd.pointperm
+get_sort_points(bd::BlockDataGPU) = bd.sort_points
+
 function set_points!(
         backend::GPU, bd::BlockDataGPU, points::StructVector, xp, timer;
         transform::F = identity,
     ) where {F <: Function}
 
-    # Total number of bits required to hold a Hilbert index.
-    # The index type T must be larger than this size.
-    # nbits_index = N * nbits_hilbert
-
     # Recursively iterate over possible values of nbits_hilbert to avoid type instability.
     # We start by nbits = 1, and increase until nbits == bd.nbits_hilbert.
     @assert bd.nbits_hilbert ≤ MAX_BITS_HILBERT_SORT
     _set_points_hilbert!(Val(1), backend, bd, points, xp, timer, transform)
 
-    # if nbits_index ≤ 8 * sizeof(UInt8)
-    #     _set_points!(UInt8, backend, bd, points, xp, timer, transform)
-    # elseif nbits_index ≤ 8 * sizeof(UInt16)
-    #     _set_points!(UInt16, backend, bd, points, xp, timer, transform)
-    # elseif nbits_index ≤ 8 * sizeof(UInt32)
-    #     _set_points!(UInt32, backend, bd, points, xp, timer, transform)
-    # else
-    #     _set_points!(UInt64, backend, bd, points, xp, timer, transform)
-    # end
-
     nothing
 end
 
 @inline function _set_points_hilbert!(::Val{nbits}, backend, bd, points, xp, args...) where {nbits}
-    if nbits > 16  # this is to avoid the compiler from exploding if it thinks the recursion is infinite
+    if nbits > MAX_BITS_HILBERT_SORT  # this is to avoid the compiler from exploding if it thinks the recursion is infinite
         nothing
     elseif nbits == bd.nbits_hilbert
         N = type_length(eltype(xp))  # = number of dimensions
@@ -167,6 +160,7 @@ function _set_points_hilbert!(
         sortperm!(pointperm, inds)
         KA.synchronize(backend)
     end
+
     KA.unsafe_free!(inds)
 
     # `pointperm` now contains the permutation needed to sort points
@@ -175,7 +169,6 @@ function _set_points_hilbert!(
             local kernel! = permute_kernel!(backend, groupsize, ndrange)
             kernel!(points_comp, xp, pointperm, transform)
             KA.synchronize(backend)
-            resize!(pointperm, 0)  # free some memory (we won't need it anymore)
         end
     end
 
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 6ec829d..1955ea6 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -5,12 +5,27 @@ using StaticArrays: MVector
         vp::NTuple{C},
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C}),
+        @Const(pointperm),
+        @Const(sort_points::StaticBool),
         @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
         evaluate::NTuple{D, <:Function},  # can't be marked Const for some reason
         to_indices::NTuple{D, <:Function},
     ) where {C, D}
     i = @index(Global, Linear)
-    x⃗ = map(xs -> @inbounds(xs[i]), points)
+
+    j = if pointperm === nothing
+        i
+    else
+        @inbounds pointperm[i]
+    end
+
+    i_x = if sort_points === True()
+        i  # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done)
+    else
+        j  # don't access points contiguously in memory, but interpolate in a localised region in space (and GPU memory?)
+    end
+
+    x⃗ = map(xs -> @inbounds(xs[i_x]), points)
 
     # Determine grid dimensions.
     # Unlike in spreading, here `us` can be made of arrays of complex numbers, because we
@@ -33,7 +48,7 @@ using StaticArrays: MVector
     v⃗ = interpolate_from_arrays_gpu(us, inds, vals)
 
     for (dst, v) ∈ zip(vp, v⃗)
-        @inbounds dst[i] = v
+        @inbounds dst[j] = v
     end
 
     nothing
@@ -56,11 +71,18 @@ function interpolate!(
     xs_comp = StructArrays.components(x⃗s)
     Δxs = map(Kernels.gridstep, gs)
 
+    pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
+    sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
+
+    if pointperm !== nothing
+        @assert eachindex(pointperm) == eachindex(x⃗s)
+    end
+
     # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another)
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
     kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(vp_all, xs_comp, us, Δxs, evaluate, to_indices)
+    kernel!(vp_all, xs_comp, us, pointperm, sort_points, Δxs, evaluate, to_indices)
 
     vp_all
 end

From 034574276461a5e7c1622db7bf3302ceeb66d374 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 10:25:43 +0200
Subject: [PATCH 05/10] Test more parameter combinations

---
 test/pseudo_gpu.jl | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
index 0175f43..3d3f791 100644
--- a/test/pseudo_gpu.jl
+++ b/test/pseudo_gpu.jl
@@ -53,6 +53,23 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
 
     set_points!(p, xp)
 
+    save_points_sorted = false # this can be useful for verifying Hilbert sorting graphically
+
+    if backend isa PseudoGPU && save_points_sorted
+        inds = NonuniformFFTs.get_pointperm(p.blocks)
+        if inds !== nothing
+            open("points_sorted.dat", "w") do io
+                for i ∈ inds
+                    x⃗ = xp[i]
+                    for x ∈ x⃗
+                        print(io, "\t", x)
+                    end
+                    print(io, "\n")
+                end
+            end
+        end
+    end
+
     T = eltype(p)  # this is actually defined in AbstractNFFTs; it represents the type in Fourier space (always complex)
     @test T <: Complex
     dims = size(p)
@@ -65,7 +82,7 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
     (; backend, us, wp,)
 end
 
-function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number}
+function compare_with_cpu(::Type{T}, dims; Np = prod(dims), kws...) where {T <: Number}
     # Generate some non-uniform random data on the CPU
     rng = Xoshiro(42)
     N = length(dims)    # number of dimensions
@@ -73,7 +90,7 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number}
     xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
     vp_init = randn(rng, T, Np)          # random values at points
 
-    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5,)
+    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, kws...)
     p_cpu = PlanNUFFT(T, dims; params..., backend = CPU())
     p_gpu = PlanNUFFT(T, dims; params..., backend = PseudoGPU())
 
@@ -87,9 +104,14 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number}
 end
 
 @testset "GPU implementation (using CPU)" begin
-    types = (Float32, ComplexF32)
     dims = (35, 64, 40)
-    @testset "T = $T" for T ∈ types
+    @testset "T = $T" for T ∈ (Float32, ComplexF32)
         compare_with_cpu(T, dims)
     end
+    @testset "sort_points = $sort_points" for sort_points ∈ (False(), True())
+        compare_with_cpu(Float64, dims; sort_points)
+    end
+    @testset "No blocking" begin  # Hilbert sorting disabled
+        compare_with_cpu(ComplexF64, dims; block_size = nothing)
+    end
 end

From 42026596b5a74d727a2de2af66e2564425cd7666 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 10:30:03 +0200
Subject: [PATCH 06/10] Update docs

---
 src/plan.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/plan.jl b/src/plan.jl
index 238c3c6..a275640 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -92,11 +92,14 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   or when sorting is enabled.
   This can be tuned for maximal performance.
   Using block partitioning is required for running with multiple threads.
-  Blocking can be completely disabled by passing `block_size = nothing` (but this is
-  generally slower, even when running on a single thread).
+  On the GPU, this will perform spatial sorting using a Hilbert curve algorithm, whose
+  minimal scale is proportional to the value of `block_size`.
+  Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
+  generally slower).
 
 - `sort_points = False()`: whether to internally permute the order of the non-uniform points.
   This can be enabled by passing `sort_points = True()`.
+  Ignored when `block_size = nothing` (which disables spatial sorting).
   In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
   This can improve performance if executing multiple transforms on the same non-uniform points.
   Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.

From ab79b29dd80a37539bf3b2db23839688329d5c6f Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@legi.cnrs.fr>
Date: Fri, 20 Sep 2024 11:29:54 +0200
Subject: [PATCH 07/10] GPU: also sort non-uniform values...

...if sort_points = True().

This slightly improves spreading performance (not much).
---
 src/blocking.jl          |  1 +
 src/interpolation/gpu.jl | 36 ++++++++++++++++++++++++++----------
 src/plan.jl              |  8 +++++---
 src/spreading/gpu.jl     | 35 ++++++++++++++++++++++++++---------
 4 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index 6f90e86..9618b96 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -21,6 +21,7 @@ end
 # or anything else which has length `N`.
 function set_points!(backend, ::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
     length(points) == length(xp) || resize_no_copy!(points, length(xp))
+    KA.synchronize(backend)
     Base.require_one_based_indexing(points)
     @timeit timer "(1) Copy + fold" begin
         # NOTE: we explicitly iterate through StructVector components because CUDA.jl
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 1955ea6..791e265 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -6,7 +6,6 @@ using StaticArrays: MVector
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C}),
         @Const(pointperm),
-        @Const(sort_points::StaticBool),
         @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
         evaluate::NTuple{D, <:Function},  # can't be marked Const for some reason
         to_indices::NTuple{D, <:Function},
@@ -19,13 +18,7 @@ using StaticArrays: MVector
         @inbounds pointperm[i]
     end
 
-    i_x = if sort_points === True()
-        i  # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done)
-    else
-        j  # don't access points contiguously in memory, but interpolate in a localised region in space (and GPU memory?)
-    end
-
-    x⃗ = map(xs -> @inbounds(xs[i_x]), points)
+    x⃗ = map(xs -> @inbounds(xs[j]), points)
 
     # Determine grid dimensions.
     # Unlike in spreading, here `us` can be made of arrays of complex numbers, because we
@@ -44,7 +37,6 @@ using StaticArrays: MVector
         geval.values .* Δx
     end
 
-    # v⃗ = @inline interpolate_from_arrays(us, inds, vals)
     v⃗ = interpolate_from_arrays_gpu(us, inds, vals)
 
     for (dst, v) ∈ zip(vp, v⃗)
@@ -78,11 +70,25 @@ function interpolate!(
         @assert eachindex(pointperm) == eachindex(x⃗s)
     end
 
+    if sort_points === True()
+        vp_sorted = map(similar, vp_all)  # allocate temporary arrays for sorted non-uniform data
+        pointperm_ = nothing  # we don't need permutations in interpolation kernel (all accesses to non-uniform data will be contiguous)
+    else
+        vp_sorted = vp_all
+        pointperm_ = pointperm
+    end
+
     # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another)
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
     kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(vp_all, xs_comp, us, pointperm, sort_points, Δxs, evaluate, to_indices)
+    kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices)
+
+    if sort_points === True()
+        kernel_perm! = interp_permute_kernel!(backend, workgroupsize, ndrange)
+        kernel_perm!(vp_all, vp_sorted, pointperm)
+        foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+    end
 
     vp_all
 end
@@ -139,3 +145,13 @@ end
         Tuple(vs)
     end
 end
+
+# This applies the *inverse* permutation by switching i ↔ j indices (opposite of spread_permute_kernel!).
+@kernel function interp_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
+    i = @index(Global, Linear)
+    j = @inbounds perm[i]
+    for n ∈ 1:N
+        @inbounds vp[n][j] = vp_in[n][i]
+    end
+    nothing
+end
diff --git a/src/plan.jl b/src/plan.jl
index a275640..7224d3e 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -88,10 +88,12 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
 
 ## Performance parameters
 
-- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning
+- `block_size::Int`: the linear block size (in number of elements) when using block partitioning
   or when sorting is enabled.
   This can be tuned for maximal performance.
-  Using block partitioning is required for running with multiple threads.
+  The current defaults are 4096 (CPU) and 1024 (GPU), but these may change in the future or
+  even depend on the actual computing device.
+  On the CPU, using block partitioning is required for running with multiple threads.
   On the GPU, this will perform spatial sorting using a Hilbert curve algorithm, whose
   minimal scale is proportional to the value of `block_size`.
   Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
@@ -253,7 +255,7 @@ ntransforms(::PlanNUFFT{T, N, Nc}) where {T, N, Nc} = Nc
 default_block_size(::CPU) = 4096  # in number of linear elements
 
 # TODO: adapt this based on size of shared memory and on element type T (and padding 2M)?
-default_block_size(::GPU) = 4096
+default_block_size(::GPU) = 1024  # a bit faster than 4096 on A100 (with 256³ oversampled grid)
 
 function get_block_dims(Ñs::Dims, bsize::Int)
     d = length(Ñs)
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 1c5b2f5..75dca82 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -4,7 +4,6 @@
         @Const(points::NTuple{D}),
         @Const(vp::NTuple{C}),
         @Const(pointperm),
-        @Const(sort_points::StaticBool),
         evaluate::NTuple{D, <:Function},  # can't be marked Const for some reason
         to_indices::NTuple{D, <:Function},
     ) where {C, D}
@@ -16,13 +15,7 @@
         @inbounds pointperm[i]
     end
 
-    i_x = if sort_points === True()
-        i  # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done)
-    else
-        j  # don't access points contiguously in memory, but spread onto a localised region in space (and GPU memory?)
-    end
-
-    x⃗ = map(xs -> @inbounds(xs[i_x]), points)
+    x⃗ = map(xs -> @inbounds(xs[j]), points)
     v⃗ = map(v -> @inbounds(v[j]), vp)
 
     # Determine grid dimensions.
@@ -161,8 +154,32 @@ function spread_from_points!(
     # may change from one call to another)
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
+
+    if sort_points === True()
+        vp_sorted = map(similar, vp_all)  # allocate temporary arrays for sorted non-uniform data
+        kernel_perm! = spread_permute_kernel!(backend, workgroupsize, ndrange)
+        kernel_perm!(vp_sorted, vp_all, pointperm)
+        pointperm_ = nothing  # we don't need any further permutations (all accesses to non-uniform data will be contiguous)
+    else
+        vp_sorted = vp_all
+        pointperm_ = pointperm
+    end
+
     kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(us_real, xs_comp, vp_all, pointperm, sort_points, evaluate, to_indices)
+    kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices)
+
+    if sort_points === True()
+        foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+    end
 
     us_all
 end
+
+@kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
+    i = @index(Global, Linear)
+    j = @inbounds perm[i]
+    for n ∈ 1:N
+        @inbounds vp[n][i] = vp_in[n][j]
+    end
+    nothing
+end

From 4f68830f76ea9886b85e1ba7c36dc1b3e907a4ff Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@legi.cnrs.fr>
Date: Fri, 20 Sep 2024 11:56:57 +0200
Subject: [PATCH 08/10] Use dynamically sized kernels

Doesn't seem to affect performance.
---
 src/NonuniformFFTs.jl    |  9 ++-------
 src/blocking.jl          | 10 +++++-----
 src/interpolation/gpu.jl | 11 ++++++-----
 src/spreading/gpu.jl     | 12 ++++++------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index 6e3b7e8..b29a681 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -55,13 +55,8 @@ function default_workgroupsize(backend, ndrange::Dims)
     KA.default_cpu_workgroupsize(ndrange)
 end
 
-# Reducing the number of threads to 64 for 1D GPU kernels seems to improve performance
-# (tested on Nvidia A100). 1D kernels are those iterating over non-uniform points, i.e.
-# spreading and interpolation, which usually dominate performance.
-# TODO: this seems to be only true when data is randomly located and sorting is not
-# performed. With sorting, it looks like larger workgroups are faster, so we should revert
-# this when sorting on GPU is implemented.
-default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(64, ndrange[1]),)
+# Case of 1D kernels on the GPU (typically, kernels which iterate over non-uniform points).
+default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(512, ndrange[1]),)
 
 include("sorting.jl")
 include("sorting_hilbert.jl")
diff --git a/src/blocking.jl b/src/blocking.jl
index 9618b96..ab54354 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -149,10 +149,10 @@ function _set_points_hilbert!(
     points_comp = StructArrays.components(points)
 
     ndrange = size(points)
-    groupsize = default_workgroupsize(backend, ndrange)
-    kernel! = hilbert_sort_kernel!(backend, groupsize, ndrange)
+    workgroupsize = default_workgroupsize(backend, ndrange)
+    kernel! = hilbert_sort_kernel!(backend)
     @timeit timer "(1) Hilbert encoding" begin
-        kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform)
+        kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform; workgroupsize, ndrange)
         KA.synchronize(backend)
     end
 
@@ -167,8 +167,8 @@ function _set_points_hilbert!(
     # `pointperm` now contains the permutation needed to sort points
     if sort_points === True()
         @timeit timer "(3) Permute points" let
-            local kernel! = permute_kernel!(backend, groupsize, ndrange)
-            kernel!(points_comp, xp, pointperm, transform)
+            local kernel! = permute_kernel!(backend)
+            kernel!(points_comp, xp, pointperm, transform; ndrange, workgroupsize)
             KA.synchronize(backend)
         end
     end
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 791e265..b140773 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -78,15 +78,16 @@ function interpolate!(
         pointperm_ = pointperm
     end
 
-    # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another)
+    # We use dynamically sized kernels to avoid recompilation, since number of points may
+    # change from one call to another.
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
-    kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices)
+    kernel! = interpolate_to_point_naive_kernel!(backend)
+    kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices; workgroupsize, ndrange)
 
     if sort_points === True()
-        kernel_perm! = interp_permute_kernel!(backend, workgroupsize, ndrange)
-        kernel_perm!(vp_all, vp_sorted, pointperm)
+        kernel_perm! = interp_permute_kernel!(backend)
+        kernel_perm!(vp_all, vp_sorted, pointperm; workgroupsize, ndrange)
         foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
     end
 
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 75dca82..74014ad 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -150,23 +150,23 @@ function spread_from_points!(
         @assert eachindex(pointperm) == eachindex(x⃗s)
     end
 
-    # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points
-    # may change from one call to another)
+    # We use dynamically sized kernels to avoid recompilation, since number of points may
+    # change from one call to another.
     ndrange = size(x⃗s)  # iterate through points
     workgroupsize = default_workgroupsize(backend, ndrange)
 
     if sort_points === True()
         vp_sorted = map(similar, vp_all)  # allocate temporary arrays for sorted non-uniform data
-        kernel_perm! = spread_permute_kernel!(backend, workgroupsize, ndrange)
-        kernel_perm!(vp_sorted, vp_all, pointperm)
+        kernel_perm! = spread_permute_kernel!(backend)
+        kernel_perm!(vp_sorted, vp_all, pointperm; workgroupsize, ndrange)
         pointperm_ = nothing  # we don't need any further permutations (all accesses to non-uniform data will be contiguous)
     else
         vp_sorted = vp_all
         pointperm_ = pointperm
     end
 
-    kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange)
-    kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices)
+    kernel! = spread_from_point_naive_kernel!(backend)
+    kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices; workgroupsize, ndrange)
 
     if sort_points === True()
         foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays

From 5e9ab75973627966e3a4132362d32cbf77d135bf Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 12:13:09 +0200
Subject: [PATCH 09/10] Try to fix tests

---
 src/abstractNFFTs.jl     | 2 +-
 test/accuracy.jl         | 4 ++--
 test/multidimensional.jl | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/abstractNFFTs.jl b/src/abstractNFFTs.jl
index 4a07d1a..3c20cdd 100644
--- a/src/abstractNFFTs.jl
+++ b/src/abstractNFFTs.jl
@@ -91,7 +91,7 @@ Base.@constprop :aggressive function PlanNUFFT(
     m, σ, reltol = AbstractNFFTs.accuracyParams(; kwargs...)
     backend = KA.get_backend(xp)  # e.g. use GPU backend if xp is a GPU array
     sort_points = sortNodes ? True() : False()  # this is type-unstable (unless constant propagation happens)
-    block_size = blocking ? default_block_size() : nothing  # also type-unstable
+    block_size = blocking ? default_block_size(backend) : nothing  # also type-unstable
     kernel = window isa AbstractKernel ? window : convert_window_function(window)
     p = PlanNUFFT(Complex{Tr}, Ns, HalfSupport(m); backend, σ = Tr(σ), sort_points, fftshift, block_size, kernel, fftw_flags = fftflags)
     AbstractNFFTs.nodes!(p, xp)
diff --git a/test/accuracy.jl b/test/accuracy.jl
index 2aec8cc..cc83fcb 100644
--- a/test/accuracy.jl
+++ b/test/accuracy.jl
@@ -95,7 +95,7 @@ function test_nufft_type1_1d(
         Np = 2 * N,
         m = HalfSupport(8),
         σ = 1.25,
-        block_size = NonuniformFFTs.default_block_size(),
+        block_size = NonuniformFFTs.default_block_size(CPU()),
     ) where {T <: Number}
     if T <: Real
         Tr = T
@@ -154,7 +154,7 @@ function test_nufft_type2_1d(
         Np = 2 * N,
         m = HalfSupport(8),
         σ = 1.25,
-        block_size = NonuniformFFTs.default_block_size(),
+        block_size = NonuniformFFTs.default_block_size(CPU()),
     ) where {T <: Number}
     if T <: Real
         Tr = T
diff --git a/test/multidimensional.jl b/test/multidimensional.jl
index e4c1b83..4ead8f1 100644
--- a/test/multidimensional.jl
+++ b/test/multidimensional.jl
@@ -32,7 +32,7 @@ function test_nufft_type1(
         Np = 2 * first(Ns),
         m = HalfSupport(8),
         σ = 1.25,
-        block_size = NonuniformFFTs.default_block_size(),
+        block_size = NonuniformFFTs.default_block_size(CPU()),
         sort_points = False(),
     ) where {T <: Number}
     Tr = real(T)
@@ -79,7 +79,7 @@ function test_nufft_type2(
         Np = 2 * first(Ns),
         m = HalfSupport(8),
         σ = 1.25,
-        block_size = NonuniformFFTs.default_block_size(),
+        block_size = NonuniformFFTs.default_block_size(CPU()),
         sort_points = False(),
     ) where {T <: Number}
     Tr = real(T)

From 2fc6487bb30291cdc77a07d9db4c3bcbe3a7f1b5 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 20 Sep 2024 15:15:16 +0200
Subject: [PATCH 10/10] Fix JET tests?

---
 src/blocking.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index ab54354..e7b355e 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -273,7 +273,7 @@ function BlockData(
         @assert N - M > 0
         min(B, N ÷ 2, N - M)
     end
-    dims = block_dims .+ 2M  # include padding for values outside of block (TODO: include padding in original block_dims?)
+    dims = block_dims .+ 2M  # include padding for values outside of block (TODO: include padding in original block_dims? requires minimal block_size in each direction)
     Tr = real(T)
     block_sizes = map(Ñs, block_dims) do N, B
         @inline
@@ -299,9 +299,9 @@ function BlockData(
     )
 end
 
-function set_points!(::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
+function set_points!(backend::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function}
     # This technically never happens, but we might use it as a way to disable blocking.
-    isempty(bd.buffers) && return set_points!(NullBlockData(), points, xp, timer)
+    isempty(bd.buffers) && return set_points!(backend, NullBlockData(), points, xp, timer; transform)
 
     (; indices, cumulative_npoints_per_block, blockidx, pointperm, block_sizes,) = bd
     N = type_length(eltype(xp))  # = number of dimensions