From 5c45c7dd4e6e425971a71eb4da94e372d8b9c8d9 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 17 Oct 2024 13:03:43 +0200
Subject: [PATCH 01/33] Start work on shared-memory implementation

---
 src/NonuniformFFTs.jl    |  2 +-
 src/blocking.jl          | 45 +++++++++++++++++++++++---
 src/interpolation/gpu.jl | 68 +++++++++++++++++++++++++++++++++-------
 src/plan.jl              |  3 +-
 4 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index a432663..8a9d205 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -5,7 +5,7 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan
 using GPUArraysCore: AbstractGPUArray, AbstractGPUVector
 using StructArrays: StructArrays, StructVector
 using AbstractFFTs: AbstractFFTs
-using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const
+using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem
 using Atomix: Atomix
 using FFTW: FFTW
 using LinearAlgebra: mul!
diff --git a/src/blocking.jl b/src/blocking.jl
index ad143af..4cffc19 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -7,6 +7,7 @@ with_blocking(::NullBlockData) = false
 # For now these are only used in the GPU implementation
 get_pointperm(::NullBlockData) = nothing
 get_sort_points(::NullBlockData) = False()
+gpu_method(::NullBlockData) = :global_memory
 
 @kernel function copy_points_unblocked_kernel!(@Const(transform::F), points::NTuple, @Const(xp)) where {F}
     I = @index(Global, Linear)
@@ -61,43 +62,79 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 
 # ================================================================================ #
 
+# Determine block size if using the shared-memory implementation.
+# We try to make sure that the total block size (including 2M ghost cells in each direction)
+# is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
+# Note that the result is a compile-time constant (on Julia 1.11.1 at least).
+@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M}
+    max_shmem_size = 48 << 10  # 48 KiB -- TODO: make this depend on the actual GPU?
+    max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
+    m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
+    n = m - 2M  # exclude ghost cells
+    if n ≤ 0
+        throw(ArgumentError(
+            lazy"""
+            GPU shared memory size is too small for the chosen problem:
+              - element type: $Z
+              - half-support: $M
+              - number of dimensions: $D
+            If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
+        ))
+        # TODO: warning if n is too small? (likely slower than global_memory method)
+        # What is small? n < M?
+    end
+    ntuple(_ -> n, Val(D))  # = (n, n, ...)
+end
+
 # GPU implementation
 struct BlockDataGPU{
         N, I <: Integer, T <: AbstractFloat,
         IndexVector <: AbstractVector{I},
         SortPoints <: StaticBool,
     } <: AbstractBlockData
+    method :: Symbol  # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory
     nblocks_per_dir  :: NTuple{N, I}  # number of blocks in each direction
+    block_dims       :: NTuple{N, I}  # maximum dimensions of a block (excluding ghost cells)
     block_sizes      :: NTuple{N, T}  # size of each block (in units of length)
     cumulative_npoints_per_block :: IndexVector    # cumulative sum of number of points in each block (length = num_blocks + 1, first value is 0)
     blockidx      :: IndexVector  # linear index of block associated to each point (length = Np)
     pointperm     :: IndexVector
     sort_points   :: SortPoints
     function BlockDataGPU(
+            method::Symbol,
             nblocks_per_dir::NTuple{N, I},
+            block_dims::NTuple{N, I},
             block_sizes::NTuple{N, T},
             npoints_per_block::V, sort::S,
         ) where {N, I, T, V, S}
+        method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)"))
         blockidx = similar(npoints_per_block, 0)
         pointperm = similar(npoints_per_block, 0)
         new{N, I, T, V, S}(
-            nblocks_per_dir, block_sizes, npoints_per_block, blockidx, pointperm, sort,
+            method, nblocks_per_dir, block_dims, block_sizes, npoints_per_block, blockidx, pointperm, sort,
         )
     end
 end
 
+gpu_method(bd::BlockDataGPU) = bd.method
 with_blocking(::BlockDataGPU) = true
 
 function BlockDataGPU(
         ::Type{Z},
-        backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, sort_points,
-    ) where {Z <: Number, D}
+        backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
+        sort_points::StaticBool;
+        method::Symbol,
+    ) where {Z <: Number, D, M}
     T = real(Z)  # in case Z is complex
+    if method === :shared_memory
+        # Override input block size. We try to maximise the use of shared memory.
+        block_dims = block_dims_gpu_shmem(Z, Ñs, h)
+    end
     nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim) --> effective block size is ≤ block_dims
     L = T(2) * π
     block_sizes = map(nblocks -> L / nblocks, nblocks_per_dir)
     cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
-    BlockDataGPU(nblocks_per_dir, block_sizes, cumulative_npoints_per_block, sort_points)
+    BlockDataGPU(method, nblocks_per_dir, block_dims, block_sizes, cumulative_npoints_per_block, sort_points)
 end
 
 get_pointperm(bd::BlockDataGPU) = bd.pointperm
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 5c96c63..76a5da2 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -45,14 +45,38 @@ using StaticArrays: MVector
     nothing
 end
 
+@kernel function interpolate_to_points_shmem_kernel!(
+        vp::NTuple{C},
+        @Const(gs::NTuple{D}),
+        @Const(points::NTuple{D}),
+        @Const(us::NTuple{C}),
+        @Const(pointperm),
+        @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
+        ::Val{block_dims},
+    ) where {C, D, block_dims}
+    is_kernel = @index(Global, NTuple)::NTuple{D}
+    threads = @groupsize()::NTuple{D}
+    is = is_kernel ./ threads  # this is the actual block index
+
+    # TODO: support C > 1 (do one component at a time, to use less memory?)
+    T = eltype(us[1])
+    M = Kernels.half_support(gs[1])  # assume they're all equal
+    block_dims_padded = block_dims .+ 2M
+    u_local = @localmem(T, block_dims_padded)
+
+    # Copy data to local memory
+
+    nothing
+end
+
 function interpolate!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},
-        gs,
+        gs::NTuple{D},
         vp_all::NTuple{C, AbstractVector},
         us::NTuple{C, AbstractArray},
         x⃗s::AbstractVector,
-    ) where {C}
+    ) where {C, D}
     # Note: the dimensions of arrays have already been checked via check_nufft_nonuniform_data.
     Base.require_one_based_indexing(x⃗s)  # this is to make sure that all indices match
     foreach(Base.require_one_based_indexing, vp_all)
@@ -75,17 +99,39 @@ function interpolate!(
         pointperm_ = pointperm
     end
 
-    # We use dynamically sized kernels to avoid recompilation, since number of points may
-    # change from one call to another.
-    ndrange = size(x⃗s)  # iterate through points
-    workgroupsize = default_workgroupsize(backend, ndrange)
-    kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize)
-    kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange)
+    method = gpu_method(bd)
+
+    if method === :global_memory
+        # We use dynamically sized kernels to avoid recompilation, since number of points may
+        # change from one call to another.
+        let ndrange = size(x⃗s)  # iterate through points
+            groupsize = default_workgroupsize(backend, ndrange)
+            kernel! = interpolate_to_point_naive_kernel!(backend, groupsize)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange)
+        end
+    elseif method === :shared_memory
+        @assert bd isa BlockDataGPU
+        Z = eltype(us[1])
+        M = Kernels.half_support(gs[1])
+        @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M))  # this is usually a compile-time constant...
+        block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
+        @assert block_dims_val === bd.block_dims
+        let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
+            groupsize = 64  # TODO: this should roughly be the number of non-uniform points per block (or less)
+            groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D)  # "augment" first dimension
+            ndrange = groupsize_dims .* ngroups
+            kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs, block_dims)
+        end
+    end
 
     if sort_points === True()
-        kernel_perm! = interp_permute_kernel!(backend, workgroupsize)
-        kernel_perm!(vp_all, vp_sorted, pointperm; ndrange)
-        foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+        let groupsize = default_workgroupsize(backend, ndrange)
+            kernel_perm! = interp_permute_kernel!(backend, groupsize)
+            kernel_perm!(vp_all, vp_sorted, pointperm; ndrange)
+            foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+        end
     end
 
     vp_all
diff --git a/src/plan.jl b/src/plan.jl
index aff8c1c..61fce96 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -308,6 +308,7 @@ function _PlanNUFFT(
         backend::KA.Backend = CPU(),
         block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
         synchronise::Bool = false,
+        gpu_method::Symbol = :global_memory,
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.
@@ -342,7 +343,7 @@ function _PlanNUFFT(
     else
         block_dims = get_block_dims(Ñs, block_size)
         if backend isa GPU
-            blocks = BlockDataGPU(T, backend, block_dims, Ñs, sort_points)
+            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method)
         else
             blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
             FFTW.set_num_threads(Threads.nthreads())

From 6be3d8ea87733aa72653693484e0c4fe6c0b28d7 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 17 Oct 2024 14:44:10 +0200
Subject: [PATCH 02/33] WIP

---
 src/NonuniformFFTs.jl    |  2 +-
 src/interpolation/gpu.jl | 77 +++++++++++++++++++++++++++++++---------
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index 8a9d205..fd3a0fb 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -5,7 +5,7 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan
 using GPUArraysCore: AbstractGPUArray, AbstractGPUVector
 using StructArrays: StructArrays, StructVector
 using AbstractFFTs: AbstractFFTs
-using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem
+using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem, @synchronize
 using Atomix: Atomix
 using FFTW: FFTW
 using LinearAlgebra: mul!
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 76a5da2..16abfec 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -1,5 +1,23 @@
 using StaticArrays: MVector
 
+# TODO: use this also in spreading, and move to separate file
+@inline function get_inds_vals_gpu(gs::NTuple{D}, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
+    ntuple(Val(D)) do n
+        @inline
+        get_inds_vals_gpu(gs[n], points[n], Ns[n], j)
+    end
+end
+
+@inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer)
+    x = @inbounds points[j]
+    gdata = Kernels.evaluate_kernel(g, x)
+    vals = gdata.values    # kernel values
+    M = Kernels.half_support(g)
+    i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
+    i₀ = ifelse(i₀ < 0, i₀ + N, i₀)  # make sure i₀ ≥ 0
+    i₀ => vals
+end
+
 # Interpolate onto a single point
 @kernel function interpolate_to_point_naive_kernel!(
         vp::NTuple{C},
@@ -22,19 +40,7 @@ using StaticArrays: MVector
     # don't perform atomic operations. This is why the code is simpler here.
     Ns = size(first(us))  # grid dimensions
 
-    indvals = ntuple(Val(D)) do n
-        @inline
-        @inbounds begin
-            g = gs[n]
-            x = points[n][j]
-            gdata = Kernels.evaluate_kernel(g, x)
-            vals = gdata.values    # kernel values
-            M = Kernels.half_support(gs[n])
-            i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
-            i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀)  # make sure i₀ ≥ 0
-            i₀ => vals
-        end
-    end
+    indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
     v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, Δxs)
 
@@ -55,20 +61,57 @@ end
         ::Val{block_dims},
     ) where {C, D, block_dims}
     is_kernel = @index(Global, NTuple)::NTuple{D}
-    threads = @groupsize()::NTuple{D}
-    is = is_kernel ./ threads  # this is the actual block index
+    groupsize = @groupsize()::NTuple{D}  # generally equal to (nthreads, 1, 1, ...)
+    threadidx = @index(Local, Linear)
+    block_index = is_kernel ./ groupsize
+    nthreads = prod(groupsize)
 
     # TODO: support C > 1 (do one component at a time, to use less memory?)
     T = eltype(us[1])
     M = Kernels.half_support(gs[1])  # assume they're all equal
     block_dims_padded = block_dims .+ 2M
-    u_local = @localmem(T, block_dims_padded)
+    u_local = @localmem(T, block_dims_padded)  # allocate shared memory
+
+    # Copy grid data from global to shared memory
+    gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads)
 
-    # Copy data to local memory
+    @synchronize  # make sure all threads have the same shared data
 
     nothing
 end
 
+@inline function gridvalues_to_local_memory!(
+        u_local::AbstractArray{T, D},
+        u_global::AbstractArray{T, D},
+        ::Val{M},
+        block_index::NTuple{D}, block_dims::NTuple{D},
+        threadidx::Integer,
+        nthreads::Integer,
+    ) where {T, D, M}
+    Ns = size(u_global)
+    # inds = ntuple(Val(D)) do n
+    #     @inline
+    #     a = (block_index[n] - 1) * block_dims[n] + 1  # start of current block in global grid (excluding ghost cells)
+    #     b = block_index[n] * block_dims[n]            # end of current block
+    #     (a - M):(b + M)  # range including ghost cells
+    # end
+    # TODO: split work across threads
+    # Assume D = 3 for now
+    if threadidx == 1
+        # for is in iter
+        for i_3 in axes(u_local, 3), i_2 in axes(u_local, 2), i_1 in axes(u_local, 1)
+            # For some reason, type assertions are needed for things to work on AMDGPU
+            j_1 = mod1(block_index[1] - 1 * block_dims[1] + i_1 - M, Ns[1])::Int
+            j_2 = mod1(block_index[2] - 1 * block_dims[2] + i_2 - M, Ns[2])::Int
+            j_3 = mod1(block_index[3] - 1 * block_dims[3] + i_3 - M, Ns[3])::Int
+            is = (i_1, i_2, i_3)::Dims{D}
+            js = (j_1, j_2, j_3)::Dims{D}
+            u_local[is...] = u_global[js...]
+        end
+    end
+    nothing
+end
+
 function interpolate!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},

From 678431763e5cab7c0335082226f8b35b116a2b26 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 09:46:39 +0200
Subject: [PATCH 03/33] Interpolation now works (in 3D, ntransforms = 1)

---
 src/NonuniformFFTs.jl    |   3 +-
 src/blocking.jl          |  51 ++++++++++++--------
 src/interpolation/gpu.jl | 101 +++++++++++++++++++++++++++++----------
 3 files changed, 108 insertions(+), 47 deletions(-)

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index fd3a0fb..eeaf20f 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -5,7 +5,8 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan
 using GPUArraysCore: AbstractGPUArray, AbstractGPUVector
 using StructArrays: StructArrays, StructVector
 using AbstractFFTs: AbstractFFTs
-using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem, @synchronize
+using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const,
+    @groupsize, @localmem, @synchronize, @uniform
 using Atomix: Atomix
 using FFTW: FFTW
 using LinearAlgebra: mul!
diff --git a/src/blocking.jl b/src/blocking.jl
index 4cffc19..2b6a50b 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -70,7 +70,7 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
     max_shmem_size = 48 << 10  # 48 KiB -- TODO: make this depend on the actual GPU?
     max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
     m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
-    n = m - 2M  # exclude ghost cells
+    n = m - (2M - 1)  # exclude ghost cells
     if n ≤ 0
         throw(ArgumentError(
             lazy"""
@@ -92,26 +92,26 @@ struct BlockDataGPU{
         IndexVector <: AbstractVector{I},
         SortPoints <: StaticBool,
     } <: AbstractBlockData
-    method :: Symbol  # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory
+    method :: Symbol     # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory
+    Δxs :: NTuple{N, T}  # grid step; the size of a block in units of length is block_dims .* Δxs.
     nblocks_per_dir  :: NTuple{N, I}  # number of blocks in each direction
     block_dims       :: NTuple{N, I}  # maximum dimensions of a block (excluding ghost cells)
-    block_sizes      :: NTuple{N, T}  # size of each block (in units of length)
     cumulative_npoints_per_block :: IndexVector    # cumulative sum of number of points in each block (length = num_blocks + 1, first value is 0)
     blockidx      :: IndexVector  # linear index of block associated to each point (length = Np)
     pointperm     :: IndexVector
     sort_points   :: SortPoints
     function BlockDataGPU(
             method::Symbol,
+            Δxs::NTuple{N, T},
             nblocks_per_dir::NTuple{N, I},
             block_dims::NTuple{N, I},
-            block_sizes::NTuple{N, T},
             npoints_per_block::V, sort::S,
         ) where {N, I, T, V, S}
         method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)"))
         blockidx = similar(npoints_per_block, 0)
         pointperm = similar(npoints_per_block, 0)
         new{N, I, T, V, S}(
-            method, nblocks_per_dir, block_dims, block_sizes, npoints_per_block, blockidx, pointperm, sort,
+            method, Δxs, nblocks_per_dir, block_dims, npoints_per_block, blockidx, pointperm, sort,
         )
     end
 end
@@ -130,11 +130,11 @@ function BlockDataGPU(
         # Override input block size. We try to maximise the use of shared memory.
         block_dims = block_dims_gpu_shmem(Z, Ñs, h)
     end
-    nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim) --> effective block size is ≤ block_dims
-    L = T(2) * π
-    block_sizes = map(nblocks -> L / nblocks, nblocks_per_dir)
+    nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim)
+    L = T(2) * π  # domain period
+    Δxs = map(N -> L / N, Ñs)  # grid step (oversampled grid)
     cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
-    BlockDataGPU(method, nblocks_per_dir, block_dims, block_sizes, cumulative_npoints_per_block, sort_points)
+    BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points)
 end
 
 get_pointperm(bd::BlockDataGPU) = bd.pointperm
@@ -145,7 +145,7 @@ function set_points_impl!(
         transform::F = identity, synchronise,
     ) where {F <: Function}
     (;
-        cumulative_npoints_per_block, nblocks_per_dir, block_sizes,
+        Δxs, cumulative_npoints_per_block, nblocks_per_dir, block_dims,
         blockidx, pointperm, sort_points,
     ) = bd
 
@@ -173,8 +173,8 @@ function set_points_impl!(
     @timeit timer "(1) Assign blocks" let
         local kernel! = assign_blocks_kernel!(backend, workgroupsize)
         kernel!(
-            blockidx, cumulative_npoints_per_block, points_comp, xp,
-            block_sizes, nblocks_per_dir, sort_points, transform;
+            blockidx, cumulative_npoints_per_block, points_comp, xp, Δxs,
+            block_dims, nblocks_per_dir, sort_points, transform;
             ndrange,
         )
         maybe_synchronise(backend, synchronise)
@@ -192,8 +192,8 @@ function set_points_impl!(
     @timeit timer "(3) Sort" let
         local kernel! = sortperm_kernel!(backend, workgroupsize)
         kernel!(
-            pointperm, cumulative_npoints_per_block, blockidx, xp,
-            block_sizes, nblocks_per_dir, transform;
+            pointperm, cumulative_npoints_per_block, blockidx, xp, Δxs,
+            block_dims, nblocks_per_dir, transform;
             ndrange,
         )
         maybe_synchronise(backend, synchronise)
@@ -214,9 +214,18 @@ end
 
 # Get index of block where x⃗ is located (assumed to be in [0, 2π)).
 @inline function block_index(
-        x⃗::NTuple{N,T}, block_sizes::NTuple{N,T}, nblocks_per_dir::NTuple{N},
+        x⃗::NTuple{N,T}, Δxs::NTuple{N,T}, block_dims::NTuple{N,Integer}, nblocks_per_dir::NTuple{N},
     ) where {N, T <: AbstractFloat}
-    is = map(Kernels.point_to_cell, x⃗, block_sizes)
+    is = ntuple(Val(N)) do n
+        @inline
+        # Note: we could directly use the block size Δx_block = Δx * block_dims, but this
+        # may lead to inconsistency with interpolation and spreading kernels (leading to
+        # errors) due to numerical accuracy issues. So we first obtain the index on the
+        # Δx grid, then we translate that to a block index by dividing by the block
+        # dimensions (= how many Δx's in a single block).
+        i = Kernels.point_to_cell(x⃗[n], Δxs[n])  # index of grid cell where point is located
+        cld(i, block_dims[n])  # index of block where point is located
+    end
     @inbounds LinearIndices(nblocks_per_dir)[is...]
 end
 
@@ -225,7 +234,8 @@ end
         cumulative_npoints_per_block::AbstractVector{<:Integer},
         points::NTuple,
         @Const(xp),
-        @Const(block_sizes::NTuple),
+        @Const(Δxs),
+        @Const(block_dims::NTuple),
         @Const(nblocks_per_dir::NTuple),
         @Const(sort_points),
         @Const(transform::F),
@@ -233,7 +243,7 @@ end
     I = @index(Global, Linear)
     @inbounds x⃗ = xp[I]
     y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple
-    n = block_index(y⃗, block_sizes, nblocks_per_dir)
+    n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir)
 
     # Note: here index_within_block is the value *after* incrementing (≥ 1).
     S = eltype(cumulative_npoints_per_block)
@@ -255,14 +265,15 @@ end
         @Const(cumulative_npoints_per_block),
         @Const(blockidx),
         @Const(xp),
-        @Const(block_sizes),
+        @Const(Δxs),
+        @Const(block_dims),
         @Const(nblocks_per_dir),
         @Const(transform::F),
     ) where {F}
     I = @index(Global, Linear)
     @inbounds x⃗ = xp[I]
     y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple
-    n = block_index(y⃗, block_sizes, nblocks_per_dir)
+    n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir)
     @inbounds J = cumulative_npoints_per_block[n] + blockidx[I]
     @inbounds pointperm[J] = I
     nothing
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 16abfec..9d7da83 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -57,57 +57,106 @@ end
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C}),
         @Const(pointperm),
+        @Const(cumulative_npoints_per_block::AbstractVector),
         @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
         ::Val{block_dims},
     ) where {C, D, block_dims}
-    is_kernel = @index(Global, NTuple)::NTuple{D}
-    groupsize = @groupsize()::NTuple{D}  # generally equal to (nthreads, 1, 1, ...)
-    threadidx = @index(Local, Linear)
-    block_index = is_kernel ./ groupsize
+    groupsize = @groupsize()::Dims{D}  # generally equal to (nthreads, 1, 1, ...)
     nthreads = prod(groupsize)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    block_n = @index(Group, Linear)      # linear index of block
 
     # TODO: support C > 1 (do one component at a time, to use less memory?)
     T = eltype(us[1])
     M = Kernels.half_support(gs[1])  # assume they're all equal
-    block_dims_padded = block_dims .+ 2M
+    block_dims_padded = @. block_dims + 2M - 1
     u_local = @localmem(T, block_dims_padded)  # allocate shared memory
 
     # Copy grid data from global to shared memory
     gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads)
 
+    # This block will take care of non-uniform points (a + 1):b
+    @inbounds a = cumulative_npoints_per_block[block_n]
+    @inbounds b = cumulative_npoints_per_block[block_n + 1]
+    Ns = size(us[1])  # grid dimensions
+    ΔV = prod(Δxs)
+
+    # Shift from indices in global array to indices in local array
+    idx_shift = @. (block_index - 1) * block_dims + 1
+
     @synchronize  # make sure all threads have the same shared data
 
+    for i in (a + threadidx):nthreads:b
+        # Interpolate at point j
+        j = if pointperm === nothing
+            i
+        else
+            @inbounds pointperm[i]
+        end
+
+        indvals = ntuple(Val(D)) do n
+            @inline
+            x = @inbounds points[n][j]
+            gdata = Kernels.evaluate_kernel(gs[n], x)
+            local vals = gdata.values    # kernel values
+            local M = Kernels.half_support(gs[n])
+            local i₀ = gdata.i - idx_shift[n]
+            @assert i₀ ≥ 0
+            @assert i₀ + 2M ≤ block_dims_padded[n]
+            i₀ => vals
+        end
+
+        inds_start = map(first, indvals)
+        vals = map(last, indvals)    # evaluated kernel values in each direction
+        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+
+        gprod_4 = ΔV
+        v = zero(T)
+
+        @inbounds for i_3 in inds[3]
+            gprod_3 = gprod_4 * vals[3][i_3]
+            j_3 = inds_start[3] + i_3
+            for i_2 in inds[2]
+                gprod_2 = gprod_3 * vals[2][i_2]
+                j_2 = inds_start[2] + i_2
+                for i_1 in inds[1]
+                    gprod_1 = gprod_2 * vals[1][i_1]
+                    j_1 = inds_start[1] + i_1
+                    js = (j_1, j_2, j_3)
+                    v += gprod_1 * u_local[js...]
+                end
+            end
+        end
+
+        @inbounds vp[1][j] = v
+    end
+
     nothing
 end
 
+# TODO
+# - generalise to all D
+# - optimise
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
         ::Val{M},
-        block_index::NTuple{D}, block_dims::NTuple{D},
+        block_index::Dims{D}, block_dims::Dims{D},
         threadidx::Integer,
         nthreads::Integer,
     ) where {T, D, M}
     Ns = size(u_global)
-    # inds = ntuple(Val(D)) do n
-    #     @inline
-    #     a = (block_index[n] - 1) * block_dims[n] + 1  # start of current block in global grid (excluding ghost cells)
-    #     b = block_index[n] * block_dims[n]            # end of current block
-    #     (a - M):(b + M)  # range including ghost cells
-    # end
-    # TODO: split work across threads
-    # Assume D = 3 for now
-    if threadidx == 1
-        # for is in iter
-        for i_3 in axes(u_local, 3), i_2 in axes(u_local, 2), i_1 in axes(u_local, 1)
-            # For some reason, type assertions are needed for things to work on AMDGPU
-            j_1 = mod1(block_index[1] - 1 * block_dims[1] + i_1 - M, Ns[1])::Int
-            j_2 = mod1(block_index[2] - 1 * block_dims[2] + i_2 - M, Ns[2])::Int
-            j_3 = mod1(block_index[3] - 1 * block_dims[3] + i_3 - M, Ns[3])::Int
-            is = (i_1, i_2, i_3)::Dims{D}
-            js = (j_1, j_2, j_3)::Dims{D}
-            u_local[is...] = u_global[js...]
-        end
+    @assert D == 3
+    inds_3 = axes(u_local, 3)[threadidx:nthreads:end]  # parallelise over outermost dimension (some threads might not work here)
+    @inbounds for i_3 in inds_3, i_2 in axes(u_local, 2), i_1 in axes(u_local, 1)
+        # For some reason, type assertions are needed for things to work on AMDGPU
+        j_1 = mod1((block_index[1] - 1) * block_dims[1] - (M - 1) + i_1, Ns[1])
+        j_2 = mod1((block_index[2] - 1) * block_dims[2] - (M - 1) + i_2, Ns[2])
+        j_3 = mod1((block_index[3] - 1) * block_dims[3] - (M - 1) + i_3, Ns[3])
+        is = (i_1, i_2, i_3)
+        js = (j_1, j_2, j_3)
+        u_local[is...] = u_global[js...]  # TODO: use linear index instead of is?
     end
     nothing
 end
@@ -165,7 +214,7 @@ function interpolate!(
             groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D)  # "augment" first dimension
             ndrange = groupsize_dims .* ngroups
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange)
-            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs, block_dims)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims)
         end
     end
 

From cec2a65db7b7b8bad74f78b4bccd86dc835b631a Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 09:55:42 +0200
Subject: [PATCH 04/33] Minor changes

---
 src/interpolation/gpu.jl | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 9d7da83..2bf4fd0 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -79,7 +79,6 @@ end
     # This block will take care of non-uniform points (a + 1):b
     @inbounds a = cumulative_npoints_per_block[block_n]
     @inbounds b = cumulative_npoints_per_block[block_n + 1]
-    Ns = size(us[1])  # grid dimensions
     ΔV = prod(Δxs)
 
     # Shift from indices in global array to indices in local array
@@ -137,7 +136,8 @@ end
 
 # TODO
 # - generalise to all D
-# - optimise
+# - parallelise over multiple directions?
+# - choose an optimal memory access pattern
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
@@ -148,15 +148,21 @@ end
     ) where {T, D, M}
     Ns = size(u_global)
     @assert D == 3
+    inds_1 = axes(u_local, 1)
+    inds_2 = axes(u_local, 2)
     inds_3 = axes(u_local, 3)[threadidx:nthreads:end]  # parallelise over outermost dimension (some threads might not work here)
-    @inbounds for i_3 in inds_3, i_2 in axes(u_local, 2), i_1 in axes(u_local, 1)
-        # For some reason, type assertions are needed for things to work on AMDGPU
-        j_1 = mod1((block_index[1] - 1) * block_dims[1] - (M - 1) + i_1, Ns[1])
-        j_2 = mod1((block_index[2] - 1) * block_dims[2] - (M - 1) + i_2, Ns[2])
-        j_3 = mod1((block_index[3] - 1) * block_dims[3] - (M - 1) + i_3, Ns[3])
-        is = (i_1, i_2, i_3)
-        js = (j_1, j_2, j_3)
-        u_local[is...] = u_global[js...]  # TODO: use linear index instead of is?
+    offsets = @. (block_index - 1) * block_dims - (M - 1)
+    @inbounds for i_3 in inds_3
+        j_3 = mod1(offsets[3] + i_3, Ns[3])
+        for i_2 in inds_2
+            j_2 = mod1(offsets[2] + i_2, Ns[2])
+            for i_1 in inds_1
+                j_1 = mod1(offsets[1] + i_1, Ns[1])
+                is = (i_1, i_2, i_3)
+                js = (j_1, j_2, j_3)
+                u_local[is...] = u_global[js...]  # TODO: use linear index instead of is?
+            end
+        end
     end
     nothing
 end
@@ -210,7 +216,7 @@ function interpolate!(
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
-            groupsize = 64  # TODO: this should roughly be the number of non-uniform points per block (or less)
+            groupsize = 64  # TODO: how to choose this? roughly equal to the number of non-uniform points per block (or less)?
             groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D)  # "augment" first dimension
             ndrange = groupsize_dims .* ngroups
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange)

From 0ac88869d816732139861483039c7145354ec327 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 11:24:34 +0200
Subject: [PATCH 05/33] Optimisations

---
 src/interpolation/gpu.jl | 144 ++++++++++++++++++++++++---------------
 1 file changed, 88 insertions(+), 56 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 2bf4fd0..546dca8 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -61,8 +61,9 @@ end
         @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
         ::Val{block_dims},
     ) where {C, D, block_dims}
-    groupsize = @groupsize()::Dims{D}  # generally equal to (nthreads, 1, 1, ...)
+    groupsize = @groupsize()::Dims{D}
     nthreads = prod(groupsize)
+    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
     block_index = @index(Group, NTuple)  # workgroup index (= block index)
     block_n = @index(Group, Linear)      # linear index of block
@@ -73,9 +74,6 @@ end
     block_dims_padded = @. block_dims + 2M - 1
     u_local = @localmem(T, block_dims_padded)  # allocate shared memory
 
-    # Copy grid data from global to shared memory
-    gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads)
-
     # This block will take care of non-uniform points (a + 1):b
     @inbounds a = cumulative_npoints_per_block[block_n]
     @inbounds b = cumulative_npoints_per_block[block_n + 1]
@@ -84,51 +82,57 @@ end
     # Shift from indices in global array to indices in local array
     idx_shift = @. (block_index - 1) * block_dims + 1
 
-    @synchronize  # make sure all threads have the same shared data
+    # Interpolate components one by one (to avoid using too much memory)
+    for c ∈ 1:C
+        # Copy grid data from global to shared memory
+        gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize)
 
-    for i in (a + threadidx):nthreads:b
-        # Interpolate at point j
-        j = if pointperm === nothing
-            i
-        else
-            @inbounds pointperm[i]
-        end
+        @synchronize  # make sure all threads have the same shared data
 
-        indvals = ntuple(Val(D)) do n
-            @inline
-            x = @inbounds points[n][j]
-            gdata = Kernels.evaluate_kernel(gs[n], x)
-            local vals = gdata.values    # kernel values
-            local M = Kernels.half_support(gs[n])
-            local i₀ = gdata.i - idx_shift[n]
-            @assert i₀ ≥ 0
-            @assert i₀ + 2M ≤ block_dims_padded[n]
-            i₀ => vals
-        end
+        for i in (a + threadidx):nthreads:b
+            # Interpolate at point j
+            j = if pointperm === nothing
+                i
+            else
+                @inbounds pointperm[i]
+            end
 
-        inds_start = map(first, indvals)
-        vals = map(last, indvals)    # evaluated kernel values in each direction
-        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-
-        gprod_4 = ΔV
-        v = zero(T)
-
-        @inbounds for i_3 in inds[3]
-            gprod_3 = gprod_4 * vals[3][i_3]
-            j_3 = inds_start[3] + i_3
-            for i_2 in inds[2]
-                gprod_2 = gprod_3 * vals[2][i_2]
-                j_2 = inds_start[2] + i_2
-                for i_1 in inds[1]
-                    gprod_1 = gprod_2 * vals[1][i_1]
-                    j_1 = inds_start[1] + i_1
-                    js = (j_1, j_2, j_3)
-                    v += gprod_1 * u_local[js...]
+            indvals = ntuple(Val(D)) do n
+                @inline
+                x = @inbounds points[n][j]
+                gdata = Kernels.evaluate_kernel(gs[n], x)
+                local vals = gdata.values    # kernel values
+                local M = Kernels.half_support(gs[n])
+                local i₀ = gdata.i - idx_shift[n]
+                @assert i₀ ≥ 0
+                @assert i₀ + 2M ≤ block_dims_padded[n]
+                i₀ => vals
+            end
+
+            inds_start = map(first, indvals)
+            vals = map(last, indvals)    # evaluated kernel values in each direction
+            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+
+            gprod_4 = ΔV
+            v = zero(T)
+
+            @inbounds for i_3 in inds[3]
+                gprod_3 = gprod_4 * vals[3][i_3]
+                j_3 = inds_start[3] + i_3
+                for i_2 in inds[2]
+                    gprod_2 = gprod_3 * vals[2][i_2]
+                    j_2 = inds_start[2] + i_2
+                    for i_1 in inds[1]
+                        gprod_1 = gprod_2 * vals[1][i_1]
+                        j_1 = inds_start[1] + i_1
+                        js = (j_1, j_2, j_3)
+                        v += gprod_1 * u_local[js...]
+                    end
                 end
             end
-        end
 
-        @inbounds vp[1][j] = v
+            @inbounds vp[c][j] = v
+        end
     end
 
     nothing
@@ -137,27 +141,33 @@ end
 # TODO
 # - generalise to all D
 # - parallelise over multiple directions?
-# - choose an optimal memory access pattern
+# - is it possible to optimise the memory access patterns?
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
         ::Val{M},
         block_index::Dims{D}, block_dims::Dims{D},
-        threadidx::Integer,
-        nthreads::Integer,
+        threadidxs::Dims{D},
+        groupsize::Dims{D},
     ) where {T, D, M}
     Ns = size(u_global)
     @assert D == 3
-    inds_1 = axes(u_local, 1)
-    inds_2 = axes(u_local, 2)
-    inds_3 = axes(u_local, 3)[threadidx:nthreads:end]  # parallelise over outermost dimension (some threads might not work here)
+    inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end]
+    inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end]
+    inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end]
     offsets = @. (block_index - 1) * block_dims - (M - 1)
     @inbounds for i_3 in inds_3
-        j_3 = mod1(offsets[3] + i_3, Ns[3])
+        j_3 = offsets[3] + i_3
+        j_3 = ifelse(j_3 ≤ 0, j_3 + Ns[3], j_3)
+        j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3)
         for i_2 in inds_2
-            j_2 = mod1(offsets[2] + i_2, Ns[2])
+            j_2 = offsets[2] + i_2
+            j_2 = ifelse(j_2 ≤ 0, j_2 + Ns[2], j_2)
+            j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2)
             for i_1 in inds_1
-                j_1 = mod1(offsets[1] + i_1, Ns[1])
+                j_1 = offsets[1] + i_1
+                j_1 = ifelse(j_1 ≤ 0, j_1 + Ns[1], j_1)
+                j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1)
                 is = (i_1, i_2, i_3)
                 js = (j_1, j_2, j_3)
                 u_local[is...] = u_global[js...]  # TODO: use linear index instead of is?
@@ -216,10 +226,9 @@ function interpolate!(
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
-            groupsize = 64  # TODO: how to choose this? roughly equal to the number of non-uniform points per block (or less)?
-            groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D)  # "augment" first dimension
-            ndrange = groupsize_dims .* ngroups
-            kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange)
+            groupsize = groupsize_shmem(ngroups, length(x⃗s))
+            ndrange = groupsize .* ngroups
+            kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims)
         end
     end
@@ -235,6 +244,29 @@ function interpolate!(
     vp_all
 end
 
+# Determine groupsize (number of threads per direction) in :shared_memory method.
+# Here Np is the number of non-uniform points.
+function groupsize_shmem(ngroups::NTuple{D}, Np) where {D}
+    points_per_group = Np / prod(ngroups)  # average number of points per block
+    groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
+    # Increase groupsize if number of points is much larger (at least 4×) than the groupsize.
+    # (Not sure this is a good idea if the point distribution is very inhomogeneous...)
+    # TODO: maybe leave it at 64?
+    while groupsize < 1024 && 4 * groupsize ≤ points_per_group
+        groupsize *= 2
+    end
+    gsizes = ntuple(_ -> 1, Val(D))
+    p = 1  # product of sizes
+    i = 1
+    while p < groupsize
+        gsizes = Base.setindex(gsizes, gsizes[i] * 2, i)
+        p *= 2
+        i = mod1(i + 1, D)
+    end
+    @assert p == groupsize == prod(gsizes)
+    gsizes
+end
+
 @inline function interpolate_from_arrays_gpu(
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},

From ecef0383f0dc7118262725fd636bca3ca7ef5b23 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 12:21:47 +0200
Subject: [PATCH 06/33] Further optimisations

---
 src/interpolation/gpu.jl | 49 +++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 546dca8..e05529f 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -140,8 +140,7 @@ end
 
 # TODO
 # - generalise to all D
-# - parallelise over multiple directions?
-# - is it possible to optimise the memory access patterns?
+# - how to optimise the memory access patterns?
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
@@ -155,18 +154,19 @@ end
     inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end]
     inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end]
     inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end]
-    offsets = @. (block_index - 1) * block_dims - (M - 1)
+    offsets = ntuple(Val(D)) do n
+        @inline
+        off = (block_index[n] - 1) * block_dims[n] - (M - 1)
+        ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+    end
     @inbounds for i_3 in inds_3
         j_3 = offsets[3] + i_3
-        j_3 = ifelse(j_3 ≤ 0, j_3 + Ns[3], j_3)
         j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3)
         for i_2 in inds_2
             j_2 = offsets[2] + i_2
-            j_2 = ifelse(j_2 ≤ 0, j_2 + Ns[2], j_2)
             j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2)
             for i_1 in inds_1
                 j_1 = offsets[1] + i_1
-                j_1 = ifelse(j_1 ≤ 0, j_1 + Ns[1], j_1)
                 j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1)
                 is = (i_1, i_2, i_3)
                 js = (j_1, j_2, j_3)
@@ -226,7 +226,9 @@ function interpolate!(
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
-            groupsize = groupsize_shmem(ngroups, length(x⃗s))
+            block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
+            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
+            # @show ngroups block_dims block_dims_padded
             ndrange = groupsize .* ngroups
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims)
@@ -246,22 +248,33 @@ end
 
 # Determine groupsize (number of threads per direction) in :shared_memory method.
 # Here Np is the number of non-uniform points.
-function groupsize_shmem(ngroups::NTuple{D}, Np) where {D}
-    points_per_group = Np / prod(ngroups)  # average number of points per block
+# The distribution of threads across directions mainly (only?) affects the copies between
+# shared and global memories, so it can have an important influence of performance due to
+# memory accesses.
+function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
+    # (1) Determine the total number of threads.
     groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
     # Increase groupsize if number of points is much larger (at least 4×) than the groupsize.
-    # (Not sure this is a good idea if the point distribution is very inhomogeneous...)
-    # TODO: maybe leave it at 64?
-    while groupsize < 1024 && 4 * groupsize ≤ points_per_group
-        groupsize *= 2
-    end
+    # Not sure this is a good idea if the point distribution is very inhomogeneous...
+    # TODO see if this improves performance in highly dense cases
+    # points_per_group = Np / prod(ngroups)  # average number of points per block
+    # while groupsize < 1024 && 4 * groupsize ≤ points_per_group
+    #     groupsize *= 2
+    # end
+    # (2) Determine number of threads in each direction.
+    # This mainly affects the performance of global -> shared memory copies.
+    # It seems like it's better to parallelise the outer dimensions first.
     gsizes = ntuple(_ -> 1, Val(D))
     p = 1  # product of sizes
-    i = 1
+    i = D  # parallelise outer dimensions first
     while p < groupsize
-        gsizes = Base.setindex(gsizes, gsizes[i] * 2, i)
-        p *= 2
-        i = mod1(i + 1, D)
+        if gsizes[i] < shmem_size[i] || i == 1
+            gsizes = Base.setindex(gsizes, gsizes[i] * 2, i)
+            p *= 2
+        else
+            @assert i > 1
+            i -= 1
+        end
     end
     @assert p == groupsize == prod(gsizes)
     gsizes

From e4aa4ca9f9a3a7564eeb44e164cc7a01bdb55151 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 13:24:20 +0200
Subject: [PATCH 07/33] Generalise interpolation to all dimensions

---
 src/interpolation/gpu.jl | 141 +++++++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 50 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index e05529f..7cca3b9 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -68,7 +68,6 @@ end
     block_index = @index(Group, NTuple)  # workgroup index (= block index)
     block_n = @index(Group, Linear)      # linear index of block
 
-    # TODO: support C > 1 (do one component at a time, to use less memory?)
     T = eltype(us[1])
     M = Kernels.half_support(gs[1])  # assume they're all equal
     block_dims_padded = @. block_dims + 2M - 1
@@ -77,7 +76,7 @@ end
     # This block will take care of non-uniform points (a + 1):b
     @inbounds a = cumulative_npoints_per_block[block_n]
     @inbounds b = cumulative_npoints_per_block[block_n + 1]
-    ΔV = prod(Δxs)
+    prefactor = prod(Δxs)  # interpolations need to be multiplied by the volume of a grid cell
 
     # Shift from indices in global array to indices in local array
     idx_shift = @. (block_index - 1) * block_dims + 1
@@ -97,6 +96,7 @@ end
                 @inbounds pointperm[i]
             end
 
+            # TODO: can we do this just once for all components? (if C > 1)
             indvals = ntuple(Val(D)) do n
                 @inline
                 x = @inbounds points[n][j]
@@ -109,28 +109,7 @@ end
                 i₀ => vals
             end
 
-            inds_start = map(first, indvals)
-            vals = map(last, indvals)    # evaluated kernel values in each direction
-            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-
-            gprod_4 = ΔV
-            v = zero(T)
-
-            @inbounds for i_3 in inds[3]
-                gprod_3 = gprod_4 * vals[3][i_3]
-                j_3 = inds_start[3] + i_3
-                for i_2 in inds[2]
-                    gprod_2 = gprod_3 * vals[2][i_2]
-                    j_2 = inds_start[2] + i_2
-                    for i_1 in inds[1]
-                        gprod_1 = gprod_2 * vals[1][i_1]
-                        j_1 = inds_start[1] + i_1
-                        js = (j_1, j_2, j_3)
-                        v += gprod_1 * u_local[js...]
-                    end
-                end
-            end
-
+            v = interpolate_from_arrays_shmem(u_local, indvals, prefactor)
             @inbounds vp[c][j] = v
         end
     end
@@ -138,9 +117,8 @@ end
     nothing
 end
 
-# TODO
-# - generalise to all D
-# - how to optimise the memory access patterns?
+# Copy values from global to shared memory.
+# - can we optimise memory access patterns?
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
@@ -149,32 +127,95 @@ end
         threadidxs::Dims{D},
         groupsize::Dims{D},
     ) where {T, D, M}
-    Ns = size(u_global)
-    @assert D == 3
-    inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end]
-    inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end]
-    inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end]
-    offsets = ntuple(Val(D)) do n
-        @inline
-        off = (block_index[n] - 1) * block_dims[n] - (M - 1)
-        ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
-    end
-    @inbounds for i_3 in inds_3
-        j_3 = offsets[3] + i_3
-        j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3)
-        for i_2 in inds_2
-            j_2 = offsets[2] + i_2
-            j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2)
-            for i_1 in inds_1
-                j_1 = offsets[1] + i_1
-                j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1)
-                is = (i_1, i_2, i_3)
-                js = (j_1, j_2, j_3)
-                u_local[is...] = u_global[js...]  # TODO: use linear index instead of is?
+    if @generated
+        quote
+            Ns = size(u_global)
+            inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end])
+            offsets = @ntuple(
+                $D,
+                n -> let
+                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
+                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+                end
+            )
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    j_d = offsets[d] + i_d
+                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
+                end,
+                begin
+                    is = @ntuple($D, i)
+                    js = @ntuple($D, j)
+                    @inbounds u_local[is...] = u_global[js...]
+                end,
+            )
+            nothing
+        end
+    else
+        Ns = size(u_global)
+        inds = ntuple(Val(D)) do n
+            axes(u_local, n)[threadidxs[n]:groupsize[n]:end]  # this determines the parallelisation pattern
+        end
+        offsets = ntuple(Val(D)) do n
+            @inline
+            off = (block_index[n] - 1) * block_dims[n] - (M - 1)
+            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+        end
+        @inbounds for is ∈ Iterators.product(inds...)
+            js = ntuple(Val(D)) do n
+                @inline
+                j = offsets[n] + is[n]
+                ifelse(j > Ns[n], j - Ns[n], j)
             end
+            u_local[is...] = u_global[js...]
         end
+        nothing
+    end
+end
+
+# Interpolate a single "component" (one transform at a time).
+# Here vp is a vector instead of a tuple of vectors.
+@inline function interpolate_from_arrays_shmem(
+        u_local::AbstractArray{T, D},
+        indvals::NTuple{D},
+        prefactor,
+    ) where {T, D}
+    if @generated
+        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
+        quote
+            $gprod_init = prefactor
+            v = zero(T)
+            inds_start = map(first, indvals)
+            vals = map(last, indvals)    # evaluated kernel values in each direction
+            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
+                    @inbounds j_d = inds_start[d] + i_d
+                end,
+                begin
+                    js = @ntuple($D, j)
+                    @inbounds v += gprod_1 * u_local[js...]
+                end,
+            )
+            v
+        end
+    else
+        v = zero(T)
+        inds_start = map(first, indvals)
+        vals = map(last, indvals)    # evaluated kernel values in each direction
+        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+        @inbounds for I ∈ CartesianIndices(inds)
+            gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D)))
+            js = inds_start .+ Tuple(I)
+            v += gprod * u_local[js...]
+        end
+        v
     end
-    nothing
 end
 
 function interpolate!(

From 7d1e4f6d0440dbeed12051397c4d624bc42e9843 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 13:26:35 +0200
Subject: [PATCH 08/33] Use get_inds_vals_gpu in spreading

---
 src/interpolation/gpu.jl |  3 ++-
 src/spreading/gpu.jl     | 14 +-------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 7cca3b9..2589377 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -1,6 +1,6 @@
 using StaticArrays: MVector
 
-# TODO: use this also in spreading, and move to separate file
+# Note: this is also used in spreading
 @inline function get_inds_vals_gpu(gs::NTuple{D}, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
     ntuple(Val(D)) do n
         @inline
@@ -8,6 +8,7 @@ using StaticArrays: MVector
     end
 end
 
+# Note: this is also used in spreading
 @inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer)
     x = @inbounds points[j]
     gdata = Kernels.evaluate_kernel(g, x)
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 838aa79..3df8aad 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -23,19 +23,7 @@
     @assert eltype(first(us)) <: Real # output is a real array (but may actually describe complex data)
     Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
 
-    indvals = ntuple(Val(D)) do n
-        @inline
-        @inbounds begin
-            g = gs[n]
-            x = points[n][j]
-            gdata = Kernels.evaluate_kernel(g, x)
-            vals = gdata.values
-            M = Kernels.half_support(gs[n])
-            i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
-            i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀)  # make sure i₀ ≥ 0
-            i₀ => vals
-        end
-    end
+    indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
     v⃗ = map(v -> @inbounds(v[j]), vp)
     spread_onto_arrays_gpu!(us, indvals, v⃗, Ns)

From 2a34d0b74a3e3be35560b5afc2cb6479dbf4b61a Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 15:18:23 +0200
Subject: [PATCH 09/33] Spreading with shmem works but is slow

Atomic add on shared memory is very slow. Is it Atomix's fault?
---
 src/interpolation/gpu.jl |  40 +++----
 src/spreading/gpu.jl     | 229 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 236 insertions(+), 33 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 2589377..41c86cd 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -26,7 +26,7 @@ end
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C}),
         @Const(pointperm),
-        @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
+        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
     ) where {C, D}
     i = @index(Global, Linear)
 
@@ -43,7 +43,7 @@ end
 
     indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
-    v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, Δxs)
+    v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, prefactor)
 
     for n ∈ eachindex(vp, v⃗)
         @inbounds vp[n][j] = v⃗[n]
@@ -59,7 +59,7 @@ end
         @Const(us::NTuple{C}),
         @Const(pointperm),
         @Const(cumulative_npoints_per_block::AbstractVector),
-        @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
+        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
         ::Val{block_dims},
     ) where {C, D, block_dims}
     groupsize = @groupsize()::Dims{D}
@@ -77,7 +77,6 @@ end
     # This block will take care of non-uniform points (a + 1):b
     @inbounds a = cumulative_npoints_per_block[block_n]
     @inbounds b = cumulative_npoints_per_block[block_n + 1]
-    prefactor = prod(Δxs)  # interpolations need to be multiplied by the volume of a grid cell
 
     # Shift from indices in global array to indices in local array
     idx_shift = @. (block_index - 1) * block_dims + 1
@@ -105,8 +104,8 @@ end
                 local vals = gdata.values    # kernel values
                 local M = Kernels.half_support(gs[n])
                 local i₀ = gdata.i - idx_shift[n]
-                @assert i₀ ≥ 0
-                @assert i₀ + 2M ≤ block_dims_padded[n]
+                # @assert i₀ ≥ 0
+                # @assert i₀ + 2M ≤ block_dims_padded[n]
                 i₀ => vals
             end
 
@@ -123,10 +122,7 @@ end
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
-        ::Val{M},
-        block_index::Dims{D}, block_dims::Dims{D},
-        threadidxs::Dims{D},
-        groupsize::Dims{D},
+        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
     ) where {T, D, M}
     if @generated
         quote
@@ -233,6 +229,7 @@ function interpolate!(
 
     xs_comp = StructArrays.components(x⃗s)
     Δxs = map(Kernels.gridstep, gs)
+    prefactor = prod(Δxs)  # interpolations need to be multiplied by the volume of a grid cell
 
     pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
     sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
@@ -249,15 +246,17 @@ function interpolate!(
         pointperm_ = pointperm
     end
 
+    # We use dynamically sized kernels to avoid recompilation, since number of points may
+    # change from one call to another.
+    ndrange_points = size(x⃗s)  # iterate through points
+    groupsize_points = default_workgroupsize(backend, ndrange_points)
+
     method = gpu_method(bd)
 
     if method === :global_memory
-        # We use dynamically sized kernels to avoid recompilation, since number of points may
-        # change from one call to another.
-        let ndrange = size(x⃗s)  # iterate through points
-            groupsize = default_workgroupsize(backend, ndrange)
+        let ndrange = ndrange_points, groupsize = groupsize_points
             kernel! = interpolate_to_point_naive_kernel!(backend, groupsize)
-            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, prefactor; ndrange)
         end
     elseif method === :shared_memory
         @assert bd isa BlockDataGPU
@@ -273,12 +272,12 @@ function interpolate!(
             # @show ngroups block_dims block_dims_padded
             ndrange = groupsize .* ngroups
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
-            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, prefactor, block_dims)
         end
     end
 
     if sort_points === True()
-        let groupsize = default_workgroupsize(backend, ndrange)
+        let ndrange = ndrange_points, groupsize = groupsize_points
             kernel_perm! = interp_permute_kernel!(backend, groupsize)
             kernel_perm!(vp_all, vp_sorted, pointperm; ndrange)
             foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
@@ -293,6 +292,7 @@ end
 # The distribution of threads across directions mainly (only?) affects the copies between
 # shared and global memories, so it can have an important influence of performance due to
 # memory accesses.
+# NOTE: this is also used in spreading
 function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
     # (1) Determine the total number of threads.
     groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
@@ -326,7 +326,7 @@ end
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},
         Ns::Dims{D},
-        Δxs::NTuple{D, Tr},
+        prefactor::Tr,
     ) where {T, Tr, C, D}
     if @generated
         gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
@@ -336,7 +336,7 @@ end
             vals = map(last, indvals)    # evaluated kernel values in each direction
             inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
             vs = zero(MVector{$C, $T})   # interpolated value (output)
-            $gprod_init = prod(Δxs)  # product of kernel values (initially Δx[1] * Δx[2] * ...)
+            $gprod_init = prefactor  # product of kernel values (initially Δx[1] * Δx[2] * ...)
             @nloops(
                 $D, i,
                 d -> inds[d],  # for i_d ∈ 1:L
@@ -368,7 +368,7 @@ end
         vals_first, vals_tail = first(vals), Base.tail(vals)
         istart_first, istart_tail = first(inds_start), Base.tail(inds_start)
         N, Ns_tail = first(Ns), Base.tail(Ns)
-        gprod_base = prod(Δxs)  # this factor is needed in interpolation only
+        gprod_base = prefactor  # this factor is needed in interpolation only
         @inbounds for I_tail ∈ CartesianIndices(inds_tail)
             is_tail = Tuple(I_tail)
             gs_tail = map(inbounds_getindex, vals_tail, is_tail)
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 3df8aad..ed5affb 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -34,6 +34,9 @@ end
 @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns
 @inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1)  # actual number of complex elements in first dimension
 
+@inline spread_real_dims(::Type{<:Real}, Ns) = Ns
+@inline spread_real_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] << 1, 1)
+
 @inline function spread_onto_arrays_gpu!(
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},
@@ -58,7 +61,7 @@ end
                 end,
                 begin
                     # gprod_1 contains the product vals[1][i_1] * vals[2][i_2] * ...
-                    js = @ntuple $D j
+                    js = @ntuple($D, j)
                     @inbounds for n ∈ 1:$C
                         w = vs[n] * gprod_1
                         _atomic_add!(us[n], w, js)
@@ -125,7 +128,7 @@ function spread_from_points!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},
         gs,
-        us_all::NTuple{C, AbstractGPUArray},
+        us::NTuple{C, AbstractGPUArray},
         x⃗s::StructVector,
         vp_all::NTuple{C, AbstractGPUVector},
     ) where {C}
@@ -135,16 +138,16 @@ function spread_from_points!(
 
     xs_comp = StructArrays.components(x⃗s)
 
-    # Reinterpret `us_all` as real arrays, in case they are complex.
+    # Reinterpret `us` as real arrays, in case they are complex.
     # This is to avoid current issues with atomic operations on complex data
     # (https://github.com/JuliaGPU/KernelAbstractions.jl/issues/497).
-    Z = eltype(us_all[1])
+    Z = eltype(us[1])
     T = real(Z)
     us_real = if Z <: Real
-        us_all
+        us
     else  # complex case
         @assert Z <: Complex
-        map(u -> reinterpret(T, u), us_all)  # note: we don't use reshape, so the first dimension has 2x elements
+        map(u -> reinterpret(T, u), us)  # note: we don't use reshape, so the first dimension has 2x elements
     end
 
     pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
@@ -156,27 +159,50 @@ function spread_from_points!(
 
     # We use dynamically sized kernels to avoid recompilation, since number of points may
     # change from one call to another.
-    ndrange = size(x⃗s)  # iterate through points
-    workgroupsize = default_workgroupsize(backend, ndrange)
+    ndrange_points = size(x⃗s)  # iterate through points
+    groupsize_points = default_workgroupsize(backend, ndrange_points)
 
     if sort_points === True()
         vp_sorted = map(similar, vp_all)  # allocate temporary arrays for sorted non-uniform data
-        kernel_perm! = spread_permute_kernel!(backend, workgroupsize)
-        kernel_perm!(vp_sorted, vp_all, pointperm; ndrange)
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel_perm! = spread_permute_kernel!(backend, groupsize)
+            kernel_perm!(vp_sorted, vp_all, pointperm; ndrange)
+        end
         pointperm_ = nothing  # we don't need any further permutations (all accesses to non-uniform data will be contiguous)
     else
         vp_sorted = vp_all
         pointperm_ = pointperm
     end
 
-    kernel! = spread_from_point_naive_kernel!(backend, workgroupsize)
-    kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange)
+    method = gpu_method(bd)
+
+    if method === :global_memory
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel! = spread_from_point_naive_kernel!(backend, groupsize)
+            kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange)
+        end
+    elseif method === :shared_memory
+        @assert bd isa BlockDataGPU
+        Z = eltype(us[1])
+        M = Kernels.half_support(gs[1])
+        @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M))  # this is usually a compile-time constant...
+        block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
+        @assert block_dims_val === bd.block_dims
+        let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
+            block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
+            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
+            ndrange = groupsize .* ngroups
+            kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
+            kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block, block_dims)
+        end
+    end
 
     if sort_points === True()
         foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
     end
 
-    us_all
+    us
 end
 
 @kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
@@ -187,3 +213,180 @@ end
     end
     nothing
 end
+
+## ==================================================================================================== ##
+## Shared-memory implementation
+
+@kernel function spread_from_points_shmem_kernel!(
+        us::NTuple{C},
+        @Const(gs::NTuple{D}),
+        @Const(points::NTuple{D}),
+        @Const(vp::NTuple{C}),
+        @Const(pointperm),
+        @Const(cumulative_npoints_per_block::AbstractVector),
+        ::Val{block_dims},
+    ) where {C, D, block_dims}
+    groupsize = @groupsize()::Dims{D}
+    nthreads = prod(groupsize)
+    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    block_n = @index(Group, Linear)      # linear index of block
+
+    # Determine grid dimensions.
+    # Note that, to avoid problems with @atomic, we currently work with real-data arrays
+    # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice
+    # the actual dataset dimensions.
+    Z = eltype(vp[1])
+    T = eltype(us[1])  # may be Z or real(Z)
+    @assert T === real(Z) # output is a real array (but may actually describe complex data)
+    Ns_real = size(first(us))  # dimensions of raw input data
+    Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
+
+    M = Kernels.half_support(gs[1])  # assume they're all equal
+    block_dims_padded = @. block_dims + 2M - 1
+    Ns_local = spread_real_dims(Z, block_dims_padded)  # multiplies first dimension by 2 if Z <: Complex (note that T <: Real)
+    u_local = @localmem(T, Ns_local)  # allocate shared memory
+
+    # This block will take care of non-uniform points (a + 1):b
+    @inbounds a = cumulative_npoints_per_block[block_n]
+    @inbounds b = cumulative_npoints_per_block[block_n + 1]
+
+    # Shift from indices in global array to indices in local array
+    idx_shift = @. (block_index - 1) * block_dims + 1
+
+    # Interpolate components one by one (to avoid using too much memory)
+    for c ∈ 1:C
+        # Reset shared memory to zero
+        for i ∈ threadidx:nthreads:length(u_local)
+            @inbounds u_local[i] = 0
+        end
+
+        @synchronize  # make sure shared memory is fully set to zero
+
+        for i in (a + threadidx):nthreads:b
+            # Spread from point j
+            j = if pointperm === nothing
+                i
+            else
+                @inbounds pointperm[i]
+            end
+
+            # TODO: can we do this just once for all components? (if C > 1)
+            indvals = ntuple(Val(D)) do n
+                @inline
+                x = @inbounds points[n][j]
+                gdata = Kernels.evaluate_kernel(gs[n], x)
+                local vals = gdata.values    # kernel values
+                local M = Kernels.half_support(gs[n])
+                local i₀ = gdata.i - idx_shift[n]
+                # @assert i₀ ≥ 0
+                # @assert i₀ + 2M ≤ block_dims_padded[n]
+                i₀ => vals
+            end
+
+            @inbounds v = vp[c][j]
+            spread_onto_array_shmem!(u_local, indvals, v)
+        end
+
+        @synchronize  # make sure we have finished writing to shared memory
+
+        add_from_local_to_global_memory!(Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize)
+
+        if c < C
+            @synchronize  # wait before jumping to next component
+        end
+    end
+
+    nothing
+end
+
+# Spread a single "component" (one transform at a time).
+@inline function spread_onto_array_shmem!(
+        u_local::AbstractArray{T, D},
+        indvals::NTuple{D},
+        v::Z,
+    ) where {T, D, Z}
+    if @generated
+        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
+        Tr = real(T)
+        quote
+            inds_start = map(first, indvals)  # start of active region in output array
+            vals = map(last, indvals)    # evaluated kernel values in each direction
+            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+            $gprod_init = one($Tr)       # product of kernel values (initially 1)
+            @nloops(
+                $D, i,
+                d -> inds[d],  # for i_d ∈ 1:L
+                d -> begin
+                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
+                    @inbounds j_d = inds_start[d] + i_d
+                end,
+                begin
+                    js = @ntuple($D, j)
+                    w = v * gprod_1
+                    _atomic_add!(u_local, w, js)  # this is slow!! (atomics in shared memory) -- is it Atomix's fault?
+                end,
+            )
+            v
+        end
+    else
+        aaa  # TODO: implement
+    end
+end
+
+# Add values from shared to global memory.
+@inline function add_from_local_to_global_memory!(
+        ::Type{Z},  # "logical" type of data (e.g. ComplexF64)
+        u_global::AbstractArray{T, D},  # actual data is always real
+        u_local::AbstractArray{T, D},
+        Ns::Dims{D},
+        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
+    ) where {Z, T, D, M}
+    if @generated
+        @assert T <: Real  # for atomic operations
+        skip = sizeof(Z) ÷ sizeof(T)  # 1 (Z real) or 2 (Z complex)
+        quote
+            # @assert skip ∈ (1, 2)
+            skips = @ntuple($D, n -> n == 1 ? $skip : 1)
+            inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end])
+            offsets = @ntuple(
+                $D,
+                n -> let
+                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
+                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+                end
+            )
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    j_d = offsets[d] + i_d
+                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
+                end,
+                begin
+                    is = @ntuple($D, i)
+                    js = @ntuple($D, j)
+                    @inbounds if $skip === 1  # real data (Z <: Real)
+                        w = u_local[is...]
+                        Atomix.@atomic u_global[js...] += w
+                    elseif $skip === 2  # complex data (Z <: Complex)
+                        is_tail = @ntuple($(D - 1), d -> i_{d + 1})
+                        js_tail = @ntuple($(D - 1), d -> j_{d + 1})
+                        ifirst = 2 * i_1 - 1
+                        jfirst = 2 * j_1 - 1
+                        w = u_local[ifirst, is_tail...]  # real part
+                        Atomix.@atomic u_global[jfirst, js_tail...] += w
+                        w = u_local[ifirst + 1, is_tail...]  # imaginary part
+                        Atomix.@atomic u_global[jfirst + 1, js_tail...] += w
+                    end
+                end,
+            )
+            nothing
+        end
+    else
+        aaa  # TODO: implement
+    end
+end
+
+## ==================================================================================================== ##

From 87f050cd8e31bbf4cc2f973eac9f6e18e1f15cfc Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 22:00:51 +0200
Subject: [PATCH 10/33] SM kernels now work with KA CPU backends

---
 src/interpolation/gpu.jl | 45 +++++++++++++------------
 src/spreading/gpu.jl     | 71 +++++++++++++++++++++-------------------
 2 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 41c86cd..cdedd67 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -53,33 +53,28 @@ end
 end
 
 @kernel function interpolate_to_points_shmem_kernel!(
-        vp::NTuple{C},
-        @Const(gs::NTuple{D}),
+        vp::NTuple{C, AbstractVector{Z}},
+        @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}),
         @Const(points::NTuple{D}),
-        @Const(us::NTuple{C}),
+        @Const(us::NTuple{C, AbstractArray{Z}}),
         @Const(pointperm),
         @Const(cumulative_npoints_per_block::AbstractVector),
         @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
         ::Val{block_dims},
-    ) where {C, D, block_dims}
-    groupsize = @groupsize()::Dims{D}
-    nthreads = prod(groupsize)
+        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
+    ) where {C, D, Z <: Number, M, block_dims, shmem_size}
+
+    @uniform begin
+        groupsize = @groupsize()::Dims{D}
+        nthreads = prod(groupsize)
+    end
+
     threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
     block_index = @index(Group, NTuple)  # workgroup index (= block index)
     block_n = @index(Group, Linear)      # linear index of block
 
-    T = eltype(us[1])
-    M = Kernels.half_support(gs[1])  # assume they're all equal
-    block_dims_padded = @. block_dims + 2M - 1
-    u_local = @localmem(T, block_dims_padded)  # allocate shared memory
-
-    # This block will take care of non-uniform points (a + 1):b
-    @inbounds a = cumulative_npoints_per_block[block_n]
-    @inbounds b = cumulative_npoints_per_block[block_n + 1]
-
-    # Shift from indices in global array to indices in local array
-    idx_shift = @. (block_index - 1) * block_dims + 1
+    u_local = @localmem(Z, shmem_size)  # allocate shared memory
 
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
@@ -88,6 +83,10 @@ end
 
         @synchronize  # make sure all threads have the same shared data
 
+        # This block will take care of non-uniform points (a + 1):b
+        @inbounds a = cumulative_npoints_per_block[block_n]
+        @inbounds b = cumulative_npoints_per_block[block_n + 1]
+
         for i in (a + threadidx):nthreads:b
             # Interpolate at point j
             j = if pointperm === nothing
@@ -102,8 +101,8 @@ end
                 x = @inbounds points[n][j]
                 gdata = Kernels.evaluate_kernel(gs[n], x)
                 local vals = gdata.values    # kernel values
-                local M = Kernels.half_support(gs[n])
-                local i₀ = gdata.i - idx_shift[n]
+                local ishift = (block_index[n] - 1) * block_dims[n] + 1
+                local i₀ = gdata.i - ishift
                 # @assert i₀ ≥ 0
                 # @assert i₀ + 2M ≤ block_dims_padded[n]
                 i₀ => vals
@@ -268,11 +267,15 @@ function interpolate!(
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
             block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
+            shmem_size = block_dims_padded
             groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
-            # @show ngroups block_dims block_dims_padded
             ndrange = groupsize .* ngroups
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
-            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, prefactor, block_dims)
+            kernel!(
+                vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
+                prefactor, block_dims,
+                Val(shmem_size),
+            )
         end
     end
 
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index ed5affb..88c798e 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -191,10 +191,15 @@ function spread_from_points!(
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
             block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
+            shmem_size = spread_real_dims(Z, block_dims_padded)
             groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
             ndrange = groupsize .* ngroups
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
-            kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block, block_dims)
+            kernel!(
+                us_real, gs, xs_comp, vp_sorted, pointperm_,
+                bd.cumulative_npoints_per_block, block_dims,
+                Val(shmem_size),
+            )
         end
     end
 
@@ -218,42 +223,35 @@ end
 ## Shared-memory implementation
 
 @kernel function spread_from_points_shmem_kernel!(
-        us::NTuple{C},
-        @Const(gs::NTuple{D}),
+        us::NTuple{C, AbstractArray{T}},
+        @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}),
         @Const(points::NTuple{D}),
-        @Const(vp::NTuple{C}),
+        @Const(vp::NTuple{C, AbstractVector{Z}}),
         @Const(pointperm),
         @Const(cumulative_npoints_per_block::AbstractVector),
         ::Val{block_dims},
-    ) where {C, D, block_dims}
-    groupsize = @groupsize()::Dims{D}
-    nthreads = prod(groupsize)
+        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
+    ) where {C, D, T <: AbstractFloat, Z <: Number, M, block_dims, shmem_size}
+
+    @uniform begin
+        groupsize = @groupsize()::Dims{D}
+        nthreads = prod(groupsize)
+
+        # Determine grid dimensions.
+        # Note that, to avoid problems with @atomic, we currently work with real-data arrays
+        # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice
+        # the actual dataset dimensions.
+        @assert T === real(Z) # output is a real array (but may actually describe complex data)
+        Ns_real = size(first(us))  # dimensions of raw input data
+        Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
+    end
+
+    block_n = @index(Group, Linear)      # linear index of block
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
     threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
-    block_index = @index(Group, NTuple)  # workgroup index (= block index)
-    block_n = @index(Group, Linear)      # linear index of block
 
-    # Determine grid dimensions.
-    # Note that, to avoid problems with @atomic, we currently work with real-data arrays
-    # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice
-    # the actual dataset dimensions.
-    Z = eltype(vp[1])
-    T = eltype(us[1])  # may be Z or real(Z)
-    @assert T === real(Z) # output is a real array (but may actually describe complex data)
-    Ns_real = size(first(us))  # dimensions of raw input data
-    Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
-
-    M = Kernels.half_support(gs[1])  # assume they're all equal
-    block_dims_padded = @. block_dims + 2M - 1
-    Ns_local = spread_real_dims(Z, block_dims_padded)  # multiplies first dimension by 2 if Z <: Complex (note that T <: Real)
-    u_local = @localmem(T, Ns_local)  # allocate shared memory
-
-    # This block will take care of non-uniform points (a + 1):b
-    @inbounds a = cumulative_npoints_per_block[block_n]
-    @inbounds b = cumulative_npoints_per_block[block_n + 1]
-
-    # Shift from indices in global array to indices in local array
-    idx_shift = @. (block_index - 1) * block_dims + 1
+    u_local = @localmem(T, shmem_size)  # allocate shared memory
 
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
@@ -264,6 +262,10 @@ end
 
         @synchronize  # make sure shared memory is fully set to zero
 
+        # This block will take care of non-uniform points (a + 1):b
+        @inbounds a = cumulative_npoints_per_block[block_n]
+        @inbounds b = cumulative_npoints_per_block[block_n + 1]
+
         for i in (a + threadidx):nthreads:b
             # Spread from point j
             j = if pointperm === nothing
@@ -278,8 +280,8 @@ end
                 x = @inbounds points[n][j]
                 gdata = Kernels.evaluate_kernel(gs[n], x)
                 local vals = gdata.values    # kernel values
-                local M = Kernels.half_support(gs[n])
-                local i₀ = gdata.i - idx_shift[n]
+                local ishift = (block_index[n] - 1) * block_dims[n] + 1
+                local i₀ = gdata.i - ishift
                 # @assert i₀ ≥ 0
                 # @assert i₀ + 2M ≤ block_dims_padded[n]
                 i₀ => vals
@@ -291,7 +293,10 @@ end
 
         @synchronize  # make sure we have finished writing to shared memory
 
-        add_from_local_to_global_memory!(Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize)
+        add_from_local_to_global_memory!(
+            Z, us[c], u_local, Ns, Val(M), block_index,
+            block_dims, threadidxs, groupsize,
+        )
 
         if c < C
             @synchronize  # wait before jumping to next component

From 46acbda99b64023f69303c05d9fc72268190a108 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Fri, 18 Oct 2024 22:07:20 +0200
Subject: [PATCH 11/33] Test SM kernels on CPU

---
 test/pseudo_gpu.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
index e68162b..6244856 100644
--- a/test/pseudo_gpu.jl
+++ b/test/pseudo_gpu.jl
@@ -6,6 +6,9 @@
 # JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels
 # seem to fail for other reasons.
 
+# TODO:
+# - use new JLBackend in the latest GPUArrays
+
 using NonuniformFFTs
 using StaticArrays: SVector  # for convenience
 using KernelAbstractions: KernelAbstractions as KA
@@ -116,4 +119,8 @@ end
     @testset "No blocking" begin  # spatial sorting disabled
         compare_with_cpu(ComplexF64, dims; block_size = nothing)
     end
+    @testset "gpu_method = :shared_memory" begin
+        @testset "Float32" compare_with_cpu(Float32, dims; gpu_method = :shared_memory)
+        @testset "ComplexF32" compare_with_cpu(ComplexF32, dims; gpu_method = :shared_memory)
+    end
 end

From 3fd04b955688a1044769b51ea4c63ad1bd146450 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Sat, 19 Oct 2024 07:27:46 +0200
Subject: [PATCH 12/33] Fix kernel compilation on CUDA

---
 src/interpolation/gpu.jl | 5 +++--
 src/spreading/gpu.jl     | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index cdedd67..9ebbe42 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -54,7 +54,7 @@ end
 
 @kernel function interpolate_to_points_shmem_kernel!(
         vp::NTuple{C, AbstractVector{Z}},
-        @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}),
+        @Const(gs::NTuple{D}),
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C, AbstractArray{Z}}),
         @Const(pointperm),
@@ -62,7 +62,7 @@ end
         @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
         ::Val{block_dims},
         ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
-    ) where {C, D, Z <: Number, M, block_dims, shmem_size}
+    ) where {C, D, Z <: Number, block_dims, shmem_size}
 
     @uniform begin
         groupsize = @groupsize()::Dims{D}
@@ -79,6 +79,7 @@ end
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
         # Copy grid data from global to shared memory
+        M = Kernels.half_support(gs[1])
         gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize)
 
         @synchronize  # make sure all threads have the same shared data
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 88c798e..be8d111 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -224,14 +224,14 @@ end
 
 @kernel function spread_from_points_shmem_kernel!(
         us::NTuple{C, AbstractArray{T}},
-        @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}),
+        @Const(gs::NTuple{D}),
         @Const(points::NTuple{D}),
         @Const(vp::NTuple{C, AbstractVector{Z}}),
         @Const(pointperm),
         @Const(cumulative_npoints_per_block::AbstractVector),
         ::Val{block_dims},
         ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
-    ) where {C, D, T <: AbstractFloat, Z <: Number, M, block_dims, shmem_size}
+    ) where {C, D, T, Z, block_dims, shmem_size}
 
     @uniform begin
         groupsize = @groupsize()::Dims{D}
@@ -293,6 +293,7 @@ end
 
         @synchronize  # make sure we have finished writing to shared memory
 
+        M = Kernels.half_support(gs[1])
         add_from_local_to_global_memory!(
             Z, us[c], u_local, Ns, Val(M), block_index,
             block_dims, threadidxs, groupsize,

From 6b0fad69a16731f49a47b475da78f88199fa3f82 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Sun, 20 Oct 2024 21:28:25 +0200
Subject: [PATCH 13/33] Reorganise some code

---
 src/interpolation/gpu.jl | 329 ++++++++++++++++++++-------------------
 src/spreading/gpu.jl     |   2 +-
 2 files changed, 167 insertions(+), 164 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 9ebbe42..cbb48f8 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -52,169 +52,6 @@ end
     nothing
 end
 
-@kernel function interpolate_to_points_shmem_kernel!(
-        vp::NTuple{C, AbstractVector{Z}},
-        @Const(gs::NTuple{D}),
-        @Const(points::NTuple{D}),
-        @Const(us::NTuple{C, AbstractArray{Z}}),
-        @Const(pointperm),
-        @Const(cumulative_npoints_per_block::AbstractVector),
-        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
-        ::Val{block_dims},
-        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
-    ) where {C, D, Z <: Number, block_dims, shmem_size}
-
-    @uniform begin
-        groupsize = @groupsize()::Dims{D}
-        nthreads = prod(groupsize)
-    end
-
-    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
-    threadidx = @index(Local, Linear)    # in 1:nthreads
-    block_index = @index(Group, NTuple)  # workgroup index (= block index)
-    block_n = @index(Group, Linear)      # linear index of block
-
-    u_local = @localmem(Z, shmem_size)  # allocate shared memory
-
-    # Interpolate components one by one (to avoid using too much memory)
-    for c ∈ 1:C
-        # Copy grid data from global to shared memory
-        M = Kernels.half_support(gs[1])
-        gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize)
-
-        @synchronize  # make sure all threads have the same shared data
-
-        # This block will take care of non-uniform points (a + 1):b
-        @inbounds a = cumulative_npoints_per_block[block_n]
-        @inbounds b = cumulative_npoints_per_block[block_n + 1]
-
-        for i in (a + threadidx):nthreads:b
-            # Interpolate at point j
-            j = if pointperm === nothing
-                i
-            else
-                @inbounds pointperm[i]
-            end
-
-            # TODO: can we do this just once for all components? (if C > 1)
-            indvals = ntuple(Val(D)) do n
-                @inline
-                x = @inbounds points[n][j]
-                gdata = Kernels.evaluate_kernel(gs[n], x)
-                local vals = gdata.values    # kernel values
-                local ishift = (block_index[n] - 1) * block_dims[n] + 1
-                local i₀ = gdata.i - ishift
-                # @assert i₀ ≥ 0
-                # @assert i₀ + 2M ≤ block_dims_padded[n]
-                i₀ => vals
-            end
-
-            v = interpolate_from_arrays_shmem(u_local, indvals, prefactor)
-            @inbounds vp[c][j] = v
-        end
-    end
-
-    nothing
-end
-
-# Copy values from global to shared memory.
-# - can we optimise memory access patterns?
-@inline function gridvalues_to_local_memory!(
-        u_local::AbstractArray{T, D},
-        u_global::AbstractArray{T, D},
-        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
-    ) where {T, D, M}
-    if @generated
-        quote
-            Ns = size(u_global)
-            inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end])
-            offsets = @ntuple(
-                $D,
-                n -> let
-                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
-                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
-                end
-            )
-            @nloops(
-                $D, i,
-                d -> inds[d],
-                d -> begin
-                    j_d = offsets[d] + i_d
-                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
-                end,
-                begin
-                    is = @ntuple($D, i)
-                    js = @ntuple($D, j)
-                    @inbounds u_local[is...] = u_global[js...]
-                end,
-            )
-            nothing
-        end
-    else
-        Ns = size(u_global)
-        inds = ntuple(Val(D)) do n
-            axes(u_local, n)[threadidxs[n]:groupsize[n]:end]  # this determines the parallelisation pattern
-        end
-        offsets = ntuple(Val(D)) do n
-            @inline
-            off = (block_index[n] - 1) * block_dims[n] - (M - 1)
-            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
-        end
-        @inbounds for is ∈ Iterators.product(inds...)
-            js = ntuple(Val(D)) do n
-                @inline
-                j = offsets[n] + is[n]
-                ifelse(j > Ns[n], j - Ns[n], j)
-            end
-            u_local[is...] = u_global[js...]
-        end
-        nothing
-    end
-end
-
-# Interpolate a single "component" (one transform at a time).
-# Here vp is a vector instead of a tuple of vectors.
-@inline function interpolate_from_arrays_shmem(
-        u_local::AbstractArray{T, D},
-        indvals::NTuple{D},
-        prefactor,
-    ) where {T, D}
-    if @generated
-        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
-        quote
-            $gprod_init = prefactor
-            v = zero(T)
-            inds_start = map(first, indvals)
-            vals = map(last, indvals)    # evaluated kernel values in each direction
-            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-            @nloops(
-                $D, i,
-                d -> inds[d],
-                d -> begin
-                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
-                    @inbounds j_d = inds_start[d] + i_d
-                end,
-                begin
-                    js = @ntuple($D, j)
-                    @inbounds v += gprod_1 * u_local[js...]
-                end,
-            )
-            v
-        end
-    else
-        v = zero(T)
-        inds_start = map(first, indvals)
-        vals = map(last, indvals)    # evaluated kernel values in each direction
-        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-        @inbounds for I ∈ CartesianIndices(inds)
-            gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D)))
-            js = inds_start .+ Tuple(I)
-            v += gprod * u_local[js...]
-        end
-        v
-    end
-end
-
 function interpolate!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},
@@ -406,3 +243,169 @@ end
     end
     nothing
 end
+
+## ========================================================================================== ##
+## Shared-memory implementation
+
+@kernel function interpolate_to_points_shmem_kernel!(
+        vp::NTuple{C, AbstractVector{Z}},
+        @Const(gs::NTuple{D}),
+        @Const(points::NTuple{D}),
+        @Const(us::NTuple{C, AbstractArray{Z}}),
+        @Const(pointperm),
+        @Const(cumulative_npoints_per_block::AbstractVector),
+        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
+        ::Val{block_dims},
+        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
+    ) where {C, D, Z <: Number, block_dims, shmem_size}
+
+    @uniform begin
+        groupsize = @groupsize()::Dims{D}
+        nthreads = prod(groupsize)
+    end
+
+    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    block_n = @index(Group, Linear)      # linear index of block
+
+    u_local = @localmem(Z, shmem_size)  # allocate shared memory
+
+    # Interpolate components one by one (to avoid using too much memory)
+    for c ∈ 1:C
+        # Copy grid data from global to shared memory
+        M = Kernels.half_support(gs[1])
+        gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize)
+
+        @synchronize  # make sure all threads have the same shared data
+
+        # This block will take care of non-uniform points (a + 1):b
+        @inbounds a = cumulative_npoints_per_block[block_n]
+        @inbounds b = cumulative_npoints_per_block[block_n + 1]
+
+        for i in (a + threadidx):nthreads:b
+            # Interpolate at point j
+            j = if pointperm === nothing
+                i
+            else
+                @inbounds pointperm[i]
+            end
+
+            # TODO: can we do this just once for all components? (if C > 1)
+            indvals = ntuple(Val(D)) do n
+                @inline
+                x = @inbounds points[n][j]
+                gdata = Kernels.evaluate_kernel(gs[n], x)
+                local vals = gdata.values    # kernel values
+                local ishift = (block_index[n] - 1) * block_dims[n] + 1
+                local i₀ = gdata.i - ishift
+                # @assert i₀ ≥ 0
+                # @assert i₀ + 2M ≤ block_dims_padded[n]
+                i₀ => vals
+            end
+
+            v = interpolate_from_arrays_shmem(u_local, indvals, prefactor)
+            @inbounds vp[c][j] = v
+        end
+    end
+
+    nothing
+end
+
+# Copy values from global to shared memory.
+# - can we optimise memory access patterns?
+@inline function gridvalues_to_local_memory!(
+        u_local::AbstractArray{T, D},
+        u_global::AbstractArray{T, D},
+        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
+    ) where {T, D, M}
+    if @generated
+        quote
+            Ns = size(u_global)
+            inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end])
+            offsets = @ntuple(
+                $D,
+                n -> let
+                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
+                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+                end
+            )
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    j_d = offsets[d] + i_d
+                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
+                end,
+                begin
+                    is = @ntuple($D, i)
+                    js = @ntuple($D, j)
+                    @inbounds u_local[is...] = u_global[js...]
+                end,
+            )
+            nothing
+        end
+    else
+        Ns = size(u_global)
+        inds = ntuple(Val(D)) do n
+            axes(u_local, n)[threadidxs[n]:groupsize[n]:end]  # this determines the parallelisation pattern
+        end
+        offsets = ntuple(Val(D)) do n
+            @inline
+            off = (block_index[n] - 1) * block_dims[n] - (M - 1)
+            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+        end
+        @inbounds for is ∈ Iterators.product(inds...)
+            js = ntuple(Val(D)) do n
+                @inline
+                j = offsets[n] + is[n]
+                ifelse(j > Ns[n], j - Ns[n], j)
+            end
+            u_local[is...] = u_global[js...]
+        end
+        nothing
+    end
+end
+
+# Interpolate a single "component" (one transform at a time).
+# Here vp is a vector instead of a tuple of vectors.
+@inline function interpolate_from_arrays_shmem(
+        u_local::AbstractArray{T, D},
+        indvals::NTuple{D},
+        prefactor,
+    ) where {T, D}
+    if @generated
+        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
+        quote
+            $gprod_init = prefactor
+            v = zero(T)
+            inds_start = map(first, indvals)
+            vals = map(last, indvals)    # evaluated kernel values in each direction
+            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
+                    @inbounds j_d = inds_start[d] + i_d
+                end,
+                begin
+                    js = @ntuple($D, j)
+                    @inbounds v += gprod_1 * u_local[js...]
+                end,
+            )
+            v
+        end
+    else
+        v = zero(T)
+        inds_start = map(first, indvals)
+        vals = map(last, indvals)    # evaluated kernel values in each direction
+        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+        @inbounds for I ∈ CartesianIndices(inds)
+            gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D)))
+            js = inds_start .+ Tuple(I)
+            v += gprod * u_local[js...]
+        end
+        v
+    end
+end
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index be8d111..910c47f 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -219,7 +219,7 @@ end
     nothing
 end
 
-## ==================================================================================================== ##
+## ========================================================================================== ##
 ## Shared-memory implementation
 
 @kernel function spread_from_points_shmem_kernel!(

From 867b73e3c7ae30ce61e88fd135a1092ea67e0af5 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Sun, 20 Oct 2024 23:09:12 +0200
Subject: [PATCH 14/33] [WIP] avoid atomics in shared-memory arrays

---
 src/Kernels/kaiser_bessel_backwards.jl |  35 +++++++
 src/spreading/gpu.jl                   | 139 ++++++++++++++++++++++---
 2 files changed, 157 insertions(+), 17 deletions(-)

diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl
index 45129cd..e894e1a 100644
--- a/src/Kernels/kaiser_bessel_backwards.jl
+++ b/src/Kernels/kaiser_bessel_backwards.jl
@@ -143,3 +143,38 @@ function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M
         (; i, values,)
     end
 end
+
+# TODO: remove?
+function evaluate_kernel_direct(
+        g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, x::T,
+    ) where {M, T}
+    (; Δx, β,) = g
+    # i = point_to_cell(x, Δx)
+    X = x / Δx - T(i - 1)  # source position relative to xs[i]
+    # @assert 0 ≤ X < 1
+    Xc = 2 * X - 1  # in [-1, 1)
+    L = 2M
+    ntuple(Val(L)) do j
+        h = 1 - 2 * (j - one(T)/2) / L  # midpoint of interval
+        δ = 1 / L                  # half-width of interval
+        y = h + Xc * δ
+        @fastmath s = sqrt(1 - y^2)
+        @fastmath sinh(β * s)::T / (s * T(π))
+    end
+end
+
+function evaluate_kernel_direct(
+        g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, n::Integer, x::T,
+    ) where {M, T}
+    (; Δx, β,) = g
+    # @assert 1 ≤ n ≤ 2M
+    X = x / Δx - T(i - 1)  # source position relative to xs[i]
+    # @assert 0 ≤ X < 1
+    Xc = 2 * X - 1  # in [-1, 1)
+    L = 2M
+    h = 1 - 2 * (n - one(T)/2) / L  # midpoint of interval
+    δ = 1 / L                  # half-width of interval
+    y = h + Xc * δ
+    @fastmath s = sqrt(1 - y^2)
+    @fastmath sinh(β * s)::T / (s * T(π))
+end
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 910c47f..16813db 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -122,6 +122,22 @@ end
     nothing
 end
 
+# Same as _atomic_add!, but without the atomics.
+@inline function _add_maybecomplex!(u::AbstractArray{T}, v::T, inds::Tuple) where {T <: Real}
+    @inbounds u[inds...] += v
+    nothing
+end
+
+@inline function _add_maybecomplex!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real}
+    @inbounds begin
+        i₁ = 2 * (inds[1] - 1)  # convert from logical index (equivalent complex array) to memory index (real array)
+        itail = Base.tail(inds)
+        u[i₁ + 1, itail...] += real(v)
+        u[i₁ + 2, itail...] += imag(v)
+    end
+    nothing
+end
+
 # GPU implementation.
 # We assume all arrays are already on the GPU.
 function spread_from_points!(
@@ -197,7 +213,7 @@ function spread_from_points!(
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 us_real, gs, xs_comp, vp_sorted, pointperm_,
-                bd.cumulative_npoints_per_block, block_dims,
+                bd.cumulative_npoints_per_block, HalfSupport(M), block_dims,
                 Val(shmem_size),
             )
         end
@@ -229,9 +245,10 @@ end
         @Const(vp::NTuple{C, AbstractVector{Z}}),
         @Const(pointperm),
         @Const(cumulative_npoints_per_block::AbstractVector),
+        ::HalfSupport{M},
         ::Val{block_dims},
         ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
-    ) where {C, D, T, Z, block_dims, shmem_size}
+    ) where {C, D, T, Z, M, block_dims, shmem_size}
 
     @uniform begin
         groupsize = @groupsize()::Dims{D}
@@ -251,7 +268,13 @@ end
     threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
 
-    u_local = @localmem(T, shmem_size)  # allocate shared memory
+    # Allocate static shared memory
+    u_local = @localmem(T, shmem_size)
+    window_vals = ntuple(Val(D)) do _
+        @assert T <: Real
+        @localmem(T, 2M)
+    end
+    inds_start = @localmem(Int, D)
 
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
@@ -266,7 +289,39 @@ end
         @inbounds a = cumulative_npoints_per_block[block_n]
         @inbounds b = cumulative_npoints_per_block[block_n + 1]
 
-        for i in (a + threadidx):nthreads:b
+        # for i in (a + threadidx):nthreads:b
+        #     # Spread from point j
+        #     j = if pointperm === nothing
+        #         i
+        #     else
+        #         @inbounds pointperm[i]
+        #     end
+        #
+        #     # TODO: can we do this just once for all components? (if C > 1)
+        #     indvals = ntuple(Val(D)) do n
+        #         @inline
+        #         local x = @inbounds points[n][j]
+        #         local g = gs[n]
+        #         local Δx = Kernels.gridstep(g)
+        #         icell = @inline Kernels.point_to_cell(x, Δx)
+        #         vals = @inline Kernels.evaluate_kernel_direct(g, icell, x)
+        #         # gdata = Kernels.evaluate_kernel(g, x)
+        #         # local vals = gdata.values    # kernel values
+        #         local ishift = (block_index[n] - 1) * block_dims[n] + 1
+        #         local i₀ = icell - ishift
+        #         # local i₀ = gdata.i - ishift
+        #         # @assert i₀ ≥ 0
+        #         # @assert i₀ + 2M ≤ block_dims_padded[n]
+        #         i₀ => vals
+        #     end
+        #
+        #     @inbounds v = vp[c][j]
+        #     spread_onto_array_shmem!(u_local, indvals, v)
+        # end
+
+        # All threads iterate over the same non-uniform points.
+        # This is to avoid atomic operations on shared memory, which seems to be quite slow.
+        for i in (a + 1):b
             # Spread from point j
             j = if pointperm === nothing
                 i
@@ -274,26 +329,39 @@ end
                 @inbounds pointperm[i]
             end
 
-            # TODO: can we do this just once for all components? (if C > 1)
-            indvals = ntuple(Val(D)) do n
-                @inline
-                x = @inbounds points[n][j]
-                gdata = Kernels.evaluate_kernel(gs[n], x)
-                local vals = gdata.values    # kernel values
-                local ishift = (block_index[n] - 1) * block_dims[n] + 1
-                local i₀ = gdata.i - ishift
-                # @assert i₀ ≥ 0
-                # @assert i₀ + 2M ≤ block_dims_padded[n]
-                i₀ => vals
+            # Parallel evaluation of window functions over all dimensions
+            let
+                inds = CartesianIndices((1:D, 1:2M))
+                subinds = (1:length(inds))[threadidx:nthreads:end]
+                @inbounds for n ∈ subinds
+                    d, m = Tuple(inds[n])  # dimension, support point
+                    x = points[d][j]
+                    local g = gs[d]
+                    local Δx = Kernels.gridstep(g)
+                    local icell = @inline Kernels.point_to_cell(x, Δx)
+                    local ishift = (block_index[d] - 1) * block_dims[d] + 1
+                    local i₀ = icell - ishift
+                    inds_start[d] = i₀  # multiple threads may write this, but that's ok
+                    window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x)
+                end
+            end
+
+            # TODO:
+            # - can we do this just once for all components? (if C > 1)
+            indvals = ntuple(Val(D)) do d
+                inds_start[d] => window_vals[d]
             end
 
             @inbounds v = vp[c][j]
-            spread_onto_array_shmem!(u_local, indvals, v)
+
+            @synchronize
+
+            # Spread in parallel
+            spread_onto_array_shmem_threads!(u_local, indvals, v, threadidxs, groupsize)
         end
 
         @synchronize  # make sure we have finished writing to shared memory
 
-        M = Kernels.half_support(gs[1])
         add_from_local_to_global_memory!(
             Z, us[c], u_local, Ns, Val(M), block_index,
             block_dims, threadidxs, groupsize,
@@ -341,6 +409,43 @@ end
     end
 end
 
+# Spread a single "component" (one transform at a time).
+# This is parallelised across threads in a workgroup.
+@inline function spread_onto_array_shmem_threads!(
+        u_local::AbstractArray{T, D},
+        indvals::NTuple{D},
+        v::Z,
+        threadidxs::NTuple{D}, groupsize::NTuple{D},
+    ) where {T, D, Z}
+    if @generated
+        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
+        Tr = real(T)
+        quote
+            inds_start = map(first, indvals)  # start of active region in output array
+            vals = map(last, indvals)    # evaluated kernel values in each direction
+            # inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+            inds = @ntuple($D, n -> eachindex(vals[n])[threadidxs[n]:groupsize[n]:end])
+            $gprod_init = one($Tr)       # product of kernel values (initially 1)
+            @nloops(
+                $D, i,
+                d -> inds[d],  # for i_d ∈ 1:L
+                d -> begin
+                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
+                    @inbounds j_d = inds_start[d] + i_d
+                end,
+                begin
+                    js = @ntuple($D, j)
+                    w = v * gprod_1
+                    _add_maybecomplex!(u_local, w, js)
+                end,
+            )
+            v
+        end
+    else
+        aaa  # TODO: implement
+    end
+end
+
 # Add values from shared to global memory.
 @inline function add_from_local_to_global_memory!(
         ::Type{Z},  # "logical" type of data (e.g. ComplexF64)

From 801eb5eb9f4f6b961ab0b4cdeb66dd475441ec84 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 10:25:58 +0200
Subject: [PATCH 15/33] Avoid atomic operations on shared memory

---
 src/spreading/gpu.jl | 119 ++++++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 47 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 16813db..a5e89d5 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -321,7 +321,7 @@ end
 
         # All threads iterate over the same non-uniform points.
         # This is to avoid atomic operations on shared memory, which seems to be quite slow.
-        for i in (a + 1):b
+        @inbounds for i in (a + 1):b
             # Spread from point j
             j = if pointperm === nothing
                 i
@@ -336,36 +336,35 @@ end
                 @inbounds for n ∈ subinds
                     d, m = Tuple(inds[n])  # dimension, support point
                     x = points[d][j]
-                    local g = gs[d]
-                    local Δx = Kernels.gridstep(g)
-                    local icell = @inline Kernels.point_to_cell(x, Δx)
-                    local ishift = (block_index[d] - 1) * block_dims[d] + 1
-                    local i₀ = icell - ishift
+                    g = gs[d]
+                    Δx = Kernels.gridstep(g)
+                    icell = @inline Kernels.point_to_cell(x, Δx)
+                    ishift = (block_index[d] - 1) * block_dims[d] + 1
+                    i₀ = icell - ishift
                     inds_start[d] = i₀  # multiple threads may write this, but that's ok
                     window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x)
                 end
             end
 
-            # TODO:
-            # - can we do this just once for all components? (if C > 1)
-            indvals = ntuple(Val(D)) do d
-                inds_start[d] => window_vals[d]
-            end
-
             @inbounds v = vp[c][j]
 
             @synchronize
 
+            inds_start_t = ntuple(d -> @inbounds(inds_start[d]), Val(D))  # as a tuple
+
             # Spread in parallel
-            spread_onto_array_shmem_threads!(u_local, indvals, v, threadidxs, groupsize)
+            spread_onto_array_shmem_threads!(u_local, inds_start_t, window_vals, v; threadidxs, groupsize, threadidx, nthreads)
         end
 
         @synchronize  # make sure we have finished writing to shared memory
 
-        add_from_local_to_global_memory!(
-            Z, us[c], u_local, Ns, Val(M), block_index,
-            block_dims, threadidxs, groupsize,
-        )
+        if a < b  # skip this step if there were actually no points in this block (if a == b)
+            add_from_local_to_global_memory!(
+                Z, us[c], u_local, Ns, Val(M);
+                block_index, block_dims, threadidxs, groupsize,
+                threadidx, nthreads,
+            )
+        end
 
         if c < C
             @synchronize  # wait before jumping to next component
@@ -413,37 +412,25 @@ end
 # This is parallelised across threads in a workgroup.
 @inline function spread_onto_array_shmem_threads!(
         u_local::AbstractArray{T, D},
-        indvals::NTuple{D},
-        v::Z,
+        inds_start::NTuple{D, Integer},
+        vals::NTuple{D, AbstractVector},  # these are static-size shared-memory arrays
+        v::Z;
         threadidxs::NTuple{D}, groupsize::NTuple{D},
+        threadidx, nthreads,
     ) where {T, D, Z}
-    if @generated
-        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
-        Tr = real(T)
-        quote
-            inds_start = map(first, indvals)  # start of active region in output array
-            vals = map(last, indvals)    # evaluated kernel values in each direction
-            # inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-            inds = @ntuple($D, n -> eachindex(vals[n])[threadidxs[n]:groupsize[n]:end])
-            $gprod_init = one($Tr)       # product of kernel values (initially 1)
-            @nloops(
-                $D, i,
-                d -> inds[d],  # for i_d ∈ 1:L
-                d -> begin
-                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
-                    @inbounds j_d = inds_start[d] + i_d
-                end,
-                begin
-                    js = @ntuple($D, j)
-                    w = v * gprod_1
-                    _add_maybecomplex!(u_local, w, js)
-                end,
-            )
-            v
+    inds = CartesianIndices(map(eachindex, vals))
+    Tr = real(T)
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        I = inds[n]
+        js = Tuple(I) .+ inds_start
+        gprod = one(Tr)
+        for d ∈ 1:D
+            gprod *= vals[d][I[d]]
         end
-    else
-        aaa  # TODO: implement
+        w = v * gprod
+        _add_maybecomplex!(u_local, w, js)
     end
+    nothing
 end
 
 # Add values from shared to global memory.
@@ -452,13 +439,14 @@ end
         u_global::AbstractArray{T, D},  # actual data is always real
         u_local::AbstractArray{T, D},
         Ns::Dims{D},
-        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
+        ::Val{M};
+        block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
+        threadidx, nthreads,
     ) where {Z, T, D, M}
     if @generated
         @assert T <: Real  # for atomic operations
         skip = sizeof(Z) ÷ sizeof(T)  # 1 (Z real) or 2 (Z complex)
         quote
-            # @assert skip ∈ (1, 2)
             skips = @ntuple($D, n -> n == 1 ? $skip : 1)
             inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end])
             offsets = @ntuple(
@@ -496,7 +484,44 @@ end
             nothing
         end
     else
-        aaa  # TODO: implement
+        # Unlike the @generated case, here we parallelise over linear indices instead of
+        # separate Cartesian directions. Performance is very similar to the @generated case.
+        @assert T <: Real  # for atomic operations
+        skip = sizeof(Z) ÷ sizeof(T)  # 1 (Z real) or 2 (Z complex)
+        inds = CartesianIndices(
+            ntuple(Val(D)) do d
+                # The logical indices are 1:end÷skip in the first dimension.
+                d == 1 ? axes(u_local, 1)[1:end÷skip] : axes(u_local, d)
+            end
+        )
+        offsets = ntuple(Val(D)) do n
+            @inline
+            local off = (block_index[n] - 1) * block_dims[n] - (M - 1)
+            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+        end
+        @inbounds for n ∈ threadidx:nthreads:length(inds)
+            I = inds[n]
+            is = Tuple(I)
+            js = ntuple(Val(D)) do d
+                @inline
+                j = is[d] + offsets[d]
+                ifelse(j > Ns[d], j - Ns[d], j)
+            end
+            @inbounds if Z <: Real
+                w = u_local[is...]
+                Atomix.@atomic u_global[js...] += w
+            elseif Z <: Complex
+                is_tail = Base.tail(is)
+                js_tail = Base.tail(js)
+                ifirst = 2 * is[1] - 1
+                jfirst = 2 * js[1] - 1
+                w = u_local[ifirst, is_tail...]  # real part
+                Atomix.@atomic u_global[jfirst, js_tail...] += w
+                w = u_local[ifirst + 1, is_tail...]  # imaginary part
+                Atomix.@atomic u_global[jfirst + 1, js_tail...] += w
+            end
+        end
+        nothing
     end
 end
 

From 24f595629124b8670a585d7544f93e10b7ab6364 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 10:29:21 +0200
Subject: [PATCH 16/33] Minor improvement

---
 src/spreading/gpu.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index a5e89d5..1a884f7 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -332,8 +332,7 @@ end
             # Parallel evaluation of window functions over all dimensions
             let
                 inds = CartesianIndices((1:D, 1:2M))
-                subinds = (1:length(inds))[threadidx:nthreads:end]
-                @inbounds for n ∈ subinds
+                @inbounds for n ∈ threadidx:nthreads:length(inds)
                     d, m = Tuple(inds[n])  # dimension, support point
                     x = points[d][j]
                     g = gs[d]

From 67e4baab54c314b0f2f2a4d48ea4bc9a0cc1c47c Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 10:44:42 +0200
Subject: [PATCH 17/33] Make window_vals a matrix

---
 src/blocking.jl      |  3 ++-
 src/spreading/gpu.jl | 14 ++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index 2b6a50b..3716218 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -67,7 +67,8 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
 # Note that the result is a compile-time constant (on Julia 1.11.1 at least).
 @inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M}
-    max_shmem_size = 48 << 10  # 48 KiB -- TODO: make this depend on the actual GPU?
+    free_shmem = 1 << 10  # how much memory to leave free for other allocations
+    max_shmem_size = (48 << 10) - free_shmem  # 48 KiB -- TODO: make this depend on the actual GPU?
     max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
     m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
     n = m - (2M - 1)  # exclude ghost cells
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 1a884f7..37ad3e0 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -270,10 +270,8 @@ end
 
     # Allocate static shared memory
     u_local = @localmem(T, shmem_size)
-    window_vals = ntuple(Val(D)) do _
-        @assert T <: Real
-        @localmem(T, 2M)
-    end
+    @assert T <: Real
+    window_vals = @localmem(T, (2M, D))
     inds_start = @localmem(Int, D)
 
     # Interpolate components one by one (to avoid using too much memory)
@@ -341,7 +339,7 @@ end
                     ishift = (block_index[d] - 1) * block_dims[d] + 1
                     i₀ = icell - ishift
                     inds_start[d] = i₀  # multiple threads may write this, but that's ok
-                    window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x)
+                    window_vals[m, d] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x)
                 end
             end
 
@@ -412,19 +410,19 @@ end
 @inline function spread_onto_array_shmem_threads!(
         u_local::AbstractArray{T, D},
         inds_start::NTuple{D, Integer},
-        vals::NTuple{D, AbstractVector},  # these are static-size shared-memory arrays
+        window_vals::AbstractArray{T, 2},  # static-size shared-memory array (2M, D)
         v::Z;
         threadidxs::NTuple{D}, groupsize::NTuple{D},
         threadidx, nthreads,
     ) where {T, D, Z}
-    inds = CartesianIndices(map(eachindex, vals))
+    inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D)))
     Tr = real(T)
     @inbounds for n ∈ threadidx:nthreads:length(inds)
         I = inds[n]
         js = Tuple(I) .+ inds_start
         gprod = one(Tr)
         for d ∈ 1:D
-            gprod *= vals[d][I[d]]
+            gprod *= window_vals[I[d], d]
         end
         w = v * gprod
         _add_maybecomplex!(u_local, w, js)

From 765b73eb4f8fe93b88d7712272d0b9c6da45f1ad Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 11:43:50 +0200
Subject: [PATCH 18/33] Implement hybrid parallelisation in SM spreading

Much faster!
---
 src/blocking.jl          |  30 ++++++--
 src/interpolation/gpu.jl |   4 +-
 src/plan.jl              |   3 +-
 src/spreading/gpu.jl     | 143 ++++++++++++---------------------------
 4 files changed, 73 insertions(+), 107 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index 3716218..9cbe6ec 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -66,8 +66,18 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 # We try to make sure that the total block size (including 2M ghost cells in each direction)
 # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
 # Note that the result is a compile-time constant (on Julia 1.11.1 at least).
-@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M}
-    free_shmem = 1 << 10  # how much memory to leave free for other allocations
+@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np}) where {Z <: Number, D, M, Np}
+    T = real(Z)
+    # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
+    # Here Np is the batch size (batch_size parameter).
+    shmem_needs = sizeof(T) * (
+        2M * D * Np +  # window_vals
+        D * Np +       # points_sm
+        D * Np         # inds_start
+    ) + (
+        sizeof(Z) * Np  # vp_sm
+    )
+    free_shmem = shmem_needs  # how much memory to leave free for other allocations
     max_shmem_size = (48 << 10) - free_shmem  # 48 KiB -- TODO: make this depend on the actual GPU?
     max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
     m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
@@ -92,6 +102,7 @@ struct BlockDataGPU{
         N, I <: Integer, T <: AbstractFloat,
         IndexVector <: AbstractVector{I},
         SortPoints <: StaticBool,
+        Np,
     } <: AbstractBlockData
     method :: Symbol     # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory
     Δxs :: NTuple{N, T}  # grid step; the size of a block in units of length is block_dims .* Δxs.
@@ -101,18 +112,22 @@ struct BlockDataGPU{
     blockidx      :: IndexVector  # linear index of block associated to each point (length = Np)
     pointperm     :: IndexVector
     sort_points   :: SortPoints
+    batch_size :: Val{Np}  # how many non-uniform points per batch in SM spreading?
     function BlockDataGPU(
             method::Symbol,
             Δxs::NTuple{N, T},
             nblocks_per_dir::NTuple{N, I},
             block_dims::NTuple{N, I},
-            npoints_per_block::V, sort::S,
-        ) where {N, I, T, V, S}
+            npoints_per_block::V, sort::S;
+            batch_size::Val{Np},
+        ) where {N, I, T, V, S, Np}
+        Np::Integer
         method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)"))
         blockidx = similar(npoints_per_block, 0)
         pointperm = similar(npoints_per_block, 0)
-        new{N, I, T, V, S}(
+        new{N, I, T, V, S, Np}(
             method, Δxs, nblocks_per_dir, block_dims, npoints_per_block, blockidx, pointperm, sort,
+            batch_size,
         )
     end
 end
@@ -125,17 +140,18 @@ function BlockDataGPU(
         backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
         sort_points::StaticBool;
         method::Symbol,
+        batch_size::Val = Val(16),  # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
     ) where {Z <: Number, D, M}
     T = real(Z)  # in case Z is complex
     if method === :shared_memory
         # Override input block size. We try to maximise the use of shared memory.
-        block_dims = block_dims_gpu_shmem(Z, Ñs, h)
+        block_dims = block_dims_gpu_shmem(Z, Ñs, h, batch_size)
     end
     nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim)
     L = T(2) * π  # domain period
     Δxs = map(N -> L / N, Ñs)  # grid step (oversampled grid)
     cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
-    BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points)
+    BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size)
 end
 
 get_pointperm(bd::BlockDataGPU) = bd.pointperm
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index cbb48f8..b55ed25 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -100,7 +100,7 @@ function interpolate!(
         Z = eltype(us[1])
         M = Kernels.half_support(gs[1])
         @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
-        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M))  # this is usually a compile-time constant...
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
@@ -108,6 +108,8 @@ function interpolate!(
             shmem_size = block_dims_padded
             groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
             ndrange = groupsize .* ngroups
+            # TODO:
+            # - try out batches in interpolation?
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
diff --git a/src/plan.jl b/src/plan.jl
index 61fce96..09b3b3a 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -309,6 +309,7 @@ function _PlanNUFFT(
         block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
         synchronise::Bool = false,
         gpu_method::Symbol = :global_memory,
+        batch_size::Val = Val(16),  # currently only used in shared-memory GPU spreading
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.
@@ -343,7 +344,7 @@ function _PlanNUFFT(
     else
         block_dims = get_block_dims(Ñs, block_size)
         if backend isa GPU
-            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method)
+            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size,)
         else
             blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
             FFTW.set_num_threads(Threads.nthreads())
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 37ad3e0..6c76314 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -202,7 +202,7 @@ function spread_from_points!(
         Z = eltype(us[1])
         M = Kernels.half_support(gs[1])
         @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
-        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M))  # this is usually a compile-time constant...
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
@@ -213,8 +213,8 @@ function spread_from_points!(
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 us_real, gs, xs_comp, vp_sorted, pointperm_,
-                bd.cumulative_npoints_per_block, HalfSupport(M), block_dims,
-                Val(shmem_size),
+                bd.cumulative_npoints_per_block, HalfSupport(M),
+                block_dims, Val(shmem_size), bd.batch_size,
             )
         end
     end
@@ -238,6 +238,10 @@ end
 ## ========================================================================================== ##
 ## Shared-memory implementation
 
+# Process non-uniform points in batches of Np points (or less).
+# The idea is to completely avoid slow atomic writes to shared memory arrays (u_local),
+# while parallelising some operations across up to Np points.
+# TODO: Should Np be equal to the number of threads? or how to choose it optimally so that the block size stays not too small?
 @kernel function spread_from_points_shmem_kernel!(
         us::NTuple{C, AbstractArray{T}},
         @Const(gs::NTuple{D}),
@@ -248,7 +252,8 @@ end
         ::HalfSupport{M},
         ::Val{block_dims},
         ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
-    ) where {C, D, T, Z, M, block_dims, shmem_size}
+        ::Val{Np},  # batch_size
+    ) where {C, D, T, Z, M, Np, block_dims, shmem_size}
 
     @uniform begin
         groupsize = @groupsize()::Dims{D}
@@ -271,8 +276,10 @@ end
     # Allocate static shared memory
     u_local = @localmem(T, shmem_size)
     @assert T <: Real
-    window_vals = @localmem(T, (2M, D))
-    inds_start = @localmem(Int, D)
+    window_vals = @localmem(T, (2M, D, Np))
+    points_sm = @localmem(T, (D, Np))  # points copied to shared memory
+    inds_start = @localmem(Int, (D, Np))
+    vp_sm = @localmem(Z, Np)  # input values copied to shared memory
 
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
@@ -287,70 +294,45 @@ end
         @inbounds a = cumulative_npoints_per_block[block_n]
         @inbounds b = cumulative_npoints_per_block[block_n + 1]
 
-        # for i in (a + threadidx):nthreads:b
-        #     # Spread from point j
-        #     j = if pointperm === nothing
-        #         i
-        #     else
-        #         @inbounds pointperm[i]
-        #     end
-        #
-        #     # TODO: can we do this just once for all components? (if C > 1)
-        #     indvals = ntuple(Val(D)) do n
-        #         @inline
-        #         local x = @inbounds points[n][j]
-        #         local g = gs[n]
-        #         local Δx = Kernels.gridstep(g)
-        #         icell = @inline Kernels.point_to_cell(x, Δx)
-        #         vals = @inline Kernels.evaluate_kernel_direct(g, icell, x)
-        #         # gdata = Kernels.evaluate_kernel(g, x)
-        #         # local vals = gdata.values    # kernel values
-        #         local ishift = (block_index[n] - 1) * block_dims[n] + 1
-        #         local i₀ = icell - ishift
-        #         # local i₀ = gdata.i - ishift
-        #         # @assert i₀ ≥ 0
-        #         # @assert i₀ + 2M ≤ block_dims_padded[n]
-        #         i₀ => vals
-        #     end
-        #
-        #     @inbounds v = vp[c][j]
-        #     spread_onto_array_shmem!(u_local, indvals, v)
-        # end
-
-        # All threads iterate over the same non-uniform points.
-        # This is to avoid atomic operations on shared memory, which seems to be quite slow.
-        @inbounds for i in (a + 1):b
-            # Spread from point j
-            j = if pointperm === nothing
-                i
-            else
-                @inbounds pointperm[i]
-            end
-
-            # Parallel evaluation of window functions over all dimensions
-            let
-                inds = CartesianIndices((1:D, 1:2M))
-                @inbounds for n ∈ threadidx:nthreads:length(inds)
-                    d, m = Tuple(inds[n])  # dimension, support point
+        # The first batch deals with points (a + 1):min(a + Np, b)
+        @inbounds for batch_begin in a:Np:(b - 1)
+            batch_size = min(Np, b - batch_begin)
+            # Iterate over points in the batch (ideally 1 thread per point).
+            # Each thread writes to shared memory.
+            # TODO: can we have a two-level parallelism? e.g. parallelise over dimensions
+            @inbounds for p in threadidx:nthreads:batch_size
+                # Spread from point j
+                i = batch_begin + p  # index of non-uniform point
+                j = if pointperm === nothing
+                    i
+                else
+                    @inbounds pointperm[i]
+                end
+                for d ∈ 1:D
                     x = points[d][j]
+                    points_sm[d, p] = x
                     g = gs[d]
-                    Δx = Kernels.gridstep(g)
-                    icell = @inline Kernels.point_to_cell(x, Δx)
+                    gdata = Kernels.evaluate_kernel(g, x)
                     ishift = (block_index[d] - 1) * block_dims[d] + 1
-                    i₀ = icell - ishift
-                    inds_start[d] = i₀  # multiple threads may write this, but that's ok
-                    window_vals[m, d] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x)
+                    inds_start[d, p] = gdata.i - ishift
+                    local vals = gdata.values
+                    for m ∈ eachindex(vals)
+                        window_vals[m, d, p] = vals[m]
+                    end
                 end
+                vp_sm[p] = vp[c][j]
             end
 
-            @inbounds v = vp[c][j]
-
-            @synchronize
+            @synchronize  # make sure all threads have the same shared data
 
-            inds_start_t = ntuple(d -> @inbounds(inds_start[d]), Val(D))  # as a tuple
-
-            # Spread in parallel
-            spread_onto_array_shmem_threads!(u_local, inds_start_t, window_vals, v; threadidxs, groupsize, threadidx, nthreads)
+            # Now all threads spread together onto shared memory
+            @inbounds for p in 1:batch_size
+                local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D))
+                local vals = @view window_vals[:, :, p]  # TODO: does this have a cost?
+                local v = vp_sm[p]
+                spread_onto_array_shmem_threads!(u_local, istart, vals, v; threadidx, nthreads)
+                @synchronize  # make sure threads don't write concurrently to the same place (since we don't use atomics)
+            end
         end
 
         @synchronize  # make sure we have finished writing to shared memory
@@ -371,40 +353,6 @@ end
     nothing
 end
 
-# Spread a single "component" (one transform at a time).
-@inline function spread_onto_array_shmem!(
-        u_local::AbstractArray{T, D},
-        indvals::NTuple{D},
-        v::Z,
-    ) where {T, D, Z}
-    if @generated
-        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
-        Tr = real(T)
-        quote
-            inds_start = map(first, indvals)  # start of active region in output array
-            vals = map(last, indvals)    # evaluated kernel values in each direction
-            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
-            $gprod_init = one($Tr)       # product of kernel values (initially 1)
-            @nloops(
-                $D, i,
-                d -> inds[d],  # for i_d ∈ 1:L
-                d -> begin
-                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
-                    @inbounds j_d = inds_start[d] + i_d
-                end,
-                begin
-                    js = @ntuple($D, j)
-                    w = v * gprod_1
-                    _atomic_add!(u_local, w, js)  # this is slow!! (atomics in shared memory) -- is it Atomix's fault?
-                end,
-            )
-            v
-        end
-    else
-        aaa  # TODO: implement
-    end
-end
-
 # Spread a single "component" (one transform at a time).
 # This is parallelised across threads in a workgroup.
 @inline function spread_onto_array_shmem_threads!(
@@ -412,7 +360,6 @@ end
         inds_start::NTuple{D, Integer},
         window_vals::AbstractArray{T, 2},  # static-size shared-memory array (2M, D)
         v::Z;
-        threadidxs::NTuple{D}, groupsize::NTuple{D},
         threadidx, nthreads,
     ) where {T, D, Z}
     inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D)))

From 5bf7ae4d1c9bd7866066e28e4884d3a34407211a Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 12:23:24 +0200
Subject: [PATCH 19/33] More optimisations

---
 src/spreading/gpu.jl | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 6c76314..97f1112 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -299,7 +299,6 @@ end
             batch_size = min(Np, b - batch_begin)
             # Iterate over points in the batch (ideally 1 thread per point).
             # Each thread writes to shared memory.
-            # TODO: can we have a two-level parallelism? e.g. parallelise over dimensions
             @inbounds for p in threadidx:nthreads:batch_size
                 # Spread from point j
                 i = batch_begin + p  # index of non-uniform point
@@ -309,20 +308,28 @@ end
                     @inbounds pointperm[i]
                 end
                 for d ∈ 1:D
-                    x = points[d][j]
-                    points_sm[d, p] = x
-                    g = gs[d]
-                    gdata = Kernels.evaluate_kernel(g, x)
-                    ishift = (block_index[d] - 1) * block_dims[d] + 1
-                    inds_start[d, p] = gdata.i - ishift
-                    local vals = gdata.values
-                    for m ∈ eachindex(vals)
-                        window_vals[m, d, p] = vals[m]
-                    end
+                    points_sm[d, p] = points[d][j]
                 end
                 vp_sm[p] = vp[c][j]
             end
 
+            @synchronize
+
+            # Now evaluate windows associated to each point.
+            local inds = CartesianIndices((1:batch_size, 1:D))  # parallelise over dimensions + points
+            @inbounds for n in threadidx:nthreads:length(inds)
+                p, d = Tuple(inds[n])
+                g = gs[d]
+                x = points_sm[d, p]
+                gdata = Kernels.evaluate_kernel(g, x)
+                ishift = (block_index[d] - 1) * block_dims[d] + 1
+                inds_start[d, p] = gdata.i - ishift
+                local vals = gdata.values
+                for m ∈ eachindex(vals)
+                    window_vals[m, d, p] = vals[m]
+                end
+            end
+
             @synchronize  # make sure all threads have the same shared data
 
             # Now all threads spread together onto shared memory

From f9e79311ec07534a24dcd268140859482d1277dd Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Mon, 21 Oct 2024 22:08:42 +0200
Subject: [PATCH 20/33] Try to fix tests (on CPU)

---
 src/spreading/gpu.jl | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 97f1112..8b1e6f6 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -281,6 +281,17 @@ end
     inds_start = @localmem(Int, (D, Np))
     vp_sm = @localmem(Z, Np)  # input values copied to shared memory
 
+    # Buffer for indices and lengths.
+    # This is needed in CPU version (used in tests), to avoid variables from being
+    # "forgotten" after a @synchronize barrier.
+    buf_sm = @localmem(Int, 3)
+
+    # This block will take care of non-uniform points (a + 1):b
+    if threadidx == 1
+        @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n]
+        @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1]
+    end
+
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
         # Reset shared memory to zero
@@ -290,13 +301,10 @@ end
 
         @synchronize  # make sure shared memory is fully set to zero
 
-        # This block will take care of non-uniform points (a + 1):b
-        @inbounds a = cumulative_npoints_per_block[block_n]
-        @inbounds b = cumulative_npoints_per_block[block_n + 1]
-
         # The first batch deals with points (a + 1):min(a + Np, b)
-        @inbounds for batch_begin in a:Np:(b - 1)
-            batch_size = min(Np, b - batch_begin)
+        @inbounds for batch_begin in buf_sm[1]:Np:(buf_sm[2] - 1)
+            batch_size = min(Np, buf_sm[2] - batch_begin)  # current batch size
+            buf_sm[3] = batch_size
             # Iterate over points in the batch (ideally 1 thread per point).
             # Each thread writes to shared memory.
             @inbounds for p in threadidx:nthreads:batch_size
@@ -316,7 +324,7 @@ end
             @synchronize
 
             # Now evaluate windows associated to each point.
-            local inds = CartesianIndices((1:batch_size, 1:D))  # parallelise over dimensions + points
+            local inds = CartesianIndices((1:buf_sm[3], 1:D))  # parallelise over dimensions + points
             @inbounds for n in threadidx:nthreads:length(inds)
                 p, d = Tuple(inds[n])
                 g = gs[d]
@@ -333,7 +341,7 @@ end
             @synchronize  # make sure all threads have the same shared data
 
             # Now all threads spread together onto shared memory
-            @inbounds for p in 1:batch_size
+            @inbounds for p in 1:buf_sm[3]
                 local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D))
                 local vals = @view window_vals[:, :, p]  # TODO: does this have a cost?
                 local v = vp_sm[p]
@@ -344,7 +352,7 @@ end
 
         @synchronize  # make sure we have finished writing to shared memory
 
-        if a < b  # skip this step if there were actually no points in this block (if a == b)
+        @inbounds if buf_sm[1] < buf_sm[2]  # skip this step if there were actually no points in this block (if a == b)
             add_from_local_to_global_memory!(
                 Z, us[c], u_local, Ns, Val(M);
                 block_index, block_dims, threadidxs, groupsize,

From e1c2daff36414f5613b6e6f89d77b5157c02b0b7 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Tue, 22 Oct 2024 09:19:19 +0200
Subject: [PATCH 21/33] Shared memory array can be complex

---
 src/blocking.jl      |   9 ++-
 src/spreading/gpu.jl | 141 ++++++++++++++-----------------------------
 2 files changed, 52 insertions(+), 98 deletions(-)

diff --git a/src/blocking.jl b/src/blocking.jl
index 9cbe6ec..e1db4c4 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -74,8 +74,13 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
         2M * D * Np +  # window_vals
         D * Np +       # points_sm
         D * Np         # inds_start
-    ) + (
-        sizeof(Z) * Np  # vp_sm
+    ) +
+    sizeof(Z) * (
+        Np  # vp_sm
+    ) +
+    sizeof(Int) * (
+        3 +  # buf_sm
+        D    # ishifts_sm
     )
     free_shmem = shmem_needs  # how much memory to leave free for other allocations
     max_shmem_size = (48 << 10) - free_shmem  # 48 KiB -- TODO: make this depend on the actual GPU?
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 8b1e6f6..99f9c96 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -206,8 +206,8 @@ function spread_from_points!(
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
-            block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
-            shmem_size = spread_real_dims(Z, block_dims_padded)
+            block_dims_padded = @. block_dims_val + 2M - 1
+            shmem_size = block_dims_padded  # dimensions of big shared memory array
             groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
             ndrange = groupsize .* ngroups
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
@@ -274,7 +274,8 @@ end
     threadidx = @index(Local, Linear)    # in 1:nthreads
 
     # Allocate static shared memory
-    u_local = @localmem(T, shmem_size)
+    u_local = @localmem(Z, shmem_size)
+    # @assert shmem_size == block_dims .+ (2M - 1)
     @assert T <: Real
     window_vals = @localmem(T, (2M, D, Np))
     points_sm = @localmem(T, (D, Np))  # points copied to shared memory
@@ -285,11 +286,15 @@ end
     # This is needed in CPU version (used in tests), to avoid variables from being
     # "forgotten" after a @synchronize barrier.
     buf_sm = @localmem(Int, 3)
+    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
 
     # This block will take care of non-uniform points (a + 1):b
     if threadidx == 1
-        @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n]
-        @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1]
+        @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n]       # = a
+        @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1]   # = b
+        @inbounds for d ∈ 1:D
+            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
+        end
     end
 
     # Interpolate components one by one (to avoid using too much memory)
@@ -330,7 +335,7 @@ end
                 g = gs[d]
                 x = points_sm[d, p]
                 gdata = Kernels.evaluate_kernel(g, x)
-                ishift = (block_index[d] - 1) * block_dims[d] + 1
+                ishift = ishifts_sm[d]
                 inds_start[d, p] = gdata.i - ishift
                 local vals = gdata.values
                 for m ∈ eachindex(vals)
@@ -343,9 +348,8 @@ end
             # Now all threads spread together onto shared memory
             @inbounds for p in 1:buf_sm[3]
                 local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D))
-                local vals = @view window_vals[:, :, p]  # TODO: does this have a cost?
                 local v = vp_sm[p]
-                spread_onto_array_shmem_threads!(u_local, istart, vals, v; threadidx, nthreads)
+                spread_onto_array_shmem_threads!(u_local, istart, window_vals, v, p; threadidx, nthreads)
                 @synchronize  # make sure threads don't write concurrently to the same place (since we don't use atomics)
             end
         end
@@ -354,8 +358,7 @@ end
 
         @inbounds if buf_sm[1] < buf_sm[2]  # skip this step if there were actually no points in this block (if a == b)
             add_from_local_to_global_memory!(
-                Z, us[c], u_local, Ns, Val(M);
-                block_index, block_dims, threadidxs, groupsize,
+                us[c], u_local, Ns, ishifts_sm, Val(M);
                 threadidx, nthreads,
             )
         end
@@ -371,117 +374,63 @@ end
 # Spread a single "component" (one transform at a time).
 # This is parallelised across threads in a workgroup.
 @inline function spread_onto_array_shmem_threads!(
-        u_local::AbstractArray{T, D},
+        u_local::AbstractArray{Z, D},
         inds_start::NTuple{D, Integer},
-        window_vals::AbstractArray{T, 2},  # static-size shared-memory array (2M, D)
-        v::Z;
+        window_vals::AbstractArray{T, 3},  # static-size shared-memory array (2M, D, Np)
+        v::Z, p::Integer;
         threadidx, nthreads,
     ) where {T, D, Z}
-    inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D)))
+    inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D)))  # = (1:2M, 1:2M, ...)
     Tr = real(T)
     @inbounds for n ∈ threadidx:nthreads:length(inds)
         I = inds[n]
         js = Tuple(I) .+ inds_start
         gprod = one(Tr)
         for d ∈ 1:D
-            gprod *= window_vals[I[d], d]
+            gprod *= window_vals[I[d], d, p]
         end
         w = v * gprod
-        _add_maybecomplex!(u_local, w, js)
+        u_local[js...] += w
     end
     nothing
 end
 
 # Add values from shared to global memory.
 @inline function add_from_local_to_global_memory!(
-        ::Type{Z},  # "logical" type of data (e.g. ComplexF64)
-        u_global::AbstractArray{T, D},  # actual data is always real
-        u_local::AbstractArray{T, D},
+        u_global::AbstractArray{T, D},  # actual data in global array is always real
+        u_local::AbstractArray{Z, D},   # shared-memory array can be complex
         Ns::Dims{D},
+        ishifts,
         ::Val{M};
-        block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
         threadidx, nthreads,
     ) where {Z, T, D, M}
-    if @generated
-        @assert T <: Real  # for atomic operations
-        skip = sizeof(Z) ÷ sizeof(T)  # 1 (Z real) or 2 (Z complex)
-        quote
-            skips = @ntuple($D, n -> n == 1 ? $skip : 1)
-            inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end])
-            offsets = @ntuple(
-                $D,
-                n -> let
-                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
-                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
-                end
-            )
-            @nloops(
-                $D, i,
-                d -> inds[d],
-                d -> begin
-                    j_d = offsets[d] + i_d
-                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
-                end,
-                begin
-                    is = @ntuple($D, i)
-                    js = @ntuple($D, j)
-                    @inbounds if $skip === 1  # real data (Z <: Real)
-                        w = u_local[is...]
-                        Atomix.@atomic u_global[js...] += w
-                    elseif $skip === 2  # complex data (Z <: Complex)
-                        is_tail = @ntuple($(D - 1), d -> i_{d + 1})
-                        js_tail = @ntuple($(D - 1), d -> j_{d + 1})
-                        ifirst = 2 * i_1 - 1
-                        jfirst = 2 * j_1 - 1
-                        w = u_local[ifirst, is_tail...]  # real part
-                        Atomix.@atomic u_global[jfirst, js_tail...] += w
-                        w = u_local[ifirst + 1, is_tail...]  # imaginary part
-                        Atomix.@atomic u_global[jfirst + 1, js_tail...] += w
-                    end
-                end,
-            )
-            nothing
-        end
-    else
-        # Unlike the @generated case, here we parallelise over linear indices instead of
-        # separate Cartesian directions. Performance is very similar to the @generated case.
-        @assert T <: Real  # for atomic operations
-        skip = sizeof(Z) ÷ sizeof(T)  # 1 (Z real) or 2 (Z complex)
-        inds = CartesianIndices(
-            ntuple(Val(D)) do d
-                # The logical indices are 1:end÷skip in the first dimension.
-                d == 1 ? axes(u_local, 1)[1:end÷skip] : axes(u_local, d)
-            end
-        )
-        offsets = ntuple(Val(D)) do n
+    @assert T <: Real  # for atomic operations
+    inds = CartesianIndices(axes(u_local))
+    offsets = ntuple(Val(D)) do d
+        @inline
+        local off = ishifts[d] - M
+        ifelse(off < 0, off + Ns[d], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+    end
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        I = inds[n]
+        is = Tuple(I)
+        js = ntuple(Val(D)) do d
             @inline
-            local off = (block_index[n] - 1) * block_dims[n] - (M - 1)
-            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+            j = is[d] + offsets[d]
+            ifelse(j > Ns[d], j - Ns[d], j)
         end
-        @inbounds for n ∈ threadidx:nthreads:length(inds)
-            I = inds[n]
-            is = Tuple(I)
-            js = ntuple(Val(D)) do d
-                @inline
-                j = is[d] + offsets[d]
-                ifelse(j > Ns[d], j - Ns[d], j)
-            end
-            @inbounds if Z <: Real
-                w = u_local[is...]
-                Atomix.@atomic u_global[js...] += w
-            elseif Z <: Complex
-                is_tail = Base.tail(is)
-                js_tail = Base.tail(js)
-                ifirst = 2 * is[1] - 1
-                jfirst = 2 * js[1] - 1
-                w = u_local[ifirst, is_tail...]  # real part
-                Atomix.@atomic u_global[jfirst, js_tail...] += w
-                w = u_local[ifirst + 1, is_tail...]  # imaginary part
-                Atomix.@atomic u_global[jfirst + 1, js_tail...] += w
-            end
+        @inbounds if Z <: Real
+            w = u_local[is...]
+            Atomix.@atomic u_global[js...] += w
+        elseif Z <: Complex
+            js_tail = Base.tail(js)
+            jfirst = 2 * js[1] - 1
+            w = u_local[is...]
+            Atomix.@atomic u_global[jfirst, js_tail...] += real(w)
+            Atomix.@atomic u_global[jfirst + 1, js_tail...] += imag(w)
         end
-        nothing
     end
+    nothing
 end
 
 ## ==================================================================================================== ##

From 4898e6ef9f1aedef97cdc584c57d67afafda3e03 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 14:02:04 +0200
Subject: [PATCH 22/33] Update interpolation based on spreading changes

---
 src/interpolation/gpu.jl | 87 +++++++++++++++++-----------------------
 src/spreading/gpu.jl     |  1 -
 2 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index b55ed25..50c576b 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -266,18 +266,30 @@ end
         nthreads = prod(groupsize)
     end
 
-    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
     block_index = @index(Group, NTuple)  # workgroup index (= block index)
     block_n = @index(Group, Linear)      # linear index of block
 
     u_local = @localmem(Z, shmem_size)  # allocate shared memory
 
+    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
+
+    if threadidx == 1
+        @inbounds for d ∈ 1:D
+            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
+        end
+    end
+
+    @synchronize
+
     # Interpolate components one by one (to avoid using too much memory)
     for c ∈ 1:C
         # Copy grid data from global to shared memory
         M = Kernels.half_support(gs[1])
-        gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize)
+        gridvalues_to_local_memory!(
+            u_local, us[c], ishifts_sm, Val(M);
+            threadidx, nthreads,
+        )
 
         @synchronize  # make sure all threads have the same shared data
 
@@ -294,13 +306,13 @@ end
             end
 
             # TODO: can we do this just once for all components? (if C > 1)
-            indvals = ntuple(Val(D)) do n
+            indvals = ntuple(Val(D)) do d
                 @inline
-                x = @inbounds points[n][j]
-                gdata = Kernels.evaluate_kernel(gs[n], x)
-                local vals = gdata.values    # kernel values
-                local ishift = (block_index[n] - 1) * block_dims[n] + 1
+                x = @inbounds points[d][j]
+                gdata = Kernels.evaluate_kernel(gs[d], x)
+                ishift = ishifts_sm[d]
                 local i₀ = gdata.i - ishift
+                local vals = gdata.values    # kernel values
                 # @assert i₀ ≥ 0
                 # @assert i₀ + 2M ≤ block_dims_padded[n]
                 i₀ => vals
@@ -319,54 +331,27 @@ end
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
-        ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D},
+        ishifts,
+        ::Val{M};
+        threadidx, nthreads,
     ) where {T, D, M}
-    if @generated
-        quote
-            Ns = size(u_global)
-            inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end])
-            offsets = @ntuple(
-                $D,
-                n -> let
-                    off = (block_index[n] - 1) * block_dims[n] - ($M - 1)
-                    ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
-                end
-            )
-            @nloops(
-                $D, i,
-                d -> inds[d],
-                d -> begin
-                    j_d = offsets[d] + i_d
-                    j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d)
-                end,
-                begin
-                    is = @ntuple($D, i)
-                    js = @ntuple($D, j)
-                    @inbounds u_local[is...] = u_global[js...]
-                end,
-            )
-            nothing
-        end
-    else
-        Ns = size(u_global)
-        inds = ntuple(Val(D)) do n
-            axes(u_local, n)[threadidxs[n]:groupsize[n]:end]  # this determines the parallelisation pattern
-        end
-        offsets = ntuple(Val(D)) do n
+    Ns = size(u_global)
+    inds = CartesianIndices(axes(u_local))
+    offsets = ntuple(Val(D)) do d
+        @inline
+        off = ishifts[d] - M
+        ifelse(off < 0, off + Ns[d], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+    end
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        is = Tuple(inds[n])
+        js = ntuple(Val(D)) do d
             @inline
-            off = (block_index[n] - 1) * block_dims[n] - (M - 1)
-            ifelse(off < 0, off + Ns[n], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+            j = is[d] + offsets[d]
+            ifelse(j > Ns[d], j - Ns[d], j)
         end
-        @inbounds for is ∈ Iterators.product(inds...)
-            js = ntuple(Val(D)) do n
-                @inline
-                j = offsets[n] + is[n]
-                ifelse(j > Ns[n], j - Ns[n], j)
-            end
-            u_local[is...] = u_global[js...]
-        end
-        nothing
+        u_local[n] = u_global[js...]
     end
+    nothing
 end
 
 # Interpolate a single "component" (one transform at a time).
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 99f9c96..91b9eb4 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -270,7 +270,6 @@ end
 
     block_n = @index(Group, Linear)      # linear index of block
     block_index = @index(Group, NTuple)  # workgroup index (= block index)
-    threadidxs = @index(Local, NTuple)   # in (1:nthreads_x, 1:nthreads_y, ...)
     threadidx = @index(Local, Linear)    # in 1:nthreads
 
     # Allocate static shared memory

From df0690f4589c2dd52ae0d58283f3ad44a3ef59da Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 14:46:01 +0200
Subject: [PATCH 23/33] Simplify setting workgroupsize

---
 src/interpolation/gpu.jl | 54 +++++++++++-----------------------------
 1 file changed, 15 insertions(+), 39 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 50c576b..a070f1c 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -108,13 +108,11 @@ function interpolate!(
             shmem_size = block_dims_padded
             groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
             ndrange = groupsize .* ngroups
-            # TODO:
-            # - try out batches in interpolation?
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
-                prefactor, block_dims,
-                Val(shmem_size),
+                prefactor,
+                block_dims, Val(shmem_size),
             )
         end
     end
@@ -130,41 +128,6 @@ function interpolate!(
     vp_all
 end
 
-# Determine groupsize (number of threads per direction) in :shared_memory method.
-# Here Np is the number of non-uniform points.
-# The distribution of threads across directions mainly (only?) affects the copies between
-# shared and global memories, so it can have an important influence of performance due to
-# memory accesses.
-# NOTE: this is also used in spreading
-function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
-    # (1) Determine the total number of threads.
-    groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
-    # Increase groupsize if number of points is much larger (at least 4×) than the groupsize.
-    # Not sure this is a good idea if the point distribution is very inhomogeneous...
-    # TODO see if this improves performance in highly dense cases
-    # points_per_group = Np / prod(ngroups)  # average number of points per block
-    # while groupsize < 1024 && 4 * groupsize ≤ points_per_group
-    #     groupsize *= 2
-    # end
-    # (2) Determine number of threads in each direction.
-    # This mainly affects the performance of global -> shared memory copies.
-    # It seems like it's better to parallelise the outer dimensions first.
-    gsizes = ntuple(_ -> 1, Val(D))
-    p = 1  # product of sizes
-    i = D  # parallelise outer dimensions first
-    while p < groupsize
-        if gsizes[i] < shmem_size[i] || i == 1
-            gsizes = Base.setindex(gsizes, gsizes[i] * 2, i)
-            p *= 2
-        else
-            @assert i > 1
-            i -= 1
-        end
-    end
-    @assert p == groupsize == prod(gsizes)
-    gsizes
-end
-
 @inline function interpolate_from_arrays_gpu(
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},
@@ -249,6 +212,19 @@ end
 ## ========================================================================================== ##
 ## Shared-memory implementation
 
+# Note: this is also used in spreading.
+function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
+    # (1) Determine the total number of threads.
+    # Not sure if it's worth it to define as a function of the inputs (currently unused).
+    # From tests, the value of 64 seems to be optimal in various situations.
+    groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
+    # (2) Determine number of threads in each direction.
+    # We don't really care about the Cartesian distribution of threads, since we always
+    # parallelise over linear indices.
+    gsizes = ntuple(_ -> 1, Val(D))
+    Base.setindex(gsizes, groupsize, 1)  # = (groupsize, 1, 1, ...)
+end
+
 @kernel function interpolate_to_points_shmem_kernel!(
         vp::NTuple{C, AbstractVector{Z}},
         @Const(gs::NTuple{D}),

From 2b2872983a57fc3bd0d29c1a90cf367ddb06848e Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 14:46:55 +0200
Subject: [PATCH 24/33] Minor changes

---
 src/interpolation/gpu.jl | 48 +++++++++++++++++-----------------------
 src/spreading/gpu.jl     |  7 +++---
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index a070f1c..8bfb9d7 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -242,28 +242,22 @@ end
         nthreads = prod(groupsize)
     end
 
-    threadidx = @index(Local, Linear)    # in 1:nthreads
-    block_index = @index(Group, NTuple)  # workgroup index (= block index)
     block_n = @index(Group, Linear)      # linear index of block
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
 
     u_local = @localmem(Z, shmem_size)  # allocate shared memory
 
-    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
-
-    if threadidx == 1
-        @inbounds for d ∈ 1:D
-            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
-        end
+    ishifts = ntuple(Val(D)) do d
+        (block_index[d] - 1) * block_dims[d] + 1
     end
 
-    @synchronize
-
     # Interpolate components one by one (to avoid using too much memory)
-    for c ∈ 1:C
+    @inbounds for c ∈ 1:C
         # Copy grid data from global to shared memory
         M = Kernels.half_support(gs[1])
         gridvalues_to_local_memory!(
-            u_local, us[c], ishifts_sm, Val(M);
+            u_local, us[c], ishifts, Val(M);
             threadidx, nthreads,
         )
 
@@ -286,15 +280,16 @@ end
                 @inline
                 x = @inbounds points[d][j]
                 gdata = Kernels.evaluate_kernel(gs[d], x)
-                ishift = ishifts_sm[d]
-                local i₀ = gdata.i - ishift
+                local i₀ = gdata.i - ishifts[d]
                 local vals = gdata.values    # kernel values
                 # @assert i₀ ≥ 0
                 # @assert i₀ + 2M ≤ block_dims_padded[n]
                 i₀ => vals
             end
 
-            v = interpolate_from_arrays_shmem(u_local, indvals, prefactor)
+            inds_start = map(first, indvals)
+            window_vals = map(last, indvals)
+            v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor)
             @inbounds vp[c][j] = v
         end
     end
@@ -333,23 +328,22 @@ end
 # Interpolate a single "component" (one transform at a time).
 # Here vp is a vector instead of a tuple of vectors.
 @inline function interpolate_from_arrays_shmem(
-        u_local::AbstractArray{T, D},
-        indvals::NTuple{D},
+        u_local::AbstractArray{Z, D},
+        inds_start::NTuple{D},
+        window_vals::NTuple{D},
         prefactor,
-    ) where {T, D}
+    ) where {Z, D}
     if @generated
         gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
         quote
             $gprod_init = prefactor
-            v = zero(T)
-            inds_start = map(first, indvals)
-            vals = map(last, indvals)    # evaluated kernel values in each direction
-            inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+            v = zero(Z)
+            inds = map(eachindex, window_vals)  # = (1:2M, 1:2M, ...)
             @nloops(
                 $D, i,
                 d -> inds[d],
                 d -> begin
-                    @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d]
+                    @inbounds gprod_d = gprod_{d + 1} * window_vals[d][i_d]
                     @inbounds j_d = inds_start[d] + i_d
                 end,
                 begin
@@ -360,12 +354,10 @@ end
             v
         end
     else
-        v = zero(T)
-        inds_start = map(first, indvals)
-        vals = map(last, indvals)    # evaluated kernel values in each direction
-        inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
+        v = zero(Z)
+        inds = map(eachindex, window_vals)  # = (1:2M, 1:2M, ...)
         @inbounds for I ∈ CartesianIndices(inds)
-            gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D)))
+            gprod = prefactor * prod(ntuple(d -> @inbounds(window_vals[d][I[d]]), Val(D)))
             js = inds_start .+ Tuple(I)
             v += gprod * u_local[js...]
         end
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 91b9eb4..f4137ce 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -212,9 +212,8 @@ function spread_from_points!(
             ndrange = groupsize .* ngroups
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
-                us_real, gs, xs_comp, vp_sorted, pointperm_,
-                bd.cumulative_npoints_per_block, HalfSupport(M),
-                block_dims, Val(shmem_size), bd.batch_size,
+                us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block,
+                HalfSupport(M), block_dims, Val(shmem_size), bd.batch_size,
             )
         end
     end
@@ -287,8 +286,8 @@ end
     buf_sm = @localmem(Int, 3)
     ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
 
-    # This block will take care of non-uniform points (a + 1):b
     if threadidx == 1
+        # This block (workgroup) will take care of non-uniform points (a + 1):b
         @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n]       # = a
         @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1]   # = b
         @inbounds for d ∈ 1:D

From d0f439f097cc8988c677e7d4ebfdcbc12c4d4745 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 15:20:04 +0200
Subject: [PATCH 25/33] Fix CPU tests

---
 src/interpolation/gpu.jl | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 8bfb9d7..fd0e449 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -248,16 +248,25 @@ end
 
     u_local = @localmem(Z, shmem_size)  # allocate shared memory
 
-    ishifts = ntuple(Val(D)) do d
-        (block_index[d] - 1) * block_dims[d] + 1
+    # This needs to be in shared memory for CPU tests (otherwise it doesn't survive across
+    # @synchronize barriers).
+    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
+
+    if threadidx == 1
+        # This block (workgroup) will take care of non-uniform points (a + 1):b
+        @inbounds for d ∈ 1:D
+            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
+        end
     end
 
+    @synchronize  # synchronise ishifts_sm
+
     # Interpolate components one by one (to avoid using too much memory)
     @inbounds for c ∈ 1:C
         # Copy grid data from global to shared memory
         M = Kernels.half_support(gs[1])
         gridvalues_to_local_memory!(
-            u_local, us[c], ishifts, Val(M);
+            u_local, us[c], ishifts_sm, Val(M);
             threadidx, nthreads,
         )
 
@@ -280,7 +289,7 @@ end
                 @inline
                 x = @inbounds points[d][j]
                 gdata = Kernels.evaluate_kernel(gs[d], x)
-                local i₀ = gdata.i - ishifts[d]
+                local i₀ = gdata.i - ishifts_sm[d]
                 local vals = gdata.values    # kernel values
                 # @assert i₀ ≥ 0
                 # @assert i₀ + 2M ≤ block_dims_padded[n]

From 2b0b4b6ff8b4a6dca30404e86a6dc3eab31ad1d5 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 15:23:03 +0200
Subject: [PATCH 26/33] Remove unused functions

---
 src/spreading/gpu.jl | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index f4137ce..7b90fc1 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -34,9 +34,6 @@ end
 @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns
 @inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1)  # actual number of complex elements in first dimension
 
-@inline spread_real_dims(::Type{<:Real}, Ns) = Ns
-@inline spread_real_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] << 1, 1)
-
 @inline function spread_onto_arrays_gpu!(
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},
@@ -122,22 +119,6 @@ end
     nothing
 end
 
-# Same as _atomic_add!, but without the atomics.
-@inline function _add_maybecomplex!(u::AbstractArray{T}, v::T, inds::Tuple) where {T <: Real}
-    @inbounds u[inds...] += v
-    nothing
-end
-
-@inline function _add_maybecomplex!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real}
-    @inbounds begin
-        i₁ = 2 * (inds[1] - 1)  # convert from logical index (equivalent complex array) to memory index (real array)
-        itail = Base.tail(inds)
-        u[i₁ + 1, itail...] += real(v)
-        u[i₁ + 2, itail...] += imag(v)
-    end
-    nothing
-end
-
 # GPU implementation.
 # We assume all arrays are already on the GPU.
 function spread_from_points!(

From 6bbeb8ee0458332ae6e9fa8cacdc0f7a6c58b405 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 15:35:53 +0200
Subject: [PATCH 27/33] Add tests for multiple transforms

---
 src/spreading/gpu.jl |  6 +++---
 test/pseudo_gpu.jl   | 23 +++++++++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 7b90fc1..4490ca1 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -342,9 +342,9 @@ end
             )
         end
 
-        if c < C
-            @synchronize  # wait before jumping to next component
-        end
+        # Avoid resetting u_local too early in the next iteration (c -> c + 1).
+        # This is mostly useful when c < C (but putting an `if` fails...).
+        @synchronize
     end
 
     nothing
diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
index 6244856..13df0a6 100644
--- a/test/pseudo_gpu.jl
+++ b/test/pseudo_gpu.jl
@@ -50,7 +50,7 @@ end
 
 # ================================================================================ #
 
-function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
+function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc}
     (; backend,) = p
 
     xp = adapt(backend, xp_init)
@@ -78,32 +78,34 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
     T = eltype(p)  # this is actually defined in AbstractNFFTs; it represents the type in Fourier space (always complex)
     @test T <: Complex
     dims = size(p)
-    us = KA.allocate(backend, T, dims)
+    us = map(_ -> KA.allocate(backend, T, dims), vp)
     exec_type1!(us, p, vp)
 
-    wp = similar(vp)
+    wp = map(similar, vp)
     exec_type2!(wp, p, us)
 
     (; backend, us, wp,)
 end
 
-function compare_with_cpu(::Type{T}, dims; Np = prod(dims), kws...) where {T <: Number}
+function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc} = Val(1), kws...) where {T <: Number, Nc}
     # Generate some non-uniform random data on the CPU
     rng = Xoshiro(42)
     N = length(dims)    # number of dimensions
     Tr = real(T)
     xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
-    vp_init = randn(rng, T, Np)          # random values at points
+    vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms)
 
-    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, kws...)
+    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...)
     p_cpu = PlanNUFFT(T, dims; params..., backend = CPU())
     p_gpu = PlanNUFFT(T, dims; params..., backend = PseudoGPU())
 
     r_cpu = run_plan(p_cpu, xp_init, vp_init)
     r_gpu = run_plan(p_gpu, xp_init, vp_init)
 
-    @test r_cpu.us ≈ r_gpu.us  # output of type-1 transform
-    @test r_cpu.wp ≈ r_gpu.wp  # output of type-2 transform
+    for c ∈ 1:Nc
+        @test r_cpu.us[c] ≈ r_gpu.us[c]  # output of type-1 transform
+        @test r_cpu.wp[c] ≈ r_gpu.wp[c]  # output of type-2 transform
+    end
 
     nothing
 end
@@ -123,4 +125,9 @@ end
         @testset "Float32" compare_with_cpu(Float32, dims; gpu_method = :shared_memory)
         @testset "ComplexF32" compare_with_cpu(ComplexF32, dims; gpu_method = :shared_memory)
     end
+    @testset "Multiple transforms" begin
+        ntransforms = Val(2)
+        @testset "Global memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :global_memory)
+        @testset "Shared memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :shared_memory)
+    end
 end

From 7bc11bc06a687a152f114520f215dc715345ce03 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 15:49:09 +0200
Subject: [PATCH 28/33] Simplify atomic adds with complex data

Doesn't change performance.
---
 src/spreading/gpu.jl | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 4490ca1..dbc58d1 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -21,7 +21,7 @@
     Z = eltype(vp[1])
     Ns_real = size(first(us))  # dimensions of raw input data
     @assert eltype(first(us)) <: Real # output is a real array (but may actually describe complex data)
-    Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
+    Ns = spread_actual_dims(Z, Ns_real)  # drop the first dimension (of size 2) if Z <: Complex
 
     indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
@@ -32,10 +32,10 @@
 end
 
 @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns
-@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1)  # actual number of complex elements in first dimension
+@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.tail(Ns)  # actual number of complex elements: drop first dimension
 
 @inline function spread_onto_arrays_gpu!(
-        us::NTuple{C, AbstractArray{T, D}},
+        us::NTuple{C, AbstractArray{T}},
         indvals::NTuple{D, <:Pair},
         vs::NTuple{C},
         Ns::Dims{D},
@@ -111,14 +111,15 @@ end
 # so the output must be a real array `u`.
 @inline function _atomic_add!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real}
     @inbounds begin
-        i₁ = 2 * (inds[1] - 1)  # convert from logical index (equivalent complex array) to memory index (real array)
-        itail = Base.tail(inds)
-        Atomix.@atomic u[i₁ + 1, itail...] += real(v)
-        Atomix.@atomic u[i₁ + 2, itail...] += imag(v)
+        Atomix.@atomic u[1, inds...] += real(v)
+        Atomix.@atomic u[2, inds...] += imag(v)
     end
     nothing
 end
 
+to_real_array(u::AbstractArray{T}) where {T <: Real} = u
+to_real_array(u::AbstractArray{T}) where {T <: Complex} = reinterpret(reshape, real(T), u)  # adds an extra first dimension with size 2
+
 # GPU implementation.
 # We assume all arrays are already on the GPU.
 function spread_from_points!(
@@ -138,14 +139,7 @@ function spread_from_points!(
     # Reinterpret `us` as real arrays, in case they are complex.
     # This is to avoid current issues with atomic operations on complex data
     # (https://github.com/JuliaGPU/KernelAbstractions.jl/issues/497).
-    Z = eltype(us[1])
-    T = real(Z)
-    us_real = if Z <: Real
-        us
-    else  # complex case
-        @assert Z <: Complex
-        map(u -> reinterpret(T, u), us)  # note: we don't use reshape, so the first dimension has 2x elements
-    end
+    us_real = map(to_real_array, us)  # doesn't change anything if data is already real (Z === T)
 
     pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
     sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
@@ -245,7 +239,7 @@ end
         # the actual dataset dimensions.
         @assert T === real(Z) # output is a real array (but may actually describe complex data)
         Ns_real = size(first(us))  # dimensions of raw input data
-        Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
+        Ns = spread_actual_dims(Z, Ns_real)  # drop the first dimension (of size 2) if Z <: Complex
     end
 
     block_n = @index(Group, Linear)      # linear index of block
@@ -376,7 +370,7 @@ end
 
 # Add values from shared to global memory.
 @inline function add_from_local_to_global_memory!(
-        u_global::AbstractArray{T, D},  # actual data in global array is always real
+        u_global::AbstractArray{T},     # actual data in global array is always real (and may have an extra first dimension)
         u_local::AbstractArray{Z, D},   # shared-memory array can be complex
         Ns::Dims{D},
         ishifts,
@@ -384,6 +378,7 @@ end
         threadidx, nthreads,
     ) where {Z, T, D, M}
     @assert T <: Real  # for atomic operations
+    # @assert ndims(u_global) === D + (Z <: Complex)  # extra dimension if data is complex
     inds = CartesianIndices(axes(u_local))
     offsets = ntuple(Val(D)) do d
         @inline
@@ -398,16 +393,8 @@ end
             j = is[d] + offsets[d]
             ifelse(j > Ns[d], j - Ns[d], j)
         end
-        @inbounds if Z <: Real
-            w = u_local[is...]
-            Atomix.@atomic u_global[js...] += w
-        elseif Z <: Complex
-            js_tail = Base.tail(js)
-            jfirst = 2 * js[1] - 1
-            w = u_local[is...]
-            Atomix.@atomic u_global[jfirst, js_tail...] += real(w)
-            Atomix.@atomic u_global[jfirst + 1, js_tail...] += imag(w)
-        end
+        w = u_local[is...]
+        _atomic_add!(u_global, w, js)
     end
     nothing
 end

From 05edddf3bc87c22729c06f0e68e585b34631f430 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 16:35:39 +0200
Subject: [PATCH 29/33] =?UTF-8?q?point=5Fto=5Fcell=20now=20also=20returns?=
 =?UTF-8?q?=20x/=CE=94x?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Kernels/Kernels.jl                 | 3 ++-
 src/Kernels/bspline.jl                 | 4 ++--
 src/Kernels/gaussian.jl                | 2 +-
 src/Kernels/kaiser_bessel.jl           | 6 +++---
 src/Kernels/kaiser_bessel_backwards.jl | 6 +++---
 src/blocking.jl                        | 6 +++---
 test/near_2pi.jl                       | 8 ++++----
 7 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/Kernels/Kernels.jl b/src/Kernels/Kernels.jl
index 1e6155b..412f97b 100644
--- a/src/Kernels/Kernels.jl
+++ b/src/Kernels/Kernels.jl
@@ -71,6 +71,7 @@ function init_fourier_coefficients!(g::AbstractKernelData, ks::AbstractVector)
 end
 
 # Assign a cell index to a location `x`. This assumes 0 ≤ x < 2π.
+# This also returns x / Δx to avoid recomputing it later.
 @inline function point_to_cell(x, Δx)
     r = x / Δx
     i = unsafe_trunc(Int, r)  # assumes r ≥ 0
@@ -78,7 +79,7 @@ end
     # is very close (but slightly smaller) to i * Δx.
     i += (i * Δx ≤ x)  # this is almost always true (so we increment by 1)
     # @assert (i - 1) * Δx ≤ x < i * Δx
-    i
+    i, r
 end
 
 # Note: evaluate_kernel_func generates a function which is callable from GPU kernels.
diff --git a/src/Kernels/bspline.jl b/src/Kernels/bspline.jl
index b934056..74378e1 100644
--- a/src/Kernels/bspline.jl
+++ b/src/Kernels/bspline.jl
@@ -98,8 +98,8 @@ function evaluate_kernel_func(g::BSplineKernelData{M}) where {M}
     # This can be shown using the partition of unity property.
     (; Δt,) = g
     function (x)
-        i = point_to_cell(x, Δt)  # assume Δx = Δt
-        x′ = i - (x / Δt)  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
+        i, r = point_to_cell(x, Δt)  # assume Δx = Δt
+        x′ = i - r  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
         # @assert 0 ≤ x′ ≤ 1
         k = 2M  # B-spline order
         values = bsplines_evaluate_all(x′, Val(k), typeof(Δt))
diff --git a/src/Kernels/gaussian.jl b/src/Kernels/gaussian.jl
index a585265..9fcc326 100644
--- a/src/Kernels/gaussian.jl
+++ b/src/Kernels/gaussian.jl
@@ -121,7 +121,7 @@ end
 function evaluate_kernel_func(g::GaussianKernelData{M}) where {M}
     (; τ, Δx, cs,) = g
     @inline @fastmath function (x)
-        i = point_to_cell(x, Δx)
+        i, r = point_to_cell(x, Δx)
         X = x - (i - 1) * Δx  # source position relative to xs[i]
         # @assert 0 ≤ X < i * Δx
         a = exp(-X^2 / τ)
diff --git a/src/Kernels/kaiser_bessel.jl b/src/Kernels/kaiser_bessel.jl
index b08da9f..997f169 100644
--- a/src/Kernels/kaiser_bessel.jl
+++ b/src/Kernels/kaiser_bessel.jl
@@ -163,10 +163,10 @@ function evaluate_fourier_func(g::KaiserBesselKernelData)
 end
 
 function evaluate_kernel_func(g::KaiserBesselKernelData{M, T}) where {M, T}
-    (; w, Δx, cs,) = g
+    (; Δx, cs,) = g
     function (x)
-        i = point_to_cell(x, Δx)
-        X = x / w - T(i - 1) / M  # source position relative to xs[i]
+        i, r = point_to_cell(x, Δx)
+        X = (r - T(i - 1)) / M  # source position relative to xs[i]
         # @assert 0 ≤ X < 1 / M
         values = evaluate_piecewise(X, cs)
         (; i, values,)
diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl
index e894e1a..e6d2fd3 100644
--- a/src/Kernels/kaiser_bessel_backwards.jl
+++ b/src/Kernels/kaiser_bessel_backwards.jl
@@ -134,10 +134,10 @@ function evaluate_fourier_func(g::BackwardsKaiserBesselKernelData)
 end
 
 function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M, T}
-    (; w, Δx, cs,) = g
+    (; Δx, cs,) = g
     function (x)
-        i = point_to_cell(x, Δx)
-        X = x / w - T(i - 1) / M  # source position relative to xs[i]
+        i, r = point_to_cell(x, Δx)  # r = x / Δx
+        X = (r - T(i - 1)) / M  # source position relative to xs[i]
         # @assert 0 ≤ X < 1 / M
         values = evaluate_piecewise(X, cs)
         (; i, values,)
diff --git a/src/blocking.jl b/src/blocking.jl
index e1db4c4..cb8e070 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -245,7 +245,7 @@ end
         # errors) due to numerical accuracy issues. So we first obtain the index on the
         # Δx grid, then we translate that to a block index by dividing by the block
         # dimensions (= how many Δx's in a single block).
-        i = Kernels.point_to_cell(x⃗[n], Δxs[n])  # index of grid cell where point is located
+        i, r = Kernels.point_to_cell(x⃗[n], Δxs[n])  # index of grid cell where point is located
         cld(i, block_dims[n])  # index of block where point is located
     end
     @inbounds LinearIndices(nblocks_per_dir)[is...]
@@ -410,7 +410,7 @@ function set_points_impl!(
     @timeit timer "(1) Assign blocks" @inbounds for (i, x⃗) ∈ pairs(xp)
         # Get index of block where point x⃗ is located.
         y⃗ = to_unit_cell(transform(NTuple{N}(x⃗)))  # converts `x⃗` to Tuple if it's an SVector
-        is = map(Kernels.point_to_cell, y⃗, block_sizes)
+        is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes)  # we use first((i, r)) -> i
         if bd.sort_points === False()
             points[i] = y⃗  # copy folded point (doesn't need to be sorted)
         end
@@ -440,7 +440,7 @@ function set_points_impl!(
             # We recompute the block index associated to this point.
             x⃗ = xp[i]
             y⃗ = to_unit_cell(transform(NTuple{N}(x⃗)))  # converts `x⃗` to Tuple if it's an SVector
-            is = map(Kernels.point_to_cell, y⃗, block_sizes)
+            is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes)  # we use first((i, r)) -> i
             n = to_linear_index[is...]  # linear index of block
             j = cumulative_npoints_per_block[n] + blockidx[i]
             pointperm[j] = i
diff --git a/test/near_2pi.jl b/test/near_2pi.jl
index 4605ded..0b706c4 100644
--- a/test/near_2pi.jl
+++ b/test/near_2pi.jl
@@ -43,13 +43,13 @@ end
     @testset "Kernels.point_to_cell" begin
         Δx = 2π / 3
         let x = prevfloat(L / 3)
-            @test Kernels.point_to_cell(x, Δx) == 1
+            @test Kernels.point_to_cell(x, Δx)[1] == 1
         end
         let x = prevfloat(2 * L / 3)
-            @test Kernels.point_to_cell(x, Δx) == 2
+            @test Kernels.point_to_cell(x, Δx)[1] == 2
         end
         let x = prevfloat(L)
-            @test Kernels.point_to_cell(x, Δx) == 3
+            @test Kernels.point_to_cell(x, Δx)[1] == 3
         end
     end
 end
@@ -65,7 +65,7 @@ end
 
     @testset "Kernels.point_to_cell" begin
         Δx = T(2π) / 24
-        i = Kernels.point_to_cell(x, Δx)
+        i, r = Kernels.point_to_cell(x, Δx)
         @test (i - 1) * Δx ≤ x < i * Δx
     end
 

From 1f6c02a0e5cbe14fdecb45be5e00267bae772141 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 16:45:15 +0200
Subject: [PATCH 30/33] Remove direct evaluation functions (for now)

---
 src/Kernels/kaiser_bessel_backwards.jl | 35 --------------------------
 1 file changed, 35 deletions(-)

diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl
index e6d2fd3..e741bbd 100644
--- a/src/Kernels/kaiser_bessel_backwards.jl
+++ b/src/Kernels/kaiser_bessel_backwards.jl
@@ -143,38 +143,3 @@ function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M
         (; i, values,)
     end
 end
-
-# TODO: remove?
-function evaluate_kernel_direct(
-        g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, x::T,
-    ) where {M, T}
-    (; Δx, β,) = g
-    # i = point_to_cell(x, Δx)
-    X = x / Δx - T(i - 1)  # source position relative to xs[i]
-    # @assert 0 ≤ X < 1
-    Xc = 2 * X - 1  # in [-1, 1)
-    L = 2M
-    ntuple(Val(L)) do j
-        h = 1 - 2 * (j - one(T)/2) / L  # midpoint of interval
-        δ = 1 / L                  # half-width of interval
-        y = h + Xc * δ
-        @fastmath s = sqrt(1 - y^2)
-        @fastmath sinh(β * s)::T / (s * T(π))
-    end
-end
-
-function evaluate_kernel_direct(
-        g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, n::Integer, x::T,
-    ) where {M, T}
-    (; Δx, β,) = g
-    # @assert 1 ≤ n ≤ 2M
-    X = x / Δx - T(i - 1)  # source position relative to xs[i]
-    # @assert 0 ≤ X < 1
-    Xc = 2 * X - 1  # in [-1, 1)
-    L = 2M
-    h = 1 - 2 * (n - one(T)/2) / L  # midpoint of interval
-    δ = 1 / L                  # half-width of interval
-    y = h + Xc * δ
-    @fastmath s = sqrt(1 - y^2)
-    @fastmath sinh(β * s)::T / (s * T(π))
-end

From e8ea63d76158af331eea6afd1f0de9b975990a4a Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 17:00:03 +0200
Subject: [PATCH 31/33] Update CHANGELOG [skip ci]

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1c8485d..b2c04c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Add shared-memory implementation of GPU transforms.
+  They are disabled by default, and can be enabled by passing `gpu_method = :shared_memory`
+  (default is `:global_memory`).
+
 ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17
 
 ### Changed

From 92356d31bd8761d8696a38861335f73fb203a0a1 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 17:09:19 +0200
Subject: [PATCH 32/33] Add documentation

---
 src/plan.jl | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/plan.jl b/src/plan.jl
index 09b3b3a..01c1674 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -98,6 +98,7 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   The current default is 4096 on the CPU and around 1024 on the GPU (but depends on the number of dimensions).
   These may change in the future or even depend on the actual computing device.
   On the CPU, using block partitioning is required for running with multiple threads.
+  On the GPU, this option is ignored if `gpu_method = :shared_memory`.
   Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
   generally slower).
 
@@ -108,6 +109,29 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   This can improve performance if executing multiple transforms on the same non-uniform points.
   Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
 
+- `gpu_method`: allows to select between two different implementations of
+  GPU transforms. Possible options are:
+
+  * `:global_memory`: directly read and write onto arrays in global memory in spreading
+    (type-1) and interpolation (type-2) operations;
+
+  * `:shared_memory`: copy data between global memory and faster shared memory (memory
+    shared between threads/work items within a single GPU workgroup) and perform most
+    operations in shared-memory, which is faster and can avoid some atomic operations in
+    type-1 transforms.
+    We try to use as much shared memory as is typically available on current GPUs (which is
+    not much, typically 48 KiB).
+    But still, this method can be much faster than the `:global_memory`, and may become the
+    default in the future.
+    Note that this method completely ignores the `block_size` parameter, as the block size is adjusted
+    to maximise shared memory usage.
+    See also the `gpu_batch_size` parameter below.
+
+  The default is currently `:global_memory` but this may change in the future.
+
+- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
+  This can have a big impact on performance.
+
 ## Other parameters
 
 - `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`).
@@ -309,7 +333,7 @@ function _PlanNUFFT(
         block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
         synchronise::Bool = false,
         gpu_method::Symbol = :global_memory,
-        batch_size::Val = Val(16),  # currently only used in shared-memory GPU spreading
+        gpu_batch_size::Val = Val(16),  # currently only used in shared-memory GPU spreading
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.
@@ -344,7 +368,7 @@ function _PlanNUFFT(
     else
         block_dims = get_block_dims(Ñs, block_size)
         if backend isa GPU
-            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size,)
+            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size = gpu_batch_size,)
         else
             blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
             FFTW.set_num_threads(Threads.nthreads())

From c9311e05da638fa9627a70c5bc749d99f0280aaf Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Wed, 23 Oct 2024 21:25:52 +0200
Subject: [PATCH 33/33] Update docs and comments

---
 CHANGELOG.md             |  4 ++--
 src/blocking.jl          |  9 ++++-----
 src/interpolation/gpu.jl |  1 -
 src/plan.jl              | 28 ++++++++++++++--------------
 src/spreading/gpu.jl     | 22 +++++++++++++++++-----
 5 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2c04c1..3a449c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Add shared-memory implementation of GPU transforms.
-  They are disabled by default, and can be enabled by passing `gpu_method = :shared_memory`
+- Add alternative implementation of GPU transforms based on shared-memory arrays.
+  This is disabled by default, and can be enabled by passing `gpu_method = :shared_memory`
   (default is `:global_memory`).
 
 ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17
diff --git a/src/blocking.jl b/src/blocking.jl
index cb8e070..df8ee49 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -63,14 +63,14 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 # ================================================================================ #
 
 # Determine block size if using the shared-memory implementation.
-# We try to make sure that the total block size (including 2M ghost cells in each direction)
+# We try to make sure that the total block size (including 2M - 1 ghost cells in each direction)
 # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
 # Note that the result is a compile-time constant (on Julia 1.11.1 at least).
 @inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np}) where {Z <: Number, D, M, Np}
     T = real(Z)
     # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
-    # Here Np is the batch size (batch_size parameter).
-    shmem_needs = sizeof(T) * (
+    # Here Np is the batch size (gpu_batch_size parameter).
+    base_shmem_required = sizeof(T) * (
         2M * D * Np +  # window_vals
         D * Np +       # points_sm
         D * Np         # inds_start
@@ -82,8 +82,7 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
         3 +  # buf_sm
         D    # ishifts_sm
     )
-    free_shmem = shmem_needs  # how much memory to leave free for other allocations
-    max_shmem_size = (48 << 10) - free_shmem  # 48 KiB -- TODO: make this depend on the actual GPU?
+    max_shmem_size = (48 << 10) - base_shmem_required  # 48 KiB -- TODO: make this depend on the actual GPU?
     max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
     m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
     n = m - (2M - 1)  # exclude ghost cells
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index fd0e449..bf9722a 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -307,7 +307,6 @@ end
 end
 
 # Copy values from global to shared memory.
-# - can we optimise memory access patterns?
 @inline function gridvalues_to_local_memory!(
         u_local::AbstractArray{T, D},
         u_global::AbstractArray{T, D},
diff --git a/src/plan.jl b/src/plan.jl
index 01c1674..3b71c78 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -109,28 +109,28 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   This can improve performance if executing multiple transforms on the same non-uniform points.
   Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
 
-- `gpu_method`: allows to select between two different implementations of
+- `gpu_method`: allows to select between different implementations of
   GPU transforms. Possible options are:
 
   * `:global_memory`: directly read and write onto arrays in global memory in spreading
     (type-1) and interpolation (type-2) operations;
 
-  * `:shared_memory`: copy data between global memory and faster shared memory (memory
-    shared between threads/work items within a single GPU workgroup) and perform most
-    operations in shared-memory, which is faster and can avoid some atomic operations in
-    type-1 transforms.
-    We try to use as much shared memory as is typically available on current GPUs (which is
-    not much, typically 48 KiB).
-    But still, this method can be much faster than the `:global_memory`, and may become the
-    default in the future.
-    Note that this method completely ignores the `block_size` parameter, as the block size is adjusted
-    to maximise shared memory usage.
-    See also the `gpu_batch_size` parameter below.
+  * `:shared_memory`: copy data between global memory and shared memory (local
+    to each GPU workgroup) and perform most operations in the latter, which is faster and
+    can help avoid some atomic operations in type-1 transforms. We try to use as much shared
+    memory as is typically available on current GPUs (which is not much, typically 48 KiB on
+    CUDA). But still, this method can be much faster than the `:global_memory` and may
+    become the default in the future. Note that this method completely ignores the
+    `block_size` parameter, as the actual block size is adjusted to maximise shared memory
+    usage. The performance of type-1 transforms can be further adjusted using the
+    `gpu_batch_size` parameter described below.
 
-  The default is currently `:global_memory` but this may change in the future.
+  The default is `:global_memory` but this may change in the future.
 
 - `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
-  This can have a big impact on performance.
+  The idea is that, to avoid atomic operations on shared-memory arrays, we process
+  non-uniform points in batches of `Np` points (16 by default).
+  This can have a large impact on performance.
 
 ## Other parameters
 
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index dbc58d1..4545528 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -279,12 +279,22 @@ end
 
         @synchronize  # make sure shared memory is fully set to zero
 
+        # We operate in batches of up to `Np` non-uniform points / sources.
+        # The aim is to completely avoid atomic operations on shared memory: instead of
+        # parallelising over non-uniform points (1 point = 1 thread), we make all threads
+        # work on the same set of points, so that, in the end, all threads have the required
+        # information to spread point values onto `u_local`. That spreading operation is
+        # done by parallelising over the local grid of dimensions M^D (so that every thread
+        # writes to a different part of the array), instead of parallelising over the
+        # non-uniform points which would require atomics.
+
         # The first batch deals with points (a + 1):min(a + Np, b)
         @inbounds for batch_begin in buf_sm[1]:Np:(buf_sm[2] - 1)
             batch_size = min(Np, buf_sm[2] - batch_begin)  # current batch size
             buf_sm[3] = batch_size
-            # Iterate over points in the batch (ideally 1 thread per point).
-            # Each thread writes to shared memory.
+
+            # (1) Iterate over points in the batch.
+            # Each active thread writes to shared memory.
             @inbounds for p in threadidx:nthreads:batch_size
                 # Spread from point j
                 i = batch_begin + p  # index of non-uniform point
@@ -301,8 +311,8 @@ end
 
             @synchronize
 
-            # Now evaluate windows associated to each point.
-            local inds = CartesianIndices((1:buf_sm[3], 1:D))  # parallelise over dimensions + points
+            # (2) Evaluate window functions around each non-uniform point.
+            inds = CartesianIndices((1:buf_sm[3], 1:D))  # parallelise over dimensions + points
             @inbounds for n in threadidx:nthreads:length(inds)
                 p, d = Tuple(inds[n])
                 g = gs[d]
@@ -318,7 +328,8 @@ end
 
             @synchronize  # make sure all threads have the same shared data
 
-            # Now all threads spread together onto shared memory
+            # (3) All threads spread together onto shared memory, avoiding all collisions
+            # and thus not requiring atomic operations.
             @inbounds for p in 1:buf_sm[3]
                 local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D))
                 local v = vp_sm[p]
@@ -329,6 +340,7 @@ end
 
         @synchronize  # make sure we have finished writing to shared memory
 
+        # Add values from shared memory onto global memory.
         @inbounds if buf_sm[1] < buf_sm[2]  # skip this step if there were actually no points in this block (if a == b)
             add_from_local_to_global_memory!(
                 us[c], u_local, Ns, ishifts_sm, Val(M);