diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1c8485d..3a449c7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Add alternative implementation of GPU transforms based on shared-memory arrays.
+  This is disabled by default, and can be enabled by passing `gpu_method = :shared_memory`
+  (default is `:global_memory`).
+
 ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17
 
 ### Changed
diff --git a/src/Kernels/Kernels.jl b/src/Kernels/Kernels.jl
index 1e6155b..412f97b 100644
--- a/src/Kernels/Kernels.jl
+++ b/src/Kernels/Kernels.jl
@@ -71,6 +71,7 @@ function init_fourier_coefficients!(g::AbstractKernelData, ks::AbstractVector)
 end
 
 # Assign a cell index to a location `x`. This assumes 0 ≤ x < 2π.
+# This also returns x / Δx to avoid recomputing it later.
 @inline function point_to_cell(x, Δx)
     r = x / Δx
     i = unsafe_trunc(Int, r)  # assumes r ≥ 0
@@ -78,7 +79,7 @@ end
     # is very close (but slightly smaller) to i * Δx.
     i += (i * Δx ≤ x)  # this is almost always true (so we increment by 1)
     # @assert (i - 1) * Δx ≤ x < i * Δx
-    i
+    i, r
 end
 
 # Note: evaluate_kernel_func generates a function which is callable from GPU kernels.
diff --git a/src/Kernels/bspline.jl b/src/Kernels/bspline.jl
index b934056..74378e1 100644
--- a/src/Kernels/bspline.jl
+++ b/src/Kernels/bspline.jl
@@ -98,8 +98,8 @@ function evaluate_kernel_func(g::BSplineKernelData{M}) where {M}
     # This can be shown using the partition of unity property.
     (; Δt,) = g
     function (x)
-        i = point_to_cell(x, Δt)  # assume Δx = Δt
-        x′ = i - (x / Δt)  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
+        i, r = point_to_cell(x, Δt)  # assume Δx = Δt
+        x′ = i - r  # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt)
         # @assert 0 ≤ x′ ≤ 1
         k = 2M  # B-spline order
         values = bsplines_evaluate_all(x′, Val(k), typeof(Δt))
diff --git a/src/Kernels/gaussian.jl b/src/Kernels/gaussian.jl
index a585265..9fcc326 100644
--- a/src/Kernels/gaussian.jl
+++ b/src/Kernels/gaussian.jl
@@ -121,7 +121,7 @@ end
 function evaluate_kernel_func(g::GaussianKernelData{M}) where {M}
     (; τ, Δx, cs,) = g
     @inline @fastmath function (x)
-        i = point_to_cell(x, Δx)
+        i, r = point_to_cell(x, Δx)
         X = x - (i - 1) * Δx  # source position relative to xs[i]
         # @assert 0 ≤ X < i * Δx
         a = exp(-X^2 / τ)
diff --git a/src/Kernels/kaiser_bessel.jl b/src/Kernels/kaiser_bessel.jl
index b08da9f..997f169 100644
--- a/src/Kernels/kaiser_bessel.jl
+++ b/src/Kernels/kaiser_bessel.jl
@@ -163,10 +163,10 @@ function evaluate_fourier_func(g::KaiserBesselKernelData)
 end
 
 function evaluate_kernel_func(g::KaiserBesselKernelData{M, T}) where {M, T}
-    (; w, Δx, cs,) = g
+    (; Δx, cs,) = g
     function (x)
-        i = point_to_cell(x, Δx)
-        X = x / w - T(i - 1) / M  # source position relative to xs[i]
+        i, r = point_to_cell(x, Δx)
+        X = (r - T(i - 1)) / M  # source position relative to xs[i]
         # @assert 0 ≤ X < 1 / M
         values = evaluate_piecewise(X, cs)
         (; i, values,)
diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl
index 45129cd..e741bbd 100644
--- a/src/Kernels/kaiser_bessel_backwards.jl
+++ b/src/Kernels/kaiser_bessel_backwards.jl
@@ -134,10 +134,10 @@ function evaluate_fourier_func(g::BackwardsKaiserBesselKernelData)
 end
 
 function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M, T}
-    (; w, Δx, cs,) = g
+    (; Δx, cs,) = g
     function (x)
-        i = point_to_cell(x, Δx)
-        X = x / w - T(i - 1) / M  # source position relative to xs[i]
+        i, r = point_to_cell(x, Δx)  # r = x / Δx
+        X = (r - T(i - 1)) / M  # source position relative to xs[i]
         # @assert 0 ≤ X < 1 / M
         values = evaluate_piecewise(X, cs)
         (; i, values,)
diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
index a432663..eeaf20f 100644
--- a/src/NonuniformFFTs.jl
+++ b/src/NonuniformFFTs.jl
@@ -5,7 +5,8 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan
 using GPUArraysCore: AbstractGPUArray, AbstractGPUVector
 using StructArrays: StructArrays, StructVector
 using AbstractFFTs: AbstractFFTs
-using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const
+using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const,
+    @groupsize, @localmem, @synchronize, @uniform
 using Atomix: Atomix
 using FFTW: FFTW
 using LinearAlgebra: mul!
diff --git a/src/blocking.jl b/src/blocking.jl
index ad143af..df8ee49 100644
--- a/src/blocking.jl
+++ b/src/blocking.jl
@@ -7,6 +7,7 @@ with_blocking(::NullBlockData) = false
 # For now these are only used in the GPU implementation
 get_pointperm(::NullBlockData) = nothing
 get_sort_points(::NullBlockData) = False()
+gpu_method(::NullBlockData) = :global_memory
 
 @kernel function copy_points_unblocked_kernel!(@Const(transform::F), points::NTuple, @Const(xp)) where {F}
     I = @index(Global, Linear)
@@ -61,43 +62,100 @@ type_length(::Type{<:NTuple{N}}) where {N} = N
 
 # ================================================================================ #
 
+# Determine block size if using the shared-memory implementation.
+# We try to make sure that the total block size (including 2M - 1 ghost cells in each direction)
+# is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
+# Note that the result is a compile-time constant (on Julia 1.11.1 at least).
+@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np}) where {Z <: Number, D, M, Np}
+    T = real(Z)
+    # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
+    # Here Np is the batch size (gpu_batch_size parameter).
+    base_shmem_required = sizeof(T) * (
+        2M * D * Np +  # window_vals
+        D * Np +       # points_sm
+        D * Np         # inds_start
+    ) +
+    sizeof(Z) * (
+        Np  # vp_sm
+    ) +
+    sizeof(Int) * (
+        3 +  # buf_sm
+        D    # ishifts_sm
+    )
+    max_shmem_size = (48 << 10) - base_shmem_required  # 48 KiB -- TODO: make this depend on the actual GPU?
+    max_block_length = max_shmem_size ÷ sizeof(Z)  # maximum number of elements in a block
+    m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
+    n = m - (2M - 1)  # exclude ghost cells
+    if n ≤ 0
+        throw(ArgumentError(
+            lazy"""
+            GPU shared memory size is too small for the chosen problem:
+              - element type: $Z
+              - half-support: $M
+              - number of dimensions: $D
+            If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
+        ))
+        # TODO: warning if n is too small? (likely slower than global_memory method)
+        # What is small? n < M?
+    end
+    ntuple(_ -> n, Val(D))  # = (n, n, ...)
+end
+
 # GPU implementation
 struct BlockDataGPU{
         N, I <: Integer, T <: AbstractFloat,
         IndexVector <: AbstractVector{I},
         SortPoints <: StaticBool,
+        Np,
     } <: AbstractBlockData
+    method :: Symbol     # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory
+    Δxs :: NTuple{N, T}  # grid step; the size of a block in units of length is block_dims .* Δxs.
     nblocks_per_dir  :: NTuple{N, I}  # number of blocks in each direction
-    block_sizes      :: NTuple{N, T}  # size of each block (in units of length)
+    block_dims       :: NTuple{N, I}  # maximum dimensions of a block (excluding ghost cells)
     cumulative_npoints_per_block :: IndexVector    # cumulative sum of number of points in each block (length = num_blocks + 1, first value is 0)
     blockidx      :: IndexVector  # linear index of block associated to each point (length = Np)
     pointperm     :: IndexVector
     sort_points   :: SortPoints
+    batch_size :: Val{Np}  # how many non-uniform points per batch in SM spreading?
     function BlockDataGPU(
+            method::Symbol,
+            Δxs::NTuple{N, T},
             nblocks_per_dir::NTuple{N, I},
-            block_sizes::NTuple{N, T},
-            npoints_per_block::V, sort::S,
-        ) where {N, I, T, V, S}
+            block_dims::NTuple{N, I},
+            npoints_per_block::V, sort::S;
+            batch_size::Val{Np},
+        ) where {N, I, T, V, S, Np}
+        Np::Integer
+        method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)"))
         blockidx = similar(npoints_per_block, 0)
         pointperm = similar(npoints_per_block, 0)
-        new{N, I, T, V, S}(
-            nblocks_per_dir, block_sizes, npoints_per_block, blockidx, pointperm, sort,
+        new{N, I, T, V, S, Np}(
+            method, Δxs, nblocks_per_dir, block_dims, npoints_per_block, blockidx, pointperm, sort,
+            batch_size,
         )
     end
 end
 
+gpu_method(bd::BlockDataGPU) = bd.method
 with_blocking(::BlockDataGPU) = true
 
 function BlockDataGPU(
         ::Type{Z},
-        backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, sort_points,
-    ) where {Z <: Number, D}
+        backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
+        sort_points::StaticBool;
+        method::Symbol,
+        batch_size::Val = Val(16),  # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
+    ) where {Z <: Number, D, M}
     T = real(Z)  # in case Z is complex
-    nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim) --> effective block size is ≤ block_dims
-    L = T(2) * π
-    block_sizes = map(nblocks -> L / nblocks, nblocks_per_dir)
+    if method === :shared_memory
+        # Override input block size. We try to maximise the use of shared memory.
+        block_dims = block_dims_gpu_shmem(Z, Ñs, h, batch_size)
+    end
+    nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim)
+    L = T(2) * π  # domain period
+    Δxs = map(N -> L / N, Ñs)  # grid step (oversampled grid)
     cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
-    BlockDataGPU(nblocks_per_dir, block_sizes, cumulative_npoints_per_block, sort_points)
+    BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size)
 end
 
 get_pointperm(bd::BlockDataGPU) = bd.pointperm
@@ -108,7 +166,7 @@ function set_points_impl!(
         transform::F = identity, synchronise,
     ) where {F <: Function}
     (;
-        cumulative_npoints_per_block, nblocks_per_dir, block_sizes,
+        Δxs, cumulative_npoints_per_block, nblocks_per_dir, block_dims,
         blockidx, pointperm, sort_points,
     ) = bd
 
@@ -136,8 +194,8 @@ function set_points_impl!(
     @timeit timer "(1) Assign blocks" let
         local kernel! = assign_blocks_kernel!(backend, workgroupsize)
         kernel!(
-            blockidx, cumulative_npoints_per_block, points_comp, xp,
-            block_sizes, nblocks_per_dir, sort_points, transform;
+            blockidx, cumulative_npoints_per_block, points_comp, xp, Δxs,
+            block_dims, nblocks_per_dir, sort_points, transform;
             ndrange,
         )
         maybe_synchronise(backend, synchronise)
@@ -155,8 +213,8 @@ function set_points_impl!(
     @timeit timer "(3) Sort" let
         local kernel! = sortperm_kernel!(backend, workgroupsize)
         kernel!(
-            pointperm, cumulative_npoints_per_block, blockidx, xp,
-            block_sizes, nblocks_per_dir, transform;
+            pointperm, cumulative_npoints_per_block, blockidx, xp, Δxs,
+            block_dims, nblocks_per_dir, transform;
             ndrange,
         )
         maybe_synchronise(backend, synchronise)
@@ -177,9 +235,18 @@ end
 
 # Get index of block where x⃗ is located (assumed to be in [0, 2π)).
 @inline function block_index(
-        x⃗::NTuple{N,T}, block_sizes::NTuple{N,T}, nblocks_per_dir::NTuple{N},
+        x⃗::NTuple{N,T}, Δxs::NTuple{N,T}, block_dims::NTuple{N,Integer}, nblocks_per_dir::NTuple{N},
     ) where {N, T <: AbstractFloat}
-    is = map(Kernels.point_to_cell, x⃗, block_sizes)
+    is = ntuple(Val(N)) do n
+        @inline
+        # Note: we could directly use the block size Δx_block = Δx * block_dims, but this
+        # may lead to inconsistency with interpolation and spreading kernels (leading to
+        # errors) due to numerical accuracy issues. So we first obtain the index on the
+        # Δx grid, then we translate that to a block index by dividing by the block
+        # dimensions (= how many Δx's in a single block).
+        i, r = Kernels.point_to_cell(x⃗[n], Δxs[n])  # index of grid cell where point is located
+        cld(i, block_dims[n])  # index of block where point is located
+    end
     @inbounds LinearIndices(nblocks_per_dir)[is...]
 end
 
@@ -188,7 +255,8 @@ end
         cumulative_npoints_per_block::AbstractVector{<:Integer},
         points::NTuple,
         @Const(xp),
-        @Const(block_sizes::NTuple),
+        @Const(Δxs),
+        @Const(block_dims::NTuple),
         @Const(nblocks_per_dir::NTuple),
         @Const(sort_points),
         @Const(transform::F),
@@ -196,7 +264,7 @@ end
     I = @index(Global, Linear)
     @inbounds x⃗ = xp[I]
     y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple
-    n = block_index(y⃗, block_sizes, nblocks_per_dir)
+    n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir)
 
     # Note: here index_within_block is the value *after* incrementing (≥ 1).
     S = eltype(cumulative_npoints_per_block)
@@ -218,14 +286,15 @@ end
         @Const(cumulative_npoints_per_block),
         @Const(blockidx),
         @Const(xp),
-        @Const(block_sizes),
+        @Const(Δxs),
+        @Const(block_dims),
         @Const(nblocks_per_dir),
         @Const(transform::F),
     ) where {F}
     I = @index(Global, Linear)
     @inbounds x⃗ = xp[I]
     y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple
-    n = block_index(y⃗, block_sizes, nblocks_per_dir)
+    n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir)
     @inbounds J = cumulative_npoints_per_block[n] + blockidx[I]
     @inbounds pointperm[J] = I
     nothing
@@ -340,7 +409,7 @@ function set_points_impl!(
     @timeit timer "(1) Assign blocks" @inbounds for (i, x⃗) ∈ pairs(xp)
         # Get index of block where point x⃗ is located.
         y⃗ = to_unit_cell(transform(NTuple{N}(x⃗)))  # converts `x⃗` to Tuple if it's an SVector
-        is = map(Kernels.point_to_cell, y⃗, block_sizes)
+        is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes)  # we use first((i, r)) -> i
         if bd.sort_points === False()
             points[i] = y⃗  # copy folded point (doesn't need to be sorted)
         end
@@ -370,7 +439,7 @@ function set_points_impl!(
             # We recompute the block index associated to this point.
             x⃗ = xp[i]
             y⃗ = to_unit_cell(transform(NTuple{N}(x⃗)))  # converts `x⃗` to Tuple if it's an SVector
-            is = map(Kernels.point_to_cell, y⃗, block_sizes)
+            is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes)  # we use first((i, r)) -> i
             n = to_linear_index[is...]  # linear index of block
             j = cumulative_npoints_per_block[n] + blockidx[i]
             pointperm[j] = i
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index 5c96c63..bf9722a 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -1,5 +1,24 @@
 using StaticArrays: MVector
 
+# Note: this is also used in spreading
+@inline function get_inds_vals_gpu(gs::NTuple{D}, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
+    ntuple(Val(D)) do n
+        @inline
+        get_inds_vals_gpu(gs[n], points[n], Ns[n], j)
+    end
+end
+
+# Note: this is also used in spreading
+@inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer)
+    x = @inbounds points[j]
+    gdata = Kernels.evaluate_kernel(g, x)
+    vals = gdata.values    # kernel values
+    M = Kernels.half_support(g)
+    i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
+    i₀ = ifelse(i₀ < 0, i₀ + N, i₀)  # make sure i₀ ≥ 0
+    i₀ => vals
+end
+
 # Interpolate onto a single point
 @kernel function interpolate_to_point_naive_kernel!(
         vp::NTuple{C},
@@ -7,7 +26,7 @@ using StaticArrays: MVector
         @Const(points::NTuple{D}),
         @Const(us::NTuple{C}),
         @Const(pointperm),
-        @Const(Δxs::NTuple{D}),           # grid step in each direction (oversampled grid)
+        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
     ) where {C, D}
     i = @index(Global, Linear)
 
@@ -22,21 +41,9 @@ using StaticArrays: MVector
     # don't perform atomic operations. This is why the code is simpler here.
     Ns = size(first(us))  # grid dimensions
 
-    indvals = ntuple(Val(D)) do n
-        @inline
-        @inbounds begin
-            g = gs[n]
-            x = points[n][j]
-            gdata = Kernels.evaluate_kernel(g, x)
-            vals = gdata.values    # kernel values
-            M = Kernels.half_support(gs[n])
-            i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
-            i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀)  # make sure i₀ ≥ 0
-            i₀ => vals
-        end
-    end
+    indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
-    v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, Δxs)
+    v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, prefactor)
 
     for n ∈ eachindex(vp, v⃗)
         @inbounds vp[n][j] = v⃗[n]
@@ -48,17 +55,18 @@ end
 function interpolate!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},
-        gs,
+        gs::NTuple{D},
         vp_all::NTuple{C, AbstractVector},
         us::NTuple{C, AbstractArray},
         x⃗s::AbstractVector,
-    ) where {C}
+    ) where {C, D}
     # Note: the dimensions of arrays have already been checked via check_nufft_nonuniform_data.
     Base.require_one_based_indexing(x⃗s)  # this is to make sure that all indices match
     foreach(Base.require_one_based_indexing, vp_all)
 
     xs_comp = StructArrays.components(x⃗s)
     Δxs = map(Kernels.gridstep, gs)
+    prefactor = prod(Δxs)  # interpolations need to be multiplied by the volume of a grid cell
 
     pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
     sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
@@ -77,15 +85,44 @@ function interpolate!(
 
     # We use dynamically sized kernels to avoid recompilation, since number of points may
     # change from one call to another.
-    ndrange = size(x⃗s)  # iterate through points
-    workgroupsize = default_workgroupsize(backend, ndrange)
-    kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize)
-    kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange)
+    ndrange_points = size(x⃗s)  # iterate through points
+    groupsize_points = default_workgroupsize(backend, ndrange_points)
+
+    method = gpu_method(bd)
+
+    if method === :global_memory
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel! = interpolate_to_point_naive_kernel!(backend, groupsize)
+            kernel!(vp_sorted, gs, xs_comp, us, pointperm_, prefactor; ndrange)
+        end
+    elseif method === :shared_memory
+        @assert bd isa BlockDataGPU
+        Z = eltype(us[1])
+        M = Kernels.half_support(gs[1])
+        @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
+        @assert block_dims_val === bd.block_dims
+        let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
+            block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
+            shmem_size = block_dims_padded
+            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
+            ndrange = groupsize .* ngroups
+            kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
+            kernel!(
+                vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
+                prefactor,
+                block_dims, Val(shmem_size),
+            )
+        end
+    end
 
     if sort_points === True()
-        kernel_perm! = interp_permute_kernel!(backend, workgroupsize)
-        kernel_perm!(vp_all, vp_sorted, pointperm; ndrange)
-        foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel_perm! = interp_permute_kernel!(backend, groupsize)
+            kernel_perm!(vp_all, vp_sorted, pointperm; ndrange)
+            foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
+        end
     end
 
     vp_all
@@ -95,7 +132,7 @@ end
         us::NTuple{C, AbstractArray{T, D}},
         indvals::NTuple{D, <:Pair},
         Ns::Dims{D},
-        Δxs::NTuple{D, Tr},
+        prefactor::Tr,
     ) where {T, Tr, C, D}
     if @generated
         gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
@@ -105,7 +142,7 @@ end
             vals = map(last, indvals)    # evaluated kernel values in each direction
             inds = map(eachindex, vals)  # = (1:L, 1:L, ...) where L = 2M is the kernel width
             vs = zero(MVector{$C, $T})   # interpolated value (output)
-            $gprod_init = prod(Δxs)  # product of kernel values (initially Δx[1] * Δx[2] * ...)
+            $gprod_init = prefactor  # product of kernel values (initially Δx[1] * Δx[2] * ...)
             @nloops(
                 $D, i,
                 d -> inds[d],  # for i_d ∈ 1:L
@@ -137,7 +174,7 @@ end
         vals_first, vals_tail = first(vals), Base.tail(vals)
         istart_first, istart_tail = first(inds_start), Base.tail(inds_start)
         N, Ns_tail = first(Ns), Base.tail(Ns)
-        gprod_base = prod(Δxs)  # this factor is needed in interpolation only
+        gprod_base = prefactor  # this factor is needed in interpolation only
         @inbounds for I_tail ∈ CartesianIndices(inds_tail)
             is_tail = Tuple(I_tail)
             gs_tail = map(inbounds_getindex, vals_tail, is_tail)
@@ -171,3 +208,167 @@ end
     end
     nothing
 end
+
+## ========================================================================================== ##
+## Shared-memory implementation
+
+# Note: this is also used in spreading.
+function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
+    # (1) Determine the total number of threads.
+    # Not sure if it's worth it to define as a function of the inputs (currently unused).
+    # From tests, the value of 64 seems to be optimal in various situations.
+    groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
+    # (2) Determine number of threads in each direction.
+    # We don't really care about the Cartesian distribution of threads, since we always
+    # parallelise over linear indices.
+    gsizes = ntuple(_ -> 1, Val(D))
+    Base.setindex(gsizes, groupsize, 1)  # = (groupsize, 1, 1, ...)
+end
+
+@kernel function interpolate_to_points_shmem_kernel!(
+        vp::NTuple{C, AbstractVector{Z}},
+        @Const(gs::NTuple{D}),
+        @Const(points::NTuple{D}),
+        @Const(us::NTuple{C, AbstractArray{Z}}),
+        @Const(pointperm),
+        @Const(cumulative_npoints_per_block::AbstractVector),
+        @Const(prefactor::Real),    # = volume of a grid cell = prod(Δxs)
+        ::Val{block_dims},
+        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
+    ) where {C, D, Z <: Number, block_dims, shmem_size}
+
+    @uniform begin
+        groupsize = @groupsize()::Dims{D}
+        nthreads = prod(groupsize)
+    end
+
+    block_n = @index(Group, Linear)      # linear index of block
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
+
+    u_local = @localmem(Z, shmem_size)  # allocate shared memory
+
+    # This needs to be in shared memory for CPU tests (otherwise it doesn't survive across
+    # @synchronize barriers).
+    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
+
+    if threadidx == 1
+        # This block (workgroup) will take care of non-uniform points (a + 1):b
+        @inbounds for d ∈ 1:D
+            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
+        end
+    end
+
+    @synchronize  # synchronise ishifts_sm
+
+    # Interpolate components one by one (to avoid using too much memory)
+    @inbounds for c ∈ 1:C
+        # Copy grid data from global to shared memory
+        M = Kernels.half_support(gs[1])
+        gridvalues_to_local_memory!(
+            u_local, us[c], ishifts_sm, Val(M);
+            threadidx, nthreads,
+        )
+
+        @synchronize  # make sure all threads have the same shared data
+
+        # This block will take care of non-uniform points (a + 1):b
+        @inbounds a = cumulative_npoints_per_block[block_n]
+        @inbounds b = cumulative_npoints_per_block[block_n + 1]
+
+        for i in (a + threadidx):nthreads:b
+            # Interpolate at point j
+            j = if pointperm === nothing
+                i
+            else
+                @inbounds pointperm[i]
+            end
+
+            # TODO: can we do this just once for all components? (if C > 1)
+            indvals = ntuple(Val(D)) do d
+                @inline
+                x = @inbounds points[d][j]
+                gdata = Kernels.evaluate_kernel(gs[d], x)
+                local i₀ = gdata.i - ishifts_sm[d]
+                local vals = gdata.values    # kernel values
+                # @assert i₀ ≥ 0
+                # @assert i₀ + 2M ≤ block_dims_padded[n]
+                i₀ => vals
+            end
+
+            inds_start = map(first, indvals)
+            window_vals = map(last, indvals)
+            v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor)
+            @inbounds vp[c][j] = v
+        end
+    end
+
+    nothing
+end
+
+# Copy values from global to shared memory.
+@inline function gridvalues_to_local_memory!(
+        u_local::AbstractArray{T, D},
+        u_global::AbstractArray{T, D},
+        ishifts,
+        ::Val{M};
+        threadidx, nthreads,
+    ) where {T, D, M}
+    Ns = size(u_global)
+    inds = CartesianIndices(axes(u_local))
+    offsets = ntuple(Val(D)) do d
+        @inline
+        off = ishifts[d] - M
+        ifelse(off < 0, off + Ns[d], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+    end
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        is = Tuple(inds[n])
+        js = ntuple(Val(D)) do d
+            @inline
+            j = is[d] + offsets[d]
+            ifelse(j > Ns[d], j - Ns[d], j)
+        end
+        u_local[n] = u_global[js...]
+    end
+    nothing
+end
+
+# Interpolate a single "component" (one transform at a time).
+# Here vp is a vector instead of a tuple of vectors.
+@inline function interpolate_from_arrays_shmem(
+        u_local::AbstractArray{Z, D},
+        inds_start::NTuple{D},
+        window_vals::NTuple{D},
+        prefactor,
+    ) where {Z, D}
+    if @generated
+        gprod_init = Symbol(:gprod_, D + 1)  # the name of this variable is important!
+        quote
+            $gprod_init = prefactor
+            v = zero(Z)
+            inds = map(eachindex, window_vals)  # = (1:2M, 1:2M, ...)
+            @nloops(
+                $D, i,
+                d -> inds[d],
+                d -> begin
+                    @inbounds gprod_d = gprod_{d + 1} * window_vals[d][i_d]
+                    @inbounds j_d = inds_start[d] + i_d
+                end,
+                begin
+                    js = @ntuple($D, j)
+                    @inbounds v += gprod_1 * u_local[js...]
+                end,
+            )
+            v
+        end
+    else
+        v = zero(Z)
+        inds = map(eachindex, window_vals)  # = (1:2M, 1:2M, ...)
+        @inbounds for I ∈ CartesianIndices(inds)
+            gprod = prefactor * prod(ntuple(d -> @inbounds(window_vals[d][I[d]]), Val(D)))
+            js = inds_start .+ Tuple(I)
+            v += gprod * u_local[js...]
+        end
+        v
+    end
+end
diff --git a/src/plan.jl b/src/plan.jl
index aff8c1c..3b71c78 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -98,6 +98,7 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   The current default is 4096 on the CPU and around 1024 on the GPU (but depends on the number of dimensions).
   These may change in the future or even depend on the actual computing device.
   On the CPU, using block partitioning is required for running with multiple threads.
+  On the GPU, this option is ignored if `gpu_method = :shared_memory`.
   Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
   generally slower).
 
@@ -108,6 +109,29 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data
   This can improve performance if executing multiple transforms on the same non-uniform points.
   Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
 
+- `gpu_method`: allows to select between different implementations of
+  GPU transforms. Possible options are:
+
+  * `:global_memory`: directly read and write onto arrays in global memory in spreading
+    (type-1) and interpolation (type-2) operations;
+
+  * `:shared_memory`: copy data between global memory and shared memory (local
+    to each GPU workgroup) and perform most operations in the latter, which is faster and
+    can help avoid some atomic operations in type-1 transforms. We try to use as much shared
+    memory as is typically available on current GPUs (which is not much, typically 48 KiB on
+    CUDA). But still, this method can be much faster than the `:global_memory` and may
+    become the default in the future. Note that this method completely ignores the
+    `block_size` parameter, as the actual block size is adjusted to maximise shared memory
+    usage. The performance of type-1 transforms can be further adjusted using the
+    `gpu_batch_size` parameter described below.
+
+  The default is `:global_memory` but this may change in the future.
+
+- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
+  The idea is that, to avoid atomic operations on shared-memory arrays, we process
+  non-uniform points in batches of `Np` points (16 by default).
+  This can have a large impact on performance.
+
 ## Other parameters
 
 - `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`).
@@ -308,6 +332,8 @@ function _PlanNUFFT(
         backend::KA.Backend = CPU(),
         block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
         synchronise::Bool = false,
+        gpu_method::Symbol = :global_memory,
+        gpu_batch_size::Val = Val(16),  # currently only used in shared-memory GPU spreading
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.
@@ -342,7 +368,7 @@ function _PlanNUFFT(
     else
         block_dims = get_block_dims(Ñs, block_size)
         if backend isa GPU
-            blocks = BlockDataGPU(T, backend, block_dims, Ñs, sort_points)
+            blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size = gpu_batch_size,)
         else
             blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points)
             FFTW.set_num_threads(Threads.nthreads())
diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
index 838aa79..4545528 100644
--- a/src/spreading/gpu.jl
+++ b/src/spreading/gpu.jl
@@ -21,21 +21,9 @@
     Z = eltype(vp[1])
     Ns_real = size(first(us))  # dimensions of raw input data
     @assert eltype(first(us)) <: Real # output is a real array (but may actually describe complex data)
-    Ns = spread_actual_dims(Z, Ns_real)  # divides the Ns_real[1] by 2 if Z <: Complex
+    Ns = spread_actual_dims(Z, Ns_real)  # drop the first dimension (of size 2) if Z <: Complex
 
-    indvals = ntuple(Val(D)) do n
-        @inline
-        @inbounds begin
-            g = gs[n]
-            x = points[n][j]
-            gdata = Kernels.evaluate_kernel(g, x)
-            vals = gdata.values
-            M = Kernels.half_support(gs[n])
-            i₀ = gdata.i - M  # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping)
-            i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀)  # make sure i₀ ≥ 0
-            i₀ => vals
-        end
-    end
+    indvals = get_inds_vals_gpu(gs, points, Ns, j)
 
     v⃗ = map(v -> @inbounds(v[j]), vp)
     spread_onto_arrays_gpu!(us, indvals, v⃗, Ns)
@@ -44,10 +32,10 @@
 end
 
 @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns
-@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1)  # actual number of complex elements in first dimension
+@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.tail(Ns)  # actual number of complex elements: drop first dimension
 
 @inline function spread_onto_arrays_gpu!(
-        us::NTuple{C, AbstractArray{T, D}},
+        us::NTuple{C, AbstractArray{T}},
         indvals::NTuple{D, <:Pair},
         vs::NTuple{C},
         Ns::Dims{D},
@@ -70,7 +58,7 @@ end
                 end,
                 begin
                     # gprod_1 contains the product vals[1][i_1] * vals[2][i_2] * ...
-                    js = @ntuple $D j
+                    js = @ntuple($D, j)
                     @inbounds for n ∈ 1:$C
                         w = vs[n] * gprod_1
                         _atomic_add!(us[n], w, js)
@@ -123,21 +111,22 @@ end
 # so the output must be a real array `u`.
 @inline function _atomic_add!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real}
     @inbounds begin
-        i₁ = 2 * (inds[1] - 1)  # convert from logical index (equivalent complex array) to memory index (real array)
-        itail = Base.tail(inds)
-        Atomix.@atomic u[i₁ + 1, itail...] += real(v)
-        Atomix.@atomic u[i₁ + 2, itail...] += imag(v)
+        Atomix.@atomic u[1, inds...] += real(v)
+        Atomix.@atomic u[2, inds...] += imag(v)
     end
     nothing
 end
 
+to_real_array(u::AbstractArray{T}) where {T <: Real} = u
+to_real_array(u::AbstractArray{T}) where {T <: Complex} = reinterpret(reshape, real(T), u)  # adds an extra first dimension with size 2
+
 # GPU implementation.
 # We assume all arrays are already on the GPU.
 function spread_from_points!(
         backend::GPU,
         bd::Union{BlockDataGPU, NullBlockData},
         gs,
-        us_all::NTuple{C, AbstractGPUArray},
+        us::NTuple{C, AbstractGPUArray},
         x⃗s::StructVector,
         vp_all::NTuple{C, AbstractGPUVector},
     ) where {C}
@@ -147,17 +136,10 @@ function spread_from_points!(
 
     xs_comp = StructArrays.components(x⃗s)
 
-    # Reinterpret `us_all` as real arrays, in case they are complex.
+    # Reinterpret `us` as real arrays, in case they are complex.
     # This is to avoid current issues with atomic operations on complex data
     # (https://github.com/JuliaGPU/KernelAbstractions.jl/issues/497).
-    Z = eltype(us_all[1])
-    T = real(Z)
-    us_real = if Z <: Real
-        us_all
-    else  # complex case
-        @assert Z <: Complex
-        map(u -> reinterpret(T, u), us_all)  # note: we don't use reshape, so the first dimension has 2x elements
-    end
+    us_real = map(to_real_array, us)  # doesn't change anything if data is already real (Z === T)
 
     pointperm = get_pointperm(bd)                  # nothing in case of NullBlockData
     sort_points = get_sort_points(bd)::StaticBool  # False in the case of NullBlockData
@@ -168,27 +150,54 @@ function spread_from_points!(
 
     # We use dynamically sized kernels to avoid recompilation, since number of points may
     # change from one call to another.
-    ndrange = size(x⃗s)  # iterate through points
-    workgroupsize = default_workgroupsize(backend, ndrange)
+    ndrange_points = size(x⃗s)  # iterate through points
+    groupsize_points = default_workgroupsize(backend, ndrange_points)
 
     if sort_points === True()
         vp_sorted = map(similar, vp_all)  # allocate temporary arrays for sorted non-uniform data
-        kernel_perm! = spread_permute_kernel!(backend, workgroupsize)
-        kernel_perm!(vp_sorted, vp_all, pointperm; ndrange)
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel_perm! = spread_permute_kernel!(backend, groupsize)
+            kernel_perm!(vp_sorted, vp_all, pointperm; ndrange)
+        end
         pointperm_ = nothing  # we don't need any further permutations (all accesses to non-uniform data will be contiguous)
     else
         vp_sorted = vp_all
         pointperm_ = pointperm
     end
 
-    kernel! = spread_from_point_naive_kernel!(backend, workgroupsize)
-    kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange)
+    method = gpu_method(bd)
+
+    if method === :global_memory
+        let ndrange = ndrange_points, groupsize = groupsize_points
+            kernel! = spread_from_point_naive_kernel!(backend, groupsize)
+            kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange)
+        end
+    elseif method === :shared_memory
+        @assert bd isa BlockDataGPU
+        Z = eltype(us[1])
+        M = Kernels.half_support(gs[1])
+        @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
+        block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
+        @assert block_dims_val === bd.block_dims
+        let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
+            block_dims_padded = @. block_dims_val + 2M - 1
+            shmem_size = block_dims_padded  # dimensions of big shared memory array
+            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
+            ndrange = groupsize .* ngroups
+            kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
+            kernel!(
+                us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block,
+                HalfSupport(M), block_dims, Val(shmem_size), bd.batch_size,
+            )
+        end
+    end
 
     if sort_points === True()
         foreach(KA.unsafe_free!, vp_sorted)  # manually deallocate temporary arrays
     end
 
-    us_all
+    us
 end
 
 @kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
@@ -199,3 +208,207 @@ end
     end
     nothing
 end
+
+## ========================================================================================== ##
+## Shared-memory implementation
+
+# Process non-uniform points in batches of Np points (or less).
+# The idea is to completely avoid slow atomic writes to shared memory arrays (u_local),
+# while parallelising some operations across up to Np points.
+# TODO: Should Np be equal to the number of threads? or how to choose it optimally so that the block size stays not too small?
+@kernel function spread_from_points_shmem_kernel!(
+        us::NTuple{C, AbstractArray{T}},
+        @Const(gs::NTuple{D}),
+        @Const(points::NTuple{D}),
+        @Const(vp::NTuple{C, AbstractVector{Z}}),
+        @Const(pointperm),
+        @Const(cumulative_npoints_per_block::AbstractVector),
+        ::HalfSupport{M},
+        ::Val{block_dims},
+        ::Val{shmem_size},  # this is a bit redundant, but seems to be required for CPU backends (used in tests)
+        ::Val{Np},  # batch_size
+    ) where {C, D, T, Z, M, Np, block_dims, shmem_size}
+
+    @uniform begin
+        groupsize = @groupsize()::Dims{D}
+        nthreads = prod(groupsize)
+
+        # Determine grid dimensions.
+        # Note that, to avoid problems with @atomic, we currently work with real-data arrays
+        # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice
+        # the actual dataset dimensions.
+        @assert T === real(Z) # output is a real array (but may actually describe complex data)
+        Ns_real = size(first(us))  # dimensions of raw input data
+        Ns = spread_actual_dims(Z, Ns_real)  # drop the first dimension (of size 2) if Z <: Complex
+    end
+
+    block_n = @index(Group, Linear)      # linear index of block
+    block_index = @index(Group, NTuple)  # workgroup index (= block index)
+    threadidx = @index(Local, Linear)    # in 1:nthreads
+
+    # Allocate static shared memory
+    u_local = @localmem(Z, shmem_size)
+    # @assert shmem_size == block_dims .+ (2M - 1)
+    @assert T <: Real
+    window_vals = @localmem(T, (2M, D, Np))
+    points_sm = @localmem(T, (D, Np))  # points copied to shared memory
+    inds_start = @localmem(Int, (D, Np))
+    vp_sm = @localmem(Z, Np)  # input values copied to shared memory
+
+    # Buffer for indices and lengths.
+    # This is needed in CPU version (used in tests), to avoid variables from being
+    # "forgotten" after a @synchronize barrier.
+    buf_sm = @localmem(Int, 3)
+    ishifts_sm = @localmem(Int, D)  # shift between local and global array in each direction
+
+    if threadidx == 1
+        # This block (workgroup) will take care of non-uniform points (a + 1):b
+        @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n]       # = a
+        @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1]   # = b
+        @inbounds for d ∈ 1:D
+            ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1
+        end
+    end
+
+    # Interpolate components one by one (to avoid using too much memory)
+    for c ∈ 1:C
+        # Reset shared memory to zero
+        for i ∈ threadidx:nthreads:length(u_local)
+            @inbounds u_local[i] = 0
+        end
+
+        @synchronize  # make sure shared memory is fully set to zero
+
+        # We operate in batches of up to `Np` non-uniform points / sources.
+        # The aim is to completely avoid atomic operations on shared memory: instead of
+        # parallelising over non-uniform points (1 point = 1 thread), we make all threads
+        # work on the same set of points, so that, in the end, all threads have the required
+        # information to spread point values onto `u_local`. That spreading operation is
+        # done by parallelising over the local grid of dimensions M^D (so that every thread
+        # writes to a different part of the array), instead of parallelising over the
+        # non-uniform points which would require atomics.
+
+        # The first batch deals with points (a + 1):min(a + Np, b)
+        @inbounds for batch_begin in buf_sm[1]:Np:(buf_sm[2] - 1)
+            batch_size = min(Np, buf_sm[2] - batch_begin)  # current batch size
+            buf_sm[3] = batch_size
+
+            # (1) Iterate over points in the batch.
+            # Each active thread writes to shared memory.
+            @inbounds for p in threadidx:nthreads:batch_size
+                # Spread from point j
+                i = batch_begin + p  # index of non-uniform point
+                j = if pointperm === nothing
+                    i
+                else
+                    @inbounds pointperm[i]
+                end
+                for d ∈ 1:D
+                    points_sm[d, p] = points[d][j]
+                end
+                vp_sm[p] = vp[c][j]
+            end
+
+            @synchronize
+
+            # (2) Evaluate window functions around each non-uniform point.
+            inds = CartesianIndices((1:buf_sm[3], 1:D))  # parallelise over dimensions + points
+            @inbounds for n in threadidx:nthreads:length(inds)
+                p, d = Tuple(inds[n])
+                g = gs[d]
+                x = points_sm[d, p]
+                gdata = Kernels.evaluate_kernel(g, x)
+                ishift = ishifts_sm[d]
+                inds_start[d, p] = gdata.i - ishift
+                local vals = gdata.values
+                for m ∈ eachindex(vals)
+                    window_vals[m, d, p] = vals[m]
+                end
+            end
+
+            @synchronize  # make sure all threads have the same shared data
+
+            # (3) All threads spread together onto shared memory, avoiding all collisions
+            # and thus not requiring atomic operations.
+            @inbounds for p in 1:buf_sm[3]
+                local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D))
+                local v = vp_sm[p]
+                spread_onto_array_shmem_threads!(u_local, istart, window_vals, v, p; threadidx, nthreads)
+                @synchronize  # make sure threads don't write concurrently to the same place (since we don't use atomics)
+            end
+        end
+
+        @synchronize  # make sure we have finished writing to shared memory
+
+        # Add values from shared memory onto global memory.
+        @inbounds if buf_sm[1] < buf_sm[2]  # skip this step if there were actually no points in this block (if a == b)
+            add_from_local_to_global_memory!(
+                us[c], u_local, Ns, ishifts_sm, Val(M);
+                threadidx, nthreads,
+            )
+        end
+
+        # Avoid resetting u_local too early in the next iteration (c -> c + 1).
+        # This is mostly useful when c < C (but putting an `if` fails...).
+        @synchronize
+    end
+
+    nothing
+end
+
+# Spread a single "component" (one transform at a time).
+# This is parallelised across threads in a workgroup.
+@inline function spread_onto_array_shmem_threads!(
+        u_local::AbstractArray{Z, D},
+        inds_start::NTuple{D, Integer},
+        window_vals::AbstractArray{T, 3},  # static-size shared-memory array (2M, D, Np)
+        v::Z, p::Integer;
+        threadidx, nthreads,
+    ) where {T, D, Z}
+    inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D)))  # = (1:2M, 1:2M, ...)
+    Tr = real(T)
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        I = inds[n]
+        js = Tuple(I) .+ inds_start
+        gprod = one(Tr)
+        for d ∈ 1:D
+            gprod *= window_vals[I[d], d, p]
+        end
+        w = v * gprod
+        u_local[js...] += w
+    end
+    nothing
+end
+
+# Add values from shared to global memory.
+@inline function add_from_local_to_global_memory!(
+        u_global::AbstractArray{T},     # actual data in global array is always real (and may have an extra first dimension)
+        u_local::AbstractArray{Z, D},   # shared-memory array can be complex
+        Ns::Dims{D},
+        ishifts,
+        ::Val{M};
+        threadidx, nthreads,
+    ) where {Z, T, D, M}
+    @assert T <: Real  # for atomic operations
+    # @assert ndims(u_global) === D + (Z <: Complex)  # extra dimension if data is complex
+    inds = CartesianIndices(axes(u_local))
+    offsets = ntuple(Val(D)) do d
+        @inline
+        local off = ishifts[d] - M
+        ifelse(off < 0, off + Ns[d], off)  # make sure the offset is non-negative (to avoid some wrapping below)
+    end
+    @inbounds for n ∈ threadidx:nthreads:length(inds)
+        I = inds[n]
+        is = Tuple(I)
+        js = ntuple(Val(D)) do d
+            @inline
+            j = is[d] + offsets[d]
+            ifelse(j > Ns[d], j - Ns[d], j)
+        end
+        w = u_local[is...]
+        _atomic_add!(u_global, w, js)
+    end
+    nothing
+end
+
+## ==================================================================================================== ##
diff --git a/test/near_2pi.jl b/test/near_2pi.jl
index 4605ded..0b706c4 100644
--- a/test/near_2pi.jl
+++ b/test/near_2pi.jl
@@ -43,13 +43,13 @@ end
     @testset "Kernels.point_to_cell" begin
         Δx = 2π / 3
         let x = prevfloat(L / 3)
-            @test Kernels.point_to_cell(x, Δx) == 1
+            @test Kernels.point_to_cell(x, Δx)[1] == 1
         end
         let x = prevfloat(2 * L / 3)
-            @test Kernels.point_to_cell(x, Δx) == 2
+            @test Kernels.point_to_cell(x, Δx)[1] == 2
         end
         let x = prevfloat(L)
-            @test Kernels.point_to_cell(x, Δx) == 3
+            @test Kernels.point_to_cell(x, Δx)[1] == 3
         end
     end
 end
@@ -65,7 +65,7 @@ end
 
     @testset "Kernels.point_to_cell" begin
         Δx = T(2π) / 24
-        i = Kernels.point_to_cell(x, Δx)
+        i, r = Kernels.point_to_cell(x, Δx)
         @test (i - 1) * Δx ≤ x < i * Δx
     end
 
diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
index e68162b..13df0a6 100644
--- a/test/pseudo_gpu.jl
+++ b/test/pseudo_gpu.jl
@@ -6,6 +6,9 @@
 # JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels
 # seem to fail for other reasons.
 
+# TODO:
+# - use new JLBackend in the latest GPUArrays
+
 using NonuniformFFTs
 using StaticArrays: SVector  # for convenience
 using KernelAbstractions: KernelAbstractions as KA
@@ -47,7 +50,7 @@ end
 
 # ================================================================================ #
 
-function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
+function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc}
     (; backend,) = p
 
     xp = adapt(backend, xp_init)
@@ -75,32 +78,34 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector)
     T = eltype(p)  # this is actually defined in AbstractNFFTs; it represents the type in Fourier space (always complex)
     @test T <: Complex
     dims = size(p)
-    us = KA.allocate(backend, T, dims)
+    us = map(_ -> KA.allocate(backend, T, dims), vp)
     exec_type1!(us, p, vp)
 
-    wp = similar(vp)
+    wp = map(similar, vp)
     exec_type2!(wp, p, us)
 
     (; backend, us, wp,)
 end
 
-function compare_with_cpu(::Type{T}, dims; Np = prod(dims), kws...) where {T <: Number}
+function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc} = Val(1), kws...) where {T <: Number, Nc}
     # Generate some non-uniform random data on the CPU
     rng = Xoshiro(42)
     N = length(dims)    # number of dimensions
     Tr = real(T)
     xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
-    vp_init = randn(rng, T, Np)          # random values at points
+    vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms)
 
-    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, kws...)
+    params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...)
     p_cpu = PlanNUFFT(T, dims; params..., backend = CPU())
     p_gpu = PlanNUFFT(T, dims; params..., backend = PseudoGPU())
 
     r_cpu = run_plan(p_cpu, xp_init, vp_init)
     r_gpu = run_plan(p_gpu, xp_init, vp_init)
 
-    @test r_cpu.us ≈ r_gpu.us  # output of type-1 transform
-    @test r_cpu.wp ≈ r_gpu.wp  # output of type-2 transform
+    for c ∈ 1:Nc
+        @test r_cpu.us[c] ≈ r_gpu.us[c]  # output of type-1 transform
+        @test r_cpu.wp[c] ≈ r_gpu.wp[c]  # output of type-2 transform
+    end
 
     nothing
 end
@@ -116,4 +121,13 @@ end
     @testset "No blocking" begin  # spatial sorting disabled
         compare_with_cpu(ComplexF64, dims; block_size = nothing)
     end
+    @testset "gpu_method = :shared_memory" begin
+        @testset "Float32" compare_with_cpu(Float32, dims; gpu_method = :shared_memory)
+        @testset "ComplexF32" compare_with_cpu(ComplexF32, dims; gpu_method = :shared_memory)
+    end
+    @testset "Multiple transforms" begin
+        ntransforms = Val(2)
+        @testset "Global memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :global_memory)
+        @testset "Shared memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :shared_memory)
+    end
 end