jipolanco · jipolanco · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/ext/NonuniformFFTsAMDGPUExt.jl b/ext/NonuniformFFTsAMDGPUExt.jl
@@ -18,7 +18,18 @@
 # than the BackwardsKaiserBesselKernel.
 NonuniformFFTs.default_kernel(::ROCBackend) = KaiserBesselKernel()
 
-NonuniformFFTs.available_static_shared_memory(::ROCBackend) =
-    AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
+# We want the result of this function to be a compile-time constant to avoid some type
+# instabilities, which is why we hardcode the result even though it could be obtained using
+# the AMDGPU API.
+# On AMDGPU, the static shared memory size (called LDS = Local Data Storage) is usually 64 KiB.
+# This is the case in particular of all current AMD Instinct GPUs/APUs (up to the CDNA3
+# architecture at least, i.e. MI300* models).
+# See https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
+function NonuniformFFTs.available_static_shared_memory(::ROCBackend)
+    expected = Int32(64) << 10  # 64 KiB
+    actual = AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
+    expected == actual || @warn(lazy"AMDGPU device reports non-standard shared memory size: $actual bytes")
+    expected
+end
 
 end
diff --git a/ext/NonuniformFFTsCUDAExt.jl b/ext/NonuniformFFTsCUDAExt.jl
@@ -18,8 +18,14 @@
 # The difference is not huge though.
 NonuniformFFTs.default_kernel(::CUDABackend) = KaiserBesselKernel()
 
-# This generally returns 48KiB.
-NonuniformFFTs.available_static_shared_memory(::CUDABackend) =
-    CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+# We want the result of this function to be a compile-time constant to avoid some type
+# instabilities, which is why we hardcode the result even though it could be obtained using
+# the CUDA API.
+function NonuniformFFTs.available_static_shared_memory(::CUDABackend)
+    expected = Int32(48) << 10  # 48 KiB
+    actual = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)  # generally returns 48KiB
+    expected == actual || @warn(lazy"CUDA device reports non-standard shared memory size: $actual bytes")
+    expected
+end
 
 end
diff --git a/src/blocking.jl b/src/blocking.jl
@@ -111,19 +111,28 @@ function BlockDataGPU(
         backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
         sort_points::StaticBool;
         method::Symbol,
-        batch_size::Val = Val(16),  # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
+        batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE),  # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
     ) where {Z <: Number, D, M}
     T = real(Z)  # in case Z is complex
     if method === :shared_memory
         # Override input block size. We try to maximise the use of shared memory.
         # Show warning if the determined block size is too small.
-        block_dims = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
+        block_dims, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
+        Np_in = get_batch_size(batch_size)
+        @assert Np_in == DEFAULT_GPU_BATCH_SIZE || Np_in ≤ Np  # the actual Np may be larger than the input one (to maximise shared memory usage)
+    else
+        # This is just for type stability: the type of `batch_size` below should be a
+        # compile-time constant.
+        _, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = false)
     end
     nblocks_per_dir = map(cld, Ñs, block_dims)  # basically equal to ceil(Ñ / block_dim)
     L = T(2) * π  # domain period
     Δxs = map(N -> L / N, Ñs)  # grid step (oversampled grid)
     cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
-    BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size)
+    BlockDataGPU(
+        method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points;
+        batch_size = Val(Np),
+    )
 end
 
 get_pointperm(bd::BlockDataGPU) = bd.pointperm

diff --git a/src/gpu_common.jl b/src/gpu_common.jl
@@ -1,50 +1,61 @@
-# Group size used in shared-memory implementation of spreading and interpolation.
-# Input variables are not currently used (except for the dimension D).
-function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
-    # (1) Determine the total number of threads.
-    # Not sure if it's worth it to define as a function of the inputs (currently unused).
-    # From tests, the value of 64 seems to be optimal in various situations.
-    groupsize = 64  # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
-    # (2) Determine number of threads in each direction.
-    # We don't really care about the Cartesian distribution of threads, since we always
-    # parallelise over linear indices.
-    gsizes = ntuple(_ -> 1, Val(D))
-    Base.setindex(gsizes, groupsize, 1)  # = (groupsize, 1, 1, ...)
-end
-
 # Total amount of shared memory available (in bytes).
 # This might be overridden in package extensions for specific backends.
-available_static_shared_memory(backend::KA.Backend) = 48 << 10  # 48 KiB (usual in CUDA)
+available_static_shared_memory(backend::KA.Backend) = Int32(48) << 10  # 48 KiB (usual in CUDA)
+
+# Minimum size of a batch (in number of non-uniform points) used in the shared-memory
+# implementation of GPU spreading.
+const DEFAULT_GPU_BATCH_SIZE = 16
+
+# Return ndrange parameter to be passed to KA kernels defining shared memory implementations.
+gpu_shmem_ndrange_from_groupsize(groupsize::Integer, ngroups::Tuple) =
+    Base.setindex(ngroups, groupsize * ngroups[1], 1)  # ngroups[1] -> ngroups[1] * groupsize
 
 # Determine block size if using the shared-memory implementation.
 # We try to make sure that the total block size (including 2M - 1 ghost cells in each direction)
 # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
 # Note that the result is a compile-time constant (on Julia 1.11.1 at least).
+# For this to be true, the available_static_shared_memory function should also return a
+# compile-time constant (see CUDA and AMDGPU extensions for details).
 @inline function block_dims_gpu_shmem(
-        backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np};
+        backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np_min} = Val(DEFAULT_GPU_BATCH_SIZE);
         warn = false,
-    ) where {Z <: Number, D, M, Np}
+    ) where {Z <: Number, D, M, Np_min}
     T = real(Z)
     # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
-    # Here Np is the batch size (gpu_batch_size parameter).
-    base_shmem_required = sizeof(T) * (
-        2M * D * Np +  # window_vals
-        D * Np +       # points_sm
-        D * Np         # inds_start
-    ) +
-    sizeof(Z) * (
-        Np  # vp_sm
-    ) +
-    sizeof(Int) * (
-        3 +  # buf_sm
-        D    # ishifts_sm
+    max_shmem = available_static_shared_memory(backend)  # hopefully this should be a compile-time constant!
+
+    # We try to maximise use of the available shared memory. We split the total capacity as:
+    #
+    #   max_shmem ≥ const_shmem + shmem_for_local_grid + Np * shmem_per_point
+    #
+    # where const_shmem and shmem_per_point are defined based on variables defined in
+    # spread_from_points_shmem_kernel! (spreading/gpu.jl).
+
+    # (1) Constant shared memory requirement (independent of Np or local grid size)
+    # See spread_from_points_shmem_kernel! for the meaning of each variable to the right of
+    # each value.
+    const_shmem = sizeof(Int) * (
+        + 3   # buf_sm
+        + D   # ishifts_sm
+    ) + 128   # extra 128 bytes for safety (CUDA seems to use slightly more shared memory than what we estimate, maybe due to memory alignment?)
+
+    # (2) Shared memory required per point in a batch
+    shmem_per_point = sizeof(T) * D * (
+        + 2M  # window_vals
+        + 1   # points_sm
+    ) + (
+        sizeof(Int) * D   # inds_start
+    ) + (
+        sizeof(Z)         # vp_sm
     )
-    max_shmem = available_static_shared_memory(backend)
-    max_shmem_blocks = max_shmem - base_shmem_required  # maximum shared-memory for block data (local grid)
-    max_block_length = max_shmem_blocks ÷ sizeof(Z)  # maximum number of elements in a block
-    m = floor(Int, max_block_length^(1/D))  # block size in each direction (including ghost cells / padding)
+
+    # (3) Determine local grid size based on above values
+    max_shmem_localgrid = max_shmem - const_shmem - Np_min * shmem_per_point  # maximum shared memory for local grid (single block)
+    max_localgrid_length = max_shmem_localgrid ÷ sizeof(Z)  # maximum number of elements in a block
+    m = floor(Int, _invpow(max_localgrid_length, Val(D)))  # local grid size in each direction (including ghost cells / padding)
     n = m - (2M - 1)  # exclude ghost cells
     block_dims = ntuple(_ -> n, Val(D))  # = (n, n, ...)
+
     if n ≤ 0
         throw(ArgumentError(
             lazy"""
@@ -55,7 +66,6 @@
               - spreading batch size: Np = $Np
             If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
         ))
-        # TODO: warning if n is too small? (likely slower than global_memory method)
     elseif warn && n < 4  # the n < 4 limit is completely empirical
         @warn lazy"""
             GPU shared memory size might be too small for the chosen problem.
@@ -68,9 +78,27 @@
             This gives blocks of dimensions $block_dims (not including 2M - 1 = $(2M - 1) ghost cells in each direction).
             If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
     end
-    block_dims
+
+    # (4) Now determine Np according to the actual remaining shared memory.
+    # Note that, if Np was passed as input, we may increase it to maximise memory usage.
+    shmem_left = max_shmem - const_shmem - sizeof(Z) * m^D
+    Np_actual = shmem_left ÷ shmem_per_point
+    @assert Np_actual ≥ Np_min
+
+    # Actual memory requirement
+    # shmem_for_local_grid = m^D * sizeof(Z)
+    # required_shmem = const_shmem + shmem_for_local_grid + Np_actual * shmem_per_point
+    # @show required_shmem max_shmem
+
+    block_dims, Np_actual
 end
 
+# Compute x^(1/D)
+_invpow(x, ::Val{1}) = x
+_invpow(x, ::Val{2}) = sqrt(x)
+_invpow(x, ::Val{3}) = cbrt(x)
+_invpow(x, ::Val{4}) = sqrt(sqrt(x))
+
 # This is called from the global memory (naive) implementation of spreading and interpolation kernels.
 @inline function get_inds_vals_gpu(gs::NTuple{D}, evalmode::EvaluationMode, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
     ntuple(Val(D)) do n

diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
@@ -83,14 +83,15 @@ function interpolate!(
         Z = eltype(us[1])
         M = Kernels.half_support(gs[1])
         @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
-        block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        @assert Val(batch_size_actual) == bd.batch_size
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
             block_dims_padded = @. block_dims_val + 2M - 1  # dimensions of shared memory array
             shmem_size = block_dims_padded
-            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
-            ndrange = groupsize .* ngroups
+            groupsize = 64
+            ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
             kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 vp_sorted, gs, evalmode, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
@@ -209,7 +210,7 @@ end
     ) where {C, D, Z <: Number, block_dims, shmem_size}
 
     @uniform begin
-        groupsize = @groupsize()::Dims{D}
+        groupsize = @groupsize()
         nthreads = prod(groupsize)
     end
 

diff --git a/src/plan.jl b/src/plan.jl
@@ -89,7 +89,7 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
 
 - `kernel::AbstractKernel = BackwardsKaiserBesselKernel()`: convolution kernel used for NUFFTs.
 
-## Performance parameters
+## Main performance parameters
 
 - `kernel_evalmode`: method used for kernel evaluation.
   The default is [`FastApproximation`](@ref) on CPU, which will attempt to use a fast
@@ -111,13 +111,6 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
   Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
   generally slower).
 
-- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
-  This can be enabled by passing `sort_points = True()`.
-  Ignored when `block_size = nothing` (which disables spatial sorting).
-  In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
-  This can improve performance if executing multiple transforms on the same non-uniform points.
-  Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
-
 - `gpu_method`: allows to select between different implementations of
   GPU transforms. Possible options are:
 
@@ -127,23 +120,34 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
   * `:shared_memory`: copy data between global memory and shared memory (local
     to each GPU workgroup) and perform most operations in the latter, which is faster and
     can help avoid some atomic operations in type-1 transforms. We try to use as much shared
-    memory as is typically available on current GPUs (which is not much, typically 48 KiB on
-    CUDA). But still, this method can be much faster than the `:global_memory` and may
-    become the default in the future. Note that this method completely ignores the
-    `block_size` parameter, as the actual block size is adjusted to maximise shared memory
-    usage. The performance of type-1 transforms can be further adjusted using the
-    `gpu_batch_size` parameter described below.
+    memory as is typically available on current GPUs (which is typically 48 KiB on
+    CUDA and 64 KiB on AMDGPU). But still, this method can be much faster than the
+    `:global_memory` and may become the default in the future. Note that this method
+    completely ignores the `block_size` parameter, as the actual block size is adjusted to
+    maximise shared memory usage.
 
   The default is `:global_memory` but this may change in the future.
 
-- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
-  The idea is that, to avoid atomic operations on shared-memory arrays, we process
-  non-uniform points in batches of `Np` points (16 by default).
-  This can have a large impact on performance.
+- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner when `backend = CPU()`.
 
-## Other parameters
+## Other performance parameters
+
+These are more advanced performance parameters which may dissappear or whose behaviour may
+change in the future.
 
-- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`).
+- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
+  This can be enabled by passing `sort_points = True()`.
+  Ignored when `block_size = nothing` (which disables spatial sorting).
+  In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
+  This can improve performance if executing multiple transforms on the same non-uniform points.
+  Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
+
+- `gpu_batch_size = Val(Np)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
+  The idea is that, to avoid inefficient atomic operations on shared-memory arrays, we process
+  non-uniform points in batches of `Np` points.
+  By default, `Np` is chosen so as to maximise shared memory usage within each GPU workgroup.
+
+## Other parameters
 
 - `fftshift = false`: determines the order of wavenumbers in uniform space.
   If `false` (default), the same order used by FFTW is used, with positive wavenumbers first
@@ -362,7 +366,7 @@ function _PlanNUFFT(
         block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
         synchronise::Bool = false,
         gpu_method::Symbol = :global_memory,
-        gpu_batch_size::Val = Val(16),  # currently only used in shared-memory GPU spreading
+        gpu_batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE),  # currently only used in shared-memory GPU spreading
     ) where {T <: Number, D}
     ks = init_wavenumbers(T, Ns)
     # Determine dimensions of oversampled grid.

diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl
@@ -180,14 +180,15 @@
         Z = eltype(us[1])
         M = Kernels.half_support(gs[1])
         @assert all(g -> Kernels.half_support(g) === M, gs)  # check that they're all equal
-        block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size)  # this is usually a compile-time constant...
+        @assert Val(batch_size_actual) == bd.batch_size
         block_dims = Val(block_dims_val)  # ...which means this doesn't require a dynamic dispatch
         @assert block_dims_val === bd.block_dims
         let ngroups = bd.nblocks_per_dir  # this is the required number of workgroups (number of blocks in CUDA)
             block_dims_padded = @. block_dims_val + 2M - 1
             shmem_size = block_dims_padded  # dimensions of big shared memory array
-            groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
-            ndrange = groupsize .* ngroups
+            groupsize = groupsize_spreading_gpu_shmem(batch_size_actual)
+            ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
             kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
             kernel!(
                 us_real, gs, evalmode, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block,
@@ -203,6 +204,18 @@
     us
 end
 
+# Determine workgroupsize based on the batch size Np.
+# Try to have between 64 and 256 threads, such that the number of threads is ideally larger
+# than the batch size.
+function groupsize_spreading_gpu_shmem(Np::Integer)
+    groupsize = 64
+    c = min(Np, 256)
+    while groupsize < c
+        groupsize += 32
+    end
+    groupsize
+end
+
 @kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
     i = @index(Global, Linear)
     j = @inbounds perm[i]
@@ -234,7 +247,7 @@
     ) where {C, D, T, Z, M, Np, block_dims, shmem_size}
 
     @uniform begin
-        groupsize = @groupsize()::Dims{D}
+        groupsize = @groupsize()
         nthreads = prod(groupsize)
 
         # Determine grid dimensions.
@@ -254,6 +267,7 @@
     u_local = @localmem(Z, shmem_size)
     # @assert shmem_size == block_dims .+ (2M - 1)
     @assert T <: Real
+
     window_vals = @localmem(T, (2M, D, Np))
     points_sm = @localmem(T, (D, Np))  # points copied to shared memory
     inds_start = @localmem(Int, (D, Np))