Skip to content

Commit

Permalink
Automatically determine batch size in shared-memory GPU transforms (#41)
Browse files Browse the repository at this point in the history
* Hardcode shared memory limits on CUDA and AMDGPU

This is to ensure that block sizes are compile-time constants.

* Determine "optimal" batch size automatically

* Add inference tests

* Adapt spreading groupsize + other changes

* Minor changes
  • Loading branch information
jipolanco authored Oct 28, 2024
1 parent 47e4d28 commit c926bbd
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 74 deletions.
15 changes: 13 additions & 2 deletions ext/NonuniformFFTsAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,18 @@ using AMDGPU.Device: @device_override
# than the BackwardsKaiserBesselKernel.
NonuniformFFTs.default_kernel(::ROCBackend) = KaiserBesselKernel()

NonuniformFFTs.available_static_shared_memory(::ROCBackend) =
AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
# We want the result of this function to be a compile-time constant to avoid some type
# instabilities, which is why we hardcode the result even though it could be obtained using
# the AMDGPU API.
# On AMDGPU, the static shared memory size (called LDS = Local Data Storage) is usually 64 KiB.
# This is the case in particular of all current AMD Instinct GPUs/APUs (up to the CDNA3
# architecture at least, i.e. MI300* models).
# See https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
function NonuniformFFTs.available_static_shared_memory(::ROCBackend)
expected = Int32(64) << 10 # 64 KiB
actual = AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
expected == actual || @warn(lazy"AMDGPU device reports non-standard shared memory size: $actual bytes")
expected
end

end
12 changes: 9 additions & 3 deletions ext/NonuniformFFTsCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,14 @@ using CUDA: @device_override
# The difference is not huge though.
NonuniformFFTs.default_kernel(::CUDABackend) = KaiserBesselKernel()

# This generally returns 48KiB.
NonuniformFFTs.available_static_shared_memory(::CUDABackend) =
CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
# We want the result of this function to be a compile-time constant to avoid some type
# instabilities, which is why we hardcode the result even though it could be obtained using
# the CUDA API.
function NonuniformFFTs.available_static_shared_memory(::CUDABackend)
expected = Int32(48) << 10 # 48 KiB
actual = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) # generally returns 48KiB
expected == actual || @warn(lazy"CUDA device reports non-standard shared memory size: $actual bytes")
expected
end

end
15 changes: 12 additions & 3 deletions src/blocking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,19 +111,28 @@ function BlockDataGPU(
backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
sort_points::StaticBool;
method::Symbol,
batch_size::Val = Val(16), # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE), # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
) where {Z <: Number, D, M}
T = real(Z) # in case Z is complex
if method === :shared_memory
# Override input block size. We try to maximise the use of shared memory.
# Show warning if the determined block size is too small.
block_dims = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
block_dims, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
Np_in = get_batch_size(batch_size)
@assert Np_in == DEFAULT_GPU_BATCH_SIZE || Np_in Np # the actual Np may be larger than the input one (to maximise shared memory usage)
else
# This is just for type stability: the type of `batch_size` below should be a
# compile-time constant.
_, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = false)
end
nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim)
L = T(2) * π # domain period
Δxs = map(N -> L / N, Ñs) # grid step (oversampled grid)
cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size)
BlockDataGPU(
method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points;
batch_size = Val(Np),
)
end

get_pointperm(bd::BlockDataGPU) = bd.pointperm
Expand Down
98 changes: 63 additions & 35 deletions src/gpu_common.jl
Original file line number Diff line number Diff line change
@@ -1,50 +1,61 @@
# Group size used in shared-memory implementation of spreading and interpolation.
# Input variables are not currently used (except for the dimension D).
function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
# (1) Determine the total number of threads.
# Not sure if it's worth it to define as a function of the inputs (currently unused).
# From tests, the value of 64 seems to be optimal in various situations.
groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
# (2) Determine number of threads in each direction.
# We don't really care about the Cartesian distribution of threads, since we always
# parallelise over linear indices.
gsizes = ntuple(_ -> 1, Val(D))
Base.setindex(gsizes, groupsize, 1) # = (groupsize, 1, 1, ...)
end

# Total amount of shared memory available (in bytes).
# This might be overridden in package extensions for specific backends.
available_static_shared_memory(backend::KA.Backend) = 48 << 10 # 48 KiB (usual in CUDA)
available_static_shared_memory(backend::KA.Backend) = Int32(48) << 10 # 48 KiB (usual in CUDA)

# Minimum size of a batch (in number of non-uniform points) used in the shared-memory
# implementation of GPU spreading.
const DEFAULT_GPU_BATCH_SIZE = 16

# Return ndrange parameter to be passed to KA kernels defining shared memory implementations.
gpu_shmem_ndrange_from_groupsize(groupsize::Integer, ngroups::Tuple) =
Base.setindex(ngroups, groupsize * ngroups[1], 1) # ngroups[1] -> ngroups[1] * groupsize

# Determine block size if using the shared-memory implementation.
# We try to make sure that the total block size (including 2M - 1 ghost cells in each direction)
# is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
# Note that the result is a compile-time constant (on Julia 1.11.1 at least).
# For this to be true, the available_static_shared_memory function should also return a
# compile-time constant (see CUDA and AMDGPU extensions for details).
@inline function block_dims_gpu_shmem(
backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np};
backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np_min} = Val(DEFAULT_GPU_BATCH_SIZE);
warn = false,
) where {Z <: Number, D, M, Np}
) where {Z <: Number, D, M, Np_min}
T = real(Z)
# These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
# Here Np is the batch size (gpu_batch_size parameter).
base_shmem_required = sizeof(T) * (
2M * D * Np + # window_vals
D * Np + # points_sm
D * Np # inds_start
) +
sizeof(Z) * (
Np # vp_sm
) +
sizeof(Int) * (
3 + # buf_sm
D # ishifts_sm
max_shmem = available_static_shared_memory(backend) # hopefully this should be a compile-time constant!

# We try to maximise use of the available shared memory. We split the total capacity as:
#
# max_shmem ≥ const_shmem + shmem_for_local_grid + Np * shmem_per_point
#
# where const_shmem and shmem_per_point are defined based on variables defined in
# spread_from_points_shmem_kernel! (spreading/gpu.jl).

# (1) Constant shared memory requirement (independent of Np or local grid size)
# See spread_from_points_shmem_kernel! for the meaning of each variable to the right of
# each value.
const_shmem = sizeof(Int) * (
+ 3 # buf_sm
+ D # ishifts_sm
) + 128 # extra 128 bytes for safety (CUDA seems to use slightly more shared memory than what we estimate, maybe due to memory alignment?)

# (2) Shared memory required per point in a batch
shmem_per_point = sizeof(T) * D * (
+ 2M # window_vals
+ 1 # points_sm
) + (
sizeof(Int) * D # inds_start
) + (
sizeof(Z) # vp_sm
)
max_shmem = available_static_shared_memory(backend)
max_shmem_blocks = max_shmem - base_shmem_required # maximum shared-memory for block data (local grid)
max_block_length = max_shmem_blocks ÷ sizeof(Z) # maximum number of elements in a block
m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding)

# (3) Determine local grid size based on above values
max_shmem_localgrid = max_shmem - const_shmem - Np_min * shmem_per_point # maximum shared memory for local grid (single block)
max_localgrid_length = max_shmem_localgrid ÷ sizeof(Z) # maximum number of elements in a block
m = floor(Int, _invpow(max_localgrid_length, Val(D))) # local grid size in each direction (including ghost cells / padding)
n = m - (2M - 1) # exclude ghost cells
block_dims = ntuple(_ -> n, Val(D)) # = (n, n, ...)

if n 0
throw(ArgumentError(
lazy"""
Expand All @@ -55,7 +66,6 @@ available_static_shared_memory(backend::KA.Backend) = 48 << 10 # 48 KiB (usual
- spreading batch size: Np = $Np
If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
))
# TODO: warning if n is too small? (likely slower than global_memory method)
elseif warn && n < 4 # the n < 4 limit is completely empirical
@warn lazy"""
GPU shared memory size might be too small for the chosen problem.
Expand All @@ -68,9 +78,27 @@ available_static_shared_memory(backend::KA.Backend) = 48 << 10 # 48 KiB (usual
This gives blocks of dimensions $block_dims (not including 2M - 1 = $(2M - 1) ghost cells in each direction).
If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
end
block_dims

# (4) Now determine Np according to the actual remaining shared memory.
# Note that, if Np was passed as input, we may increase it to maximise memory usage.
shmem_left = max_shmem - const_shmem - sizeof(Z) * m^D
Np_actual = shmem_left ÷ shmem_per_point
@assert Np_actual Np_min

# Actual memory requirement
# shmem_for_local_grid = m^D * sizeof(Z)
# required_shmem = const_shmem + shmem_for_local_grid + Np_actual * shmem_per_point
# @show required_shmem max_shmem

block_dims, Np_actual
end

# Compute x^(1/D)
_invpow(x, ::Val{1}) = x
_invpow(x, ::Val{2}) = sqrt(x)
_invpow(x, ::Val{3}) = cbrt(x)
_invpow(x, ::Val{4}) = sqrt(sqrt(x))

# This is called from the global memory (naive) implementation of spreading and interpolation kernels.
@inline function get_inds_vals_gpu(gs::NTuple{D}, evalmode::EvaluationMode, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
ntuple(Val(D)) do n
Expand Down
9 changes: 5 additions & 4 deletions src/interpolation/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,15 @@ function interpolate!(
Z = eltype(us[1])
M = Kernels.half_support(gs[1])
@assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal
block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
@assert Val(batch_size_actual) == bd.batch_size
block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch
@assert block_dims_val === bd.block_dims
let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA)
block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array
shmem_size = block_dims_padded
groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
ndrange = groupsize .* ngroups
groupsize = 64
ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
kernel!(
vp_sorted, gs, evalmode, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
Expand Down Expand Up @@ -209,7 +210,7 @@ end
) where {C, D, Z <: Number, block_dims, shmem_size}

@uniform begin
groupsize = @groupsize()::Dims{D}
groupsize = @groupsize()
nthreads = prod(groupsize)
end

Expand Down
46 changes: 25 additions & 21 deletions src/plan.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
- `kernel::AbstractKernel = BackwardsKaiserBesselKernel()`: convolution kernel used for NUFFTs.
## Performance parameters
## Main performance parameters
- `kernel_evalmode`: method used for kernel evaluation.
The default is [`FastApproximation`](@ref) on CPU, which will attempt to use a fast
Expand All @@ -111,13 +111,6 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
generally slower).
- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
This can be enabled by passing `sort_points = True()`.
Ignored when `block_size = nothing` (which disables spatial sorting).
In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
This can improve performance if executing multiple transforms on the same non-uniform points.
Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
- `gpu_method`: allows to select between different implementations of
GPU transforms. Possible options are:
Expand All @@ -127,23 +120,34 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
* `:shared_memory`: copy data between global memory and shared memory (local
to each GPU workgroup) and perform most operations in the latter, which is faster and
can help avoid some atomic operations in type-1 transforms. We try to use as much shared
memory as is typically available on current GPUs (which is not much, typically 48 KiB on
CUDA). But still, this method can be much faster than the `:global_memory` and may
become the default in the future. Note that this method completely ignores the
`block_size` parameter, as the actual block size is adjusted to maximise shared memory
usage. The performance of type-1 transforms can be further adjusted using the
`gpu_batch_size` parameter described below.
memory as is typically available on current GPUs (which is typically 48 KiB on
CUDA and 64 KiB on AMDGPU). But still, this method can be much faster than the
`:global_memory` and may become the default in the future. Note that this method
completely ignores the `block_size` parameter, as the actual block size is adjusted to
maximise shared memory usage.
The default is `:global_memory` but this may change in the future.
- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
The idea is that, to avoid atomic operations on shared-memory arrays, we process
non-uniform points in batches of `Np` points (16 by default).
This can have a large impact on performance.
- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner when `backend = CPU()`.
## Other parameters
## Other performance parameters
These are more advanced performance parameters which may dissappear or whose behaviour may
change in the future.
- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`).
- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
This can be enabled by passing `sort_points = True()`.
Ignored when `block_size = nothing` (which disables spatial sorting).
In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
This can improve performance if executing multiple transforms on the same non-uniform points.
Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.
- `gpu_batch_size = Val(Np)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
The idea is that, to avoid inefficient atomic operations on shared-memory arrays, we process
non-uniform points in batches of `Np` points.
By default, `Np` is chosen so as to maximise shared memory usage within each GPU workgroup.
## Other parameters
- `fftshift = false`: determines the order of wavenumbers in uniform space.
If `false` (default), the same order used by FFTW is used, with positive wavenumbers first
Expand Down Expand Up @@ -362,7 +366,7 @@ function _PlanNUFFT(
block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
synchronise::Bool = false,
gpu_method::Symbol = :global_memory,
gpu_batch_size::Val = Val(16), # currently only used in shared-memory GPU spreading
gpu_batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE), # currently only used in shared-memory GPU spreading
) where {T <: Number, D}
ks = init_wavenumbers(T, Ns)
# Determine dimensions of oversampled grid.
Expand Down
22 changes: 18 additions & 4 deletions src/spreading/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,14 +180,15 @@ function spread_from_points!(
Z = eltype(us[1])
M = Kernels.half_support(gs[1])
@assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal
block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
@assert Val(batch_size_actual) == bd.batch_size
block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch
@assert block_dims_val === bd.block_dims
let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA)
block_dims_padded = @. block_dims_val + 2M - 1
shmem_size = block_dims_padded # dimensions of big shared memory array
groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
ndrange = groupsize .* ngroups
groupsize = groupsize_spreading_gpu_shmem(batch_size_actual)
ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
kernel!(
us_real, gs, evalmode, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block,
Expand All @@ -203,6 +204,18 @@ function spread_from_points!(
us
end

# Determine workgroupsize based on the batch size Np.
# Try to have between 64 and 256 threads, such that the number of threads is ideally larger
# than the batch size.
function groupsize_spreading_gpu_shmem(Np::Integer)
groupsize = 64
c = min(Np, 256)
while groupsize < c
groupsize += 32
end
groupsize
end

@kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
i = @index(Global, Linear)
j = @inbounds perm[i]
Expand Down Expand Up @@ -234,7 +247,7 @@ end
) where {C, D, T, Z, M, Np, block_dims, shmem_size}

@uniform begin
groupsize = @groupsize()::Dims{D}
groupsize = @groupsize()
nthreads = prod(groupsize)

# Determine grid dimensions.
Expand All @@ -254,6 +267,7 @@ end
u_local = @localmem(Z, shmem_size)
# @assert shmem_size == block_dims .+ (2M - 1)
@assert T <: Real

window_vals = @localmem(T, (2M, D, Np))
points_sm = @localmem(T, (D, Np)) # points copied to shared memory
inds_start = @localmem(Int, (D, Np))
Expand Down
Loading

0 comments on commit c926bbd

Please sign in to comment.