Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically determine batch size in shared-memory GPU transforms #41

Merged
merged 5 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions ext/NonuniformFFTsAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,18 @@
# than the BackwardsKaiserBesselKernel.
NonuniformFFTs.default_kernel(::ROCBackend) = KaiserBesselKernel()

NonuniformFFTs.available_static_shared_memory(::ROCBackend) =
AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
# We want the result of this function to be a compile-time constant to avoid some type
# instabilities, which is why we hardcode the result even though it could be obtained using
# the AMDGPU API.
# On AMDGPU, the static shared memory size (called LDS = Local Data Storage) is usually 64 KiB.
# This is the case in particular of all current AMD Instinct GPUs/APUs (up to the CDNA3
# architecture at least, i.e. MI300* models).
# See https://rocm.docs.amd.com/en/latest/reference/gpu-arch-specs.html
function NonuniformFFTs.available_static_shared_memory(::ROCBackend)
expected = Int32(64) << 10 # 64 KiB
actual = AMDGPU.HIP.attribute(AMDGPU.device(), AMDGPU.HIP.hipDeviceAttributeMaxSharedMemoryPerBlock)
expected == actual || @warn(lazy"AMDGPU device reports non-standard shared memory size: $actual bytes")
expected

Check warning on line 32 in ext/NonuniformFFTsAMDGPUExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/NonuniformFFTsAMDGPUExt.jl#L28-L32

Added lines #L28 - L32 were not covered by tests
end

end
12 changes: 9 additions & 3 deletions ext/NonuniformFFTsCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,14 @@
# The difference is not huge though.
NonuniformFFTs.default_kernel(::CUDABackend) = KaiserBesselKernel()

# This generally returns 48KiB.
NonuniformFFTs.available_static_shared_memory(::CUDABackend) =
CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
# We want the result of this function to be a compile-time constant to avoid some type
# instabilities, which is why we hardcode the result even though it could be obtained using
# the CUDA API.
function NonuniformFFTs.available_static_shared_memory(::CUDABackend)
expected = Int32(48) << 10 # 48 KiB
actual = CUDA.attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK) # generally returns 48KiB
expected == actual || @warn(lazy"CUDA device reports non-standard shared memory size: $actual bytes")
expected

Check warning on line 28 in ext/NonuniformFFTsCUDAExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/NonuniformFFTsCUDAExt.jl#L24-L28

Added lines #L24 - L28 were not covered by tests
end

end
15 changes: 12 additions & 3 deletions src/blocking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,19 +111,28 @@ function BlockDataGPU(
backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M},
sort_points::StaticBool;
method::Symbol,
batch_size::Val = Val(16), # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE), # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory
) where {Z <: Number, D, M}
T = real(Z) # in case Z is complex
if method === :shared_memory
# Override input block size. We try to maximise the use of shared memory.
# Show warning if the determined block size is too small.
block_dims = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
block_dims, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = true)
Np_in = get_batch_size(batch_size)
@assert Np_in == DEFAULT_GPU_BATCH_SIZE || Np_in ≤ Np # the actual Np may be larger than the input one (to maximise shared memory usage)
else
# This is just for type stability: the type of `batch_size` below should be a
# compile-time constant.
_, Np = block_dims_gpu_shmem(backend, Z, Ñs, h, batch_size; warn = false)
end
nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim)
L = T(2) * π # domain period
Δxs = map(N -> L / N, Ñs) # grid step (oversampled grid)
cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1)
BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size)
BlockDataGPU(
method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points;
batch_size = Val(Np),
)
end

get_pointperm(bd::BlockDataGPU) = bd.pointperm
Expand Down
98 changes: 63 additions & 35 deletions src/gpu_common.jl
Original file line number Diff line number Diff line change
@@ -1,50 +1,61 @@
# Group size used in shared-memory implementation of spreading and interpolation.
# Input variables are not currently used (except for the dimension D).
function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D}
# (1) Determine the total number of threads.
# Not sure if it's worth it to define as a function of the inputs (currently unused).
# From tests, the value of 64 seems to be optimal in various situations.
groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU)
# (2) Determine number of threads in each direction.
# We don't really care about the Cartesian distribution of threads, since we always
# parallelise over linear indices.
gsizes = ntuple(_ -> 1, Val(D))
Base.setindex(gsizes, groupsize, 1) # = (groupsize, 1, 1, ...)
end

# Total amount of shared memory available (in bytes).
# This might be overridden in package extensions for specific backends.
available_static_shared_memory(backend::KA.Backend) = 48 << 10 # 48 KiB (usual in CUDA)
available_static_shared_memory(backend::KA.Backend) = Int32(48) << 10 # 48 KiB (usual in CUDA)

# Minimum size of a batch (in number of non-uniform points) used in the shared-memory
# implementation of GPU spreading.
const DEFAULT_GPU_BATCH_SIZE = 16

# Return ndrange parameter to be passed to KA kernels defining shared memory implementations.
gpu_shmem_ndrange_from_groupsize(groupsize::Integer, ngroups::Tuple) =
Base.setindex(ngroups, groupsize * ngroups[1], 1) # ngroups[1] -> ngroups[1] * groupsize

# Determine block size if using the shared-memory implementation.
# We try to make sure that the total block size (including 2M - 1 ghost cells in each direction)
# is not larger than the available shared memory. In CUDA the limit is usually 48 KiB.
# Note that the result is a compile-time constant (on Julia 1.11.1 at least).
# For this to be true, the available_static_shared_memory function should also return a
# compile-time constant (see CUDA and AMDGPU extensions for details).
@inline function block_dims_gpu_shmem(
backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np};
backend, ::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np_min} = Val(DEFAULT_GPU_BATCH_SIZE);
warn = false,
) where {Z <: Number, D, M, Np}
) where {Z <: Number, D, M, Np_min}
T = real(Z)
# These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!).
# Here Np is the batch size (gpu_batch_size parameter).
base_shmem_required = sizeof(T) * (
2M * D * Np + # window_vals
D * Np + # points_sm
D * Np # inds_start
) +
sizeof(Z) * (
Np # vp_sm
) +
sizeof(Int) * (
3 + # buf_sm
D # ishifts_sm
max_shmem = available_static_shared_memory(backend) # hopefully this should be a compile-time constant!

# We try to maximise use of the available shared memory. We split the total capacity as:
#
# max_shmem ≥ const_shmem + shmem_for_local_grid + Np * shmem_per_point
#
# where const_shmem and shmem_per_point are defined based on variables defined in
# spread_from_points_shmem_kernel! (spreading/gpu.jl).

# (1) Constant shared memory requirement (independent of Np or local grid size)
# See spread_from_points_shmem_kernel! for the meaning of each variable to the right of
# each value.
const_shmem = sizeof(Int) * (
+ 3 # buf_sm
+ D # ishifts_sm
) + 128 # extra 128 bytes for safety (CUDA seems to use slightly more shared memory than what we estimate, maybe due to memory alignment?)

# (2) Shared memory required per point in a batch
shmem_per_point = sizeof(T) * D * (
+ 2M # window_vals
+ 1 # points_sm
) + (
sizeof(Int) * D # inds_start
) + (
sizeof(Z) # vp_sm
)
max_shmem = available_static_shared_memory(backend)
max_shmem_blocks = max_shmem - base_shmem_required # maximum shared-memory for block data (local grid)
max_block_length = max_shmem_blocks ÷ sizeof(Z) # maximum number of elements in a block
m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding)

# (3) Determine local grid size based on above values
max_shmem_localgrid = max_shmem - const_shmem - Np_min * shmem_per_point # maximum shared memory for local grid (single block)
max_localgrid_length = max_shmem_localgrid ÷ sizeof(Z) # maximum number of elements in a block
m = floor(Int, _invpow(max_localgrid_length, Val(D))) # local grid size in each direction (including ghost cells / padding)
n = m - (2M - 1) # exclude ghost cells
block_dims = ntuple(_ -> n, Val(D)) # = (n, n, ...)

if n ≤ 0
throw(ArgumentError(
lazy"""
Expand All @@ -55,7 +66,6 @@
- spreading batch size: Np = $Np
If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
))
# TODO: warning if n is too small? (likely slower than global_memory method)
elseif warn && n < 4 # the n < 4 limit is completely empirical
@warn lazy"""
GPU shared memory size might be too small for the chosen problem.
Expand All @@ -68,9 +78,27 @@
This gives blocks of dimensions $block_dims (not including 2M - 1 = $(2M - 1) ghost cells in each direction).
If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory."""
end
block_dims

# (4) Now determine Np according to the actual remaining shared memory.
# Note that, if Np was passed as input, we may increase it to maximise memory usage.
shmem_left = max_shmem - const_shmem - sizeof(Z) * m^D
Np_actual = shmem_left ÷ shmem_per_point
@assert Np_actual ≥ Np_min

# Actual memory requirement
# shmem_for_local_grid = m^D * sizeof(Z)
# required_shmem = const_shmem + shmem_for_local_grid + Np_actual * shmem_per_point
# @show required_shmem max_shmem

block_dims, Np_actual
end

# Compute x^(1/D)
_invpow(x, ::Val{1}) = x
_invpow(x, ::Val{2}) = sqrt(x)

Check warning on line 98 in src/gpu_common.jl

View check run for this annotation

Codecov / codecov/patch

src/gpu_common.jl#L97-L98

Added lines #L97 - L98 were not covered by tests
_invpow(x, ::Val{3}) = cbrt(x)
_invpow(x, ::Val{4}) = sqrt(sqrt(x))

Check warning on line 100 in src/gpu_common.jl

View check run for this annotation

Codecov / codecov/patch

src/gpu_common.jl#L100

Added line #L100 was not covered by tests

# This is called from the global memory (naive) implementation of spreading and interpolation kernels.
@inline function get_inds_vals_gpu(gs::NTuple{D}, evalmode::EvaluationMode, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D}
ntuple(Val(D)) do n
Expand Down
9 changes: 5 additions & 4 deletions src/interpolation/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,15 @@ function interpolate!(
Z = eltype(us[1])
M = Kernels.half_support(gs[1])
@assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal
block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
@assert Val(batch_size_actual) == bd.batch_size
block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch
@assert block_dims_val === bd.block_dims
let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA)
block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array
shmem_size = block_dims_padded
groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
ndrange = groupsize .* ngroups
groupsize = 64
ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange)
kernel!(
vp_sorted, gs, evalmode, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block,
Expand Down Expand Up @@ -209,7 +210,7 @@ end
) where {C, D, Z <: Number, block_dims, shmem_size}

@uniform begin
groupsize = @groupsize()::Dims{D}
groupsize = @groupsize()
nthreads = prod(groupsize)
end

Expand Down
46 changes: 25 additions & 21 deletions src/plan.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.

- `kernel::AbstractKernel = BackwardsKaiserBesselKernel()`: convolution kernel used for NUFFTs.

## Performance parameters
## Main performance parameters

- `kernel_evalmode`: method used for kernel evaluation.
The default is [`FastApproximation`](@ref) on CPU, which will attempt to use a fast
Expand All @@ -111,13 +111,6 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is
generally slower).

- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
This can be enabled by passing `sort_points = True()`.
Ignored when `block_size = nothing` (which disables spatial sorting).
In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
This can improve performance if executing multiple transforms on the same non-uniform points.
Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.

- `gpu_method`: allows to select between different implementations of
GPU transforms. Possible options are:

Expand All @@ -127,23 +120,34 @@ the order of ``10^{-7}`` for `Float64` or `ComplexF64` data.
* `:shared_memory`: copy data between global memory and shared memory (local
to each GPU workgroup) and perform most operations in the latter, which is faster and
can help avoid some atomic operations in type-1 transforms. We try to use as much shared
memory as is typically available on current GPUs (which is not much, typically 48 KiB on
CUDA). But still, this method can be much faster than the `:global_memory` and may
become the default in the future. Note that this method completely ignores the
`block_size` parameter, as the actual block size is adjusted to maximise shared memory
usage. The performance of type-1 transforms can be further adjusted using the
`gpu_batch_size` parameter described below.
memory as is typically available on current GPUs (which is typically 48 KiB on
CUDA and 64 KiB on AMDGPU). But still, this method can be much faster than the
`:global_memory` and may become the default in the future. Note that this method
completely ignores the `block_size` parameter, as the actual block size is adjusted to
maximise shared memory usage.

The default is `:global_memory` but this may change in the future.

- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
The idea is that, to avoid atomic operations on shared-memory arrays, we process
non-uniform points in batches of `Np` points (16 by default).
This can have a large impact on performance.
- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner when `backend = CPU()`.

## Other parameters
## Other performance parameters

These are more advanced performance parameters which may dissappear or whose behaviour may
change in the future.

- `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`).
- `sort_points = False()`: whether to internally permute the order of the non-uniform points.
This can be enabled by passing `sort_points = True()`.
Ignored when `block_size = nothing` (which disables spatial sorting).
In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms.
This can improve performance if executing multiple transforms on the same non-uniform points.
Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`.

- `gpu_batch_size = Val(Np)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`.
The idea is that, to avoid inefficient atomic operations on shared-memory arrays, we process
non-uniform points in batches of `Np` points.
By default, `Np` is chosen so as to maximise shared memory usage within each GPU workgroup.

## Other parameters

- `fftshift = false`: determines the order of wavenumbers in uniform space.
If `false` (default), the same order used by FFTW is used, with positive wavenumbers first
Expand Down Expand Up @@ -362,7 +366,7 @@ function _PlanNUFFT(
block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend),
synchronise::Bool = false,
gpu_method::Symbol = :global_memory,
gpu_batch_size::Val = Val(16), # currently only used in shared-memory GPU spreading
gpu_batch_size::Val = Val(DEFAULT_GPU_BATCH_SIZE), # currently only used in shared-memory GPU spreading
) where {T <: Number, D}
ks = init_wavenumbers(T, Ns)
# Determine dimensions of oversampled grid.
Expand Down
22 changes: 18 additions & 4 deletions src/spreading/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,14 +180,15 @@
Z = eltype(us[1])
M = Kernels.half_support(gs[1])
@assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal
block_dims_val = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
block_dims_val, batch_size_actual = block_dims_gpu_shmem(backend, Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant...
@assert Val(batch_size_actual) == bd.batch_size
block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch
@assert block_dims_val === bd.block_dims
let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA)
block_dims_padded = @. block_dims_val + 2M - 1
shmem_size = block_dims_padded # dimensions of big shared memory array
groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s))
ndrange = groupsize .* ngroups
groupsize = groupsize_spreading_gpu_shmem(batch_size_actual)
ndrange = gpu_shmem_ndrange_from_groupsize(groupsize, ngroups)
kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange)
kernel!(
us_real, gs, evalmode, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block,
Expand All @@ -203,6 +204,18 @@
us
end

# Determine workgroupsize based on the batch size Np.
# Try to have between 64 and 256 threads, such that the number of threads is ideally larger
# than the batch size.
function groupsize_spreading_gpu_shmem(Np::Integer)
groupsize = 64
c = min(Np, 256)
while groupsize < c
groupsize += 32
end

Check warning on line 215 in src/spreading/gpu.jl

View check run for this annotation

Codecov / codecov/patch

src/spreading/gpu.jl#L214-L215

Added lines #L214 - L215 were not covered by tests
groupsize
end

@kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N}
i = @index(Global, Linear)
j = @inbounds perm[i]
Expand Down Expand Up @@ -234,7 +247,7 @@
) where {C, D, T, Z, M, Np, block_dims, shmem_size}

@uniform begin
groupsize = @groupsize()::Dims{D}
groupsize = @groupsize()
nthreads = prod(groupsize)

# Determine grid dimensions.
Expand All @@ -254,6 +267,7 @@
u_local = @localmem(Z, shmem_size)
# @assert shmem_size == block_dims .+ (2M - 1)
@assert T <: Real

window_vals = @localmem(T, (2M, D, Np))
points_sm = @localmem(T, (D, Np)) # points copied to shared memory
inds_start = @localmem(Int, (D, Np))
Expand Down
Loading