From 5c45c7dd4e6e425971a71eb4da94e372d8b9c8d9 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Thu, 17 Oct 2024 13:03:43 +0200 Subject: [PATCH 01/33] Start work on shared-memory implementation --- src/NonuniformFFTs.jl | 2 +- src/blocking.jl | 45 +++++++++++++++++++++++--- src/interpolation/gpu.jl | 68 +++++++++++++++++++++++++++++++++------- src/plan.jl | 3 +- 4 files changed, 101 insertions(+), 17 deletions(-) diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index a432663..8a9d205 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -5,7 +5,7 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan using GPUArraysCore: AbstractGPUArray, AbstractGPUVector using StructArrays: StructArrays, StructVector using AbstractFFTs: AbstractFFTs -using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const +using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem using Atomix: Atomix using FFTW: FFTW using LinearAlgebra: mul! diff --git a/src/blocking.jl b/src/blocking.jl index ad143af..4cffc19 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -7,6 +7,7 @@ with_blocking(::NullBlockData) = false # For now these are only used in the GPU implementation get_pointperm(::NullBlockData) = nothing get_sort_points(::NullBlockData) = False() +gpu_method(::NullBlockData) = :global_memory @kernel function copy_points_unblocked_kernel!(@Const(transform::F), points::NTuple, @Const(xp)) where {F} I = @index(Global, Linear) @@ -61,43 +62,79 @@ type_length(::Type{<:NTuple{N}}) where {N} = N # ================================================================================ # +# Determine block size if using the shared-memory implementation. +# We try to make sure that the total block size (including 2M ghost cells in each direction) +# is not larger than the available shared memory. In CUDA the limit is usually 48 KiB. +# Note that the result is a compile-time constant (on Julia 1.11.1 at least). +@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M} + max_shmem_size = 48 << 10 # 48 KiB -- TODO: make this depend on the actual GPU? + max_block_length = max_shmem_size ÷ sizeof(Z) # maximum number of elements in a block + m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding) + n = m - 2M # exclude ghost cells + if n ≤ 0 + throw(ArgumentError( + lazy""" + GPU shared memory size is too small for the chosen problem: + - element type: $Z + - half-support: $M + - number of dimensions: $D + If possible, reduce some of these parameters, or else switch to gpu_method = :global_memory.""" + )) + # TODO: warning if n is too small? (likely slower than global_memory method) + # What is small? n < M? + end + ntuple(_ -> n, Val(D)) # = (n, n, ...) +end + # GPU implementation struct BlockDataGPU{ N, I <: Integer, T <: AbstractFloat, IndexVector <: AbstractVector{I}, SortPoints <: StaticBool, } <: AbstractBlockData + method :: Symbol # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory nblocks_per_dir :: NTuple{N, I} # number of blocks in each direction + block_dims :: NTuple{N, I} # maximum dimensions of a block (excluding ghost cells) block_sizes :: NTuple{N, T} # size of each block (in units of length) cumulative_npoints_per_block :: IndexVector # cumulative sum of number of points in each block (length = num_blocks + 1, first value is 0) blockidx :: IndexVector # linear index of block associated to each point (length = Np) pointperm :: IndexVector sort_points :: SortPoints function BlockDataGPU( + method::Symbol, nblocks_per_dir::NTuple{N, I}, + block_dims::NTuple{N, I}, block_sizes::NTuple{N, T}, npoints_per_block::V, sort::S, ) where {N, I, T, V, S} + method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)")) blockidx = similar(npoints_per_block, 0) pointperm = similar(npoints_per_block, 0) new{N, I, T, V, S}( - nblocks_per_dir, block_sizes, npoints_per_block, blockidx, pointperm, sort, + method, nblocks_per_dir, block_dims, block_sizes, npoints_per_block, blockidx, pointperm, sort, ) end end +gpu_method(bd::BlockDataGPU) = bd.method with_blocking(::BlockDataGPU) = true function BlockDataGPU( ::Type{Z}, - backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, sort_points, - ) where {Z <: Number, D} + backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M}, + sort_points::StaticBool; + method::Symbol, + ) where {Z <: Number, D, M} T = real(Z) # in case Z is complex + if method === :shared_memory + # Override input block size. We try to maximise the use of shared memory. + block_dims = block_dims_gpu_shmem(Z, Ñs, h) + end nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim) --> effective block size is ≤ block_dims L = T(2) * π block_sizes = map(nblocks -> L / nblocks, nblocks_per_dir) cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1) - BlockDataGPU(nblocks_per_dir, block_sizes, cumulative_npoints_per_block, sort_points) + BlockDataGPU(method, nblocks_per_dir, block_dims, block_sizes, cumulative_npoints_per_block, sort_points) end get_pointperm(bd::BlockDataGPU) = bd.pointperm diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 5c96c63..76a5da2 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -45,14 +45,38 @@ using StaticArrays: MVector nothing end +@kernel function interpolate_to_points_shmem_kernel!( + vp::NTuple{C}, + @Const(gs::NTuple{D}), + @Const(points::NTuple{D}), + @Const(us::NTuple{C}), + @Const(pointperm), + @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) + ::Val{block_dims}, + ) where {C, D, block_dims} + is_kernel = @index(Global, NTuple)::NTuple{D} + threads = @groupsize()::NTuple{D} + is = is_kernel ./ threads # this is the actual block index + + # TODO: support C > 1 (do one component at a time, to use less memory?) + T = eltype(us[1]) + M = Kernels.half_support(gs[1]) # assume they're all equal + block_dims_padded = block_dims .+ 2M + u_local = @localmem(T, block_dims_padded) + + # Copy data to local memory + + nothing +end + function interpolate!( backend::GPU, bd::Union{BlockDataGPU, NullBlockData}, - gs, + gs::NTuple{D}, vp_all::NTuple{C, AbstractVector}, us::NTuple{C, AbstractArray}, x⃗s::AbstractVector, - ) where {C} + ) where {C, D} # Note: the dimensions of arrays have already been checked via check_nufft_nonuniform_data. Base.require_one_based_indexing(x⃗s) # this is to make sure that all indices match foreach(Base.require_one_based_indexing, vp_all) @@ -75,17 +99,39 @@ function interpolate!( pointperm_ = pointperm end - # We use dynamically sized kernels to avoid recompilation, since number of points may - # change from one call to another. - ndrange = size(x⃗s) # iterate through points - workgroupsize = default_workgroupsize(backend, ndrange) - kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize) - kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange) + method = gpu_method(bd) + + if method === :global_memory + # We use dynamically sized kernels to avoid recompilation, since number of points may + # change from one call to another. + let ndrange = size(x⃗s) # iterate through points + groupsize = default_workgroupsize(backend, ndrange) + kernel! = interpolate_to_point_naive_kernel!(backend, groupsize) + kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange) + end + elseif method === :shared_memory + @assert bd isa BlockDataGPU + Z = eltype(us[1]) + M = Kernels.half_support(gs[1]) + @assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal + block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M)) # this is usually a compile-time constant... + block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch + @assert block_dims_val === bd.block_dims + let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) + groupsize = 64 # TODO: this should roughly be the number of non-uniform points per block (or less) + groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D) # "augment" first dimension + ndrange = groupsize_dims .* ngroups + kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange) + kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs, block_dims) + end + end if sort_points === True() - kernel_perm! = interp_permute_kernel!(backend, workgroupsize) - kernel_perm!(vp_all, vp_sorted, pointperm; ndrange) - foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays + let groupsize = default_workgroupsize(backend, ndrange) + kernel_perm! = interp_permute_kernel!(backend, groupsize) + kernel_perm!(vp_all, vp_sorted, pointperm; ndrange) + foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays + end end vp_all diff --git a/src/plan.jl b/src/plan.jl index aff8c1c..61fce96 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -308,6 +308,7 @@ function _PlanNUFFT( backend::KA.Backend = CPU(), block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend), synchronise::Bool = false, + gpu_method::Symbol = :global_memory, ) where {T <: Number, D} ks = init_wavenumbers(T, Ns) # Determine dimensions of oversampled grid. @@ -342,7 +343,7 @@ function _PlanNUFFT( else block_dims = get_block_dims(Ñs, block_size) if backend isa GPU - blocks = BlockDataGPU(T, backend, block_dims, Ñs, sort_points) + blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method) else blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points) FFTW.set_num_threads(Threads.nthreads()) From 6be3d8ea87733aa72653693484e0c4fe6c0b28d7 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Thu, 17 Oct 2024 14:44:10 +0200 Subject: [PATCH 02/33] WIP --- src/NonuniformFFTs.jl | 2 +- src/interpolation/gpu.jl | 77 +++++++++++++++++++++++++++++++--------- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index 8a9d205..fd3a0fb 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -5,7 +5,7 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan using GPUArraysCore: AbstractGPUArray, AbstractGPUVector using StructArrays: StructArrays, StructVector using AbstractFFTs: AbstractFFTs -using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem +using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem, @synchronize using Atomix: Atomix using FFTW: FFTW using LinearAlgebra: mul! diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 76a5da2..16abfec 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -1,5 +1,23 @@ using StaticArrays: MVector +# TODO: use this also in spreading, and move to separate file +@inline function get_inds_vals_gpu(gs::NTuple{D}, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D} + ntuple(Val(D)) do n + @inline + get_inds_vals_gpu(gs[n], points[n], Ns[n], j) + end +end + +@inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer) + x = @inbounds points[j] + gdata = Kernels.evaluate_kernel(g, x) + vals = gdata.values # kernel values + M = Kernels.half_support(g) + i₀ = gdata.i - M # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping) + i₀ = ifelse(i₀ < 0, i₀ + N, i₀) # make sure i₀ ≥ 0 + i₀ => vals +end + # Interpolate onto a single point @kernel function interpolate_to_point_naive_kernel!( vp::NTuple{C}, @@ -22,19 +40,7 @@ using StaticArrays: MVector # don't perform atomic operations. This is why the code is simpler here. Ns = size(first(us)) # grid dimensions - indvals = ntuple(Val(D)) do n - @inline - @inbounds begin - g = gs[n] - x = points[n][j] - gdata = Kernels.evaluate_kernel(g, x) - vals = gdata.values # kernel values - M = Kernels.half_support(gs[n]) - i₀ = gdata.i - M # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping) - i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀) # make sure i₀ ≥ 0 - i₀ => vals - end - end + indvals = get_inds_vals_gpu(gs, points, Ns, j) v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, Δxs) @@ -55,20 +61,57 @@ end ::Val{block_dims}, ) where {C, D, block_dims} is_kernel = @index(Global, NTuple)::NTuple{D} - threads = @groupsize()::NTuple{D} - is = is_kernel ./ threads # this is the actual block index + groupsize = @groupsize()::NTuple{D} # generally equal to (nthreads, 1, 1, ...) + threadidx = @index(Local, Linear) + block_index = is_kernel ./ groupsize + nthreads = prod(groupsize) # TODO: support C > 1 (do one component at a time, to use less memory?) T = eltype(us[1]) M = Kernels.half_support(gs[1]) # assume they're all equal block_dims_padded = block_dims .+ 2M - u_local = @localmem(T, block_dims_padded) + u_local = @localmem(T, block_dims_padded) # allocate shared memory + + # Copy grid data from global to shared memory + gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads) - # Copy data to local memory + @synchronize # make sure all threads have the same shared data nothing end +@inline function gridvalues_to_local_memory!( + u_local::AbstractArray{T, D}, + u_global::AbstractArray{T, D}, + ::Val{M}, + block_index::NTuple{D}, block_dims::NTuple{D}, + threadidx::Integer, + nthreads::Integer, + ) where {T, D, M} + Ns = size(u_global) + # inds = ntuple(Val(D)) do n + # @inline + # a = (block_index[n] - 1) * block_dims[n] + 1 # start of current block in global grid (excluding ghost cells) + # b = block_index[n] * block_dims[n] # end of current block + # (a - M):(b + M) # range including ghost cells + # end + # TODO: split work across threads + # Assume D = 3 for now + if threadidx == 1 + # for is in iter + for i_3 in axes(u_local, 3), i_2 in axes(u_local, 2), i_1 in axes(u_local, 1) + # For some reason, type assertions are needed for things to work on AMDGPU + j_1 = mod1(block_index[1] - 1 * block_dims[1] + i_1 - M, Ns[1])::Int + j_2 = mod1(block_index[2] - 1 * block_dims[2] + i_2 - M, Ns[2])::Int + j_3 = mod1(block_index[3] - 1 * block_dims[3] + i_3 - M, Ns[3])::Int + is = (i_1, i_2, i_3)::Dims{D} + js = (j_1, j_2, j_3)::Dims{D} + u_local[is...] = u_global[js...] + end + end + nothing +end + function interpolate!( backend::GPU, bd::Union{BlockDataGPU, NullBlockData}, From 678431763e5cab7c0335082226f8b35b116a2b26 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 09:46:39 +0200 Subject: [PATCH 03/33] Interpolation now works (in 3D, ntransforms = 1) --- src/NonuniformFFTs.jl | 3 +- src/blocking.jl | 51 ++++++++++++-------- src/interpolation/gpu.jl | 101 +++++++++++++++++++++++++++++---------- 3 files changed, 108 insertions(+), 47 deletions(-) diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index fd3a0fb..eeaf20f 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -5,7 +5,8 @@ using AbstractNFFTs: AbstractNFFTs, AbstractNFFTPlan using GPUArraysCore: AbstractGPUArray, AbstractGPUVector using StructArrays: StructArrays, StructVector using AbstractFFTs: AbstractFFTs -using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, @groupsize, @localmem, @synchronize +using KernelAbstractions: KernelAbstractions as KA, CPU, GPU, @kernel, @index, @Const, + @groupsize, @localmem, @synchronize, @uniform using Atomix: Atomix using FFTW: FFTW using LinearAlgebra: mul! diff --git a/src/blocking.jl b/src/blocking.jl index 4cffc19..2b6a50b 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -70,7 +70,7 @@ type_length(::Type{<:NTuple{N}}) where {N} = N max_shmem_size = 48 << 10 # 48 KiB -- TODO: make this depend on the actual GPU? max_block_length = max_shmem_size ÷ sizeof(Z) # maximum number of elements in a block m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding) - n = m - 2M # exclude ghost cells + n = m - (2M - 1) # exclude ghost cells if n ≤ 0 throw(ArgumentError( lazy""" @@ -92,26 +92,26 @@ struct BlockDataGPU{ IndexVector <: AbstractVector{I}, SortPoints <: StaticBool, } <: AbstractBlockData - method :: Symbol # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory + method :: Symbol # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory + Δxs :: NTuple{N, T} # grid step; the size of a block in units of length is block_dims .* Δxs. nblocks_per_dir :: NTuple{N, I} # number of blocks in each direction block_dims :: NTuple{N, I} # maximum dimensions of a block (excluding ghost cells) - block_sizes :: NTuple{N, T} # size of each block (in units of length) cumulative_npoints_per_block :: IndexVector # cumulative sum of number of points in each block (length = num_blocks + 1, first value is 0) blockidx :: IndexVector # linear index of block associated to each point (length = Np) pointperm :: IndexVector sort_points :: SortPoints function BlockDataGPU( method::Symbol, + Δxs::NTuple{N, T}, nblocks_per_dir::NTuple{N, I}, block_dims::NTuple{N, I}, - block_sizes::NTuple{N, T}, npoints_per_block::V, sort::S, ) where {N, I, T, V, S} method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)")) blockidx = similar(npoints_per_block, 0) pointperm = similar(npoints_per_block, 0) new{N, I, T, V, S}( - method, nblocks_per_dir, block_dims, block_sizes, npoints_per_block, blockidx, pointperm, sort, + method, Δxs, nblocks_per_dir, block_dims, npoints_per_block, blockidx, pointperm, sort, ) end end @@ -130,11 +130,11 @@ function BlockDataGPU( # Override input block size. We try to maximise the use of shared memory. block_dims = block_dims_gpu_shmem(Z, Ñs, h) end - nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim) --> effective block size is ≤ block_dims - L = T(2) * π - block_sizes = map(nblocks -> L / nblocks, nblocks_per_dir) + nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim) + L = T(2) * π # domain period + Δxs = map(N -> L / N, Ñs) # grid step (oversampled grid) cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1) - BlockDataGPU(method, nblocks_per_dir, block_dims, block_sizes, cumulative_npoints_per_block, sort_points) + BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points) end get_pointperm(bd::BlockDataGPU) = bd.pointperm @@ -145,7 +145,7 @@ function set_points_impl!( transform::F = identity, synchronise, ) where {F <: Function} (; - cumulative_npoints_per_block, nblocks_per_dir, block_sizes, + Δxs, cumulative_npoints_per_block, nblocks_per_dir, block_dims, blockidx, pointperm, sort_points, ) = bd @@ -173,8 +173,8 @@ function set_points_impl!( @timeit timer "(1) Assign blocks" let local kernel! = assign_blocks_kernel!(backend, workgroupsize) kernel!( - blockidx, cumulative_npoints_per_block, points_comp, xp, - block_sizes, nblocks_per_dir, sort_points, transform; + blockidx, cumulative_npoints_per_block, points_comp, xp, Δxs, + block_dims, nblocks_per_dir, sort_points, transform; ndrange, ) maybe_synchronise(backend, synchronise) @@ -192,8 +192,8 @@ function set_points_impl!( @timeit timer "(3) Sort" let local kernel! = sortperm_kernel!(backend, workgroupsize) kernel!( - pointperm, cumulative_npoints_per_block, blockidx, xp, - block_sizes, nblocks_per_dir, transform; + pointperm, cumulative_npoints_per_block, blockidx, xp, Δxs, + block_dims, nblocks_per_dir, transform; ndrange, ) maybe_synchronise(backend, synchronise) @@ -214,9 +214,18 @@ end # Get index of block where x⃗ is located (assumed to be in [0, 2π)). @inline function block_index( - x⃗::NTuple{N,T}, block_sizes::NTuple{N,T}, nblocks_per_dir::NTuple{N}, + x⃗::NTuple{N,T}, Δxs::NTuple{N,T}, block_dims::NTuple{N,Integer}, nblocks_per_dir::NTuple{N}, ) where {N, T <: AbstractFloat} - is = map(Kernels.point_to_cell, x⃗, block_sizes) + is = ntuple(Val(N)) do n + @inline + # Note: we could directly use the block size Δx_block = Δx * block_dims, but this + # may lead to inconsistency with interpolation and spreading kernels (leading to + # errors) due to numerical accuracy issues. So we first obtain the index on the + # Δx grid, then we translate that to a block index by dividing by the block + # dimensions (= how many Δx's in a single block). + i = Kernels.point_to_cell(x⃗[n], Δxs[n]) # index of grid cell where point is located + cld(i, block_dims[n]) # index of block where point is located + end @inbounds LinearIndices(nblocks_per_dir)[is...] end @@ -225,7 +234,8 @@ end cumulative_npoints_per_block::AbstractVector{<:Integer}, points::NTuple, @Const(xp), - @Const(block_sizes::NTuple), + @Const(Δxs), + @Const(block_dims::NTuple), @Const(nblocks_per_dir::NTuple), @Const(sort_points), @Const(transform::F), @@ -233,7 +243,7 @@ end I = @index(Global, Linear) @inbounds x⃗ = xp[I] y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple - n = block_index(y⃗, block_sizes, nblocks_per_dir) + n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir) # Note: here index_within_block is the value *after* incrementing (≥ 1). S = eltype(cumulative_npoints_per_block) @@ -255,14 +265,15 @@ end @Const(cumulative_npoints_per_block), @Const(blockidx), @Const(xp), - @Const(block_sizes), + @Const(Δxs), + @Const(block_dims), @Const(nblocks_per_dir), @Const(transform::F), ) where {F} I = @index(Global, Linear) @inbounds x⃗ = xp[I] y⃗ = to_unit_cell_gpu(transform(Tuple(x⃗))) :: NTuple - n = block_index(y⃗, block_sizes, nblocks_per_dir) + n = block_index(y⃗, Δxs, block_dims, nblocks_per_dir) @inbounds J = cumulative_npoints_per_block[n] + blockidx[I] @inbounds pointperm[J] = I nothing diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 16abfec..9d7da83 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -57,57 +57,106 @@ end @Const(points::NTuple{D}), @Const(us::NTuple{C}), @Const(pointperm), + @Const(cumulative_npoints_per_block::AbstractVector), @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) ::Val{block_dims}, ) where {C, D, block_dims} - is_kernel = @index(Global, NTuple)::NTuple{D} - groupsize = @groupsize()::NTuple{D} # generally equal to (nthreads, 1, 1, ...) - threadidx = @index(Local, Linear) - block_index = is_kernel ./ groupsize + groupsize = @groupsize()::Dims{D} # generally equal to (nthreads, 1, 1, ...) nthreads = prod(groupsize) + threadidx = @index(Local, Linear) # in 1:nthreads + block_index = @index(Group, NTuple) # workgroup index (= block index) + block_n = @index(Group, Linear) # linear index of block # TODO: support C > 1 (do one component at a time, to use less memory?) T = eltype(us[1]) M = Kernels.half_support(gs[1]) # assume they're all equal - block_dims_padded = block_dims .+ 2M + block_dims_padded = @. block_dims + 2M - 1 u_local = @localmem(T, block_dims_padded) # allocate shared memory # Copy grid data from global to shared memory gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads) + # This block will take care of non-uniform points (a + 1):b + @inbounds a = cumulative_npoints_per_block[block_n] + @inbounds b = cumulative_npoints_per_block[block_n + 1] + Ns = size(us[1]) # grid dimensions + ΔV = prod(Δxs) + + # Shift from indices in global array to indices in local array + idx_shift = @. (block_index - 1) * block_dims + 1 + @synchronize # make sure all threads have the same shared data + for i in (a + threadidx):nthreads:b + # Interpolate at point j + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + + indvals = ntuple(Val(D)) do n + @inline + x = @inbounds points[n][j] + gdata = Kernels.evaluate_kernel(gs[n], x) + local vals = gdata.values # kernel values + local M = Kernels.half_support(gs[n]) + local i₀ = gdata.i - idx_shift[n] + @assert i₀ ≥ 0 + @assert i₀ + 2M ≤ block_dims_padded[n] + i₀ => vals + end + + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + + gprod_4 = ΔV + v = zero(T) + + @inbounds for i_3 in inds[3] + gprod_3 = gprod_4 * vals[3][i_3] + j_3 = inds_start[3] + i_3 + for i_2 in inds[2] + gprod_2 = gprod_3 * vals[2][i_2] + j_2 = inds_start[2] + i_2 + for i_1 in inds[1] + gprod_1 = gprod_2 * vals[1][i_1] + j_1 = inds_start[1] + i_1 + js = (j_1, j_2, j_3) + v += gprod_1 * u_local[js...] + end + end + end + + @inbounds vp[1][j] = v + end + nothing end +# TODO +# - generalise to all D +# - optimise @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, ::Val{M}, - block_index::NTuple{D}, block_dims::NTuple{D}, + block_index::Dims{D}, block_dims::Dims{D}, threadidx::Integer, nthreads::Integer, ) where {T, D, M} Ns = size(u_global) - # inds = ntuple(Val(D)) do n - # @inline - # a = (block_index[n] - 1) * block_dims[n] + 1 # start of current block in global grid (excluding ghost cells) - # b = block_index[n] * block_dims[n] # end of current block - # (a - M):(b + M) # range including ghost cells - # end - # TODO: split work across threads - # Assume D = 3 for now - if threadidx == 1 - # for is in iter - for i_3 in axes(u_local, 3), i_2 in axes(u_local, 2), i_1 in axes(u_local, 1) - # For some reason, type assertions are needed for things to work on AMDGPU - j_1 = mod1(block_index[1] - 1 * block_dims[1] + i_1 - M, Ns[1])::Int - j_2 = mod1(block_index[2] - 1 * block_dims[2] + i_2 - M, Ns[2])::Int - j_3 = mod1(block_index[3] - 1 * block_dims[3] + i_3 - M, Ns[3])::Int - is = (i_1, i_2, i_3)::Dims{D} - js = (j_1, j_2, j_3)::Dims{D} - u_local[is...] = u_global[js...] - end + @assert D == 3 + inds_3 = axes(u_local, 3)[threadidx:nthreads:end] # parallelise over outermost dimension (some threads might not work here) + @inbounds for i_3 in inds_3, i_2 in axes(u_local, 2), i_1 in axes(u_local, 1) + # For some reason, type assertions are needed for things to work on AMDGPU + j_1 = mod1((block_index[1] - 1) * block_dims[1] - (M - 1) + i_1, Ns[1]) + j_2 = mod1((block_index[2] - 1) * block_dims[2] - (M - 1) + i_2, Ns[2]) + j_3 = mod1((block_index[3] - 1) * block_dims[3] - (M - 1) + i_3, Ns[3]) + is = (i_1, i_2, i_3) + js = (j_1, j_2, j_3) + u_local[is...] = u_global[js...] # TODO: use linear index instead of is? end nothing end @@ -165,7 +214,7 @@ function interpolate!( groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D) # "augment" first dimension ndrange = groupsize_dims .* ngroups kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange) - kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs, block_dims) + kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims) end end From cec2a65db7b7b8bad74f78b4bccd86dc835b631a Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 09:55:42 +0200 Subject: [PATCH 04/33] Minor changes --- src/interpolation/gpu.jl | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 9d7da83..2bf4fd0 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -79,7 +79,6 @@ end # This block will take care of non-uniform points (a + 1):b @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] - Ns = size(us[1]) # grid dimensions ΔV = prod(Δxs) # Shift from indices in global array to indices in local array @@ -137,7 +136,8 @@ end # TODO # - generalise to all D -# - optimise +# - parallelise over multiple directions? +# - choose an optimal memory access pattern @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, @@ -148,15 +148,21 @@ end ) where {T, D, M} Ns = size(u_global) @assert D == 3 + inds_1 = axes(u_local, 1) + inds_2 = axes(u_local, 2) inds_3 = axes(u_local, 3)[threadidx:nthreads:end] # parallelise over outermost dimension (some threads might not work here) - @inbounds for i_3 in inds_3, i_2 in axes(u_local, 2), i_1 in axes(u_local, 1) - # For some reason, type assertions are needed for things to work on AMDGPU - j_1 = mod1((block_index[1] - 1) * block_dims[1] - (M - 1) + i_1, Ns[1]) - j_2 = mod1((block_index[2] - 1) * block_dims[2] - (M - 1) + i_2, Ns[2]) - j_3 = mod1((block_index[3] - 1) * block_dims[3] - (M - 1) + i_3, Ns[3]) - is = (i_1, i_2, i_3) - js = (j_1, j_2, j_3) - u_local[is...] = u_global[js...] # TODO: use linear index instead of is? + offsets = @. (block_index - 1) * block_dims - (M - 1) + @inbounds for i_3 in inds_3 + j_3 = mod1(offsets[3] + i_3, Ns[3]) + for i_2 in inds_2 + j_2 = mod1(offsets[2] + i_2, Ns[2]) + for i_1 in inds_1 + j_1 = mod1(offsets[1] + i_1, Ns[1]) + is = (i_1, i_2, i_3) + js = (j_1, j_2, j_3) + u_local[is...] = u_global[js...] # TODO: use linear index instead of is? + end + end end nothing end @@ -210,7 +216,7 @@ function interpolate!( block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) - groupsize = 64 # TODO: this should roughly be the number of non-uniform points per block (or less) + groupsize = 64 # TODO: how to choose this? roughly equal to the number of non-uniform points per block (or less)? groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D) # "augment" first dimension ndrange = groupsize_dims .* ngroups kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange) From 0ac88869d816732139861483039c7145354ec327 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 11:24:34 +0200 Subject: [PATCH 05/33] Optimisations --- src/interpolation/gpu.jl | 144 ++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 56 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 2bf4fd0..546dca8 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -61,8 +61,9 @@ end @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) ::Val{block_dims}, ) where {C, D, block_dims} - groupsize = @groupsize()::Dims{D} # generally equal to (nthreads, 1, 1, ...) + groupsize = @groupsize()::Dims{D} nthreads = prod(groupsize) + threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads block_index = @index(Group, NTuple) # workgroup index (= block index) block_n = @index(Group, Linear) # linear index of block @@ -73,9 +74,6 @@ end block_dims_padded = @. block_dims + 2M - 1 u_local = @localmem(T, block_dims_padded) # allocate shared memory - # Copy grid data from global to shared memory - gridvalues_to_local_memory!(u_local, us[1], Val(M), block_index, block_dims, threadidx, nthreads) - # This block will take care of non-uniform points (a + 1):b @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] @@ -84,51 +82,57 @@ end # Shift from indices in global array to indices in local array idx_shift = @. (block_index - 1) * block_dims + 1 - @synchronize # make sure all threads have the same shared data + # Interpolate components one by one (to avoid using too much memory) + for c ∈ 1:C + # Copy grid data from global to shared memory + gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize) - for i in (a + threadidx):nthreads:b - # Interpolate at point j - j = if pointperm === nothing - i - else - @inbounds pointperm[i] - end + @synchronize # make sure all threads have the same shared data - indvals = ntuple(Val(D)) do n - @inline - x = @inbounds points[n][j] - gdata = Kernels.evaluate_kernel(gs[n], x) - local vals = gdata.values # kernel values - local M = Kernels.half_support(gs[n]) - local i₀ = gdata.i - idx_shift[n] - @assert i₀ ≥ 0 - @assert i₀ + 2M ≤ block_dims_padded[n] - i₀ => vals - end + for i in (a + threadidx):nthreads:b + # Interpolate at point j + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - - gprod_4 = ΔV - v = zero(T) - - @inbounds for i_3 in inds[3] - gprod_3 = gprod_4 * vals[3][i_3] - j_3 = inds_start[3] + i_3 - for i_2 in inds[2] - gprod_2 = gprod_3 * vals[2][i_2] - j_2 = inds_start[2] + i_2 - for i_1 in inds[1] - gprod_1 = gprod_2 * vals[1][i_1] - j_1 = inds_start[1] + i_1 - js = (j_1, j_2, j_3) - v += gprod_1 * u_local[js...] + indvals = ntuple(Val(D)) do n + @inline + x = @inbounds points[n][j] + gdata = Kernels.evaluate_kernel(gs[n], x) + local vals = gdata.values # kernel values + local M = Kernels.half_support(gs[n]) + local i₀ = gdata.i - idx_shift[n] + @assert i₀ ≥ 0 + @assert i₀ + 2M ≤ block_dims_padded[n] + i₀ => vals + end + + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + + gprod_4 = ΔV + v = zero(T) + + @inbounds for i_3 in inds[3] + gprod_3 = gprod_4 * vals[3][i_3] + j_3 = inds_start[3] + i_3 + for i_2 in inds[2] + gprod_2 = gprod_3 * vals[2][i_2] + j_2 = inds_start[2] + i_2 + for i_1 in inds[1] + gprod_1 = gprod_2 * vals[1][i_1] + j_1 = inds_start[1] + i_1 + js = (j_1, j_2, j_3) + v += gprod_1 * u_local[js...] + end end end - end - @inbounds vp[1][j] = v + @inbounds vp[c][j] = v + end end nothing @@ -137,27 +141,33 @@ end # TODO # - generalise to all D # - parallelise over multiple directions? -# - choose an optimal memory access pattern +# - is it possible to optimise the memory access patterns? @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, - threadidx::Integer, - nthreads::Integer, + threadidxs::Dims{D}, + groupsize::Dims{D}, ) where {T, D, M} Ns = size(u_global) @assert D == 3 - inds_1 = axes(u_local, 1) - inds_2 = axes(u_local, 2) - inds_3 = axes(u_local, 3)[threadidx:nthreads:end] # parallelise over outermost dimension (some threads might not work here) + inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end] + inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end] + inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end] offsets = @. (block_index - 1) * block_dims - (M - 1) @inbounds for i_3 in inds_3 - j_3 = mod1(offsets[3] + i_3, Ns[3]) + j_3 = offsets[3] + i_3 + j_3 = ifelse(j_3 ≤ 0, j_3 + Ns[3], j_3) + j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3) for i_2 in inds_2 - j_2 = mod1(offsets[2] + i_2, Ns[2]) + j_2 = offsets[2] + i_2 + j_2 = ifelse(j_2 ≤ 0, j_2 + Ns[2], j_2) + j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2) for i_1 in inds_1 - j_1 = mod1(offsets[1] + i_1, Ns[1]) + j_1 = offsets[1] + i_1 + j_1 = ifelse(j_1 ≤ 0, j_1 + Ns[1], j_1) + j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1) is = (i_1, i_2, i_3) js = (j_1, j_2, j_3) u_local[is...] = u_global[js...] # TODO: use linear index instead of is? @@ -216,10 +226,9 @@ function interpolate!( block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) - groupsize = 64 # TODO: how to choose this? roughly equal to the number of non-uniform points per block (or less)? - groupsize_dims = ntuple(d -> d == 1 ? groupsize : 1, D) # "augment" first dimension - ndrange = groupsize_dims .* ngroups - kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize_dims, ndrange) + groupsize = groupsize_shmem(ngroups, length(x⃗s)) + ndrange = groupsize .* ngroups + kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims) end end @@ -235,6 +244,29 @@ function interpolate!( vp_all end +# Determine groupsize (number of threads per direction) in :shared_memory method. +# Here Np is the number of non-uniform points. +function groupsize_shmem(ngroups::NTuple{D}, Np) where {D} + points_per_group = Np / prod(ngroups) # average number of points per block + groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU) + # Increase groupsize if number of points is much larger (at least 4×) than the groupsize. + # (Not sure this is a good idea if the point distribution is very inhomogeneous...) + # TODO: maybe leave it at 64? + while groupsize < 1024 && 4 * groupsize ≤ points_per_group + groupsize *= 2 + end + gsizes = ntuple(_ -> 1, Val(D)) + p = 1 # product of sizes + i = 1 + while p < groupsize + gsizes = Base.setindex(gsizes, gsizes[i] * 2, i) + p *= 2 + i = mod1(i + 1, D) + end + @assert p == groupsize == prod(gsizes) + gsizes +end + @inline function interpolate_from_arrays_gpu( us::NTuple{C, AbstractArray{T, D}}, indvals::NTuple{D, <:Pair}, From ecef0383f0dc7118262725fd636bca3ca7ef5b23 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 12:21:47 +0200 Subject: [PATCH 06/33] Further optimisations --- src/interpolation/gpu.jl | 49 +++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 546dca8..e05529f 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -140,8 +140,7 @@ end # TODO # - generalise to all D -# - parallelise over multiple directions? -# - is it possible to optimise the memory access patterns? +# - how to optimise the memory access patterns? @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, @@ -155,18 +154,19 @@ end inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end] inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end] inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end] - offsets = @. (block_index - 1) * block_dims - (M - 1) + offsets = ntuple(Val(D)) do n + @inline + off = (block_index[n] - 1) * block_dims[n] - (M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end @inbounds for i_3 in inds_3 j_3 = offsets[3] + i_3 - j_3 = ifelse(j_3 ≤ 0, j_3 + Ns[3], j_3) j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3) for i_2 in inds_2 j_2 = offsets[2] + i_2 - j_2 = ifelse(j_2 ≤ 0, j_2 + Ns[2], j_2) j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2) for i_1 in inds_1 j_1 = offsets[1] + i_1 - j_1 = ifelse(j_1 ≤ 0, j_1 + Ns[1], j_1) j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1) is = (i_1, i_2, i_3) js = (j_1, j_2, j_3) @@ -226,7 +226,9 @@ function interpolate!( block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) - groupsize = groupsize_shmem(ngroups, length(x⃗s)) + block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array + groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) + # @show ngroups block_dims block_dims_padded ndrange = groupsize .* ngroups kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims) @@ -246,22 +248,33 @@ end # Determine groupsize (number of threads per direction) in :shared_memory method. # Here Np is the number of non-uniform points. -function groupsize_shmem(ngroups::NTuple{D}, Np) where {D} - points_per_group = Np / prod(ngroups) # average number of points per block +# The distribution of threads across directions mainly (only?) affects the copies between +# shared and global memories, so it can have an important influence of performance due to +# memory accesses. +function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D} + # (1) Determine the total number of threads. groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU) # Increase groupsize if number of points is much larger (at least 4×) than the groupsize. - # (Not sure this is a good idea if the point distribution is very inhomogeneous...) - # TODO: maybe leave it at 64? - while groupsize < 1024 && 4 * groupsize ≤ points_per_group - groupsize *= 2 - end + # Not sure this is a good idea if the point distribution is very inhomogeneous... + # TODO see if this improves performance in highly dense cases + # points_per_group = Np / prod(ngroups) # average number of points per block + # while groupsize < 1024 && 4 * groupsize ≤ points_per_group + # groupsize *= 2 + # end + # (2) Determine number of threads in each direction. + # This mainly affects the performance of global -> shared memory copies. + # It seems like it's better to parallelise the outer dimensions first. gsizes = ntuple(_ -> 1, Val(D)) p = 1 # product of sizes - i = 1 + i = D # parallelise outer dimensions first while p < groupsize - gsizes = Base.setindex(gsizes, gsizes[i] * 2, i) - p *= 2 - i = mod1(i + 1, D) + if gsizes[i] < shmem_size[i] || i == 1 + gsizes = Base.setindex(gsizes, gsizes[i] * 2, i) + p *= 2 + else + @assert i > 1 + i -= 1 + end end @assert p == groupsize == prod(gsizes) gsizes From e4aa4ca9f9a3a7564eeb44e164cc7a01bdb55151 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 13:24:20 +0200 Subject: [PATCH 07/33] Generalise interpolation to all dimensions --- src/interpolation/gpu.jl | 141 +++++++++++++++++++++++++-------------- 1 file changed, 91 insertions(+), 50 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index e05529f..7cca3b9 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -68,7 +68,6 @@ end block_index = @index(Group, NTuple) # workgroup index (= block index) block_n = @index(Group, Linear) # linear index of block - # TODO: support C > 1 (do one component at a time, to use less memory?) T = eltype(us[1]) M = Kernels.half_support(gs[1]) # assume they're all equal block_dims_padded = @. block_dims + 2M - 1 @@ -77,7 +76,7 @@ end # This block will take care of non-uniform points (a + 1):b @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] - ΔV = prod(Δxs) + prefactor = prod(Δxs) # interpolations need to be multiplied by the volume of a grid cell # Shift from indices in global array to indices in local array idx_shift = @. (block_index - 1) * block_dims + 1 @@ -97,6 +96,7 @@ end @inbounds pointperm[i] end + # TODO: can we do this just once for all components? (if C > 1) indvals = ntuple(Val(D)) do n @inline x = @inbounds points[n][j] @@ -109,28 +109,7 @@ end i₀ => vals end - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - - gprod_4 = ΔV - v = zero(T) - - @inbounds for i_3 in inds[3] - gprod_3 = gprod_4 * vals[3][i_3] - j_3 = inds_start[3] + i_3 - for i_2 in inds[2] - gprod_2 = gprod_3 * vals[2][i_2] - j_2 = inds_start[2] + i_2 - for i_1 in inds[1] - gprod_1 = gprod_2 * vals[1][i_1] - j_1 = inds_start[1] + i_1 - js = (j_1, j_2, j_3) - v += gprod_1 * u_local[js...] - end - end - end - + v = interpolate_from_arrays_shmem(u_local, indvals, prefactor) @inbounds vp[c][j] = v end end @@ -138,9 +117,8 @@ end nothing end -# TODO -# - generalise to all D -# - how to optimise the memory access patterns? +# Copy values from global to shared memory. +# - can we optimise memory access patterns? @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, @@ -149,32 +127,95 @@ end threadidxs::Dims{D}, groupsize::Dims{D}, ) where {T, D, M} - Ns = size(u_global) - @assert D == 3 - inds_1 = axes(u_local, 1)[threadidxs[1]:groupsize[1]:end] - inds_2 = axes(u_local, 2)[threadidxs[2]:groupsize[2]:end] - inds_3 = axes(u_local, 3)[threadidxs[3]:groupsize[3]:end] - offsets = ntuple(Val(D)) do n - @inline - off = (block_index[n] - 1) * block_dims[n] - (M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) - end - @inbounds for i_3 in inds_3 - j_3 = offsets[3] + i_3 - j_3 = ifelse(j_3 > Ns[3], j_3 - Ns[3], j_3) - for i_2 in inds_2 - j_2 = offsets[2] + i_2 - j_2 = ifelse(j_2 > Ns[2], j_2 - Ns[2], j_2) - for i_1 in inds_1 - j_1 = offsets[1] + i_1 - j_1 = ifelse(j_1 > Ns[1], j_1 - Ns[1], j_1) - is = (i_1, i_2, i_3) - js = (j_1, j_2, j_3) - u_local[is...] = u_global[js...] # TODO: use linear index instead of is? + if @generated + quote + Ns = size(u_global) + inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end]) + offsets = @ntuple( + $D, + n -> let + off = (block_index[n] - 1) * block_dims[n] - ($M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + ) + @nloops( + $D, i, + d -> inds[d], + d -> begin + j_d = offsets[d] + i_d + j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) + end, + begin + is = @ntuple($D, i) + js = @ntuple($D, j) + @inbounds u_local[is...] = u_global[js...] + end, + ) + nothing + end + else + Ns = size(u_global) + inds = ntuple(Val(D)) do n + axes(u_local, n)[threadidxs[n]:groupsize[n]:end] # this determines the parallelisation pattern + end + offsets = ntuple(Val(D)) do n + @inline + off = (block_index[n] - 1) * block_dims[n] - (M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + @inbounds for is ∈ Iterators.product(inds...) + js = ntuple(Val(D)) do n + @inline + j = offsets[n] + is[n] + ifelse(j > Ns[n], j - Ns[n], j) end + u_local[is...] = u_global[js...] end + nothing + end +end + +# Interpolate a single "component" (one transform at a time). +# Here vp is a vector instead of a tuple of vectors. +@inline function interpolate_from_arrays_shmem( + u_local::AbstractArray{T, D}, + indvals::NTuple{D}, + prefactor, + ) where {T, D} + if @generated + gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! + quote + $gprod_init = prefactor + v = zero(T) + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + @nloops( + $D, i, + d -> inds[d], + d -> begin + @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] + @inbounds j_d = inds_start[d] + i_d + end, + begin + js = @ntuple($D, j) + @inbounds v += gprod_1 * u_local[js...] + end, + ) + v + end + else + v = zero(T) + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + @inbounds for I ∈ CartesianIndices(inds) + gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D))) + js = inds_start .+ Tuple(I) + v += gprod * u_local[js...] + end + v end - nothing end function interpolate!( From 7d1e4f6d0440dbeed12051397c4d624bc42e9843 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 13:26:35 +0200 Subject: [PATCH 08/33] Use get_inds_vals_gpu in spreading --- src/interpolation/gpu.jl | 3 ++- src/spreading/gpu.jl | 14 +------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 7cca3b9..2589377 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -1,6 +1,6 @@ using StaticArrays: MVector -# TODO: use this also in spreading, and move to separate file +# Note: this is also used in spreading @inline function get_inds_vals_gpu(gs::NTuple{D}, points::NTuple{D}, Ns::NTuple{D}, j::Integer) where {D} ntuple(Val(D)) do n @inline @@ -8,6 +8,7 @@ using StaticArrays: MVector end end +# Note: this is also used in spreading @inline function get_inds_vals_gpu(g::AbstractKernelData, points::AbstractVector, N::Integer, j::Integer) x = @inbounds points[j] gdata = Kernels.evaluate_kernel(g, x) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 838aa79..3df8aad 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -23,19 +23,7 @@ @assert eltype(first(us)) <: Real # output is a real array (but may actually describe complex data) Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex - indvals = ntuple(Val(D)) do n - @inline - @inbounds begin - g = gs[n] - x = points[n][j] - gdata = Kernels.evaluate_kernel(g, x) - vals = gdata.values - M = Kernels.half_support(gs[n]) - i₀ = gdata.i - M # active region is (i₀ + 1):(i₀ + 2M) (up to periodic wrapping) - i₀ = ifelse(i₀ < 0, i₀ + Ns[n], i₀) # make sure i₀ ≥ 0 - i₀ => vals - end - end + indvals = get_inds_vals_gpu(gs, points, Ns, j) v⃗ = map(v -> @inbounds(v[j]), vp) spread_onto_arrays_gpu!(us, indvals, v⃗, Ns) From 2a34d0b74a3e3be35560b5afc2cb6479dbf4b61a Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 15:18:23 +0200 Subject: [PATCH 09/33] Spreading with shmem works but is slow Atomic add on shared memory is very slow. Is it Atomix's fault? --- src/interpolation/gpu.jl | 40 +++---- src/spreading/gpu.jl | 229 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 236 insertions(+), 33 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 2589377..41c86cd 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -26,7 +26,7 @@ end @Const(points::NTuple{D}), @Const(us::NTuple{C}), @Const(pointperm), - @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) + @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) ) where {C, D} i = @index(Global, Linear) @@ -43,7 +43,7 @@ end indvals = get_inds_vals_gpu(gs, points, Ns, j) - v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, Δxs) + v⃗ = interpolate_from_arrays_gpu(us, indvals, Ns, prefactor) for n ∈ eachindex(vp, v⃗) @inbounds vp[n][j] = v⃗[n] @@ -59,7 +59,7 @@ end @Const(us::NTuple{C}), @Const(pointperm), @Const(cumulative_npoints_per_block::AbstractVector), - @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) + @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) ::Val{block_dims}, ) where {C, D, block_dims} groupsize = @groupsize()::Dims{D} @@ -77,7 +77,6 @@ end # This block will take care of non-uniform points (a + 1):b @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] - prefactor = prod(Δxs) # interpolations need to be multiplied by the volume of a grid cell # Shift from indices in global array to indices in local array idx_shift = @. (block_index - 1) * block_dims + 1 @@ -105,8 +104,8 @@ end local vals = gdata.values # kernel values local M = Kernels.half_support(gs[n]) local i₀ = gdata.i - idx_shift[n] - @assert i₀ ≥ 0 - @assert i₀ + 2M ≤ block_dims_padded[n] + # @assert i₀ ≥ 0 + # @assert i₀ + 2M ≤ block_dims_padded[n] i₀ => vals end @@ -123,10 +122,7 @@ end @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, - ::Val{M}, - block_index::Dims{D}, block_dims::Dims{D}, - threadidxs::Dims{D}, - groupsize::Dims{D}, + ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, ) where {T, D, M} if @generated quote @@ -233,6 +229,7 @@ function interpolate!( xs_comp = StructArrays.components(x⃗s) Δxs = map(Kernels.gridstep, gs) + prefactor = prod(Δxs) # interpolations need to be multiplied by the volume of a grid cell pointperm = get_pointperm(bd) # nothing in case of NullBlockData sort_points = get_sort_points(bd)::StaticBool # False in the case of NullBlockData @@ -249,15 +246,17 @@ function interpolate!( pointperm_ = pointperm end + # We use dynamically sized kernels to avoid recompilation, since number of points may + # change from one call to another. + ndrange_points = size(x⃗s) # iterate through points + groupsize_points = default_workgroupsize(backend, ndrange_points) + method = gpu_method(bd) if method === :global_memory - # We use dynamically sized kernels to avoid recompilation, since number of points may - # change from one call to another. - let ndrange = size(x⃗s) # iterate through points - groupsize = default_workgroupsize(backend, ndrange) + let ndrange = ndrange_points, groupsize = groupsize_points kernel! = interpolate_to_point_naive_kernel!(backend, groupsize) - kernel!(vp_sorted, gs, xs_comp, us, pointperm_, Δxs; ndrange) + kernel!(vp_sorted, gs, xs_comp, us, pointperm_, prefactor; ndrange) end elseif method === :shared_memory @assert bd isa BlockDataGPU @@ -273,12 +272,12 @@ function interpolate!( # @show ngroups block_dims block_dims_padded ndrange = groupsize .* ngroups kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) - kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, Δxs, block_dims) + kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, prefactor, block_dims) end end if sort_points === True() - let groupsize = default_workgroupsize(backend, ndrange) + let ndrange = ndrange_points, groupsize = groupsize_points kernel_perm! = interp_permute_kernel!(backend, groupsize) kernel_perm!(vp_all, vp_sorted, pointperm; ndrange) foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays @@ -293,6 +292,7 @@ end # The distribution of threads across directions mainly (only?) affects the copies between # shared and global memories, so it can have an important influence of performance due to # memory accesses. +# NOTE: this is also used in spreading function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D} # (1) Determine the total number of threads. groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU) @@ -326,7 +326,7 @@ end us::NTuple{C, AbstractArray{T, D}}, indvals::NTuple{D, <:Pair}, Ns::Dims{D}, - Δxs::NTuple{D, Tr}, + prefactor::Tr, ) where {T, Tr, C, D} if @generated gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! @@ -336,7 +336,7 @@ end vals = map(last, indvals) # evaluated kernel values in each direction inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width vs = zero(MVector{$C, $T}) # interpolated value (output) - $gprod_init = prod(Δxs) # product of kernel values (initially Δx[1] * Δx[2] * ...) + $gprod_init = prefactor # product of kernel values (initially Δx[1] * Δx[2] * ...) @nloops( $D, i, d -> inds[d], # for i_d ∈ 1:L @@ -368,7 +368,7 @@ end vals_first, vals_tail = first(vals), Base.tail(vals) istart_first, istart_tail = first(inds_start), Base.tail(inds_start) N, Ns_tail = first(Ns), Base.tail(Ns) - gprod_base = prod(Δxs) # this factor is needed in interpolation only + gprod_base = prefactor # this factor is needed in interpolation only @inbounds for I_tail ∈ CartesianIndices(inds_tail) is_tail = Tuple(I_tail) gs_tail = map(inbounds_getindex, vals_tail, is_tail) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 3df8aad..ed5affb 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -34,6 +34,9 @@ end @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns @inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1) # actual number of complex elements in first dimension +@inline spread_real_dims(::Type{<:Real}, Ns) = Ns +@inline spread_real_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] << 1, 1) + @inline function spread_onto_arrays_gpu!( us::NTuple{C, AbstractArray{T, D}}, indvals::NTuple{D, <:Pair}, @@ -58,7 +61,7 @@ end end, begin # gprod_1 contains the product vals[1][i_1] * vals[2][i_2] * ... - js = @ntuple $D j + js = @ntuple($D, j) @inbounds for n ∈ 1:$C w = vs[n] * gprod_1 _atomic_add!(us[n], w, js) @@ -125,7 +128,7 @@ function spread_from_points!( backend::GPU, bd::Union{BlockDataGPU, NullBlockData}, gs, - us_all::NTuple{C, AbstractGPUArray}, + us::NTuple{C, AbstractGPUArray}, x⃗s::StructVector, vp_all::NTuple{C, AbstractGPUVector}, ) where {C} @@ -135,16 +138,16 @@ function spread_from_points!( xs_comp = StructArrays.components(x⃗s) - # Reinterpret `us_all` as real arrays, in case they are complex. + # Reinterpret `us` as real arrays, in case they are complex. # This is to avoid current issues with atomic operations on complex data # (https://github.com/JuliaGPU/KernelAbstractions.jl/issues/497). - Z = eltype(us_all[1]) + Z = eltype(us[1]) T = real(Z) us_real = if Z <: Real - us_all + us else # complex case @assert Z <: Complex - map(u -> reinterpret(T, u), us_all) # note: we don't use reshape, so the first dimension has 2x elements + map(u -> reinterpret(T, u), us) # note: we don't use reshape, so the first dimension has 2x elements end pointperm = get_pointperm(bd) # nothing in case of NullBlockData @@ -156,27 +159,50 @@ function spread_from_points!( # We use dynamically sized kernels to avoid recompilation, since number of points may # change from one call to another. - ndrange = size(x⃗s) # iterate through points - workgroupsize = default_workgroupsize(backend, ndrange) + ndrange_points = size(x⃗s) # iterate through points + groupsize_points = default_workgroupsize(backend, ndrange_points) if sort_points === True() vp_sorted = map(similar, vp_all) # allocate temporary arrays for sorted non-uniform data - kernel_perm! = spread_permute_kernel!(backend, workgroupsize) - kernel_perm!(vp_sorted, vp_all, pointperm; ndrange) + let ndrange = ndrange_points, groupsize = groupsize_points + kernel_perm! = spread_permute_kernel!(backend, groupsize) + kernel_perm!(vp_sorted, vp_all, pointperm; ndrange) + end pointperm_ = nothing # we don't need any further permutations (all accesses to non-uniform data will be contiguous) else vp_sorted = vp_all pointperm_ = pointperm end - kernel! = spread_from_point_naive_kernel!(backend, workgroupsize) - kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange) + method = gpu_method(bd) + + if method === :global_memory + let ndrange = ndrange_points, groupsize = groupsize_points + kernel! = spread_from_point_naive_kernel!(backend, groupsize) + kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_; ndrange) + end + elseif method === :shared_memory + @assert bd isa BlockDataGPU + Z = eltype(us[1]) + M = Kernels.half_support(gs[1]) + @assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal + block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M)) # this is usually a compile-time constant... + block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch + @assert block_dims_val === bd.block_dims + let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) + block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array + groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) + ndrange = groupsize .* ngroups + kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) + kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block, block_dims) + end + end if sort_points === True() foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays end - us_all + us end @kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N} @@ -187,3 +213,180 @@ end end nothing end + +## ==================================================================================================== ## +## Shared-memory implementation + +@kernel function spread_from_points_shmem_kernel!( + us::NTuple{C}, + @Const(gs::NTuple{D}), + @Const(points::NTuple{D}), + @Const(vp::NTuple{C}), + @Const(pointperm), + @Const(cumulative_npoints_per_block::AbstractVector), + ::Val{block_dims}, + ) where {C, D, block_dims} + groupsize = @groupsize()::Dims{D} + nthreads = prod(groupsize) + threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) + threadidx = @index(Local, Linear) # in 1:nthreads + block_index = @index(Group, NTuple) # workgroup index (= block index) + block_n = @index(Group, Linear) # linear index of block + + # Determine grid dimensions. + # Note that, to avoid problems with @atomic, we currently work with real-data arrays + # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice + # the actual dataset dimensions. + Z = eltype(vp[1]) + T = eltype(us[1]) # may be Z or real(Z) + @assert T === real(Z) # output is a real array (but may actually describe complex data) + Ns_real = size(first(us)) # dimensions of raw input data + Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex + + M = Kernels.half_support(gs[1]) # assume they're all equal + block_dims_padded = @. block_dims + 2M - 1 + Ns_local = spread_real_dims(Z, block_dims_padded) # multiplies first dimension by 2 if Z <: Complex (note that T <: Real) + u_local = @localmem(T, Ns_local) # allocate shared memory + + # This block will take care of non-uniform points (a + 1):b + @inbounds a = cumulative_npoints_per_block[block_n] + @inbounds b = cumulative_npoints_per_block[block_n + 1] + + # Shift from indices in global array to indices in local array + idx_shift = @. (block_index - 1) * block_dims + 1 + + # Interpolate components one by one (to avoid using too much memory) + for c ∈ 1:C + # Reset shared memory to zero + for i ∈ threadidx:nthreads:length(u_local) + @inbounds u_local[i] = 0 + end + + @synchronize # make sure shared memory is fully set to zero + + for i in (a + threadidx):nthreads:b + # Spread from point j + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + + # TODO: can we do this just once for all components? (if C > 1) + indvals = ntuple(Val(D)) do n + @inline + x = @inbounds points[n][j] + gdata = Kernels.evaluate_kernel(gs[n], x) + local vals = gdata.values # kernel values + local M = Kernels.half_support(gs[n]) + local i₀ = gdata.i - idx_shift[n] + # @assert i₀ ≥ 0 + # @assert i₀ + 2M ≤ block_dims_padded[n] + i₀ => vals + end + + @inbounds v = vp[c][j] + spread_onto_array_shmem!(u_local, indvals, v) + end + + @synchronize # make sure we have finished writing to shared memory + + add_from_local_to_global_memory!(Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize) + + if c < C + @synchronize # wait before jumping to next component + end + end + + nothing +end + +# Spread a single "component" (one transform at a time). +@inline function spread_onto_array_shmem!( + u_local::AbstractArray{T, D}, + indvals::NTuple{D}, + v::Z, + ) where {T, D, Z} + if @generated + gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! + Tr = real(T) + quote + inds_start = map(first, indvals) # start of active region in output array + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + $gprod_init = one($Tr) # product of kernel values (initially 1) + @nloops( + $D, i, + d -> inds[d], # for i_d ∈ 1:L + d -> begin + @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] + @inbounds j_d = inds_start[d] + i_d + end, + begin + js = @ntuple($D, j) + w = v * gprod_1 + _atomic_add!(u_local, w, js) # this is slow!! (atomics in shared memory) -- is it Atomix's fault? + end, + ) + v + end + else + aaa # TODO: implement + end +end + +# Add values from shared to global memory. +@inline function add_from_local_to_global_memory!( + ::Type{Z}, # "logical" type of data (e.g. ComplexF64) + u_global::AbstractArray{T, D}, # actual data is always real + u_local::AbstractArray{T, D}, + Ns::Dims{D}, + ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, + ) where {Z, T, D, M} + if @generated + @assert T <: Real # for atomic operations + skip = sizeof(Z) ÷ sizeof(T) # 1 (Z real) or 2 (Z complex) + quote + # @assert skip ∈ (1, 2) + skips = @ntuple($D, n -> n == 1 ? $skip : 1) + inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end]) + offsets = @ntuple( + $D, + n -> let + off = (block_index[n] - 1) * block_dims[n] - ($M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + ) + @nloops( + $D, i, + d -> inds[d], + d -> begin + j_d = offsets[d] + i_d + j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) + end, + begin + is = @ntuple($D, i) + js = @ntuple($D, j) + @inbounds if $skip === 1 # real data (Z <: Real) + w = u_local[is...] + Atomix.@atomic u_global[js...] += w + elseif $skip === 2 # complex data (Z <: Complex) + is_tail = @ntuple($(D - 1), d -> i_{d + 1}) + js_tail = @ntuple($(D - 1), d -> j_{d + 1}) + ifirst = 2 * i_1 - 1 + jfirst = 2 * j_1 - 1 + w = u_local[ifirst, is_tail...] # real part + Atomix.@atomic u_global[jfirst, js_tail...] += w + w = u_local[ifirst + 1, is_tail...] # imaginary part + Atomix.@atomic u_global[jfirst + 1, js_tail...] += w + end + end, + ) + nothing + end + else + aaa # TODO: implement + end +end + +## ==================================================================================================== ## From 87f050cd8e31bbf4cc2f973eac9f6e18e1f15cfc Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 22:00:51 +0200 Subject: [PATCH 10/33] SM kernels now work with KA CPU backends --- src/interpolation/gpu.jl | 45 +++++++++++++------------ src/spreading/gpu.jl | 71 +++++++++++++++++++++------------------- 2 files changed, 62 insertions(+), 54 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 41c86cd..cdedd67 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -53,33 +53,28 @@ end end @kernel function interpolate_to_points_shmem_kernel!( - vp::NTuple{C}, - @Const(gs::NTuple{D}), + vp::NTuple{C, AbstractVector{Z}}, + @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}), @Const(points::NTuple{D}), - @Const(us::NTuple{C}), + @Const(us::NTuple{C, AbstractArray{Z}}), @Const(pointperm), @Const(cumulative_npoints_per_block::AbstractVector), @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) ::Val{block_dims}, - ) where {C, D, block_dims} - groupsize = @groupsize()::Dims{D} - nthreads = prod(groupsize) + ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) + ) where {C, D, Z <: Number, M, block_dims, shmem_size} + + @uniform begin + groupsize = @groupsize()::Dims{D} + nthreads = prod(groupsize) + end + threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads block_index = @index(Group, NTuple) # workgroup index (= block index) block_n = @index(Group, Linear) # linear index of block - T = eltype(us[1]) - M = Kernels.half_support(gs[1]) # assume they're all equal - block_dims_padded = @. block_dims + 2M - 1 - u_local = @localmem(T, block_dims_padded) # allocate shared memory - - # This block will take care of non-uniform points (a + 1):b - @inbounds a = cumulative_npoints_per_block[block_n] - @inbounds b = cumulative_npoints_per_block[block_n + 1] - - # Shift from indices in global array to indices in local array - idx_shift = @. (block_index - 1) * block_dims + 1 + u_local = @localmem(Z, shmem_size) # allocate shared memory # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C @@ -88,6 +83,10 @@ end @synchronize # make sure all threads have the same shared data + # This block will take care of non-uniform points (a + 1):b + @inbounds a = cumulative_npoints_per_block[block_n] + @inbounds b = cumulative_npoints_per_block[block_n + 1] + for i in (a + threadidx):nthreads:b # Interpolate at point j j = if pointperm === nothing @@ -102,8 +101,8 @@ end x = @inbounds points[n][j] gdata = Kernels.evaluate_kernel(gs[n], x) local vals = gdata.values # kernel values - local M = Kernels.half_support(gs[n]) - local i₀ = gdata.i - idx_shift[n] + local ishift = (block_index[n] - 1) * block_dims[n] + 1 + local i₀ = gdata.i - ishift # @assert i₀ ≥ 0 # @assert i₀ + 2M ≤ block_dims_padded[n] i₀ => vals @@ -268,11 +267,15 @@ function interpolate!( @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array + shmem_size = block_dims_padded groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) - # @show ngroups block_dims block_dims_padded ndrange = groupsize .* ngroups kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) - kernel!(vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, prefactor, block_dims) + kernel!( + vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, + prefactor, block_dims, + Val(shmem_size), + ) end end diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index ed5affb..88c798e 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -191,10 +191,15 @@ function spread_from_points!( @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array + shmem_size = spread_real_dims(Z, block_dims_padded) groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) ndrange = groupsize .* ngroups kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) - kernel!(us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block, block_dims) + kernel!( + us_real, gs, xs_comp, vp_sorted, pointperm_, + bd.cumulative_npoints_per_block, block_dims, + Val(shmem_size), + ) end end @@ -218,42 +223,35 @@ end ## Shared-memory implementation @kernel function spread_from_points_shmem_kernel!( - us::NTuple{C}, - @Const(gs::NTuple{D}), + us::NTuple{C, AbstractArray{T}}, + @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}), @Const(points::NTuple{D}), - @Const(vp::NTuple{C}), + @Const(vp::NTuple{C, AbstractVector{Z}}), @Const(pointperm), @Const(cumulative_npoints_per_block::AbstractVector), ::Val{block_dims}, - ) where {C, D, block_dims} - groupsize = @groupsize()::Dims{D} - nthreads = prod(groupsize) + ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) + ) where {C, D, T <: AbstractFloat, Z <: Number, M, block_dims, shmem_size} + + @uniform begin + groupsize = @groupsize()::Dims{D} + nthreads = prod(groupsize) + + # Determine grid dimensions. + # Note that, to avoid problems with @atomic, we currently work with real-data arrays + # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice + # the actual dataset dimensions. + @assert T === real(Z) # output is a real array (but may actually describe complex data) + Ns_real = size(first(us)) # dimensions of raw input data + Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex + end + + block_n = @index(Group, Linear) # linear index of block + block_index = @index(Group, NTuple) # workgroup index (= block index) threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads - block_index = @index(Group, NTuple) # workgroup index (= block index) - block_n = @index(Group, Linear) # linear index of block - # Determine grid dimensions. - # Note that, to avoid problems with @atomic, we currently work with real-data arrays - # even when the output is complex. So, if `Z <: Complex`, the first dimension has twice - # the actual dataset dimensions. - Z = eltype(vp[1]) - T = eltype(us[1]) # may be Z or real(Z) - @assert T === real(Z) # output is a real array (but may actually describe complex data) - Ns_real = size(first(us)) # dimensions of raw input data - Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex - - M = Kernels.half_support(gs[1]) # assume they're all equal - block_dims_padded = @. block_dims + 2M - 1 - Ns_local = spread_real_dims(Z, block_dims_padded) # multiplies first dimension by 2 if Z <: Complex (note that T <: Real) - u_local = @localmem(T, Ns_local) # allocate shared memory - - # This block will take care of non-uniform points (a + 1):b - @inbounds a = cumulative_npoints_per_block[block_n] - @inbounds b = cumulative_npoints_per_block[block_n + 1] - - # Shift from indices in global array to indices in local array - idx_shift = @. (block_index - 1) * block_dims + 1 + u_local = @localmem(T, shmem_size) # allocate shared memory # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C @@ -264,6 +262,10 @@ end @synchronize # make sure shared memory is fully set to zero + # This block will take care of non-uniform points (a + 1):b + @inbounds a = cumulative_npoints_per_block[block_n] + @inbounds b = cumulative_npoints_per_block[block_n + 1] + for i in (a + threadidx):nthreads:b # Spread from point j j = if pointperm === nothing @@ -278,8 +280,8 @@ end x = @inbounds points[n][j] gdata = Kernels.evaluate_kernel(gs[n], x) local vals = gdata.values # kernel values - local M = Kernels.half_support(gs[n]) - local i₀ = gdata.i - idx_shift[n] + local ishift = (block_index[n] - 1) * block_dims[n] + 1 + local i₀ = gdata.i - ishift # @assert i₀ ≥ 0 # @assert i₀ + 2M ≤ block_dims_padded[n] i₀ => vals @@ -291,7 +293,10 @@ end @synchronize # make sure we have finished writing to shared memory - add_from_local_to_global_memory!(Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize) + add_from_local_to_global_memory!( + Z, us[c], u_local, Ns, Val(M), block_index, + block_dims, threadidxs, groupsize, + ) if c < C @synchronize # wait before jumping to next component From 46acbda99b64023f69303c05d9fc72268190a108 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 18 Oct 2024 22:07:20 +0200 Subject: [PATCH 11/33] Test SM kernels on CPU --- test/pseudo_gpu.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl index e68162b..6244856 100644 --- a/test/pseudo_gpu.jl +++ b/test/pseudo_gpu.jl @@ -6,6 +6,9 @@ # JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels # seem to fail for other reasons. +# TODO: +# - use new JLBackend in the latest GPUArrays + using NonuniformFFTs using StaticArrays: SVector # for convenience using KernelAbstractions: KernelAbstractions as KA @@ -116,4 +119,8 @@ end @testset "No blocking" begin # spatial sorting disabled compare_with_cpu(ComplexF64, dims; block_size = nothing) end + @testset "gpu_method = :shared_memory" begin + @testset "Float32" compare_with_cpu(Float32, dims; gpu_method = :shared_memory) + @testset "ComplexF32" compare_with_cpu(ComplexF32, dims; gpu_method = :shared_memory) + end end From 3fd04b955688a1044769b51ea4c63ad1bd146450 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Sat, 19 Oct 2024 07:27:46 +0200 Subject: [PATCH 12/33] Fix kernel compilation on CUDA --- src/interpolation/gpu.jl | 5 +++-- src/spreading/gpu.jl | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index cdedd67..9ebbe42 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -54,7 +54,7 @@ end @kernel function interpolate_to_points_shmem_kernel!( vp::NTuple{C, AbstractVector{Z}}, - @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}), + @Const(gs::NTuple{D}), @Const(points::NTuple{D}), @Const(us::NTuple{C, AbstractArray{Z}}), @Const(pointperm), @@ -62,7 +62,7 @@ end @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) ::Val{block_dims}, ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) - ) where {C, D, Z <: Number, M, block_dims, shmem_size} + ) where {C, D, Z <: Number, block_dims, shmem_size} @uniform begin groupsize = @groupsize()::Dims{D} @@ -79,6 +79,7 @@ end # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C # Copy grid data from global to shared memory + M = Kernels.half_support(gs[1]) gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize) @synchronize # make sure all threads have the same shared data diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 88c798e..be8d111 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -224,14 +224,14 @@ end @kernel function spread_from_points_shmem_kernel!( us::NTuple{C, AbstractArray{T}}, - @Const(gs::NTuple{D, AbstractKernelData{<:Any, M}}), + @Const(gs::NTuple{D}), @Const(points::NTuple{D}), @Const(vp::NTuple{C, AbstractVector{Z}}), @Const(pointperm), @Const(cumulative_npoints_per_block::AbstractVector), ::Val{block_dims}, ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) - ) where {C, D, T <: AbstractFloat, Z <: Number, M, block_dims, shmem_size} + ) where {C, D, T, Z, block_dims, shmem_size} @uniform begin groupsize = @groupsize()::Dims{D} @@ -293,6 +293,7 @@ end @synchronize # make sure we have finished writing to shared memory + M = Kernels.half_support(gs[1]) add_from_local_to_global_memory!( Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize, From 6b0fad69a16731f49a47b475da78f88199fa3f82 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Sun, 20 Oct 2024 21:28:25 +0200 Subject: [PATCH 13/33] Reorganise some code --- src/interpolation/gpu.jl | 329 ++++++++++++++++++++------------------- src/spreading/gpu.jl | 2 +- 2 files changed, 167 insertions(+), 164 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 9ebbe42..cbb48f8 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -52,169 +52,6 @@ end nothing end -@kernel function interpolate_to_points_shmem_kernel!( - vp::NTuple{C, AbstractVector{Z}}, - @Const(gs::NTuple{D}), - @Const(points::NTuple{D}), - @Const(us::NTuple{C, AbstractArray{Z}}), - @Const(pointperm), - @Const(cumulative_npoints_per_block::AbstractVector), - @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) - ::Val{block_dims}, - ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) - ) where {C, D, Z <: Number, block_dims, shmem_size} - - @uniform begin - groupsize = @groupsize()::Dims{D} - nthreads = prod(groupsize) - end - - threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) - threadidx = @index(Local, Linear) # in 1:nthreads - block_index = @index(Group, NTuple) # workgroup index (= block index) - block_n = @index(Group, Linear) # linear index of block - - u_local = @localmem(Z, shmem_size) # allocate shared memory - - # Interpolate components one by one (to avoid using too much memory) - for c ∈ 1:C - # Copy grid data from global to shared memory - M = Kernels.half_support(gs[1]) - gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize) - - @synchronize # make sure all threads have the same shared data - - # This block will take care of non-uniform points (a + 1):b - @inbounds a = cumulative_npoints_per_block[block_n] - @inbounds b = cumulative_npoints_per_block[block_n + 1] - - for i in (a + threadidx):nthreads:b - # Interpolate at point j - j = if pointperm === nothing - i - else - @inbounds pointperm[i] - end - - # TODO: can we do this just once for all components? (if C > 1) - indvals = ntuple(Val(D)) do n - @inline - x = @inbounds points[n][j] - gdata = Kernels.evaluate_kernel(gs[n], x) - local vals = gdata.values # kernel values - local ishift = (block_index[n] - 1) * block_dims[n] + 1 - local i₀ = gdata.i - ishift - # @assert i₀ ≥ 0 - # @assert i₀ + 2M ≤ block_dims_padded[n] - i₀ => vals - end - - v = interpolate_from_arrays_shmem(u_local, indvals, prefactor) - @inbounds vp[c][j] = v - end - end - - nothing -end - -# Copy values from global to shared memory. -# - can we optimise memory access patterns? -@inline function gridvalues_to_local_memory!( - u_local::AbstractArray{T, D}, - u_global::AbstractArray{T, D}, - ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, - ) where {T, D, M} - if @generated - quote - Ns = size(u_global) - inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end]) - offsets = @ntuple( - $D, - n -> let - off = (block_index[n] - 1) * block_dims[n] - ($M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) - end - ) - @nloops( - $D, i, - d -> inds[d], - d -> begin - j_d = offsets[d] + i_d - j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) - end, - begin - is = @ntuple($D, i) - js = @ntuple($D, j) - @inbounds u_local[is...] = u_global[js...] - end, - ) - nothing - end - else - Ns = size(u_global) - inds = ntuple(Val(D)) do n - axes(u_local, n)[threadidxs[n]:groupsize[n]:end] # this determines the parallelisation pattern - end - offsets = ntuple(Val(D)) do n - @inline - off = (block_index[n] - 1) * block_dims[n] - (M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) - end - @inbounds for is ∈ Iterators.product(inds...) - js = ntuple(Val(D)) do n - @inline - j = offsets[n] + is[n] - ifelse(j > Ns[n], j - Ns[n], j) - end - u_local[is...] = u_global[js...] - end - nothing - end -end - -# Interpolate a single "component" (one transform at a time). -# Here vp is a vector instead of a tuple of vectors. -@inline function interpolate_from_arrays_shmem( - u_local::AbstractArray{T, D}, - indvals::NTuple{D}, - prefactor, - ) where {T, D} - if @generated - gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! - quote - $gprod_init = prefactor - v = zero(T) - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - @nloops( - $D, i, - d -> inds[d], - d -> begin - @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] - @inbounds j_d = inds_start[d] + i_d - end, - begin - js = @ntuple($D, j) - @inbounds v += gprod_1 * u_local[js...] - end, - ) - v - end - else - v = zero(T) - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - @inbounds for I ∈ CartesianIndices(inds) - gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D))) - js = inds_start .+ Tuple(I) - v += gprod * u_local[js...] - end - v - end -end - function interpolate!( backend::GPU, bd::Union{BlockDataGPU, NullBlockData}, @@ -406,3 +243,169 @@ end end nothing end + +## ========================================================================================== ## +## Shared-memory implementation + +@kernel function interpolate_to_points_shmem_kernel!( + vp::NTuple{C, AbstractVector{Z}}, + @Const(gs::NTuple{D}), + @Const(points::NTuple{D}), + @Const(us::NTuple{C, AbstractArray{Z}}), + @Const(pointperm), + @Const(cumulative_npoints_per_block::AbstractVector), + @Const(prefactor::Real), # = volume of a grid cell = prod(Δxs) + ::Val{block_dims}, + ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) + ) where {C, D, Z <: Number, block_dims, shmem_size} + + @uniform begin + groupsize = @groupsize()::Dims{D} + nthreads = prod(groupsize) + end + + threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) + threadidx = @index(Local, Linear) # in 1:nthreads + block_index = @index(Group, NTuple) # workgroup index (= block index) + block_n = @index(Group, Linear) # linear index of block + + u_local = @localmem(Z, shmem_size) # allocate shared memory + + # Interpolate components one by one (to avoid using too much memory) + for c ∈ 1:C + # Copy grid data from global to shared memory + M = Kernels.half_support(gs[1]) + gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize) + + @synchronize # make sure all threads have the same shared data + + # This block will take care of non-uniform points (a + 1):b + @inbounds a = cumulative_npoints_per_block[block_n] + @inbounds b = cumulative_npoints_per_block[block_n + 1] + + for i in (a + threadidx):nthreads:b + # Interpolate at point j + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + + # TODO: can we do this just once for all components? (if C > 1) + indvals = ntuple(Val(D)) do n + @inline + x = @inbounds points[n][j] + gdata = Kernels.evaluate_kernel(gs[n], x) + local vals = gdata.values # kernel values + local ishift = (block_index[n] - 1) * block_dims[n] + 1 + local i₀ = gdata.i - ishift + # @assert i₀ ≥ 0 + # @assert i₀ + 2M ≤ block_dims_padded[n] + i₀ => vals + end + + v = interpolate_from_arrays_shmem(u_local, indvals, prefactor) + @inbounds vp[c][j] = v + end + end + + nothing +end + +# Copy values from global to shared memory. +# - can we optimise memory access patterns? +@inline function gridvalues_to_local_memory!( + u_local::AbstractArray{T, D}, + u_global::AbstractArray{T, D}, + ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, + ) where {T, D, M} + if @generated + quote + Ns = size(u_global) + inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end]) + offsets = @ntuple( + $D, + n -> let + off = (block_index[n] - 1) * block_dims[n] - ($M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + ) + @nloops( + $D, i, + d -> inds[d], + d -> begin + j_d = offsets[d] + i_d + j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) + end, + begin + is = @ntuple($D, i) + js = @ntuple($D, j) + @inbounds u_local[is...] = u_global[js...] + end, + ) + nothing + end + else + Ns = size(u_global) + inds = ntuple(Val(D)) do n + axes(u_local, n)[threadidxs[n]:groupsize[n]:end] # this determines the parallelisation pattern + end + offsets = ntuple(Val(D)) do n + @inline + off = (block_index[n] - 1) * block_dims[n] - (M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + @inbounds for is ∈ Iterators.product(inds...) + js = ntuple(Val(D)) do n + @inline + j = offsets[n] + is[n] + ifelse(j > Ns[n], j - Ns[n], j) + end + u_local[is...] = u_global[js...] + end + nothing + end +end + +# Interpolate a single "component" (one transform at a time). +# Here vp is a vector instead of a tuple of vectors. +@inline function interpolate_from_arrays_shmem( + u_local::AbstractArray{T, D}, + indvals::NTuple{D}, + prefactor, + ) where {T, D} + if @generated + gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! + quote + $gprod_init = prefactor + v = zero(T) + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + @nloops( + $D, i, + d -> inds[d], + d -> begin + @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] + @inbounds j_d = inds_start[d] + i_d + end, + begin + js = @ntuple($D, j) + @inbounds v += gprod_1 * u_local[js...] + end, + ) + v + end + else + v = zero(T) + inds_start = map(first, indvals) + vals = map(last, indvals) # evaluated kernel values in each direction + inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + @inbounds for I ∈ CartesianIndices(inds) + gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D))) + js = inds_start .+ Tuple(I) + v += gprod * u_local[js...] + end + v + end +end diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index be8d111..910c47f 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -219,7 +219,7 @@ end nothing end -## ==================================================================================================== ## +## ========================================================================================== ## ## Shared-memory implementation @kernel function spread_from_points_shmem_kernel!( From 867b73e3c7ae30ce61e88fd135a1092ea67e0af5 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Sun, 20 Oct 2024 23:09:12 +0200 Subject: [PATCH 14/33] [WIP] avoid atomics in shared-memory arrays --- src/Kernels/kaiser_bessel_backwards.jl | 35 +++++++ src/spreading/gpu.jl | 139 ++++++++++++++++++++++--- 2 files changed, 157 insertions(+), 17 deletions(-) diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl index 45129cd..e894e1a 100644 --- a/src/Kernels/kaiser_bessel_backwards.jl +++ b/src/Kernels/kaiser_bessel_backwards.jl @@ -143,3 +143,38 @@ function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M (; i, values,) end end + +# TODO: remove? +function evaluate_kernel_direct( + g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, x::T, + ) where {M, T} + (; Δx, β,) = g + # i = point_to_cell(x, Δx) + X = x / Δx - T(i - 1) # source position relative to xs[i] + # @assert 0 ≤ X < 1 + Xc = 2 * X - 1 # in [-1, 1) + L = 2M + ntuple(Val(L)) do j + h = 1 - 2 * (j - one(T)/2) / L # midpoint of interval + δ = 1 / L # half-width of interval + y = h + Xc * δ + @fastmath s = sqrt(1 - y^2) + @fastmath sinh(β * s)::T / (s * T(π)) + end +end + +function evaluate_kernel_direct( + g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, n::Integer, x::T, + ) where {M, T} + (; Δx, β,) = g + # @assert 1 ≤ n ≤ 2M + X = x / Δx - T(i - 1) # source position relative to xs[i] + # @assert 0 ≤ X < 1 + Xc = 2 * X - 1 # in [-1, 1) + L = 2M + h = 1 - 2 * (n - one(T)/2) / L # midpoint of interval + δ = 1 / L # half-width of interval + y = h + Xc * δ + @fastmath s = sqrt(1 - y^2) + @fastmath sinh(β * s)::T / (s * T(π)) +end diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 910c47f..16813db 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -122,6 +122,22 @@ end nothing end +# Same as _atomic_add!, but without the atomics. +@inline function _add_maybecomplex!(u::AbstractArray{T}, v::T, inds::Tuple) where {T <: Real} + @inbounds u[inds...] += v + nothing +end + +@inline function _add_maybecomplex!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real} + @inbounds begin + i₁ = 2 * (inds[1] - 1) # convert from logical index (equivalent complex array) to memory index (real array) + itail = Base.tail(inds) + u[i₁ + 1, itail...] += real(v) + u[i₁ + 2, itail...] += imag(v) + end + nothing +end + # GPU implementation. # We assume all arrays are already on the GPU. function spread_from_points!( @@ -197,7 +213,7 @@ function spread_from_points!( kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) kernel!( us_real, gs, xs_comp, vp_sorted, pointperm_, - bd.cumulative_npoints_per_block, block_dims, + bd.cumulative_npoints_per_block, HalfSupport(M), block_dims, Val(shmem_size), ) end @@ -229,9 +245,10 @@ end @Const(vp::NTuple{C, AbstractVector{Z}}), @Const(pointperm), @Const(cumulative_npoints_per_block::AbstractVector), + ::HalfSupport{M}, ::Val{block_dims}, ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) - ) where {C, D, T, Z, block_dims, shmem_size} + ) where {C, D, T, Z, M, block_dims, shmem_size} @uniform begin groupsize = @groupsize()::Dims{D} @@ -251,7 +268,13 @@ end threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads - u_local = @localmem(T, shmem_size) # allocate shared memory + # Allocate static shared memory + u_local = @localmem(T, shmem_size) + window_vals = ntuple(Val(D)) do _ + @assert T <: Real + @localmem(T, 2M) + end + inds_start = @localmem(Int, D) # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C @@ -266,7 +289,39 @@ end @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] - for i in (a + threadidx):nthreads:b + # for i in (a + threadidx):nthreads:b + # # Spread from point j + # j = if pointperm === nothing + # i + # else + # @inbounds pointperm[i] + # end + # + # # TODO: can we do this just once for all components? (if C > 1) + # indvals = ntuple(Val(D)) do n + # @inline + # local x = @inbounds points[n][j] + # local g = gs[n] + # local Δx = Kernels.gridstep(g) + # icell = @inline Kernels.point_to_cell(x, Δx) + # vals = @inline Kernels.evaluate_kernel_direct(g, icell, x) + # # gdata = Kernels.evaluate_kernel(g, x) + # # local vals = gdata.values # kernel values + # local ishift = (block_index[n] - 1) * block_dims[n] + 1 + # local i₀ = icell - ishift + # # local i₀ = gdata.i - ishift + # # @assert i₀ ≥ 0 + # # @assert i₀ + 2M ≤ block_dims_padded[n] + # i₀ => vals + # end + # + # @inbounds v = vp[c][j] + # spread_onto_array_shmem!(u_local, indvals, v) + # end + + # All threads iterate over the same non-uniform points. + # This is to avoid atomic operations on shared memory, which seems to be quite slow. + for i in (a + 1):b # Spread from point j j = if pointperm === nothing i @@ -274,26 +329,39 @@ end @inbounds pointperm[i] end - # TODO: can we do this just once for all components? (if C > 1) - indvals = ntuple(Val(D)) do n - @inline - x = @inbounds points[n][j] - gdata = Kernels.evaluate_kernel(gs[n], x) - local vals = gdata.values # kernel values - local ishift = (block_index[n] - 1) * block_dims[n] + 1 - local i₀ = gdata.i - ishift - # @assert i₀ ≥ 0 - # @assert i₀ + 2M ≤ block_dims_padded[n] - i₀ => vals + # Parallel evaluation of window functions over all dimensions + let + inds = CartesianIndices((1:D, 1:2M)) + subinds = (1:length(inds))[threadidx:nthreads:end] + @inbounds for n ∈ subinds + d, m = Tuple(inds[n]) # dimension, support point + x = points[d][j] + local g = gs[d] + local Δx = Kernels.gridstep(g) + local icell = @inline Kernels.point_to_cell(x, Δx) + local ishift = (block_index[d] - 1) * block_dims[d] + 1 + local i₀ = icell - ishift + inds_start[d] = i₀ # multiple threads may write this, but that's ok + window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x) + end + end + + # TODO: + # - can we do this just once for all components? (if C > 1) + indvals = ntuple(Val(D)) do d + inds_start[d] => window_vals[d] end @inbounds v = vp[c][j] - spread_onto_array_shmem!(u_local, indvals, v) + + @synchronize + + # Spread in parallel + spread_onto_array_shmem_threads!(u_local, indvals, v, threadidxs, groupsize) end @synchronize # make sure we have finished writing to shared memory - M = Kernels.half_support(gs[1]) add_from_local_to_global_memory!( Z, us[c], u_local, Ns, Val(M), block_index, block_dims, threadidxs, groupsize, @@ -341,6 +409,43 @@ end end end +# Spread a single "component" (one transform at a time). +# This is parallelised across threads in a workgroup. +@inline function spread_onto_array_shmem_threads!( + u_local::AbstractArray{T, D}, + indvals::NTuple{D}, + v::Z, + threadidxs::NTuple{D}, groupsize::NTuple{D}, + ) where {T, D, Z} + if @generated + gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! + Tr = real(T) + quote + inds_start = map(first, indvals) # start of active region in output array + vals = map(last, indvals) # evaluated kernel values in each direction + # inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + inds = @ntuple($D, n -> eachindex(vals[n])[threadidxs[n]:groupsize[n]:end]) + $gprod_init = one($Tr) # product of kernel values (initially 1) + @nloops( + $D, i, + d -> inds[d], # for i_d ∈ 1:L + d -> begin + @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] + @inbounds j_d = inds_start[d] + i_d + end, + begin + js = @ntuple($D, j) + w = v * gprod_1 + _add_maybecomplex!(u_local, w, js) + end, + ) + v + end + else + aaa # TODO: implement + end +end + # Add values from shared to global memory. @inline function add_from_local_to_global_memory!( ::Type{Z}, # "logical" type of data (e.g. ComplexF64) From 801eb5eb9f4f6b961ab0b4cdeb66dd475441ec84 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 10:25:58 +0200 Subject: [PATCH 15/33] Avoid atomic operations on shared memory --- src/spreading/gpu.jl | 119 ++++++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 47 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 16813db..a5e89d5 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -321,7 +321,7 @@ end # All threads iterate over the same non-uniform points. # This is to avoid atomic operations on shared memory, which seems to be quite slow. - for i in (a + 1):b + @inbounds for i in (a + 1):b # Spread from point j j = if pointperm === nothing i @@ -336,36 +336,35 @@ end @inbounds for n ∈ subinds d, m = Tuple(inds[n]) # dimension, support point x = points[d][j] - local g = gs[d] - local Δx = Kernels.gridstep(g) - local icell = @inline Kernels.point_to_cell(x, Δx) - local ishift = (block_index[d] - 1) * block_dims[d] + 1 - local i₀ = icell - ishift + g = gs[d] + Δx = Kernels.gridstep(g) + icell = @inline Kernels.point_to_cell(x, Δx) + ishift = (block_index[d] - 1) * block_dims[d] + 1 + i₀ = icell - ishift inds_start[d] = i₀ # multiple threads may write this, but that's ok window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x) end end - # TODO: - # - can we do this just once for all components? (if C > 1) - indvals = ntuple(Val(D)) do d - inds_start[d] => window_vals[d] - end - @inbounds v = vp[c][j] @synchronize + inds_start_t = ntuple(d -> @inbounds(inds_start[d]), Val(D)) # as a tuple + # Spread in parallel - spread_onto_array_shmem_threads!(u_local, indvals, v, threadidxs, groupsize) + spread_onto_array_shmem_threads!(u_local, inds_start_t, window_vals, v; threadidxs, groupsize, threadidx, nthreads) end @synchronize # make sure we have finished writing to shared memory - add_from_local_to_global_memory!( - Z, us[c], u_local, Ns, Val(M), block_index, - block_dims, threadidxs, groupsize, - ) + if a < b # skip this step if there were actually no points in this block (if a == b) + add_from_local_to_global_memory!( + Z, us[c], u_local, Ns, Val(M); + block_index, block_dims, threadidxs, groupsize, + threadidx, nthreads, + ) + end if c < C @synchronize # wait before jumping to next component @@ -413,37 +412,25 @@ end # This is parallelised across threads in a workgroup. @inline function spread_onto_array_shmem_threads!( u_local::AbstractArray{T, D}, - indvals::NTuple{D}, - v::Z, + inds_start::NTuple{D, Integer}, + vals::NTuple{D, AbstractVector}, # these are static-size shared-memory arrays + v::Z; threadidxs::NTuple{D}, groupsize::NTuple{D}, + threadidx, nthreads, ) where {T, D, Z} - if @generated - gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! - Tr = real(T) - quote - inds_start = map(first, indvals) # start of active region in output array - vals = map(last, indvals) # evaluated kernel values in each direction - # inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - inds = @ntuple($D, n -> eachindex(vals[n])[threadidxs[n]:groupsize[n]:end]) - $gprod_init = one($Tr) # product of kernel values (initially 1) - @nloops( - $D, i, - d -> inds[d], # for i_d ∈ 1:L - d -> begin - @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] - @inbounds j_d = inds_start[d] + i_d - end, - begin - js = @ntuple($D, j) - w = v * gprod_1 - _add_maybecomplex!(u_local, w, js) - end, - ) - v + inds = CartesianIndices(map(eachindex, vals)) + Tr = real(T) + @inbounds for n ∈ threadidx:nthreads:length(inds) + I = inds[n] + js = Tuple(I) .+ inds_start + gprod = one(Tr) + for d ∈ 1:D + gprod *= vals[d][I[d]] end - else - aaa # TODO: implement + w = v * gprod + _add_maybecomplex!(u_local, w, js) end + nothing end # Add values from shared to global memory. @@ -452,13 +439,14 @@ end u_global::AbstractArray{T, D}, # actual data is always real u_local::AbstractArray{T, D}, Ns::Dims{D}, - ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, + ::Val{M}; + block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, + threadidx, nthreads, ) where {Z, T, D, M} if @generated @assert T <: Real # for atomic operations skip = sizeof(Z) ÷ sizeof(T) # 1 (Z real) or 2 (Z complex) quote - # @assert skip ∈ (1, 2) skips = @ntuple($D, n -> n == 1 ? $skip : 1) inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end]) offsets = @ntuple( @@ -496,7 +484,44 @@ end nothing end else - aaa # TODO: implement + # Unlike the @generated case, here we parallelise over linear indices instead of + # separate Cartesian directions. Performance is very similar to the @generated case. + @assert T <: Real # for atomic operations + skip = sizeof(Z) ÷ sizeof(T) # 1 (Z real) or 2 (Z complex) + inds = CartesianIndices( + ntuple(Val(D)) do d + # The logical indices are 1:end÷skip in the first dimension. + d == 1 ? axes(u_local, 1)[1:end÷skip] : axes(u_local, d) + end + ) + offsets = ntuple(Val(D)) do n + @inline + local off = (block_index[n] - 1) * block_dims[n] - (M - 1) + ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + @inbounds for n ∈ threadidx:nthreads:length(inds) + I = inds[n] + is = Tuple(I) + js = ntuple(Val(D)) do d + @inline + j = is[d] + offsets[d] + ifelse(j > Ns[d], j - Ns[d], j) + end + @inbounds if Z <: Real + w = u_local[is...] + Atomix.@atomic u_global[js...] += w + elseif Z <: Complex + is_tail = Base.tail(is) + js_tail = Base.tail(js) + ifirst = 2 * is[1] - 1 + jfirst = 2 * js[1] - 1 + w = u_local[ifirst, is_tail...] # real part + Atomix.@atomic u_global[jfirst, js_tail...] += w + w = u_local[ifirst + 1, is_tail...] # imaginary part + Atomix.@atomic u_global[jfirst + 1, js_tail...] += w + end + end + nothing end end From 24f595629124b8670a585d7544f93e10b7ab6364 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 10:29:21 +0200 Subject: [PATCH 16/33] Minor improvement --- src/spreading/gpu.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index a5e89d5..1a884f7 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -332,8 +332,7 @@ end # Parallel evaluation of window functions over all dimensions let inds = CartesianIndices((1:D, 1:2M)) - subinds = (1:length(inds))[threadidx:nthreads:end] - @inbounds for n ∈ subinds + @inbounds for n ∈ threadidx:nthreads:length(inds) d, m = Tuple(inds[n]) # dimension, support point x = points[d][j] g = gs[d] From 67e4baab54c314b0f2f2a4d48ea4bc9a0cc1c47c Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 10:44:42 +0200 Subject: [PATCH 17/33] Make window_vals a matrix --- src/blocking.jl | 3 ++- src/spreading/gpu.jl | 14 ++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index 2b6a50b..3716218 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -67,7 +67,8 @@ type_length(::Type{<:NTuple{N}}) where {N} = N # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB. # Note that the result is a compile-time constant (on Julia 1.11.1 at least). @inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M} - max_shmem_size = 48 << 10 # 48 KiB -- TODO: make this depend on the actual GPU? + free_shmem = 1 << 10 # how much memory to leave free for other allocations + max_shmem_size = (48 << 10) - free_shmem # 48 KiB -- TODO: make this depend on the actual GPU? max_block_length = max_shmem_size ÷ sizeof(Z) # maximum number of elements in a block m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding) n = m - (2M - 1) # exclude ghost cells diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 1a884f7..37ad3e0 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -270,10 +270,8 @@ end # Allocate static shared memory u_local = @localmem(T, shmem_size) - window_vals = ntuple(Val(D)) do _ - @assert T <: Real - @localmem(T, 2M) - end + @assert T <: Real + window_vals = @localmem(T, (2M, D)) inds_start = @localmem(Int, D) # Interpolate components one by one (to avoid using too much memory) @@ -341,7 +339,7 @@ end ishift = (block_index[d] - 1) * block_dims[d] + 1 i₀ = icell - ishift inds_start[d] = i₀ # multiple threads may write this, but that's ok - window_vals[d][m] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x) + window_vals[m, d] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x) end end @@ -412,19 +410,19 @@ end @inline function spread_onto_array_shmem_threads!( u_local::AbstractArray{T, D}, inds_start::NTuple{D, Integer}, - vals::NTuple{D, AbstractVector}, # these are static-size shared-memory arrays + window_vals::AbstractArray{T, 2}, # static-size shared-memory array (2M, D) v::Z; threadidxs::NTuple{D}, groupsize::NTuple{D}, threadidx, nthreads, ) where {T, D, Z} - inds = CartesianIndices(map(eachindex, vals)) + inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D))) Tr = real(T) @inbounds for n ∈ threadidx:nthreads:length(inds) I = inds[n] js = Tuple(I) .+ inds_start gprod = one(Tr) for d ∈ 1:D - gprod *= vals[d][I[d]] + gprod *= window_vals[I[d], d] end w = v * gprod _add_maybecomplex!(u_local, w, js) From 765b73eb4f8fe93b88d7712272d0b9c6da45f1ad Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 11:43:50 +0200 Subject: [PATCH 18/33] Implement hybrid parallelisation in SM spreading Much faster! --- src/blocking.jl | 30 ++++++-- src/interpolation/gpu.jl | 4 +- src/plan.jl | 3 +- src/spreading/gpu.jl | 143 ++++++++++++--------------------------- 4 files changed, 73 insertions(+), 107 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index 3716218..9cbe6ec 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -66,8 +66,18 @@ type_length(::Type{<:NTuple{N}}) where {N} = N # We try to make sure that the total block size (including 2M ghost cells in each direction) # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB. # Note that the result is a compile-time constant (on Julia 1.11.1 at least). -@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}) where {Z <: Number, D, M} - free_shmem = 1 << 10 # how much memory to leave free for other allocations +@inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np}) where {Z <: Number, D, M, Np} + T = real(Z) + # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!). + # Here Np is the batch size (batch_size parameter). + shmem_needs = sizeof(T) * ( + 2M * D * Np + # window_vals + D * Np + # points_sm + D * Np # inds_start + ) + ( + sizeof(Z) * Np # vp_sm + ) + free_shmem = shmem_needs # how much memory to leave free for other allocations max_shmem_size = (48 << 10) - free_shmem # 48 KiB -- TODO: make this depend on the actual GPU? max_block_length = max_shmem_size ÷ sizeof(Z) # maximum number of elements in a block m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding) @@ -92,6 +102,7 @@ struct BlockDataGPU{ N, I <: Integer, T <: AbstractFloat, IndexVector <: AbstractVector{I}, SortPoints <: StaticBool, + Np, } <: AbstractBlockData method :: Symbol # method used for spreading and interpolation; options: (1) :global_memory (2) :shared_memory Δxs :: NTuple{N, T} # grid step; the size of a block in units of length is block_dims .* Δxs. @@ -101,18 +112,22 @@ struct BlockDataGPU{ blockidx :: IndexVector # linear index of block associated to each point (length = Np) pointperm :: IndexVector sort_points :: SortPoints + batch_size :: Val{Np} # how many non-uniform points per batch in SM spreading? function BlockDataGPU( method::Symbol, Δxs::NTuple{N, T}, nblocks_per_dir::NTuple{N, I}, block_dims::NTuple{N, I}, - npoints_per_block::V, sort::S, - ) where {N, I, T, V, S} + npoints_per_block::V, sort::S; + batch_size::Val{Np}, + ) where {N, I, T, V, S, Np} + Np::Integer method ∈ (:global_memory, :shared_memory) || throw(ArgumentError("expected gpu_method ∈ (:global_memory, :shared_memory)")) blockidx = similar(npoints_per_block, 0) pointperm = similar(npoints_per_block, 0) - new{N, I, T, V, S}( + new{N, I, T, V, S, Np}( method, Δxs, nblocks_per_dir, block_dims, npoints_per_block, blockidx, pointperm, sort, + batch_size, ) end end @@ -125,17 +140,18 @@ function BlockDataGPU( backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, h::HalfSupport{M}, sort_points::StaticBool; method::Symbol, + batch_size::Val = Val(16), # batch size (in number of non-uniform points) used in spreading if gpu_method == :shared_memory ) where {Z <: Number, D, M} T = real(Z) # in case Z is complex if method === :shared_memory # Override input block size. We try to maximise the use of shared memory. - block_dims = block_dims_gpu_shmem(Z, Ñs, h) + block_dims = block_dims_gpu_shmem(Z, Ñs, h, batch_size) end nblocks_per_dir = map(cld, Ñs, block_dims) # basically equal to ceil(Ñ / block_dim) L = T(2) * π # domain period Δxs = map(N -> L / N, Ñs) # grid step (oversampled grid) cumulative_npoints_per_block = KA.allocate(backend, Int, prod(nblocks_per_dir) + 1) - BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points) + BlockDataGPU(method, Δxs, nblocks_per_dir, block_dims, cumulative_npoints_per_block, sort_points; batch_size) end get_pointperm(bd::BlockDataGPU) = bd.pointperm diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index cbb48f8..b55ed25 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -100,7 +100,7 @@ function interpolate!( Z = eltype(us[1]) M = Kernels.half_support(gs[1]) @assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal - block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M)) # this is usually a compile-time constant... + block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant... block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) @@ -108,6 +108,8 @@ function interpolate!( shmem_size = block_dims_padded groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) ndrange = groupsize .* ngroups + # TODO: + # - try out batches in interpolation? kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) kernel!( vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, diff --git a/src/plan.jl b/src/plan.jl index 61fce96..09b3b3a 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -309,6 +309,7 @@ function _PlanNUFFT( block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend), synchronise::Bool = false, gpu_method::Symbol = :global_memory, + batch_size::Val = Val(16), # currently only used in shared-memory GPU spreading ) where {T <: Number, D} ks = init_wavenumbers(T, Ns) # Determine dimensions of oversampled grid. @@ -343,7 +344,7 @@ function _PlanNUFFT( else block_dims = get_block_dims(Ñs, block_size) if backend isa GPU - blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method) + blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size,) else blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points) FFTW.set_num_threads(Threads.nthreads()) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 37ad3e0..6c76314 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -202,7 +202,7 @@ function spread_from_points!( Z = eltype(us[1]) M = Kernels.half_support(gs[1]) @assert all(g -> Kernels.half_support(g) === M, gs) # check that they're all equal - block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M)) # this is usually a compile-time constant... + block_dims_val = block_dims_gpu_shmem(Z, size(us[1]), HalfSupport(M), bd.batch_size) # this is usually a compile-time constant... block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) @@ -213,8 +213,8 @@ function spread_from_points!( kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) kernel!( us_real, gs, xs_comp, vp_sorted, pointperm_, - bd.cumulative_npoints_per_block, HalfSupport(M), block_dims, - Val(shmem_size), + bd.cumulative_npoints_per_block, HalfSupport(M), + block_dims, Val(shmem_size), bd.batch_size, ) end end @@ -238,6 +238,10 @@ end ## ========================================================================================== ## ## Shared-memory implementation +# Process non-uniform points in batches of Np points (or less). +# The idea is to completely avoid slow atomic writes to shared memory arrays (u_local), +# while parallelising some operations across up to Np points. +# TODO: Should Np be equal to the number of threads? or how to choose it optimally so that the block size stays not too small? @kernel function spread_from_points_shmem_kernel!( us::NTuple{C, AbstractArray{T}}, @Const(gs::NTuple{D}), @@ -248,7 +252,8 @@ end ::HalfSupport{M}, ::Val{block_dims}, ::Val{shmem_size}, # this is a bit redundant, but seems to be required for CPU backends (used in tests) - ) where {C, D, T, Z, M, block_dims, shmem_size} + ::Val{Np}, # batch_size + ) where {C, D, T, Z, M, Np, block_dims, shmem_size} @uniform begin groupsize = @groupsize()::Dims{D} @@ -271,8 +276,10 @@ end # Allocate static shared memory u_local = @localmem(T, shmem_size) @assert T <: Real - window_vals = @localmem(T, (2M, D)) - inds_start = @localmem(Int, D) + window_vals = @localmem(T, (2M, D, Np)) + points_sm = @localmem(T, (D, Np)) # points copied to shared memory + inds_start = @localmem(Int, (D, Np)) + vp_sm = @localmem(Z, Np) # input values copied to shared memory # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C @@ -287,70 +294,45 @@ end @inbounds a = cumulative_npoints_per_block[block_n] @inbounds b = cumulative_npoints_per_block[block_n + 1] - # for i in (a + threadidx):nthreads:b - # # Spread from point j - # j = if pointperm === nothing - # i - # else - # @inbounds pointperm[i] - # end - # - # # TODO: can we do this just once for all components? (if C > 1) - # indvals = ntuple(Val(D)) do n - # @inline - # local x = @inbounds points[n][j] - # local g = gs[n] - # local Δx = Kernels.gridstep(g) - # icell = @inline Kernels.point_to_cell(x, Δx) - # vals = @inline Kernels.evaluate_kernel_direct(g, icell, x) - # # gdata = Kernels.evaluate_kernel(g, x) - # # local vals = gdata.values # kernel values - # local ishift = (block_index[n] - 1) * block_dims[n] + 1 - # local i₀ = icell - ishift - # # local i₀ = gdata.i - ishift - # # @assert i₀ ≥ 0 - # # @assert i₀ + 2M ≤ block_dims_padded[n] - # i₀ => vals - # end - # - # @inbounds v = vp[c][j] - # spread_onto_array_shmem!(u_local, indvals, v) - # end - - # All threads iterate over the same non-uniform points. - # This is to avoid atomic operations on shared memory, which seems to be quite slow. - @inbounds for i in (a + 1):b - # Spread from point j - j = if pointperm === nothing - i - else - @inbounds pointperm[i] - end - - # Parallel evaluation of window functions over all dimensions - let - inds = CartesianIndices((1:D, 1:2M)) - @inbounds for n ∈ threadidx:nthreads:length(inds) - d, m = Tuple(inds[n]) # dimension, support point + # The first batch deals with points (a + 1):min(a + Np, b) + @inbounds for batch_begin in a:Np:(b - 1) + batch_size = min(Np, b - batch_begin) + # Iterate over points in the batch (ideally 1 thread per point). + # Each thread writes to shared memory. + # TODO: can we have a two-level parallelism? e.g. parallelise over dimensions + @inbounds for p in threadidx:nthreads:batch_size + # Spread from point j + i = batch_begin + p # index of non-uniform point + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + for d ∈ 1:D x = points[d][j] + points_sm[d, p] = x g = gs[d] - Δx = Kernels.gridstep(g) - icell = @inline Kernels.point_to_cell(x, Δx) + gdata = Kernels.evaluate_kernel(g, x) ishift = (block_index[d] - 1) * block_dims[d] + 1 - i₀ = icell - ishift - inds_start[d] = i₀ # multiple threads may write this, but that's ok - window_vals[m, d] = @inline Kernels.evaluate_kernel_direct(g, icell, m, x) + inds_start[d, p] = gdata.i - ishift + local vals = gdata.values + for m ∈ eachindex(vals) + window_vals[m, d, p] = vals[m] + end end + vp_sm[p] = vp[c][j] end - @inbounds v = vp[c][j] - - @synchronize + @synchronize # make sure all threads have the same shared data - inds_start_t = ntuple(d -> @inbounds(inds_start[d]), Val(D)) # as a tuple - - # Spread in parallel - spread_onto_array_shmem_threads!(u_local, inds_start_t, window_vals, v; threadidxs, groupsize, threadidx, nthreads) + # Now all threads spread together onto shared memory + @inbounds for p in 1:batch_size + local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D)) + local vals = @view window_vals[:, :, p] # TODO: does this have a cost? + local v = vp_sm[p] + spread_onto_array_shmem_threads!(u_local, istart, vals, v; threadidx, nthreads) + @synchronize # make sure threads don't write concurrently to the same place (since we don't use atomics) + end end @synchronize # make sure we have finished writing to shared memory @@ -371,40 +353,6 @@ end nothing end -# Spread a single "component" (one transform at a time). -@inline function spread_onto_array_shmem!( - u_local::AbstractArray{T, D}, - indvals::NTuple{D}, - v::Z, - ) where {T, D, Z} - if @generated - gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! - Tr = real(T) - quote - inds_start = map(first, indvals) # start of active region in output array - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width - $gprod_init = one($Tr) # product of kernel values (initially 1) - @nloops( - $D, i, - d -> inds[d], # for i_d ∈ 1:L - d -> begin - @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] - @inbounds j_d = inds_start[d] + i_d - end, - begin - js = @ntuple($D, j) - w = v * gprod_1 - _atomic_add!(u_local, w, js) # this is slow!! (atomics in shared memory) -- is it Atomix's fault? - end, - ) - v - end - else - aaa # TODO: implement - end -end - # Spread a single "component" (one transform at a time). # This is parallelised across threads in a workgroup. @inline function spread_onto_array_shmem_threads!( @@ -412,7 +360,6 @@ end inds_start::NTuple{D, Integer}, window_vals::AbstractArray{T, 2}, # static-size shared-memory array (2M, D) v::Z; - threadidxs::NTuple{D}, groupsize::NTuple{D}, threadidx, nthreads, ) where {T, D, Z} inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D))) From 5bf7ae4d1c9bd7866066e28e4884d3a34407211a Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 12:23:24 +0200 Subject: [PATCH 19/33] More optimisations --- src/spreading/gpu.jl | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 6c76314..97f1112 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -299,7 +299,6 @@ end batch_size = min(Np, b - batch_begin) # Iterate over points in the batch (ideally 1 thread per point). # Each thread writes to shared memory. - # TODO: can we have a two-level parallelism? e.g. parallelise over dimensions @inbounds for p in threadidx:nthreads:batch_size # Spread from point j i = batch_begin + p # index of non-uniform point @@ -309,20 +308,28 @@ end @inbounds pointperm[i] end for d ∈ 1:D - x = points[d][j] - points_sm[d, p] = x - g = gs[d] - gdata = Kernels.evaluate_kernel(g, x) - ishift = (block_index[d] - 1) * block_dims[d] + 1 - inds_start[d, p] = gdata.i - ishift - local vals = gdata.values - for m ∈ eachindex(vals) - window_vals[m, d, p] = vals[m] - end + points_sm[d, p] = points[d][j] end vp_sm[p] = vp[c][j] end + @synchronize + + # Now evaluate windows associated to each point. + local inds = CartesianIndices((1:batch_size, 1:D)) # parallelise over dimensions + points + @inbounds for n in threadidx:nthreads:length(inds) + p, d = Tuple(inds[n]) + g = gs[d] + x = points_sm[d, p] + gdata = Kernels.evaluate_kernel(g, x) + ishift = (block_index[d] - 1) * block_dims[d] + 1 + inds_start[d, p] = gdata.i - ishift + local vals = gdata.values + for m ∈ eachindex(vals) + window_vals[m, d, p] = vals[m] + end + end + @synchronize # make sure all threads have the same shared data # Now all threads spread together onto shared memory From f9e79311ec07534a24dcd268140859482d1277dd Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Mon, 21 Oct 2024 22:08:42 +0200 Subject: [PATCH 20/33] Try to fix tests (on CPU) --- src/spreading/gpu.jl | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 97f1112..8b1e6f6 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -281,6 +281,17 @@ end inds_start = @localmem(Int, (D, Np)) vp_sm = @localmem(Z, Np) # input values copied to shared memory + # Buffer for indices and lengths. + # This is needed in CPU version (used in tests), to avoid variables from being + # "forgotten" after a @synchronize barrier. + buf_sm = @localmem(Int, 3) + + # This block will take care of non-uniform points (a + 1):b + if threadidx == 1 + @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n] + @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1] + end + # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C # Reset shared memory to zero @@ -290,13 +301,10 @@ end @synchronize # make sure shared memory is fully set to zero - # This block will take care of non-uniform points (a + 1):b - @inbounds a = cumulative_npoints_per_block[block_n] - @inbounds b = cumulative_npoints_per_block[block_n + 1] - # The first batch deals with points (a + 1):min(a + Np, b) - @inbounds for batch_begin in a:Np:(b - 1) - batch_size = min(Np, b - batch_begin) + @inbounds for batch_begin in buf_sm[1]:Np:(buf_sm[2] - 1) + batch_size = min(Np, buf_sm[2] - batch_begin) # current batch size + buf_sm[3] = batch_size # Iterate over points in the batch (ideally 1 thread per point). # Each thread writes to shared memory. @inbounds for p in threadidx:nthreads:batch_size @@ -316,7 +324,7 @@ end @synchronize # Now evaluate windows associated to each point. - local inds = CartesianIndices((1:batch_size, 1:D)) # parallelise over dimensions + points + local inds = CartesianIndices((1:buf_sm[3], 1:D)) # parallelise over dimensions + points @inbounds for n in threadidx:nthreads:length(inds) p, d = Tuple(inds[n]) g = gs[d] @@ -333,7 +341,7 @@ end @synchronize # make sure all threads have the same shared data # Now all threads spread together onto shared memory - @inbounds for p in 1:batch_size + @inbounds for p in 1:buf_sm[3] local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D)) local vals = @view window_vals[:, :, p] # TODO: does this have a cost? local v = vp_sm[p] @@ -344,7 +352,7 @@ end @synchronize # make sure we have finished writing to shared memory - if a < b # skip this step if there were actually no points in this block (if a == b) + @inbounds if buf_sm[1] < buf_sm[2] # skip this step if there were actually no points in this block (if a == b) add_from_local_to_global_memory!( Z, us[c], u_local, Ns, Val(M); block_index, block_dims, threadidxs, groupsize, From e1c2daff36414f5613b6e6f89d77b5157c02b0b7 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Tue, 22 Oct 2024 09:19:19 +0200 Subject: [PATCH 21/33] Shared memory array can be complex --- src/blocking.jl | 9 ++- src/spreading/gpu.jl | 141 ++++++++++++++----------------------------- 2 files changed, 52 insertions(+), 98 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index 9cbe6ec..e1db4c4 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -74,8 +74,13 @@ type_length(::Type{<:NTuple{N}}) where {N} = N 2M * D * Np + # window_vals D * Np + # points_sm D * Np # inds_start - ) + ( - sizeof(Z) * Np # vp_sm + ) + + sizeof(Z) * ( + Np # vp_sm + ) + + sizeof(Int) * ( + 3 + # buf_sm + D # ishifts_sm ) free_shmem = shmem_needs # how much memory to leave free for other allocations max_shmem_size = (48 << 10) - free_shmem # 48 KiB -- TODO: make this depend on the actual GPU? diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 8b1e6f6..99f9c96 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -206,8 +206,8 @@ function spread_from_points!( block_dims = Val(block_dims_val) # ...which means this doesn't require a dynamic dispatch @assert block_dims_val === bd.block_dims let ngroups = bd.nblocks_per_dir # this is the required number of workgroups (number of blocks in CUDA) - block_dims_padded = @. block_dims_val + 2M - 1 # dimensions of shared memory array - shmem_size = spread_real_dims(Z, block_dims_padded) + block_dims_padded = @. block_dims_val + 2M - 1 + shmem_size = block_dims_padded # dimensions of big shared memory array groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) ndrange = groupsize .* ngroups kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) @@ -274,7 +274,8 @@ end threadidx = @index(Local, Linear) # in 1:nthreads # Allocate static shared memory - u_local = @localmem(T, shmem_size) + u_local = @localmem(Z, shmem_size) + # @assert shmem_size == block_dims .+ (2M - 1) @assert T <: Real window_vals = @localmem(T, (2M, D, Np)) points_sm = @localmem(T, (D, Np)) # points copied to shared memory @@ -285,11 +286,15 @@ end # This is needed in CPU version (used in tests), to avoid variables from being # "forgotten" after a @synchronize barrier. buf_sm = @localmem(Int, 3) + ishifts_sm = @localmem(Int, D) # shift between local and global array in each direction # This block will take care of non-uniform points (a + 1):b if threadidx == 1 - @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n] - @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1] + @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n] # = a + @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1] # = b + @inbounds for d ∈ 1:D + ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1 + end end # Interpolate components one by one (to avoid using too much memory) @@ -330,7 +335,7 @@ end g = gs[d] x = points_sm[d, p] gdata = Kernels.evaluate_kernel(g, x) - ishift = (block_index[d] - 1) * block_dims[d] + 1 + ishift = ishifts_sm[d] inds_start[d, p] = gdata.i - ishift local vals = gdata.values for m ∈ eachindex(vals) @@ -343,9 +348,8 @@ end # Now all threads spread together onto shared memory @inbounds for p in 1:buf_sm[3] local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D)) - local vals = @view window_vals[:, :, p] # TODO: does this have a cost? local v = vp_sm[p] - spread_onto_array_shmem_threads!(u_local, istart, vals, v; threadidx, nthreads) + spread_onto_array_shmem_threads!(u_local, istart, window_vals, v, p; threadidx, nthreads) @synchronize # make sure threads don't write concurrently to the same place (since we don't use atomics) end end @@ -354,8 +358,7 @@ end @inbounds if buf_sm[1] < buf_sm[2] # skip this step if there were actually no points in this block (if a == b) add_from_local_to_global_memory!( - Z, us[c], u_local, Ns, Val(M); - block_index, block_dims, threadidxs, groupsize, + us[c], u_local, Ns, ishifts_sm, Val(M); threadidx, nthreads, ) end @@ -371,117 +374,63 @@ end # Spread a single "component" (one transform at a time). # This is parallelised across threads in a workgroup. @inline function spread_onto_array_shmem_threads!( - u_local::AbstractArray{T, D}, + u_local::AbstractArray{Z, D}, inds_start::NTuple{D, Integer}, - window_vals::AbstractArray{T, 2}, # static-size shared-memory array (2M, D) - v::Z; + window_vals::AbstractArray{T, 3}, # static-size shared-memory array (2M, D, Np) + v::Z, p::Integer; threadidx, nthreads, ) where {T, D, Z} - inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D))) + inds = CartesianIndices(ntuple(_ -> axes(window_vals, 1), Val(D))) # = (1:2M, 1:2M, ...) Tr = real(T) @inbounds for n ∈ threadidx:nthreads:length(inds) I = inds[n] js = Tuple(I) .+ inds_start gprod = one(Tr) for d ∈ 1:D - gprod *= window_vals[I[d], d] + gprod *= window_vals[I[d], d, p] end w = v * gprod - _add_maybecomplex!(u_local, w, js) + u_local[js...] += w end nothing end # Add values from shared to global memory. @inline function add_from_local_to_global_memory!( - ::Type{Z}, # "logical" type of data (e.g. ComplexF64) - u_global::AbstractArray{T, D}, # actual data is always real - u_local::AbstractArray{T, D}, + u_global::AbstractArray{T, D}, # actual data in global array is always real + u_local::AbstractArray{Z, D}, # shared-memory array can be complex Ns::Dims{D}, + ishifts, ::Val{M}; - block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, threadidx, nthreads, ) where {Z, T, D, M} - if @generated - @assert T <: Real # for atomic operations - skip = sizeof(Z) ÷ sizeof(T) # 1 (Z real) or 2 (Z complex) - quote - skips = @ntuple($D, n -> n == 1 ? $skip : 1) - inds = @ntuple($D, n -> axes(u_local, n)[1:(end ÷ skips[n])][threadidxs[n]:groupsize[n]:end]) - offsets = @ntuple( - $D, - n -> let - off = (block_index[n] - 1) * block_dims[n] - ($M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) - end - ) - @nloops( - $D, i, - d -> inds[d], - d -> begin - j_d = offsets[d] + i_d - j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) - end, - begin - is = @ntuple($D, i) - js = @ntuple($D, j) - @inbounds if $skip === 1 # real data (Z <: Real) - w = u_local[is...] - Atomix.@atomic u_global[js...] += w - elseif $skip === 2 # complex data (Z <: Complex) - is_tail = @ntuple($(D - 1), d -> i_{d + 1}) - js_tail = @ntuple($(D - 1), d -> j_{d + 1}) - ifirst = 2 * i_1 - 1 - jfirst = 2 * j_1 - 1 - w = u_local[ifirst, is_tail...] # real part - Atomix.@atomic u_global[jfirst, js_tail...] += w - w = u_local[ifirst + 1, is_tail...] # imaginary part - Atomix.@atomic u_global[jfirst + 1, js_tail...] += w - end - end, - ) - nothing - end - else - # Unlike the @generated case, here we parallelise over linear indices instead of - # separate Cartesian directions. Performance is very similar to the @generated case. - @assert T <: Real # for atomic operations - skip = sizeof(Z) ÷ sizeof(T) # 1 (Z real) or 2 (Z complex) - inds = CartesianIndices( - ntuple(Val(D)) do d - # The logical indices are 1:end÷skip in the first dimension. - d == 1 ? axes(u_local, 1)[1:end÷skip] : axes(u_local, d) - end - ) - offsets = ntuple(Val(D)) do n + @assert T <: Real # for atomic operations + inds = CartesianIndices(axes(u_local)) + offsets = ntuple(Val(D)) do d + @inline + local off = ishifts[d] - M + ifelse(off < 0, off + Ns[d], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + @inbounds for n ∈ threadidx:nthreads:length(inds) + I = inds[n] + is = Tuple(I) + js = ntuple(Val(D)) do d @inline - local off = (block_index[n] - 1) * block_dims[n] - (M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + j = is[d] + offsets[d] + ifelse(j > Ns[d], j - Ns[d], j) end - @inbounds for n ∈ threadidx:nthreads:length(inds) - I = inds[n] - is = Tuple(I) - js = ntuple(Val(D)) do d - @inline - j = is[d] + offsets[d] - ifelse(j > Ns[d], j - Ns[d], j) - end - @inbounds if Z <: Real - w = u_local[is...] - Atomix.@atomic u_global[js...] += w - elseif Z <: Complex - is_tail = Base.tail(is) - js_tail = Base.tail(js) - ifirst = 2 * is[1] - 1 - jfirst = 2 * js[1] - 1 - w = u_local[ifirst, is_tail...] # real part - Atomix.@atomic u_global[jfirst, js_tail...] += w - w = u_local[ifirst + 1, is_tail...] # imaginary part - Atomix.@atomic u_global[jfirst + 1, js_tail...] += w - end + @inbounds if Z <: Real + w = u_local[is...] + Atomix.@atomic u_global[js...] += w + elseif Z <: Complex + js_tail = Base.tail(js) + jfirst = 2 * js[1] - 1 + w = u_local[is...] + Atomix.@atomic u_global[jfirst, js_tail...] += real(w) + Atomix.@atomic u_global[jfirst + 1, js_tail...] += imag(w) end - nothing end + nothing end ## ==================================================================================================== ## From 4898e6ef9f1aedef97cdc584c57d67afafda3e03 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 14:02:04 +0200 Subject: [PATCH 22/33] Update interpolation based on spreading changes --- src/interpolation/gpu.jl | 87 +++++++++++++++++----------------------- src/spreading/gpu.jl | 1 - 2 files changed, 36 insertions(+), 52 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index b55ed25..50c576b 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -266,18 +266,30 @@ end nthreads = prod(groupsize) end - threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads block_index = @index(Group, NTuple) # workgroup index (= block index) block_n = @index(Group, Linear) # linear index of block u_local = @localmem(Z, shmem_size) # allocate shared memory + ishifts_sm = @localmem(Int, D) # shift between local and global array in each direction + + if threadidx == 1 + @inbounds for d ∈ 1:D + ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1 + end + end + + @synchronize + # Interpolate components one by one (to avoid using too much memory) for c ∈ 1:C # Copy grid data from global to shared memory M = Kernels.half_support(gs[1]) - gridvalues_to_local_memory!(u_local, us[c], Val(M), block_index, block_dims, threadidxs, groupsize) + gridvalues_to_local_memory!( + u_local, us[c], ishifts_sm, Val(M); + threadidx, nthreads, + ) @synchronize # make sure all threads have the same shared data @@ -294,13 +306,13 @@ end end # TODO: can we do this just once for all components? (if C > 1) - indvals = ntuple(Val(D)) do n + indvals = ntuple(Val(D)) do d @inline - x = @inbounds points[n][j] - gdata = Kernels.evaluate_kernel(gs[n], x) - local vals = gdata.values # kernel values - local ishift = (block_index[n] - 1) * block_dims[n] + 1 + x = @inbounds points[d][j] + gdata = Kernels.evaluate_kernel(gs[d], x) + ishift = ishifts_sm[d] local i₀ = gdata.i - ishift + local vals = gdata.values # kernel values # @assert i₀ ≥ 0 # @assert i₀ + 2M ≤ block_dims_padded[n] i₀ => vals @@ -319,54 +331,27 @@ end @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, - ::Val{M}, block_index::Dims{D}, block_dims::Dims{D}, threadidxs::Dims{D}, groupsize::Dims{D}, + ishifts, + ::Val{M}; + threadidx, nthreads, ) where {T, D, M} - if @generated - quote - Ns = size(u_global) - inds = @ntuple($D, n -> axes(u_local, n)[threadidxs[n]:groupsize[n]:end]) - offsets = @ntuple( - $D, - n -> let - off = (block_index[n] - 1) * block_dims[n] - ($M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) - end - ) - @nloops( - $D, i, - d -> inds[d], - d -> begin - j_d = offsets[d] + i_d - j_d = ifelse(j_d > Ns[d], j_d - Ns[d], j_d) - end, - begin - is = @ntuple($D, i) - js = @ntuple($D, j) - @inbounds u_local[is...] = u_global[js...] - end, - ) - nothing - end - else - Ns = size(u_global) - inds = ntuple(Val(D)) do n - axes(u_local, n)[threadidxs[n]:groupsize[n]:end] # this determines the parallelisation pattern - end - offsets = ntuple(Val(D)) do n + Ns = size(u_global) + inds = CartesianIndices(axes(u_local)) + offsets = ntuple(Val(D)) do d + @inline + off = ishifts[d] - M + ifelse(off < 0, off + Ns[d], off) # make sure the offset is non-negative (to avoid some wrapping below) + end + @inbounds for n ∈ threadidx:nthreads:length(inds) + is = Tuple(inds[n]) + js = ntuple(Val(D)) do d @inline - off = (block_index[n] - 1) * block_dims[n] - (M - 1) - ifelse(off < 0, off + Ns[n], off) # make sure the offset is non-negative (to avoid some wrapping below) + j = is[d] + offsets[d] + ifelse(j > Ns[d], j - Ns[d], j) end - @inbounds for is ∈ Iterators.product(inds...) - js = ntuple(Val(D)) do n - @inline - j = offsets[n] + is[n] - ifelse(j > Ns[n], j - Ns[n], j) - end - u_local[is...] = u_global[js...] - end - nothing + u_local[n] = u_global[js...] end + nothing end # Interpolate a single "component" (one transform at a time). diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 99f9c96..91b9eb4 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -270,7 +270,6 @@ end block_n = @index(Group, Linear) # linear index of block block_index = @index(Group, NTuple) # workgroup index (= block index) - threadidxs = @index(Local, NTuple) # in (1:nthreads_x, 1:nthreads_y, ...) threadidx = @index(Local, Linear) # in 1:nthreads # Allocate static shared memory From df0690f4589c2dd52ae0d58283f3ad44a3ef59da Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 14:46:01 +0200 Subject: [PATCH 23/33] Simplify setting workgroupsize --- src/interpolation/gpu.jl | 54 +++++++++++----------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 50c576b..a070f1c 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -108,13 +108,11 @@ function interpolate!( shmem_size = block_dims_padded groupsize = groupsize_shmem(ngroups, block_dims_padded, length(x⃗s)) ndrange = groupsize .* ngroups - # TODO: - # - try out batches in interpolation? kernel! = interpolate_to_points_shmem_kernel!(backend, groupsize, ndrange) kernel!( vp_sorted, gs, xs_comp, us, pointperm_, bd.cumulative_npoints_per_block, - prefactor, block_dims, - Val(shmem_size), + prefactor, + block_dims, Val(shmem_size), ) end end @@ -130,41 +128,6 @@ function interpolate!( vp_all end -# Determine groupsize (number of threads per direction) in :shared_memory method. -# Here Np is the number of non-uniform points. -# The distribution of threads across directions mainly (only?) affects the copies between -# shared and global memories, so it can have an important influence of performance due to -# memory accesses. -# NOTE: this is also used in spreading -function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D} - # (1) Determine the total number of threads. - groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU) - # Increase groupsize if number of points is much larger (at least 4×) than the groupsize. - # Not sure this is a good idea if the point distribution is very inhomogeneous... - # TODO see if this improves performance in highly dense cases - # points_per_group = Np / prod(ngroups) # average number of points per block - # while groupsize < 1024 && 4 * groupsize ≤ points_per_group - # groupsize *= 2 - # end - # (2) Determine number of threads in each direction. - # This mainly affects the performance of global -> shared memory copies. - # It seems like it's better to parallelise the outer dimensions first. - gsizes = ntuple(_ -> 1, Val(D)) - p = 1 # product of sizes - i = D # parallelise outer dimensions first - while p < groupsize - if gsizes[i] < shmem_size[i] || i == 1 - gsizes = Base.setindex(gsizes, gsizes[i] * 2, i) - p *= 2 - else - @assert i > 1 - i -= 1 - end - end - @assert p == groupsize == prod(gsizes) - gsizes -end - @inline function interpolate_from_arrays_gpu( us::NTuple{C, AbstractArray{T, D}}, indvals::NTuple{D, <:Pair}, @@ -249,6 +212,19 @@ end ## ========================================================================================== ## ## Shared-memory implementation +# Note: this is also used in spreading. +function groupsize_shmem(ngroups::NTuple{D}, shmem_size::NTuple{D}, Np) where {D} + # (1) Determine the total number of threads. + # Not sure if it's worth it to define as a function of the inputs (currently unused). + # From tests, the value of 64 seems to be optimal in various situations. + groupsize = 64 # minimum group size should be equal to the warp size (usually 32 on CUDA and 64 on AMDGPU) + # (2) Determine number of threads in each direction. + # We don't really care about the Cartesian distribution of threads, since we always + # parallelise over linear indices. + gsizes = ntuple(_ -> 1, Val(D)) + Base.setindex(gsizes, groupsize, 1) # = (groupsize, 1, 1, ...) +end + @kernel function interpolate_to_points_shmem_kernel!( vp::NTuple{C, AbstractVector{Z}}, @Const(gs::NTuple{D}), From 2b2872983a57fc3bd0d29c1a90cf367ddb06848e Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 14:46:55 +0200 Subject: [PATCH 24/33] Minor changes --- src/interpolation/gpu.jl | 48 +++++++++++++++++----------------------- src/spreading/gpu.jl | 7 +++--- 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index a070f1c..8bfb9d7 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -242,28 +242,22 @@ end nthreads = prod(groupsize) end - threadidx = @index(Local, Linear) # in 1:nthreads - block_index = @index(Group, NTuple) # workgroup index (= block index) block_n = @index(Group, Linear) # linear index of block + block_index = @index(Group, NTuple) # workgroup index (= block index) + threadidx = @index(Local, Linear) # in 1:nthreads u_local = @localmem(Z, shmem_size) # allocate shared memory - ishifts_sm = @localmem(Int, D) # shift between local and global array in each direction - - if threadidx == 1 - @inbounds for d ∈ 1:D - ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1 - end + ishifts = ntuple(Val(D)) do d + (block_index[d] - 1) * block_dims[d] + 1 end - @synchronize - # Interpolate components one by one (to avoid using too much memory) - for c ∈ 1:C + @inbounds for c ∈ 1:C # Copy grid data from global to shared memory M = Kernels.half_support(gs[1]) gridvalues_to_local_memory!( - u_local, us[c], ishifts_sm, Val(M); + u_local, us[c], ishifts, Val(M); threadidx, nthreads, ) @@ -286,15 +280,16 @@ end @inline x = @inbounds points[d][j] gdata = Kernels.evaluate_kernel(gs[d], x) - ishift = ishifts_sm[d] - local i₀ = gdata.i - ishift + local i₀ = gdata.i - ishifts[d] local vals = gdata.values # kernel values # @assert i₀ ≥ 0 # @assert i₀ + 2M ≤ block_dims_padded[n] i₀ => vals end - v = interpolate_from_arrays_shmem(u_local, indvals, prefactor) + inds_start = map(first, indvals) + window_vals = map(last, indvals) + v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor) @inbounds vp[c][j] = v end end @@ -333,23 +328,22 @@ end # Interpolate a single "component" (one transform at a time). # Here vp is a vector instead of a tuple of vectors. @inline function interpolate_from_arrays_shmem( - u_local::AbstractArray{T, D}, - indvals::NTuple{D}, + u_local::AbstractArray{Z, D}, + inds_start::NTuple{D}, + window_vals::NTuple{D}, prefactor, - ) where {T, D} + ) where {Z, D} if @generated gprod_init = Symbol(:gprod_, D + 1) # the name of this variable is important! quote $gprod_init = prefactor - v = zero(T) - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + v = zero(Z) + inds = map(eachindex, window_vals) # = (1:2M, 1:2M, ...) @nloops( $D, i, d -> inds[d], d -> begin - @inbounds gprod_d = gprod_{d + 1} * vals[d][i_d] + @inbounds gprod_d = gprod_{d + 1} * window_vals[d][i_d] @inbounds j_d = inds_start[d] + i_d end, begin @@ -360,12 +354,10 @@ end v end else - v = zero(T) - inds_start = map(first, indvals) - vals = map(last, indvals) # evaluated kernel values in each direction - inds = map(eachindex, vals) # = (1:L, 1:L, ...) where L = 2M is the kernel width + v = zero(Z) + inds = map(eachindex, window_vals) # = (1:2M, 1:2M, ...) @inbounds for I ∈ CartesianIndices(inds) - gprod = prefactor * prod(ntuple(d -> @inbounds(vals[d][I[d]]), Val(D))) + gprod = prefactor * prod(ntuple(d -> @inbounds(window_vals[d][I[d]]), Val(D))) js = inds_start .+ Tuple(I) v += gprod * u_local[js...] end diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 91b9eb4..f4137ce 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -212,9 +212,8 @@ function spread_from_points!( ndrange = groupsize .* ngroups kernel! = spread_from_points_shmem_kernel!(backend, groupsize, ndrange) kernel!( - us_real, gs, xs_comp, vp_sorted, pointperm_, - bd.cumulative_npoints_per_block, HalfSupport(M), - block_dims, Val(shmem_size), bd.batch_size, + us_real, gs, xs_comp, vp_sorted, pointperm_, bd.cumulative_npoints_per_block, + HalfSupport(M), block_dims, Val(shmem_size), bd.batch_size, ) end end @@ -287,8 +286,8 @@ end buf_sm = @localmem(Int, 3) ishifts_sm = @localmem(Int, D) # shift between local and global array in each direction - # This block will take care of non-uniform points (a + 1):b if threadidx == 1 + # This block (workgroup) will take care of non-uniform points (a + 1):b @inbounds buf_sm[1] = cumulative_npoints_per_block[block_n] # = a @inbounds buf_sm[2] = cumulative_npoints_per_block[block_n + 1] # = b @inbounds for d ∈ 1:D From d0f439f097cc8988c677e7d4ebfdcbc12c4d4745 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 15:20:04 +0200 Subject: [PATCH 25/33] Fix CPU tests --- src/interpolation/gpu.jl | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 8bfb9d7..fd0e449 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -248,16 +248,25 @@ end u_local = @localmem(Z, shmem_size) # allocate shared memory - ishifts = ntuple(Val(D)) do d - (block_index[d] - 1) * block_dims[d] + 1 + # This needs to be in shared memory for CPU tests (otherwise it doesn't survive across + # @synchronize barriers). + ishifts_sm = @localmem(Int, D) # shift between local and global array in each direction + + if threadidx == 1 + # This block (workgroup) will take care of non-uniform points (a + 1):b + @inbounds for d ∈ 1:D + ishifts_sm[d] = (block_index[d] - 1) * block_dims[d] + 1 + end end + @synchronize # synchronise ishifts_sm + # Interpolate components one by one (to avoid using too much memory) @inbounds for c ∈ 1:C # Copy grid data from global to shared memory M = Kernels.half_support(gs[1]) gridvalues_to_local_memory!( - u_local, us[c], ishifts, Val(M); + u_local, us[c], ishifts_sm, Val(M); threadidx, nthreads, ) @@ -280,7 +289,7 @@ end @inline x = @inbounds points[d][j] gdata = Kernels.evaluate_kernel(gs[d], x) - local i₀ = gdata.i - ishifts[d] + local i₀ = gdata.i - ishifts_sm[d] local vals = gdata.values # kernel values # @assert i₀ ≥ 0 # @assert i₀ + 2M ≤ block_dims_padded[n] From 2b0b4b6ff8b4a6dca30404e86a6dc3eab31ad1d5 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 15:23:03 +0200 Subject: [PATCH 26/33] Remove unused functions --- src/spreading/gpu.jl | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index f4137ce..7b90fc1 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -34,9 +34,6 @@ end @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns @inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1) # actual number of complex elements in first dimension -@inline spread_real_dims(::Type{<:Real}, Ns) = Ns -@inline spread_real_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] << 1, 1) - @inline function spread_onto_arrays_gpu!( us::NTuple{C, AbstractArray{T, D}}, indvals::NTuple{D, <:Pair}, @@ -122,22 +119,6 @@ end nothing end -# Same as _atomic_add!, but without the atomics. -@inline function _add_maybecomplex!(u::AbstractArray{T}, v::T, inds::Tuple) where {T <: Real} - @inbounds u[inds...] += v - nothing -end - -@inline function _add_maybecomplex!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real} - @inbounds begin - i₁ = 2 * (inds[1] - 1) # convert from logical index (equivalent complex array) to memory index (real array) - itail = Base.tail(inds) - u[i₁ + 1, itail...] += real(v) - u[i₁ + 2, itail...] += imag(v) - end - nothing -end - # GPU implementation. # We assume all arrays are already on the GPU. function spread_from_points!( From 6bbeb8ee0458332ae6e9fa8cacdc0f7a6c58b405 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 15:35:53 +0200 Subject: [PATCH 27/33] Add tests for multiple transforms --- src/spreading/gpu.jl | 6 +++--- test/pseudo_gpu.jl | 23 +++++++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 7b90fc1..4490ca1 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -342,9 +342,9 @@ end ) end - if c < C - @synchronize # wait before jumping to next component - end + # Avoid resetting u_local too early in the next iteration (c -> c + 1). + # This is mostly useful when c < C (but putting an `if` fails...). + @synchronize end nothing diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl index 6244856..13df0a6 100644 --- a/test/pseudo_gpu.jl +++ b/test/pseudo_gpu.jl @@ -50,7 +50,7 @@ end # ================================================================================ # -function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector) +function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc} (; backend,) = p xp = adapt(backend, xp_init) @@ -78,32 +78,34 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector) T = eltype(p) # this is actually defined in AbstractNFFTs; it represents the type in Fourier space (always complex) @test T <: Complex dims = size(p) - us = KA.allocate(backend, T, dims) + us = map(_ -> KA.allocate(backend, T, dims), vp) exec_type1!(us, p, vp) - wp = similar(vp) + wp = map(similar, vp) exec_type2!(wp, p, us) (; backend, us, wp,) end -function compare_with_cpu(::Type{T}, dims; Np = prod(dims), kws...) where {T <: Number} +function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc} = Val(1), kws...) where {T <: Number, Nc} # Generate some non-uniform random data on the CPU rng = Xoshiro(42) N = length(dims) # number of dimensions Tr = real(T) xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np] # non-uniform points in [0, 2π]ᵈ - vp_init = randn(rng, T, Np) # random values at points + vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms) - params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, kws...) + params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...) p_cpu = PlanNUFFT(T, dims; params..., backend = CPU()) p_gpu = PlanNUFFT(T, dims; params..., backend = PseudoGPU()) r_cpu = run_plan(p_cpu, xp_init, vp_init) r_gpu = run_plan(p_gpu, xp_init, vp_init) - @test r_cpu.us ≈ r_gpu.us # output of type-1 transform - @test r_cpu.wp ≈ r_gpu.wp # output of type-2 transform + for c ∈ 1:Nc + @test r_cpu.us[c] ≈ r_gpu.us[c] # output of type-1 transform + @test r_cpu.wp[c] ≈ r_gpu.wp[c] # output of type-2 transform + end nothing end @@ -123,4 +125,9 @@ end @testset "Float32" compare_with_cpu(Float32, dims; gpu_method = :shared_memory) @testset "ComplexF32" compare_with_cpu(ComplexF32, dims; gpu_method = :shared_memory) end + @testset "Multiple transforms" begin + ntransforms = Val(2) + @testset "Global memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :global_memory) + @testset "Shared memory" compare_with_cpu(Float32, dims; ntransforms, gpu_method = :shared_memory) + end end From 7bc11bc06a687a152f114520f215dc715345ce03 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 15:49:09 +0200 Subject: [PATCH 28/33] Simplify atomic adds with complex data Doesn't change performance. --- src/spreading/gpu.jl | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 4490ca1..dbc58d1 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -21,7 +21,7 @@ Z = eltype(vp[1]) Ns_real = size(first(us)) # dimensions of raw input data @assert eltype(first(us)) <: Real # output is a real array (but may actually describe complex data) - Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex + Ns = spread_actual_dims(Z, Ns_real) # drop the first dimension (of size 2) if Z <: Complex indvals = get_inds_vals_gpu(gs, points, Ns, j) @@ -32,10 +32,10 @@ end @inline spread_actual_dims(::Type{<:Real}, Ns) = Ns -@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.setindex(Ns, Ns[1] >> 1, 1) # actual number of complex elements in first dimension +@inline spread_actual_dims(::Type{<:Complex}, Ns) = Base.tail(Ns) # actual number of complex elements: drop first dimension @inline function spread_onto_arrays_gpu!( - us::NTuple{C, AbstractArray{T, D}}, + us::NTuple{C, AbstractArray{T}}, indvals::NTuple{D, <:Pair}, vs::NTuple{C}, Ns::Dims{D}, @@ -111,14 +111,15 @@ end # so the output must be a real array `u`. @inline function _atomic_add!(u::AbstractArray{T}, v::Complex{T}, inds::Tuple) where {T <: Real} @inbounds begin - i₁ = 2 * (inds[1] - 1) # convert from logical index (equivalent complex array) to memory index (real array) - itail = Base.tail(inds) - Atomix.@atomic u[i₁ + 1, itail...] += real(v) - Atomix.@atomic u[i₁ + 2, itail...] += imag(v) + Atomix.@atomic u[1, inds...] += real(v) + Atomix.@atomic u[2, inds...] += imag(v) end nothing end +to_real_array(u::AbstractArray{T}) where {T <: Real} = u +to_real_array(u::AbstractArray{T}) where {T <: Complex} = reinterpret(reshape, real(T), u) # adds an extra first dimension with size 2 + # GPU implementation. # We assume all arrays are already on the GPU. function spread_from_points!( @@ -138,14 +139,7 @@ function spread_from_points!( # Reinterpret `us` as real arrays, in case they are complex. # This is to avoid current issues with atomic operations on complex data # (https://github.com/JuliaGPU/KernelAbstractions.jl/issues/497). - Z = eltype(us[1]) - T = real(Z) - us_real = if Z <: Real - us - else # complex case - @assert Z <: Complex - map(u -> reinterpret(T, u), us) # note: we don't use reshape, so the first dimension has 2x elements - end + us_real = map(to_real_array, us) # doesn't change anything if data is already real (Z === T) pointperm = get_pointperm(bd) # nothing in case of NullBlockData sort_points = get_sort_points(bd)::StaticBool # False in the case of NullBlockData @@ -245,7 +239,7 @@ end # the actual dataset dimensions. @assert T === real(Z) # output is a real array (but may actually describe complex data) Ns_real = size(first(us)) # dimensions of raw input data - Ns = spread_actual_dims(Z, Ns_real) # divides the Ns_real[1] by 2 if Z <: Complex + Ns = spread_actual_dims(Z, Ns_real) # drop the first dimension (of size 2) if Z <: Complex end block_n = @index(Group, Linear) # linear index of block @@ -376,7 +370,7 @@ end # Add values from shared to global memory. @inline function add_from_local_to_global_memory!( - u_global::AbstractArray{T, D}, # actual data in global array is always real + u_global::AbstractArray{T}, # actual data in global array is always real (and may have an extra first dimension) u_local::AbstractArray{Z, D}, # shared-memory array can be complex Ns::Dims{D}, ishifts, @@ -384,6 +378,7 @@ end threadidx, nthreads, ) where {Z, T, D, M} @assert T <: Real # for atomic operations + # @assert ndims(u_global) === D + (Z <: Complex) # extra dimension if data is complex inds = CartesianIndices(axes(u_local)) offsets = ntuple(Val(D)) do d @inline @@ -398,16 +393,8 @@ end j = is[d] + offsets[d] ifelse(j > Ns[d], j - Ns[d], j) end - @inbounds if Z <: Real - w = u_local[is...] - Atomix.@atomic u_global[js...] += w - elseif Z <: Complex - js_tail = Base.tail(js) - jfirst = 2 * js[1] - 1 - w = u_local[is...] - Atomix.@atomic u_global[jfirst, js_tail...] += real(w) - Atomix.@atomic u_global[jfirst + 1, js_tail...] += imag(w) - end + w = u_local[is...] + _atomic_add!(u_global, w, js) end nothing end From 05edddf3bc87c22729c06f0e68e585b34631f430 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 16:35:39 +0200 Subject: [PATCH 29/33] =?UTF-8?q?point=5Fto=5Fcell=20now=20also=20returns?= =?UTF-8?q?=20x/=CE=94x?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/Kernels/Kernels.jl | 3 ++- src/Kernels/bspline.jl | 4 ++-- src/Kernels/gaussian.jl | 2 +- src/Kernels/kaiser_bessel.jl | 6 +++--- src/Kernels/kaiser_bessel_backwards.jl | 6 +++--- src/blocking.jl | 6 +++--- test/near_2pi.jl | 8 ++++---- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/Kernels/Kernels.jl b/src/Kernels/Kernels.jl index 1e6155b..412f97b 100644 --- a/src/Kernels/Kernels.jl +++ b/src/Kernels/Kernels.jl @@ -71,6 +71,7 @@ function init_fourier_coefficients!(g::AbstractKernelData, ks::AbstractVector) end # Assign a cell index to a location `x`. This assumes 0 ≤ x < 2π. +# This also returns x / Δx to avoid recomputing it later. @inline function point_to_cell(x, Δx) r = x / Δx i = unsafe_trunc(Int, r) # assumes r ≥ 0 @@ -78,7 +79,7 @@ end # is very close (but slightly smaller) to i * Δx. i += (i * Δx ≤ x) # this is almost always true (so we increment by 1) # @assert (i - 1) * Δx ≤ x < i * Δx - i + i, r end # Note: evaluate_kernel_func generates a function which is callable from GPU kernels. diff --git a/src/Kernels/bspline.jl b/src/Kernels/bspline.jl index b934056..74378e1 100644 --- a/src/Kernels/bspline.jl +++ b/src/Kernels/bspline.jl @@ -98,8 +98,8 @@ function evaluate_kernel_func(g::BSplineKernelData{M}) where {M} # This can be shown using the partition of unity property. (; Δt,) = g function (x) - i = point_to_cell(x, Δt) # assume Δx = Δt - x′ = i - (x / Δt) # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt) + i, r = point_to_cell(x, Δt) # assume Δx = Δt + x′ = i - r # normalised coordinate, 0 < x′ ≤ 1 (this assumes Δx = Δt) # @assert 0 ≤ x′ ≤ 1 k = 2M # B-spline order values = bsplines_evaluate_all(x′, Val(k), typeof(Δt)) diff --git a/src/Kernels/gaussian.jl b/src/Kernels/gaussian.jl index a585265..9fcc326 100644 --- a/src/Kernels/gaussian.jl +++ b/src/Kernels/gaussian.jl @@ -121,7 +121,7 @@ end function evaluate_kernel_func(g::GaussianKernelData{M}) where {M} (; τ, Δx, cs,) = g @inline @fastmath function (x) - i = point_to_cell(x, Δx) + i, r = point_to_cell(x, Δx) X = x - (i - 1) * Δx # source position relative to xs[i] # @assert 0 ≤ X < i * Δx a = exp(-X^2 / τ) diff --git a/src/Kernels/kaiser_bessel.jl b/src/Kernels/kaiser_bessel.jl index b08da9f..997f169 100644 --- a/src/Kernels/kaiser_bessel.jl +++ b/src/Kernels/kaiser_bessel.jl @@ -163,10 +163,10 @@ function evaluate_fourier_func(g::KaiserBesselKernelData) end function evaluate_kernel_func(g::KaiserBesselKernelData{M, T}) where {M, T} - (; w, Δx, cs,) = g + (; Δx, cs,) = g function (x) - i = point_to_cell(x, Δx) - X = x / w - T(i - 1) / M # source position relative to xs[i] + i, r = point_to_cell(x, Δx) + X = (r - T(i - 1)) / M # source position relative to xs[i] # @assert 0 ≤ X < 1 / M values = evaluate_piecewise(X, cs) (; i, values,) diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl index e894e1a..e6d2fd3 100644 --- a/src/Kernels/kaiser_bessel_backwards.jl +++ b/src/Kernels/kaiser_bessel_backwards.jl @@ -134,10 +134,10 @@ function evaluate_fourier_func(g::BackwardsKaiserBesselKernelData) end function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M, T} - (; w, Δx, cs,) = g + (; Δx, cs,) = g function (x) - i = point_to_cell(x, Δx) - X = x / w - T(i - 1) / M # source position relative to xs[i] + i, r = point_to_cell(x, Δx) # r = x / Δx + X = (r - T(i - 1)) / M # source position relative to xs[i] # @assert 0 ≤ X < 1 / M values = evaluate_piecewise(X, cs) (; i, values,) diff --git a/src/blocking.jl b/src/blocking.jl index e1db4c4..cb8e070 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -245,7 +245,7 @@ end # errors) due to numerical accuracy issues. So we first obtain the index on the # Δx grid, then we translate that to a block index by dividing by the block # dimensions (= how many Δx's in a single block). - i = Kernels.point_to_cell(x⃗[n], Δxs[n]) # index of grid cell where point is located + i, r = Kernels.point_to_cell(x⃗[n], Δxs[n]) # index of grid cell where point is located cld(i, block_dims[n]) # index of block where point is located end @inbounds LinearIndices(nblocks_per_dir)[is...] @@ -410,7 +410,7 @@ function set_points_impl!( @timeit timer "(1) Assign blocks" @inbounds for (i, x⃗) ∈ pairs(xp) # Get index of block where point x⃗ is located. y⃗ = to_unit_cell(transform(NTuple{N}(x⃗))) # converts `x⃗` to Tuple if it's an SVector - is = map(Kernels.point_to_cell, y⃗, block_sizes) + is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes) # we use first((i, r)) -> i if bd.sort_points === False() points[i] = y⃗ # copy folded point (doesn't need to be sorted) end @@ -440,7 +440,7 @@ function set_points_impl!( # We recompute the block index associated to this point. x⃗ = xp[i] y⃗ = to_unit_cell(transform(NTuple{N}(x⃗))) # converts `x⃗` to Tuple if it's an SVector - is = map(Kernels.point_to_cell, y⃗, block_sizes) + is = map(first ∘ Kernels.point_to_cell, y⃗, block_sizes) # we use first((i, r)) -> i n = to_linear_index[is...] # linear index of block j = cumulative_npoints_per_block[n] + blockidx[i] pointperm[j] = i diff --git a/test/near_2pi.jl b/test/near_2pi.jl index 4605ded..0b706c4 100644 --- a/test/near_2pi.jl +++ b/test/near_2pi.jl @@ -43,13 +43,13 @@ end @testset "Kernels.point_to_cell" begin Δx = 2π / 3 let x = prevfloat(L / 3) - @test Kernels.point_to_cell(x, Δx) == 1 + @test Kernels.point_to_cell(x, Δx)[1] == 1 end let x = prevfloat(2 * L / 3) - @test Kernels.point_to_cell(x, Δx) == 2 + @test Kernels.point_to_cell(x, Δx)[1] == 2 end let x = prevfloat(L) - @test Kernels.point_to_cell(x, Δx) == 3 + @test Kernels.point_to_cell(x, Δx)[1] == 3 end end end @@ -65,7 +65,7 @@ end @testset "Kernels.point_to_cell" begin Δx = T(2π) / 24 - i = Kernels.point_to_cell(x, Δx) + i, r = Kernels.point_to_cell(x, Δx) @test (i - 1) * Δx ≤ x < i * Δx end From 1f6c02a0e5cbe14fdecb45be5e00267bae772141 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 16:45:15 +0200 Subject: [PATCH 30/33] Remove direct evaluation functions (for now) --- src/Kernels/kaiser_bessel_backwards.jl | 35 -------------------------- 1 file changed, 35 deletions(-) diff --git a/src/Kernels/kaiser_bessel_backwards.jl b/src/Kernels/kaiser_bessel_backwards.jl index e6d2fd3..e741bbd 100644 --- a/src/Kernels/kaiser_bessel_backwards.jl +++ b/src/Kernels/kaiser_bessel_backwards.jl @@ -143,38 +143,3 @@ function evaluate_kernel_func(g::BackwardsKaiserBesselKernelData{M, T}) where {M (; i, values,) end end - -# TODO: remove? -function evaluate_kernel_direct( - g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, x::T, - ) where {M, T} - (; Δx, β,) = g - # i = point_to_cell(x, Δx) - X = x / Δx - T(i - 1) # source position relative to xs[i] - # @assert 0 ≤ X < 1 - Xc = 2 * X - 1 # in [-1, 1) - L = 2M - ntuple(Val(L)) do j - h = 1 - 2 * (j - one(T)/2) / L # midpoint of interval - δ = 1 / L # half-width of interval - y = h + Xc * δ - @fastmath s = sqrt(1 - y^2) - @fastmath sinh(β * s)::T / (s * T(π)) - end -end - -function evaluate_kernel_direct( - g::BackwardsKaiserBesselKernelData{M, T}, i::Integer, n::Integer, x::T, - ) where {M, T} - (; Δx, β,) = g - # @assert 1 ≤ n ≤ 2M - X = x / Δx - T(i - 1) # source position relative to xs[i] - # @assert 0 ≤ X < 1 - Xc = 2 * X - 1 # in [-1, 1) - L = 2M - h = 1 - 2 * (n - one(T)/2) / L # midpoint of interval - δ = 1 / L # half-width of interval - y = h + Xc * δ - @fastmath s = sqrt(1 - y^2) - @fastmath sinh(β * s)::T / (s * T(π)) -end From e8ea63d76158af331eea6afd1f0de9b975990a4a Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 17:00:03 +0200 Subject: [PATCH 31/33] Update CHANGELOG [skip ci] --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c8485d..b2c04c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- Add shared-memory implementation of GPU transforms. + They are disabled by default, and can be enabled by passing `gpu_method = :shared_memory` + (default is `:global_memory`). + ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17 ### Changed From 92356d31bd8761d8696a38861335f73fb203a0a1 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 17:09:19 +0200 Subject: [PATCH 32/33] Add documentation --- src/plan.jl | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/plan.jl b/src/plan.jl index 09b3b3a..01c1674 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -98,6 +98,7 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data The current default is 4096 on the CPU and around 1024 on the GPU (but depends on the number of dimensions). These may change in the future or even depend on the actual computing device. On the CPU, using block partitioning is required for running with multiple threads. + On the GPU, this option is ignored if `gpu_method = :shared_memory`. Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is generally slower). @@ -108,6 +109,29 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data This can improve performance if executing multiple transforms on the same non-uniform points. Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`. +- `gpu_method`: allows to select between two different implementations of + GPU transforms. Possible options are: + + * `:global_memory`: directly read and write onto arrays in global memory in spreading + (type-1) and interpolation (type-2) operations; + + * `:shared_memory`: copy data between global memory and faster shared memory (memory + shared between threads/work items within a single GPU workgroup) and perform most + operations in shared-memory, which is faster and can avoid some atomic operations in + type-1 transforms. + We try to use as much shared memory as is typically available on current GPUs (which is + not much, typically 48 KiB). + But still, this method can be much faster than the `:global_memory`, and may become the + default in the future. + Note that this method completely ignores the `block_size` parameter, as the block size is adjusted + to maximise shared memory usage. + See also the `gpu_batch_size` parameter below. + + The default is currently `:global_memory` but this may change in the future. + +- `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`. + This can have a big impact on performance. + ## Other parameters - `fftw_flags = FFTW.MEASURE`: parameters passed to the FFTW planner (only used when `backend = CPU()`). @@ -309,7 +333,7 @@ function _PlanNUFFT( block_size::Union{Integer, Dims{D}, Nothing} = default_block_size(Ns, backend), synchronise::Bool = false, gpu_method::Symbol = :global_memory, - batch_size::Val = Val(16), # currently only used in shared-memory GPU spreading + gpu_batch_size::Val = Val(16), # currently only used in shared-memory GPU spreading ) where {T <: Number, D} ks = init_wavenumbers(T, Ns) # Determine dimensions of oversampled grid. @@ -344,7 +368,7 @@ function _PlanNUFFT( else block_dims = get_block_dims(Ñs, block_size) if backend isa GPU - blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size,) + blocks = BlockDataGPU(T, backend, block_dims, Ñs, h, sort_points; method = gpu_method, batch_size = gpu_batch_size,) else blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points) FFTW.set_num_threads(Threads.nthreads()) From c9311e05da638fa9627a70c5bc749d99f0280aaf Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Wed, 23 Oct 2024 21:25:52 +0200 Subject: [PATCH 33/33] Update docs and comments --- CHANGELOG.md | 4 ++-- src/blocking.jl | 9 ++++----- src/interpolation/gpu.jl | 1 - src/plan.jl | 28 ++++++++++++++-------------- src/spreading/gpu.jl | 22 +++++++++++++++++----- 5 files changed, 37 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2c04c1..3a449c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add shared-memory implementation of GPU transforms. - They are disabled by default, and can be enabled by passing `gpu_method = :shared_memory` +- Add alternative implementation of GPU transforms based on shared-memory arrays. + This is disabled by default, and can be enabled by passing `gpu_method = :shared_memory` (default is `:global_memory`). ## [v0.5.6](https://github.com/jipolanco/NonuniformFFTs.jl/releases/tag/v0.5.6) - 2024-10-17 diff --git a/src/blocking.jl b/src/blocking.jl index cb8e070..df8ee49 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -63,14 +63,14 @@ type_length(::Type{<:NTuple{N}}) where {N} = N # ================================================================================ # # Determine block size if using the shared-memory implementation. -# We try to make sure that the total block size (including 2M ghost cells in each direction) +# We try to make sure that the total block size (including 2M - 1 ghost cells in each direction) # is not larger than the available shared memory. In CUDA the limit is usually 48 KiB. # Note that the result is a compile-time constant (on Julia 1.11.1 at least). @inline function block_dims_gpu_shmem(::Type{Z}, ::Dims{D}, ::HalfSupport{M}, ::Val{Np}) where {Z <: Number, D, M, Np} T = real(Z) # These are extra shared-memory needs in spreading kernel (see spread_from_points_shmem_kernel!). - # Here Np is the batch size (batch_size parameter). - shmem_needs = sizeof(T) * ( + # Here Np is the batch size (gpu_batch_size parameter). + base_shmem_required = sizeof(T) * ( 2M * D * Np + # window_vals D * Np + # points_sm D * Np # inds_start @@ -82,8 +82,7 @@ type_length(::Type{<:NTuple{N}}) where {N} = N 3 + # buf_sm D # ishifts_sm ) - free_shmem = shmem_needs # how much memory to leave free for other allocations - max_shmem_size = (48 << 10) - free_shmem # 48 KiB -- TODO: make this depend on the actual GPU? + max_shmem_size = (48 << 10) - base_shmem_required # 48 KiB -- TODO: make this depend on the actual GPU? max_block_length = max_shmem_size ÷ sizeof(Z) # maximum number of elements in a block m = floor(Int, max_block_length^(1/D)) # block size in each direction (including ghost cells / padding) n = m - (2M - 1) # exclude ghost cells diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index fd0e449..bf9722a 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -307,7 +307,6 @@ end end # Copy values from global to shared memory. -# - can we optimise memory access patterns? @inline function gridvalues_to_local_memory!( u_local::AbstractArray{T, D}, u_global::AbstractArray{T, D}, diff --git a/src/plan.jl b/src/plan.jl index 01c1674..3b71c78 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -109,28 +109,28 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data This can improve performance if executing multiple transforms on the same non-uniform points. Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`. -- `gpu_method`: allows to select between two different implementations of +- `gpu_method`: allows to select between different implementations of GPU transforms. Possible options are: * `:global_memory`: directly read and write onto arrays in global memory in spreading (type-1) and interpolation (type-2) operations; - * `:shared_memory`: copy data between global memory and faster shared memory (memory - shared between threads/work items within a single GPU workgroup) and perform most - operations in shared-memory, which is faster and can avoid some atomic operations in - type-1 transforms. - We try to use as much shared memory as is typically available on current GPUs (which is - not much, typically 48 KiB). - But still, this method can be much faster than the `:global_memory`, and may become the - default in the future. - Note that this method completely ignores the `block_size` parameter, as the block size is adjusted - to maximise shared memory usage. - See also the `gpu_batch_size` parameter below. + * `:shared_memory`: copy data between global memory and shared memory (local + to each GPU workgroup) and perform most operations in the latter, which is faster and + can help avoid some atomic operations in type-1 transforms. We try to use as much shared + memory as is typically available on current GPUs (which is not much, typically 48 KiB on + CUDA). But still, this method can be much faster than the `:global_memory` and may + become the default in the future. Note that this method completely ignores the + `block_size` parameter, as the actual block size is adjusted to maximise shared memory + usage. The performance of type-1 transforms can be further adjusted using the + `gpu_batch_size` parameter described below. - The default is currently `:global_memory` but this may change in the future. + The default is `:global_memory` but this may change in the future. - `gpu_batch_size = Val(16)`: batch size used in type-1 transforms when `gpu_method = :shared_memory`. - This can have a big impact on performance. + The idea is that, to avoid atomic operations on shared-memory arrays, we process + non-uniform points in batches of `Np` points (16 by default). + This can have a large impact on performance. ## Other parameters diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index dbc58d1..4545528 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -279,12 +279,22 @@ end @synchronize # make sure shared memory is fully set to zero + # We operate in batches of up to `Np` non-uniform points / sources. + # The aim is to completely avoid atomic operations on shared memory: instead of + # parallelising over non-uniform points (1 point = 1 thread), we make all threads + # work on the same set of points, so that, in the end, all threads have the required + # information to spread point values onto `u_local`. That spreading operation is + # done by parallelising over the local grid of dimensions M^D (so that every thread + # writes to a different part of the array), instead of parallelising over the + # non-uniform points which would require atomics. + # The first batch deals with points (a + 1):min(a + Np, b) @inbounds for batch_begin in buf_sm[1]:Np:(buf_sm[2] - 1) batch_size = min(Np, buf_sm[2] - batch_begin) # current batch size buf_sm[3] = batch_size - # Iterate over points in the batch (ideally 1 thread per point). - # Each thread writes to shared memory. + + # (1) Iterate over points in the batch. + # Each active thread writes to shared memory. @inbounds for p in threadidx:nthreads:batch_size # Spread from point j i = batch_begin + p # index of non-uniform point @@ -301,8 +311,8 @@ end @synchronize - # Now evaluate windows associated to each point. - local inds = CartesianIndices((1:buf_sm[3], 1:D)) # parallelise over dimensions + points + # (2) Evaluate window functions around each non-uniform point. + inds = CartesianIndices((1:buf_sm[3], 1:D)) # parallelise over dimensions + points @inbounds for n in threadidx:nthreads:length(inds) p, d = Tuple(inds[n]) g = gs[d] @@ -318,7 +328,8 @@ end @synchronize # make sure all threads have the same shared data - # Now all threads spread together onto shared memory + # (3) All threads spread together onto shared memory, avoiding all collisions + # and thus not requiring atomic operations. @inbounds for p in 1:buf_sm[3] local istart = ntuple(d -> @inbounds(inds_start[d, p]), Val(D)) local v = vp_sm[p] @@ -329,6 +340,7 @@ end @synchronize # make sure we have finished writing to shared memory + # Add values from shared memory onto global memory. @inbounds if buf_sm[1] < buf_sm[2] # skip this step if there were actually no points in this block (if a == b) add_from_local_to_global_memory!( us[c], u_local, Ns, ishifts_sm, Val(M);