From 11e566e62c4cb8ee3869ba5eb8b123000aab2f72 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Thu, 19 Sep 2024 16:32:06 +0200 Subject: [PATCH 01/10] Adapt Hilbert sorting from BijectiveHilbert.jl --- src/NonuniformFFTs.jl | 1 + src/sorting_hilbert.jl | 135 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 src/sorting_hilbert.jl diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index 1ce94dc..2d430bb 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -64,6 +64,7 @@ end default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(64, ndrange[1]),) include("sorting.jl") +include("sorting_hilbert.jl") include("blocking.jl") include("plan.jl") include("set_points.jl") diff --git a/src/sorting_hilbert.jl b/src/sorting_hilbert.jl new file mode 100644 index 0000000..e519c80 --- /dev/null +++ b/src/sorting_hilbert.jl @@ -0,0 +1,135 @@ +# Code adapted from the BijectiveHilbert.jl package by Andrew Dolgert (MIT license). +# +# Hilbert sorting is a method for sorting a set of points in n-dimensional space so that +# points which are physically close are also close in memory. See for instance: +# +# - https://en.wikipedia.org/wiki/Hilbert_curve +# - https://charlesreid1.com/wiki/Hilbert_Sort +# - https://doc.cgal.org/latest/Spatial_sorting/index.html +# - https://computingkitchen.com/BijectiveHilbert.jl/stable/hilbert/ +# +# We define a GlobalGrayStatic{T,B} type adapted from the BijectiveHilbert.GlobalGray{T} +# type. The differences are that GlobalGrayStatic contains the number of bits `B` as a +# static parameter, and that it simply does not include the number +# of dimensions `n`. Instead, `n` is inferred from the input to `encode_hilbert_zero`, which +# should be a (statically-sized) `Tuple` instead of a `Vector` as in BijectiveHilbert.jl. +# +# Moreover, functions for computing the Hilbert index have been adapted for working with +# tuples and have no allocations. The performance is greatly improved compared to the +# original GlobalGray type. Our code can also be called from GPU kernels, since it only uses +# tuples and MVectors. +# +# We only implement encoding as we currently don't need decoding. +# +# See also https://computingkitchen.com/BijectiveHilbert.jl/stable/globalgray/#Global-Gray +# for some details on the algorithm. + +""" + GlobalGrayStatic(T <: Unsigned, B::Int) + GlobalGrayStatic(B::Int, N::Int) + +Hilbert sorting algorithm adapted from the `GlobalGray` algorithm in BijectiveHilbert.jl. + +Here `T <: Unsigned` is the type of the Hilbert index and `B` is the required number of +bits per dimension. For `B` bits, the algorithm creates a grid of size ``2^B`` in each +direction. + +The second constructor chooses the smallest unsigned type `T` based on the required number +of bits `B` and the space dimension `N`. If they are constants, they will likely be +constant-propagated so that `T` is inferred by the compiler. + +The total number of bits required is `nbits = B * N`. For instance, if `N = 3` and `B = 4` +(corresponding to a grid of dimensions ``(2^4)^3 = 16^3``), then `nbits = 12` and the +smallest possible type `T` is `UInt16`, whose size is `8 * sizeof(UInt16) = 16` bits (as its +name suggests!). +""" +struct GlobalGrayStatic{T, B} end + +Base.eltype(::GlobalGrayStatic{T}) where {T} = T + +GlobalGrayStatic(::Type{T}, B::Int) where {T} = GlobalGrayStatic{T, B}() + +Base.@constprop :aggressive function GlobalGrayStatic(B::Int, N::Int) + nbits = B * N # minimal number of bits needed + T = large_enough_unsigned_static(Val(nbits)) + GlobalGrayStatic{T, B}() +end + +@inline function large_enough_unsigned_static(::Val{nbits}) where {nbits} + if @generated + unsigned_types = (UInt8, UInt16, UInt32, UInt64, UInt128) + for T ∈ unsigned_types + if 8 * sizeof(T) ≥ nbits + return :($T) + end + end + :(nothing) + else + unsigned_types = (UInt8, UInt16, UInt32, UInt64, UInt128) + for T ∈ unsigned_types + if 8 * sizeof(T) ≥ nbits + return T + end + end + nothing + end +end + +""" + encode_hilbert_zero(algorithm::GlobalGrayStatic{T}, X::NTuple{N, <:Integer}) -> T + +Return Hilbert index associated to location `X` in `N`-dimensional space. +""" +function encode_hilbert_zero(::GlobalGrayStatic{T, B}, X::NTuple) where {T, B} + X = axes_to_transpose(X, Val(B)) + interleave_transpose(T, X, Val(B))::T +end + +function axes_to_transpose(X_in::NTuple{N,T}, ::Val{b}) where {N, b, T <: Integer} + X = MVector(X_in) + M = one(T) << (b - 1) + # Inverse undo + Q = M + @inbounds while Q > one(T) + P = Q - one(T) + for io ∈ 1:N + if !iszero(X[io] & Q) + X[1] ⊻= P + else + t = (X[1] ⊻ X[io]) & P + X[1] ⊻= t + X[io] ⊻= t + end + end + Q >>= one(Q) + end + # Gray encode + for jo ∈ 2:N + @inbounds X[jo] ⊻= X[jo - 1] + end + t2 = zero(T) + Q = M + @inbounds while Q > one(T) + if !iszero(X[N] & Q) + t2 ⊻= (Q - one(T)) + end + Q >>= one(T) + end + for ko ∈ eachindex(X) + @inbounds X[ko] ⊻= t2 + end + Tuple(X) +end + +function interleave_transpose(::Type{T}, x::NTuple, ::Val{b}) where {T, b} + N = length(x) + S = eltype(x) + h = zero(T) + @inbounds for i ∈ 0:(b - 1) + for d ∈ eachindex(x) + ith_bit = Bool((x[d] & (one(S) << i)) >> i) # 0 or 1 + h |= T(ith_bit << (i * N + N - d)) + end + end + h +end From e7e05e798fea8cf388ad344b9bda88f07156de07 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 09:32:18 +0200 Subject: [PATCH 02/10] Implement BlockDataGPU --- src/NonuniformFFTs.jl | 12 +- src/blocking.jl | 221 ++++++++++++++++++++++++++-- src/interpolation/cpu_blocked.jl | 5 +- src/interpolation/cpu_nonblocked.jl | 1 + src/interpolation/gpu.jl | 1 + src/plan.jl | 21 ++- src/set_points.jl | 2 +- src/sorting_hilbert.jl | 13 +- src/spreading/cpu_blocked.jl | 4 +- src/spreading/cpu_nonblocked.jl | 1 + src/spreading/gpu.jl | 1 + 11 files changed, 243 insertions(+), 39 deletions(-) diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index 2d430bb..6e3b7e8 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -146,11 +146,7 @@ function exec_type1!(ûs_k::NTuple{C, AbstractArray{<:Complex}}, p::PlanNUFFT, end @timeit timer "(1) Spreading" begin - if with_blocking(blocks) - spread_from_points_blocked!(backend, kernels, blocks, us, points, vp) # CPU-only - else - spread_from_points!(backend, kernels, us, points, vp) # single-threaded case? Also GPU case. - end + spread_from_points!(backend, blocks, kernels, us, points, vp) KA.synchronize(backend) end @@ -249,11 +245,7 @@ function exec_type2!(vp::NTuple{C, AbstractVector}, p::PlanNUFFT, ûs_k::NTuple end @timeit timer "(3) Interpolation" begin - if with_blocking(blocks) - interpolate_blocked!(kernels, blocks, vp, us, points) # CPU-onlu - else - interpolate!(backend, kernels, vp, us, points) # single-threaded case? Also GPU case. - end + interpolate!(backend, blocks, kernels, vp, us, points) KA.synchronize(backend) end end diff --git a/src/blocking.jl b/src/blocking.jl index 961d6ad..ef65a0a 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -15,16 +15,14 @@ end # Here the element type of `xp` can be either an NTuple{N, <:Real}, an SVector{N, <:Real}, # or anything else which has length `N`. -function set_points!(::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} - # TODO: avoid implicit copy in resize!? (can be costly, especially on GPU) - length(points) == length(xp) || resize!(points, length(xp)) +function set_points!(backend, ::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} + length(points) == length(xp) || resize_no_copy!(points, length(xp)) Base.require_one_based_indexing(points) @timeit timer "(1) Copy + fold" begin # NOTE: we explicitly iterate through StructVector components because CUDA.jl # currently fails when implicitly writing to a StructArray (compilation fails, # tested on Julia 1.11-rc3 and CUDA.jl v5.4.3). points_comp = StructArrays.components(points) - backend = KA.get_backend(points_comp[1]) kernel! = copy_points_unblocked_kernel!(backend) kernel!(transform, points_comp, xp; ndrange = size(xp)) KA.synchronize(backend) # mostly to get accurate timings @@ -51,6 +49,205 @@ type_length(::Type{<:NTuple{N}}) where {N} = N # ================================================================================ # +# Maximum number of bits allowed for Hilbert sort. +# Hilbert sorting divides the domain in a grid of size N^d, where `d` is the dimension and +# N = 2^b. Here `b` is the number of bits used in the algorithm. +# This means that `b = 16` corresponds to a grid of dimensions 65536 *in each direction*, +# which is far larger than anything we will ever need in practice. +# (Also note that the Hilbert "grid" is generally coarser than the actual NUFFT grid where +# values are spread and interpolated.) +const MAX_BITS_HILBERT_SORT = 16 + +# GPU implementation +struct BlockDataGPU{ + PermVector <: AbstractVector{<:Integer}, + SortPoints <: StaticBool, + } <: AbstractBlockData + pointperm :: PermVector + nbits_hilbert :: Int # number of bits required in Hilbert sorting + sort_points :: SortPoints + + BlockDataGPU(pointperm::P, nbits, sort::S) where {P, S} = + new{P, S}(pointperm, nbits, sort) +end + +with_blocking(::BlockDataGPU) = true + +# In the case of Hilbert sorting, what we call "block" corresponds to the size of the +# minimal box in the Hilbert curve algorithm. +function BlockDataGPU(backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, sort_points) where {D} + # We use a Hilbert sorting algorithm which requires the same number of blocks in each + # direction. In the best case, blocks will have the size required by `block_dims`. If + # that's not possible, we will prefer to use a smaller block size (so more blocks). + # This makes sense if the input block size was tuned as proportional to some memory + # limit (e.g. shared memory size on GPUs). + nblocks_per_dir_wanted = map(Ñs, block_dims) do N, B + cld(N, B) # ≥ 1 + end + nblocks = max(nblocks_per_dir_wanted...) # this makes the actual block size ≤ the wanted block size + nbits = exponent(nblocks - 1) + 1 + @assert UInt(2)^(nbits - 1) < nblocks ≤ UInt(2)^nbits # verify that we got the right value of nbits + nbits = min(nbits, MAX_BITS_HILBERT_SORT) # just in case; in practice nbits < MAX_BITS_HILBERT_SORT all the time + pointperm = KA.allocate(backend, Int, 0) # stores permutation indices + BlockDataGPU(pointperm, nbits, sort_points) +end + +function set_points!( + backend::GPU, bd::BlockDataGPU, points::StructVector, xp, timer; + transform::F = identity, + ) where {F <: Function} + + # Total number of bits required to hold a Hilbert index. + # The index type T must be larger than this size. + # nbits_index = N * nbits_hilbert + + # Recursively iterate over possible values of nbits_hilbert to avoid type instability. + # We start by nbits = 1, and increase until nbits == bd.nbits_hilbert. + @assert bd.nbits_hilbert ≤ MAX_BITS_HILBERT_SORT + _set_points_hilbert!(Val(1), backend, bd, points, xp, timer, transform) + + # if nbits_index ≤ 8 * sizeof(UInt8) + # _set_points!(UInt8, backend, bd, points, xp, timer, transform) + # elseif nbits_index ≤ 8 * sizeof(UInt16) + # _set_points!(UInt16, backend, bd, points, xp, timer, transform) + # elseif nbits_index ≤ 8 * sizeof(UInt32) + # _set_points!(UInt32, backend, bd, points, xp, timer, transform) + # else + # _set_points!(UInt64, backend, bd, points, xp, timer, transform) + # end + + nothing +end + +@inline function _set_points_hilbert!(::Val{nbits}, backend, bd, points, xp, args...) where {nbits} + if nbits > 16 # this is to avoid the compiler from exploding if it thinks the recursion is infinite + nothing + elseif nbits == bd.nbits_hilbert + N = type_length(eltype(xp)) # = number of dimensions + alg = GlobalGrayStatic(nbits, N) # should be inferred, since nbits is statically known + _set_points_hilbert!(alg, backend, bd, points, xp, args...) # call method defined further below + else + _set_points_hilbert!(Val(nbits + 1), backend, bd, points, xp, args...) # keep iterating + end +end + +function _set_points_hilbert!( + sortalg::HilbertSortingAlgorithm, backend, bd::BlockDataGPU, + points::StructVector, xp, timer, transform::F, + ) where {F} + (; pointperm, sort_points,) = bd + + B = nbits(sortalg) # static value + nblocks = 1 << B # same number of blocks in all directions (= 2^B) + + Np = length(xp) + resize_no_copy!(pointperm, Np) + resize_no_copy!(points, Np) + @assert eachindex(points) == eachindex(xp) + + # Allocate temporary array for holding Hilbert indices (manually deallocated later) + T = eltype(sortalg) + inds = KA.zeros(backend, T, Np) + + # We avoid passing a StructVector to the kernel, so we pass `points` as a tuple of + # vectors. The kernel might fail if `xp` is also a StructVector, which is not imposed + # nor disallowed. + points_comp = StructArrays.components(points) + + ndrange = size(points) + groupsize = default_workgroupsize(backend, ndrange) + kernel! = hilbert_sort_kernel!(backend, groupsize, ndrange) + @timeit timer "(1) Hilbert encoding" begin + kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform) + KA.synchronize(backend) + end + + # Compute permutation needed to sort Hilbert indices. + @timeit timer "(2) Sortperm" begin + sortperm!(pointperm, inds) + KA.synchronize(backend) + end + KA.unsafe_free!(inds) + + # `pointperm` now contains the permutation needed to sort points + if sort_points === True() + @timeit timer "(3) Permute points" let + local kernel! = permute_kernel!(backend, groupsize, ndrange) + kernel!(points_comp, xp, pointperm, transform) + KA.synchronize(backend) + resize!(pointperm, 0) # free some memory (we won't need it anymore) + end + end + + nothing +end + +# This kernel may do multiple things at once: +# - Compute Hilbert index associated to a point +# - Copy point from `xp` to `points`, after transformations and folding, if sort_points === False(). +@kernel function hilbert_sort_kernel!( + inds::AbstractVector{<:Unsigned}, + points::NTuple, + @Const(xp), + @Const(sortalg::HilbertSortingAlgorithm), + @Const(nblocks::Integer), # TODO: can we pass this as a compile-time constant? (Val) + @Const(sort_points), + @Const(transform::F), + ) where {F} + I = @index(Global, Linear) + @inbounds x⃗ = xp[I] + y⃗ = to_unit_cell(transform(Tuple(x⃗))) :: NTuple + T = eltype(y⃗) + @assert T <: AbstractFloat + + L = T(2) * π + block_size = L / nblocks + + is = map(y⃗) do y + i = Kernels.point_to_cell(y, block_size) + i - one(i) # Hilbert sorting requires zero-based indices + end + + @inbounds inds[I] = encode_hilbert_zero(sortalg, is) # compute Hilbert index for sorting + + # If points need to be sorted, then we fill `points` some time later (in permute_kernel!). + if sort_points === False() + for n ∈ eachindex(x⃗) + @inbounds points[n][I] = y⃗[n] + end + end + + nothing +end + +@kernel function permute_kernel!( + points::NTuple, + @Const(xp::AbstractVector), + @Const(perm::AbstractVector{<:Integer}), + @Const(transform::F), + ) where {F} + i = @index(Global, Linear) + j = @inbounds perm[i] + x⃗ = @inbounds xp[j] + y⃗ = to_unit_cell(transform(Tuple(x⃗))) :: NTuple # note: we perform the transform + folding twice per point... + for n ∈ eachindex(x⃗) + @inbounds points[n][i] = y⃗[n] + end + nothing +end + +# Resize vector trying to avoid copy when N is larger than the original length. +# In other words, we completely discard the original contents of x, which is not the +# original behaviour of resize!. This can save us some device-to-device copies. +function resize_no_copy!(x, N) + resize!(x, 0) + resize!(x, N) + x +end + +# ================================================================================ # + +# CPU only struct BlockData{ T, N, Nc, Tr, # = real(T) @@ -82,7 +279,7 @@ function BlockData( @assert N - M > 0 min(B, N ÷ 2, N - M) end - dims = block_dims .+ 2M # include padding for values outside of block + dims = block_dims .+ 2M # include padding for values outside of block (TODO: include padding in original block_dims?) Tr = real(T) block_sizes = map(Ñs, block_dims) do N, B @inline @@ -108,11 +305,9 @@ function BlockData( ) end -# Blocking is considered to be disabled if there are no allocated buffers. -with_blocking(bd::BlockData) = !isempty(bd.buffers) - -function set_points!(bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} - with_blocking(bd) || return set_points!(NullBlockData(), points, xp, timer) +function set_points!(::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} + # This technically never happens, but we might use it as a way to disable blocking. + isempty(bd.buffers) && return set_points!(NullBlockData(), points, xp, timer) (; indices, cumulative_npoints_per_block, blockidx, pointperm, block_sizes,) = bd N = type_length(eltype(xp)) # = number of dimensions @@ -120,9 +315,9 @@ function set_points!(bd::BlockData, points::StructVector, xp, timer; transform:: fill!(cumulative_npoints_per_block, 0) to_linear_index = LinearIndices(axes(indices)) # maps Cartesian to linear index of a block Np = length(xp) - resize!(blockidx, Np) - resize!(pointperm, Np) - resize!(points, Np) + resize_no_copy!(blockidx, Np) + resize_no_copy!(pointperm, Np) + resize_no_copy!(points, Np) @timeit timer "(1) Assign blocks" @inbounds for (i, x⃗) ∈ pairs(xp) # Get index of block where point x⃗ is located. diff --git a/src/interpolation/cpu_blocked.jl b/src/interpolation/cpu_blocked.jl index e8eed94..d225c25 100644 --- a/src/interpolation/cpu_blocked.jl +++ b/src/interpolation/cpu_blocked.jl @@ -89,9 +89,10 @@ function interpolate_from_arrays_blocked( end end -function interpolate_blocked!( - gs, +function interpolate!( + backend::CPU, bd::BlockData, + gs, vp_all::NTuple{C, AbstractVector}, us::NTuple{C, AbstractArray}, x⃗s::AbstractArray, diff --git a/src/interpolation/cpu_nonblocked.jl b/src/interpolation/cpu_nonblocked.jl index 1eaf54c..4ae52ca 100644 --- a/src/interpolation/cpu_nonblocked.jl +++ b/src/interpolation/cpu_nonblocked.jl @@ -1,5 +1,6 @@ function interpolate!( backend::CPU, + ::NullBlockData, gs, vp_all::NTuple{C, AbstractVector}, us::NTuple{C, AbstractArray}, diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index e9c5893..6ec829d 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -41,6 +41,7 @@ end function interpolate!( backend::GPU, + bd::Union{BlockDataGPU, NullBlockData}, gs, vp_all::NTuple{C, AbstractVector}, us::NTuple{C, AbstractArray}, diff --git a/src/plan.jl b/src/plan.jl index fdbb137..238c3c6 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -88,12 +88,12 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data ## Performance parameters -- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning. +- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning + or when sorting is enabled. This can be tuned for maximal performance. Using block partitioning is required for running with multiple threads. Blocking can be completely disabled by passing `block_size = nothing` (but this is generally slower, even when running on a single thread). - This parameter is ignored in GPU implementations. - `sort_points = False()`: whether to internally permute the order of the non-uniform points. This can be enabled by passing `sort_points = True()`. @@ -247,7 +247,10 @@ Return the number of datasets which are simultaneously transformed by a plan. """ ntransforms(::PlanNUFFT{T, N, Nc}) where {T, N, Nc} = Nc -default_block_size() = 4096 # in number of linear elements +default_block_size(::CPU) = 4096 # in number of linear elements + +# TODO: adapt this based on size of shared memory and on element type T (and padding 2M)? +default_block_size(::GPU) = 4096 function get_block_dims(Ñs::Dims, bsize::Int) d = length(Ñs) @@ -270,9 +273,9 @@ function _PlanNUFFT( timer = TimerOutput(), fftw_flags = FFTW.MEASURE, fftshift = false, - block_size::Union{Integer, Nothing} = default_block_size(), sort_points::StaticBool = False(), backend::KA.Backend = CPU(), + block_size::Union{Integer, Nothing} = default_block_size(backend), ) where {T <: Number, D} ks = init_wavenumbers(T, Ns) # Determine dimensions of oversampled grid. @@ -301,13 +304,17 @@ function _PlanNUFFT( foreach(init_fourier_coefficients!, kernel_data, ks) end points = StructVector(ntuple(_ -> KA.allocate(backend, Tr, 0), Val(D))) # empty vector of points - if block_size === nothing || backend isa GPU + if block_size === nothing blocks = NullBlockData() # disable blocking (→ can't use multithreading when spreading) backend isa CPU && FFTW.set_num_threads(1) # also disable FFTW threading (avoids allocations) else block_dims = get_block_dims(Ñs, block_size) - blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points) - backend isa CPU && FFTW.set_num_threads(Threads.nthreads()) + if backend isa GPU + blocks = BlockDataGPU(backend, block_dims, Ñs, sort_points) + else + blocks = BlockData(T, block_dims, Ñs, h, num_transforms, sort_points) + FFTW.set_num_threads(Threads.nthreads()) + end end plan_kwargs = backend isa CPU ? (flags = fftw_flags,) : (;) nufft_data = init_plan_data(T, backend, Ñs, ks, num_transforms; plan_kwargs) diff --git a/src/set_points.jl b/src/set_points.jl index 2e3fc65..d34d64a 100644 --- a/src/set_points.jl +++ b/src/set_points.jl @@ -32,7 +32,7 @@ function set_points!(p::PlanNUFFT, xp::AbstractVector; kwargs...) (; points, timer,) = p N = ndims(p) type_length(eltype(xp)) == N || throw(DimensionMismatch(lazy"expected $N-dimensional points")) - @timeit timer "Set points" set_points!(p.blocks, points, xp, timer; kwargs...) + @timeit timer "Set points" set_points!(p.backend, p.blocks, points, xp, timer; kwargs...) p end diff --git a/src/sorting_hilbert.jl b/src/sorting_hilbert.jl index e519c80..4902204 100644 --- a/src/sorting_hilbert.jl +++ b/src/sorting_hilbert.jl @@ -24,8 +24,13 @@ # See also https://computingkitchen.com/BijectiveHilbert.jl/stable/globalgray/#Global-Gray # for some details on the algorithm. +abstract type HilbertSortingAlgorithm{T, B} end + +Base.eltype(::HilbertSortingAlgorithm{T}) where {T} = T +nbits(::HilbertSortingAlgorithm{T, B}) where {T, B} = B + """ - GlobalGrayStatic(T <: Unsigned, B::Int) + GlobalGrayStatic(T <: Unsigned, B::Int) <: HilbertSortingAlgorithm{T, B} GlobalGrayStatic(B::Int, N::Int) Hilbert sorting algorithm adapted from the `GlobalGray` algorithm in BijectiveHilbert.jl. @@ -43,9 +48,7 @@ The total number of bits required is `nbits = B * N`. For instance, if `N = 3` a smallest possible type `T` is `UInt16`, whose size is `8 * sizeof(UInt16) = 16` bits (as its name suggests!). """ -struct GlobalGrayStatic{T, B} end - -Base.eltype(::GlobalGrayStatic{T}) where {T} = T +struct GlobalGrayStatic{T, B} <: HilbertSortingAlgorithm{T, B} end GlobalGrayStatic(::Type{T}, B::Int) where {T} = GlobalGrayStatic{T, B}() @@ -79,6 +82,8 @@ end encode_hilbert_zero(algorithm::GlobalGrayStatic{T}, X::NTuple{N, <:Integer}) -> T Return Hilbert index associated to location `X` in `N`-dimensional space. + +Values in `X` usually correspond to indices on a grid. This function takes indices which start at 0 (!!). """ function encode_hilbert_zero(::GlobalGrayStatic{T, B}, X::NTuple) where {T, B} X = axes_to_transpose(X, Val(B)) diff --git a/src/spreading/cpu_blocked.jl b/src/spreading/cpu_blocked.jl index 695c5d2..8f90c7e 100644 --- a/src/spreading/cpu_blocked.jl +++ b/src/spreading/cpu_blocked.jl @@ -80,10 +80,10 @@ function fill_with_zeros_serial!(us_all::NTuple{C, A}) where {C, A <: DenseArray us_all end -function spread_from_points_blocked!( +function spread_from_points!( ::CPU, - gs, bd::BlockData, + gs, us_all::NTuple{C, AbstractArray}, xp::AbstractVector, vp_all::NTuple{C, AbstractVector}, diff --git a/src/spreading/cpu_nonblocked.jl b/src/spreading/cpu_nonblocked.jl index 2cacc67..bc81a07 100644 --- a/src/spreading/cpu_nonblocked.jl +++ b/src/spreading/cpu_nonblocked.jl @@ -39,6 +39,7 @@ end function spread_from_points!( ::CPU, + ::NullBlockData, # no blocking gs, us_all::NTuple{C, AbstractArray}, x⃗s::AbstractVector, diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index cdc6ee7..bf97903 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -109,6 +109,7 @@ end # We assume all arrays are already on the GPU. function spread_from_points!( backend::GPU, + bd::Union{BlockDataGPU, NullBlockData}, gs, us_all::NTuple{C, AbstractGPUArray}, x⃗s::StructVector, From e9d06b48069a513af0200b9767685471b490bfc8 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 10:19:25 +0200 Subject: [PATCH 03/10] Use permutations in spreading --- src/spreading/gpu.jl | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index bf97903..1c5b2f5 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -3,12 +3,27 @@ us::NTuple{C}, @Const(points::NTuple{D}), @Const(vp::NTuple{C}), + @Const(pointperm), + @Const(sort_points::StaticBool), evaluate::NTuple{D, <:Function}, # can't be marked Const for some reason to_indices::NTuple{D, <:Function}, ) where {C, D} i = @index(Global, Linear) - x⃗ = map(xs -> @inbounds(xs[i]), points) - v⃗ = map(v -> @inbounds(v[i]), vp) + + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + + i_x = if sort_points === True() + i # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done) + else + j # don't access points contiguously in memory, but spread onto a localised region in space (and GPU memory?) + end + + x⃗ = map(xs -> @inbounds(xs[i_x]), points) + v⃗ = map(v -> @inbounds(v[j]), vp) # Determine grid dimensions. Z = eltype(v⃗) @@ -135,11 +150,19 @@ function spread_from_points!( map(u -> reinterpret(T, u), us_all) # note: we don't use reshape, so the first dimension has 2x elements end - # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another) + pointperm = get_pointperm(bd) # nothing in case of NullBlockData + sort_points = get_sort_points(bd)::StaticBool # False in the case of NullBlockData + + if pointperm !== nothing + @assert eachindex(pointperm) == eachindex(x⃗s) + end + + # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points + # may change from one call to another) ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(us_real, xs_comp, vp_all, evaluate, to_indices) + kernel!(us_real, xs_comp, vp_all, pointperm, sort_points, evaluate, to_indices) us_all end From e063e6edbec8346b0bbcb1e59f8a1f9ed3b1fa9d Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 10:25:27 +0200 Subject: [PATCH 04/10] Update interpolation code --- src/blocking.jl | 25 +++++++++---------------- src/interpolation/gpu.jl | 28 +++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index ef65a0a..6f90e86 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -4,6 +4,10 @@ abstract type AbstractBlockData end struct NullBlockData <: AbstractBlockData end with_blocking(::NullBlockData) = false +# For now these are only used in the GPU implementation +get_pointperm(::NullBlockData) = nothing +get_sort_points(::NullBlockData) = False() + @kernel function copy_points_unblocked_kernel!(@Const(transform::F), points::NTuple, @Const(xp)) where {F} I = @index(Global, Linear) @inbounds x⃗ = xp[I] @@ -92,35 +96,24 @@ function BlockDataGPU(backend::KA.Backend, block_dims::Dims{D}, Ñs::Dims{D}, so BlockDataGPU(pointperm, nbits, sort_points) end +get_pointperm(bd::BlockDataGPU) = bd.pointperm +get_sort_points(bd::BlockDataGPU) = bd.sort_points + function set_points!( backend::GPU, bd::BlockDataGPU, points::StructVector, xp, timer; transform::F = identity, ) where {F <: Function} - # Total number of bits required to hold a Hilbert index. - # The index type T must be larger than this size. - # nbits_index = N * nbits_hilbert - # Recursively iterate over possible values of nbits_hilbert to avoid type instability. # We start by nbits = 1, and increase until nbits == bd.nbits_hilbert. @assert bd.nbits_hilbert ≤ MAX_BITS_HILBERT_SORT _set_points_hilbert!(Val(1), backend, bd, points, xp, timer, transform) - # if nbits_index ≤ 8 * sizeof(UInt8) - # _set_points!(UInt8, backend, bd, points, xp, timer, transform) - # elseif nbits_index ≤ 8 * sizeof(UInt16) - # _set_points!(UInt16, backend, bd, points, xp, timer, transform) - # elseif nbits_index ≤ 8 * sizeof(UInt32) - # _set_points!(UInt32, backend, bd, points, xp, timer, transform) - # else - # _set_points!(UInt64, backend, bd, points, xp, timer, transform) - # end - nothing end @inline function _set_points_hilbert!(::Val{nbits}, backend, bd, points, xp, args...) where {nbits} - if nbits > 16 # this is to avoid the compiler from exploding if it thinks the recursion is infinite + if nbits > MAX_BITS_HILBERT_SORT # this is to avoid the compiler from exploding if it thinks the recursion is infinite nothing elseif nbits == bd.nbits_hilbert N = type_length(eltype(xp)) # = number of dimensions @@ -167,6 +160,7 @@ function _set_points_hilbert!( sortperm!(pointperm, inds) KA.synchronize(backend) end + KA.unsafe_free!(inds) # `pointperm` now contains the permutation needed to sort points @@ -175,7 +169,6 @@ function _set_points_hilbert!( local kernel! = permute_kernel!(backend, groupsize, ndrange) kernel!(points_comp, xp, pointperm, transform) KA.synchronize(backend) - resize!(pointperm, 0) # free some memory (we won't need it anymore) end end diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 6ec829d..1955ea6 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -5,12 +5,27 @@ using StaticArrays: MVector vp::NTuple{C}, @Const(points::NTuple{D}), @Const(us::NTuple{C}), + @Const(pointperm), + @Const(sort_points::StaticBool), @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) evaluate::NTuple{D, <:Function}, # can't be marked Const for some reason to_indices::NTuple{D, <:Function}, ) where {C, D} i = @index(Global, Linear) - x⃗ = map(xs -> @inbounds(xs[i]), points) + + j = if pointperm === nothing + i + else + @inbounds pointperm[i] + end + + i_x = if sort_points === True() + i # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done) + else + j # don't access points contiguously in memory, but interpolate in a localised region in space (and GPU memory?) + end + + x⃗ = map(xs -> @inbounds(xs[i_x]), points) # Determine grid dimensions. # Unlike in spreading, here `us` can be made of arrays of complex numbers, because we @@ -33,7 +48,7 @@ using StaticArrays: MVector v⃗ = interpolate_from_arrays_gpu(us, inds, vals) for (dst, v) ∈ zip(vp, v⃗) - @inbounds dst[i] = v + @inbounds dst[j] = v end nothing @@ -56,11 +71,18 @@ function interpolate!( xs_comp = StructArrays.components(x⃗s) Δxs = map(Kernels.gridstep, gs) + pointperm = get_pointperm(bd) # nothing in case of NullBlockData + sort_points = get_sort_points(bd)::StaticBool # False in the case of NullBlockData + + if pointperm !== nothing + @assert eachindex(pointperm) == eachindex(x⃗s) + end + # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another) ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(vp_all, xs_comp, us, Δxs, evaluate, to_indices) + kernel!(vp_all, xs_comp, us, pointperm, sort_points, Δxs, evaluate, to_indices) vp_all end From 034574276461a5e7c1622db7bf3302ceeb66d374 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 10:25:43 +0200 Subject: [PATCH 05/10] Test more parameter combinations --- test/pseudo_gpu.jl | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl index 0175f43..3d3f791 100644 --- a/test/pseudo_gpu.jl +++ b/test/pseudo_gpu.jl @@ -53,6 +53,23 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector) set_points!(p, xp) + save_points_sorted = false # this can be useful for verifying Hilbert sorting graphically + + if backend isa PseudoGPU && save_points_sorted + inds = NonuniformFFTs.get_pointperm(p.blocks) + if inds !== nothing + open("points_sorted.dat", "w") do io + for i ∈ inds + x⃗ = xp[i] + for x ∈ x⃗ + print(io, "\t", x) + end + print(io, "\n") + end + end + end + end + T = eltype(p) # this is actually defined in AbstractNFFTs; it represents the type in Fourier space (always complex) @test T <: Complex dims = size(p) @@ -65,7 +82,7 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::AbstractVector) (; backend, us, wp,) end -function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number} +function compare_with_cpu(::Type{T}, dims; Np = prod(dims), kws...) where {T <: Number} # Generate some non-uniform random data on the CPU rng = Xoshiro(42) N = length(dims) # number of dimensions @@ -73,7 +90,7 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number} xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np] # non-uniform points in [0, 2π]ᵈ vp_init = randn(rng, T, Np) # random values at points - params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5,) + params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, kws...) p_cpu = PlanNUFFT(T, dims; params..., backend = CPU()) p_gpu = PlanNUFFT(T, dims; params..., backend = PseudoGPU()) @@ -87,9 +104,14 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims)) where {T <: Number} end @testset "GPU implementation (using CPU)" begin - types = (Float32, ComplexF32) dims = (35, 64, 40) - @testset "T = $T" for T ∈ types + @testset "T = $T" for T ∈ (Float32, ComplexF32) compare_with_cpu(T, dims) end + @testset "sort_points = $sort_points" for sort_points ∈ (False(), True()) + compare_with_cpu(Float64, dims; sort_points) + end + @testset "No blocking" begin # Hilbert sorting disabled + compare_with_cpu(ComplexF64, dims; block_size = nothing) + end end From 42026596b5a74d727a2de2af66e2564425cd7666 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 10:30:03 +0200 Subject: [PATCH 06/10] Update docs --- src/plan.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/plan.jl b/src/plan.jl index 238c3c6..a275640 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -92,11 +92,14 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data or when sorting is enabled. This can be tuned for maximal performance. Using block partitioning is required for running with multiple threads. - Blocking can be completely disabled by passing `block_size = nothing` (but this is - generally slower, even when running on a single thread). + On the GPU, this will perform spatial sorting using a Hilbert curve algorithm, whose + minimal scale is proportional to the value of `block_size`. + Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is + generally slower). - `sort_points = False()`: whether to internally permute the order of the non-uniform points. This can be enabled by passing `sort_points = True()`. + Ignored when `block_size = nothing` (which disables spatial sorting). In this case, more time will be spent in [`set_points!`](@ref) and less time on the actual transforms. This can improve performance if executing multiple transforms on the same non-uniform points. Note that, even when enabled, this does not modify the `points` argument passed to `set_points!`. From ab79b29dd80a37539bf3b2db23839688329d5c6f Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 11:29:54 +0200 Subject: [PATCH 07/10] GPU: also sort non-uniform values... ...if sort_points = True(). This slightly improves spreading performance (not much). --- src/blocking.jl | 1 + src/interpolation/gpu.jl | 36 ++++++++++++++++++++++++++---------- src/plan.jl | 8 +++++--- src/spreading/gpu.jl | 35 ++++++++++++++++++++++++++--------- 4 files changed, 58 insertions(+), 22 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index 6f90e86..9618b96 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -21,6 +21,7 @@ end # or anything else which has length `N`. function set_points!(backend, ::NullBlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} length(points) == length(xp) || resize_no_copy!(points, length(xp)) + KA.synchronize(backend) Base.require_one_based_indexing(points) @timeit timer "(1) Copy + fold" begin # NOTE: we explicitly iterate through StructVector components because CUDA.jl diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 1955ea6..791e265 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -6,7 +6,6 @@ using StaticArrays: MVector @Const(points::NTuple{D}), @Const(us::NTuple{C}), @Const(pointperm), - @Const(sort_points::StaticBool), @Const(Δxs::NTuple{D}), # grid step in each direction (oversampled grid) evaluate::NTuple{D, <:Function}, # can't be marked Const for some reason to_indices::NTuple{D, <:Function}, @@ -19,13 +18,7 @@ using StaticArrays: MVector @inbounds pointperm[i] end - i_x = if sort_points === True() - i # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done) - else - j # don't access points contiguously in memory, but interpolate in a localised region in space (and GPU memory?) - end - - x⃗ = map(xs -> @inbounds(xs[i_x]), points) + x⃗ = map(xs -> @inbounds(xs[j]), points) # Determine grid dimensions. # Unlike in spreading, here `us` can be made of arrays of complex numbers, because we @@ -44,7 +37,6 @@ using StaticArrays: MVector geval.values .* Δx end - # v⃗ = @inline interpolate_from_arrays(us, inds, vals) v⃗ = interpolate_from_arrays_gpu(us, inds, vals) for (dst, v) ∈ zip(vp, v⃗) @@ -78,11 +70,25 @@ function interpolate!( @assert eachindex(pointperm) == eachindex(x⃗s) end + if sort_points === True() + vp_sorted = map(similar, vp_all) # allocate temporary arrays for sorted non-uniform data + pointperm_ = nothing # we don't need permutations in interpolation kernel (all accesses to non-uniform data will be contiguous) + else + vp_sorted = vp_all + pointperm_ = pointperm + end + # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another) ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(vp_all, xs_comp, us, pointperm, sort_points, Δxs, evaluate, to_indices) + kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices) + + if sort_points === True() + kernel_perm! = interp_permute_kernel!(backend, workgroupsize, ndrange) + kernel_perm!(vp_all, vp_sorted, pointperm) + foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays + end vp_all end @@ -139,3 +145,13 @@ end Tuple(vs) end end + +# This applies the *inverse* permutation by switching i ↔ j indices (opposite of spread_permute_kernel!). +@kernel function interp_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N} + i = @index(Global, Linear) + j = @inbounds perm[i] + for n ∈ 1:N + @inbounds vp[n][j] = vp_in[n][i] + end + nothing +end diff --git a/src/plan.jl b/src/plan.jl index a275640..7224d3e 100644 --- a/src/plan.jl +++ b/src/plan.jl @@ -88,10 +88,12 @@ The created plan contains all data needed to perform NUFFTs for non-uniform data ## Performance parameters -- `block_size = 4096`: the linear block size (in number of elements) when using block partitioning +- `block_size::Int`: the linear block size (in number of elements) when using block partitioning or when sorting is enabled. This can be tuned for maximal performance. - Using block partitioning is required for running with multiple threads. + The current defaults are 4096 (CPU) and 1024 (GPU), but these may change in the future or + even depend on the actual computing device. + On the CPU, using block partitioning is required for running with multiple threads. On the GPU, this will perform spatial sorting using a Hilbert curve algorithm, whose minimal scale is proportional to the value of `block_size`. Blocking / spatial sorting can be completely disabled by passing `block_size = nothing` (but this is @@ -253,7 +255,7 @@ ntransforms(::PlanNUFFT{T, N, Nc}) where {T, N, Nc} = Nc default_block_size(::CPU) = 4096 # in number of linear elements # TODO: adapt this based on size of shared memory and on element type T (and padding 2M)? -default_block_size(::GPU) = 4096 +default_block_size(::GPU) = 1024 # a bit faster than 4096 on A100 (with 256³ oversampled grid) function get_block_dims(Ñs::Dims, bsize::Int) d = length(Ñs) diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 1c5b2f5..75dca82 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -4,7 +4,6 @@ @Const(points::NTuple{D}), @Const(vp::NTuple{C}), @Const(pointperm), - @Const(sort_points::StaticBool), evaluate::NTuple{D, <:Function}, # can't be marked Const for some reason to_indices::NTuple{D, <:Function}, ) where {C, D} @@ -16,13 +15,7 @@ @inbounds pointperm[i] end - i_x = if sort_points === True() - i # points have already been sorted, so we access them contiguously (should be faster, once the permutation has been done) - else - j # don't access points contiguously in memory, but spread onto a localised region in space (and GPU memory?) - end - - x⃗ = map(xs -> @inbounds(xs[i_x]), points) + x⃗ = map(xs -> @inbounds(xs[j]), points) v⃗ = map(v -> @inbounds(v[j]), vp) # Determine grid dimensions. @@ -161,8 +154,32 @@ function spread_from_points!( # may change from one call to another) ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) + + if sort_points === True() + vp_sorted = map(similar, vp_all) # allocate temporary arrays for sorted non-uniform data + kernel_perm! = spread_permute_kernel!(backend, workgroupsize, ndrange) + kernel_perm!(vp_sorted, vp_all, pointperm) + pointperm_ = nothing # we don't need any further permutations (all accesses to non-uniform data will be contiguous) + else + vp_sorted = vp_all + pointperm_ = pointperm + end + kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(us_real, xs_comp, vp_all, pointperm, sort_points, evaluate, to_indices) + kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices) + + if sort_points === True() + foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays + end us_all end + +@kernel function spread_permute_kernel!(vp::NTuple{N}, @Const(vp_in::NTuple{N}), @Const(perm::AbstractVector)) where {N} + i = @index(Global, Linear) + j = @inbounds perm[i] + for n ∈ 1:N + @inbounds vp[n][i] = vp_in[n][j] + end + nothing +end From 4f68830f76ea9886b85e1ba7c36dc1b3e907a4ff Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 11:56:57 +0200 Subject: [PATCH 08/10] Use dynamically sized kernels Doesn't seem to affect performance. --- src/NonuniformFFTs.jl | 9 ++------- src/blocking.jl | 10 +++++----- src/interpolation/gpu.jl | 11 ++++++----- src/spreading/gpu.jl | 12 ++++++------ 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl index 6e3b7e8..b29a681 100644 --- a/src/NonuniformFFTs.jl +++ b/src/NonuniformFFTs.jl @@ -55,13 +55,8 @@ function default_workgroupsize(backend, ndrange::Dims) KA.default_cpu_workgroupsize(ndrange) end -# Reducing the number of threads to 64 for 1D GPU kernels seems to improve performance -# (tested on Nvidia A100). 1D kernels are those iterating over non-uniform points, i.e. -# spreading and interpolation, which usually dominate performance. -# TODO: this seems to be only true when data is randomly located and sorting is not -# performed. With sorting, it looks like larger workgroups are faster, so we should revert -# this when sorting on GPU is implemented. -default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(64, ndrange[1]),) +# Case of 1D kernels on the GPU (typically, kernels which iterate over non-uniform points). +default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(512, ndrange[1]),) include("sorting.jl") include("sorting_hilbert.jl") diff --git a/src/blocking.jl b/src/blocking.jl index 9618b96..ab54354 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -149,10 +149,10 @@ function _set_points_hilbert!( points_comp = StructArrays.components(points) ndrange = size(points) - groupsize = default_workgroupsize(backend, ndrange) - kernel! = hilbert_sort_kernel!(backend, groupsize, ndrange) + workgroupsize = default_workgroupsize(backend, ndrange) + kernel! = hilbert_sort_kernel!(backend) @timeit timer "(1) Hilbert encoding" begin - kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform) + kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform; workgroupsize, ndrange) KA.synchronize(backend) end @@ -167,8 +167,8 @@ function _set_points_hilbert!( # `pointperm` now contains the permutation needed to sort points if sort_points === True() @timeit timer "(3) Permute points" let - local kernel! = permute_kernel!(backend, groupsize, ndrange) - kernel!(points_comp, xp, pointperm, transform) + local kernel! = permute_kernel!(backend) + kernel!(points_comp, xp, pointperm, transform; ndrange, workgroupsize) KA.synchronize(backend) end end diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index 791e265..b140773 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -78,15 +78,16 @@ function interpolate!( pointperm_ = pointperm end - # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another) + # We use dynamically sized kernels to avoid recompilation, since number of points may + # change from one call to another. ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) - kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices) + kernel! = interpolate_to_point_naive_kernel!(backend) + kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices; workgroupsize, ndrange) if sort_points === True() - kernel_perm! = interp_permute_kernel!(backend, workgroupsize, ndrange) - kernel_perm!(vp_all, vp_sorted, pointperm) + kernel_perm! = interp_permute_kernel!(backend) + kernel_perm!(vp_all, vp_sorted, pointperm; workgroupsize, ndrange) foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays end diff --git a/src/spreading/gpu.jl b/src/spreading/gpu.jl index 75dca82..74014ad 100644 --- a/src/spreading/gpu.jl +++ b/src/spreading/gpu.jl @@ -150,23 +150,23 @@ function spread_from_points!( @assert eachindex(pointperm) == eachindex(x⃗s) end - # TODO: use dynamically sized kernel? (to avoid recompilation, since number of points - # may change from one call to another) + # We use dynamically sized kernels to avoid recompilation, since number of points may + # change from one call to another. ndrange = size(x⃗s) # iterate through points workgroupsize = default_workgroupsize(backend, ndrange) if sort_points === True() vp_sorted = map(similar, vp_all) # allocate temporary arrays for sorted non-uniform data - kernel_perm! = spread_permute_kernel!(backend, workgroupsize, ndrange) - kernel_perm!(vp_sorted, vp_all, pointperm) + kernel_perm! = spread_permute_kernel!(backend) + kernel_perm!(vp_sorted, vp_all, pointperm; workgroupsize, ndrange) pointperm_ = nothing # we don't need any further permutations (all accesses to non-uniform data will be contiguous) else vp_sorted = vp_all pointperm_ = pointperm end - kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange) - kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices) + kernel! = spread_from_point_naive_kernel!(backend) + kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices; workgroupsize, ndrange) if sort_points === True() foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays From 5e9ab75973627966e3a4132362d32cbf77d135bf Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 12:13:09 +0200 Subject: [PATCH 09/10] Try to fix tests --- src/abstractNFFTs.jl | 2 +- test/accuracy.jl | 4 ++-- test/multidimensional.jl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/abstractNFFTs.jl b/src/abstractNFFTs.jl index 4a07d1a..3c20cdd 100644 --- a/src/abstractNFFTs.jl +++ b/src/abstractNFFTs.jl @@ -91,7 +91,7 @@ Base.@constprop :aggressive function PlanNUFFT( m, σ, reltol = AbstractNFFTs.accuracyParams(; kwargs...) backend = KA.get_backend(xp) # e.g. use GPU backend if xp is a GPU array sort_points = sortNodes ? True() : False() # this is type-unstable (unless constant propagation happens) - block_size = blocking ? default_block_size() : nothing # also type-unstable + block_size = blocking ? default_block_size(backend) : nothing # also type-unstable kernel = window isa AbstractKernel ? window : convert_window_function(window) p = PlanNUFFT(Complex{Tr}, Ns, HalfSupport(m); backend, σ = Tr(σ), sort_points, fftshift, block_size, kernel, fftw_flags = fftflags) AbstractNFFTs.nodes!(p, xp) diff --git a/test/accuracy.jl b/test/accuracy.jl index 2aec8cc..cc83fcb 100644 --- a/test/accuracy.jl +++ b/test/accuracy.jl @@ -95,7 +95,7 @@ function test_nufft_type1_1d( Np = 2 * N, m = HalfSupport(8), σ = 1.25, - block_size = NonuniformFFTs.default_block_size(), + block_size = NonuniformFFTs.default_block_size(CPU()), ) where {T <: Number} if T <: Real Tr = T @@ -154,7 +154,7 @@ function test_nufft_type2_1d( Np = 2 * N, m = HalfSupport(8), σ = 1.25, - block_size = NonuniformFFTs.default_block_size(), + block_size = NonuniformFFTs.default_block_size(CPU()), ) where {T <: Number} if T <: Real Tr = T diff --git a/test/multidimensional.jl b/test/multidimensional.jl index e4c1b83..4ead8f1 100644 --- a/test/multidimensional.jl +++ b/test/multidimensional.jl @@ -32,7 +32,7 @@ function test_nufft_type1( Np = 2 * first(Ns), m = HalfSupport(8), σ = 1.25, - block_size = NonuniformFFTs.default_block_size(), + block_size = NonuniformFFTs.default_block_size(CPU()), sort_points = False(), ) where {T <: Number} Tr = real(T) @@ -79,7 +79,7 @@ function test_nufft_type2( Np = 2 * first(Ns), m = HalfSupport(8), σ = 1.25, - block_size = NonuniformFFTs.default_block_size(), + block_size = NonuniformFFTs.default_block_size(CPU()), sort_points = False(), ) where {T <: Number} Tr = real(T) From 2fc6487bb30291cdc77a07d9db4c3bcbe3a7f1b5 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 20 Sep 2024 15:15:16 +0200 Subject: [PATCH 10/10] Fix JET tests? --- src/blocking.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/blocking.jl b/src/blocking.jl index ab54354..e7b355e 100644 --- a/src/blocking.jl +++ b/src/blocking.jl @@ -273,7 +273,7 @@ function BlockData( @assert N - M > 0 min(B, N ÷ 2, N - M) end - dims = block_dims .+ 2M # include padding for values outside of block (TODO: include padding in original block_dims?) + dims = block_dims .+ 2M # include padding for values outside of block (TODO: include padding in original block_dims? requires minimal block_size in each direction) Tr = real(T) block_sizes = map(Ñs, block_dims) do N, B @inline @@ -299,9 +299,9 @@ function BlockData( ) end -function set_points!(::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} +function set_points!(backend::CPU, bd::BlockData, points::StructVector, xp, timer; transform::F = identity) where {F <: Function} # This technically never happens, but we might use it as a way to disable blocking. - isempty(bd.buffers) && return set_points!(NullBlockData(), points, xp, timer) + isempty(bd.buffers) && return set_points!(backend, NullBlockData(), points, xp, timer; transform) (; indices, cumulative_npoints_per_block, blockidx, pointperm, block_sizes,) = bd N = type_length(eltype(xp)) # = number of dimensions