Skip to content

Commit

Permalink
Use dynamically sized kernels
Browse files Browse the repository at this point in the history
Doesn't seem to affect performance.
  • Loading branch information
jipolanco committed Sep 20, 2024
1 parent ab79b29 commit 4f68830
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 23 deletions.
9 changes: 2 additions & 7 deletions src/NonuniformFFTs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,8 @@ function default_workgroupsize(backend, ndrange::Dims)
KA.default_cpu_workgroupsize(ndrange)
end

# Reducing the number of threads to 64 for 1D GPU kernels seems to improve performance
# (tested on Nvidia A100). 1D kernels are those iterating over non-uniform points, i.e.
# spreading and interpolation, which usually dominate performance.
# TODO: this seems to be only true when data is randomly located and sorting is not
# performed. With sorting, it looks like larger workgroups are faster, so we should revert
# this when sorting on GPU is implemented.
default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(64, ndrange[1]),)
# Case of 1D kernels on the GPU (typically, kernels which iterate over non-uniform points).
default_workgroupsize(::GPU, ndrange::Dims{1}) = (min(512, ndrange[1]),)

include("sorting.jl")
include("sorting_hilbert.jl")
Expand Down
10 changes: 5 additions & 5 deletions src/blocking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ function _set_points_hilbert!(
points_comp = StructArrays.components(points)

ndrange = size(points)
groupsize = default_workgroupsize(backend, ndrange)
kernel! = hilbert_sort_kernel!(backend, groupsize, ndrange)
workgroupsize = default_workgroupsize(backend, ndrange)
kernel! = hilbert_sort_kernel!(backend)
@timeit timer "(1) Hilbert encoding" begin
kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform)
kernel!(inds, points_comp, xp, sortalg, nblocks, sort_points, transform; workgroupsize, ndrange)
KA.synchronize(backend)
end

Expand All @@ -167,8 +167,8 @@ function _set_points_hilbert!(
# `pointperm` now contains the permutation needed to sort points
if sort_points === True()
@timeit timer "(3) Permute points" let
local kernel! = permute_kernel!(backend, groupsize, ndrange)
kernel!(points_comp, xp, pointperm, transform)
local kernel! = permute_kernel!(backend)
kernel!(points_comp, xp, pointperm, transform; ndrange, workgroupsize)
KA.synchronize(backend)
end
end
Expand Down
11 changes: 6 additions & 5 deletions src/interpolation/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,16 @@ function interpolate!(
pointperm_ = pointperm
end

# TODO: use dynamically sized kernel? (to avoid recompilation, since number of points may change from one call to another)
# We use dynamically sized kernels to avoid recompilation, since number of points may
# change from one call to another.
ndrange = size(x⃗s) # iterate through points
workgroupsize = default_workgroupsize(backend, ndrange)
kernel! = interpolate_to_point_naive_kernel!(backend, workgroupsize, ndrange)
kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices)
kernel! = interpolate_to_point_naive_kernel!(backend)
kernel!(vp_sorted, xs_comp, us, pointperm_, Δxs, evaluate, to_indices; workgroupsize, ndrange)

if sort_points === True()
kernel_perm! = interp_permute_kernel!(backend, workgroupsize, ndrange)
kernel_perm!(vp_all, vp_sorted, pointperm)
kernel_perm! = interp_permute_kernel!(backend)
kernel_perm!(vp_all, vp_sorted, pointperm; workgroupsize, ndrange)
foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays
end

Expand Down
12 changes: 6 additions & 6 deletions src/spreading/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,23 +150,23 @@ function spread_from_points!(
@assert eachindex(pointperm) == eachindex(x⃗s)
end

# TODO: use dynamically sized kernel? (to avoid recompilation, since number of points
# may change from one call to another)
# We use dynamically sized kernels to avoid recompilation, since number of points may
# change from one call to another.
ndrange = size(x⃗s) # iterate through points
workgroupsize = default_workgroupsize(backend, ndrange)

if sort_points === True()
vp_sorted = map(similar, vp_all) # allocate temporary arrays for sorted non-uniform data
kernel_perm! = spread_permute_kernel!(backend, workgroupsize, ndrange)
kernel_perm!(vp_sorted, vp_all, pointperm)
kernel_perm! = spread_permute_kernel!(backend)
kernel_perm!(vp_sorted, vp_all, pointperm; workgroupsize, ndrange)
pointperm_ = nothing # we don't need any further permutations (all accesses to non-uniform data will be contiguous)
else
vp_sorted = vp_all
pointperm_ = pointperm
end

kernel! = spread_from_point_naive_kernel!(backend, workgroupsize, ndrange)
kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices)
kernel! = spread_from_point_naive_kernel!(backend)
kernel!(us_real, xs_comp, vp_sorted, pointperm_, evaluate, to_indices; workgroupsize, ndrange)

if sort_points === True()
foreach(KA.unsafe_free!, vp_sorted) # manually deallocate temporary arrays
Expand Down

0 comments on commit 4f68830

Please sign in to comment.