Faster point sorting + performance tuning (#31)

* Avoid sortperm! in CPU blocking / sorting * Initialise array in parallel * Remove old comments * We don't need to zero out vector! * Avoid sortperm in GPU code * GPU: use simple block sorting Faster and simpler than Hilbert sorting. * Minor changes * Fix tests? * Use mod2pi on GPU * Minor changes * Tune block dims on GPU + accept tuple as input * Fix tests? * More tuning * Reorder code * Update CHANGELOG
jipolanco · Sep 24, 2024 · d893e9a · d893e9a
1 parent 1ca1b4d
commit d893e9a
Show file tree

Hide file tree

Showing 12 changed files with 201 additions and 377 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Changed
+
+- Faster spatial sorting of non-uniform points (CPU and GPU).
+
+- Tune GPU parameters: kernel workgroupsize; block size for spatial sorting.
+
 ## [v0.5.2] - 2024-09-23
 
 ### Changed

diff --git a/src/NonuniformFFTs.jl b/src/NonuniformFFTs.jl
@@ -46,20 +46,25 @@ export
 
 default_kernel() = BackwardsKaiserBesselKernel()
 
-# This is used at several places instead of getindex (inside of a `map`) to make sure that
-# the @inbounds is applied.
-inbounds_getindex(v, i) = @inbounds v[i]
+default_block_size(::Dims, ::CPU) = 4096  # in number of linear elements
+default_block_size(::Dims, ::GPU) = 1024  # except in 2D and 3D (see below)
+
+# TODO: adapt this based on size of shared memory and on element type T (and padding 2M)?
+default_block_size(::Dims{2}, ::GPU) = (32, 32)
+default_block_size(::Dims{3}, ::GPU) = (16, 16, 4)  # tuned on A100 with 256³ non-oversampled grid, σ = 2 and m = HalfSupport(4)
 
 function default_workgroupsize(backend, ndrange::Dims)
     # This currently assumes 1024 available threads, which might fail in some GPUs (AMD?).
     KA.default_cpu_workgroupsize(ndrange)
 end
 
 # Case of 1D kernels on the GPU (typically, kernels which iterate over non-uniform points).
-default_workgroupsize(::GPU, ndrange::Dims{1}) = (512,)
+default_workgroupsize(::GPU, ndrange::Dims{1}) = (1024,)
+
+# This is used at several places instead of getindex (inside of a `map`) to make sure that
+# the @inbounds is applied.
+inbounds_getindex(v, i) = @inbounds v[i]
 
-include("sorting.jl")
-include("sorting_hilbert.jl")
 include("blocking.jl")
 include("plan.jl")
 include("set_points.jl")

diff --git a/src/abstractNFFTs.jl b/src/abstractNFFTs.jl
@@ -91,7 +91,7 @@ Base.@constprop :aggressive function PlanNUFFT(
     m, σ, reltol = AbstractNFFTs.accuracyParams(; kwargs...)
     backend = KA.get_backend(xp)  # e.g. use GPU backend if xp is a GPU array
     sort_points = sortNodes ? True() : False()  # this is type-unstable (unless constant propagation happens)
-    block_size = blocking ? default_block_size(backend) : nothing  # also type-unstable
+    block_size = blocking ? default_block_size(Ns, backend) : nothing  # also type-unstable
     kernel = window isa AbstractKernel ? window : convert_window_function(window)
     p = PlanNUFFT(Complex{Tr}, Ns, HalfSupport(m); backend, σ = Tr(σ), sort_points, fftshift, block_size, kernel, fftw_flags = fftflags)
     AbstractNFFTs.nodes!(p, xp)