diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
new file mode 100644
index 0000000..97f42a7
--- /dev/null
+++ b/benchmarks/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+FINUFFT = "d8beea63-0952-562e-9c6a-8e8ef7364055"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+
+[extras]
+CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..fbf52b8
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,44 @@
+# Benchmarks
+
+The benchmark consists in type-1 and type-2 NUFFTs on a uniform 3D grid of
+fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of
+non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values
+between $10^{-4}$ (very few points) and $10^1$ (very dense).
+Points are randomly located in $[0, 2π)^3$.
+Moreover, the relative tolerance is fixed to $10^{-6}$.
+In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5`
+(oversampling factor) and $m = HalfSupport(4)$ (see Accuracy page in the docs).
+
+The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an
+NVIDIA A100 GPU.
+
+The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1.
+
+Each reported time includes (1) the time spent processing non-uniform points
+(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`).
+
+## FINUFFT set-up
+
+We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For
+performance reasons, the (Cu)FINUFFT libraries were compiled locally and the
+FINUFFT.jl sources were modified accordingly as described
+[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries).
+FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`.
+Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3.
+
+All FINUFFT benchmarks were run with relative tolerance `1e-6`.
+Moreover, the following options were used:
+
+- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs)
+- `spread_sort = 1` (enable point sorting in CPU plans)
+- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation)
+- `fftw = FFTW.ESTIMATE` (CPU plans)
+
+and for GPU plans:
+
+- `gpu_sort = 1` (enable point sorting)
+- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation)
+- `gpu_method = 1` (global memory method, non-uniform point driven)
+
+We also tried `gpu_method = 2` (based on shared memory) but found it to be
+considerably slower in almost all cases (in three dimensions, at the requested tolerance).
diff --git a/benchmarks/plots/Project.toml b/benchmarks/plots/Project.toml
new file mode 100644
index 0000000..bbc29bb
--- /dev/null
+++ b/benchmarks/plots/Project.toml
@@ -0,0 +1,4 @@
+[deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
diff --git a/benchmarks/plots/benchmark_ComplexF64_type1.svg b/benchmarks/plots/benchmark_ComplexF64_type1.svg
new file mode 100644
index 0000000..ac0a25a
--- /dev/null
+++ b/benchmarks/plots/benchmark_ComplexF64_type1.svg
@@ -0,0 +1,1756 @@
+
+
diff --git a/benchmarks/plots/benchmark_ComplexF64_type2.svg b/benchmarks/plots/benchmark_ComplexF64_type2.svg
new file mode 100644
index 0000000..4769683
--- /dev/null
+++ b/benchmarks/plots/benchmark_ComplexF64_type2.svg
@@ -0,0 +1,1756 @@
+
+
diff --git a/benchmarks/plots/benchmark_Float64_type1.svg b/benchmarks/plots/benchmark_Float64_type1.svg
new file mode 100644
index 0000000..bd7db6f
--- /dev/null
+++ b/benchmarks/plots/benchmark_Float64_type1.svg
@@ -0,0 +1,1731 @@
+
+
diff --git a/benchmarks/plots/benchmark_Float64_type2.svg b/benchmarks/plots/benchmark_Float64_type2.svg
new file mode 100644
index 0000000..f12b342
--- /dev/null
+++ b/benchmarks/plots/benchmark_Float64_type2.svg
@@ -0,0 +1,1731 @@
+
+
diff --git a/benchmarks/plots/plot_benchmarks.jl b/benchmarks/plots/plot_benchmarks.jl
new file mode 100644
index 0000000..0533d7f
--- /dev/null
+++ b/benchmarks/plots/plot_benchmarks.jl
@@ -0,0 +1,142 @@
+using GLMakie
+using CairoMakie
+using DelimitedFiles
+
+GLMakie.activate!()
+
+function read_timings(io::IO; nufft_type::Int)
+ while Char(peek(io)) == '#'
+ readline(io)
+ end
+ data = readdlm(io, Float64)
+ (; Np = @view(data[:, 1]), times = @view(data[:, 1 + nufft_type]),)
+end
+
+read_timings(fname::AbstractString; kws...) = open(io -> read_timings(io; kws...), fname)
+
+function plot_from_file!(ax::Axis, filename; nufft_type, zorder = nothing, kws...)
+ data = read_timings(filename; nufft_type)
+ (; Np, times,) = data
+ l = scatterlines!(ax, Np, times; kws...)
+ l.strokecolor[] = l.color[]
+ if zorder !== nothing
+ translate!(l, 0, 0, zorder)
+ end
+ l
+end
+
+function plot_benchmark(::Type{T}; nufft_type = 1, save_svg = false,) where {T <: Number}
+ Ns = (256, 256, 256)
+ N = first(Ns)
+ Ngrid = prod(Ns)
+ Z = complex(T) # for FINUFFT (complex data only)
+
+ fig = Figure(size = (750, 440))
+ ax = Axis(
+ fig[1, 1];
+ xscale = log10, yscale = log10,
+ xlabel = L"Number of nonuniform points $N$", xlabelsize = 16,
+ ylabel = "Time (s)",
+ xticks = LogTicks(0:20), xminorticks = IntervalsBetween(9), xminorticksvisible = true,
+ yticks = LogTicks(-8:3), yminorticks = IntervalsBetween(9), yminorticksvisible = true,
+ xgridvisible = false, ygridvisible = false,
+ limits = (nothing, (1e-3, 1e1)),
+ )
+ limits_top = lift(ax.finallimits) do lims
+ xlims = Makie.xlimits(lims) ./ Ngrid
+ ylims = Makie.ylimits(lims)
+ xlims, ylims
+ end
+ ax_top = Axis(
+ fig[1, 1];
+ xscale = ax.xscale, yscale = ax.yscale,
+ xaxisposition = :top,
+ # xgridvisible = true, ygridvisible = false,
+ xticks = LogTicks(-8:2), xminorticks = IntervalsBetween(9), xminorticksvisible = true,
+ xlabel = L"Point density $ρ = N / M^3$", xlabelsize = 16,
+ limits = limits_top,
+ )
+ hidespines!(ax_top)
+ hideydecorations!(ax_top; grid = false)
+
+ colours = Makie.wong_colors()
+
+ kws_all = (; nufft_type,)
+ kws_cpu = (; marker = :x, markersize = 16, strokewidth = 0,)
+ kws_gpu = (; marker = :circle, markersize = 10, strokewidth = 2,)
+ kws_gpu_sm = (; kws_gpu..., markercolor = :transparent,) # open symbols
+ kws_nonuniform = (; linestyle = :solid, color = colours[1], zorder = 10,)
+ kws_finufft = (; linestyle = :dash, color = colours[2],)
+
+ # Leftmost point of all CPU/GPU curves (for annotating later)
+ first_points_cpu = Point2{Float64}[]
+ first_points_gpu = Point2{Float64}[]
+ last_points_gpu = Point2{Float64}[]
+
+ l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CPU.dat"; label = "NonuniformFFTs CPU", kws_nonuniform..., kws_cpu..., kws_all...)
+ push!(first_points_cpu, l[1][][1]) # get first datapoint in line
+
+ l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_global_memory.dat"; label = "NonuniformFFTs GPU", kws_nonuniform..., kws_gpu..., kws_all...)
+ push!(first_points_gpu, l[1][][1]) # get first datapoint in line
+ push!(last_points_gpu, l[1][][end]) # get last datapoint in line
+
+ l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_shared_memory.dat"; label = "NonuniformFFTs GPU (SM)", kws_nonuniform..., kws_gpu_sm..., kws_all...)
+ push!(first_points_gpu, l[1][][1]) # get first datapoint in line
+ push!(last_points_gpu, l[1][][end]) # get last datapoint in line
+
+ l = plot_from_file!(ax, "../results/FINUFFT_$(N)_$(Z)_CPU.dat"; label = "FINUFFT CPU", kws_finufft..., kws_cpu..., kws_all...)
+ push!(first_points_cpu, l[1][][1]) # get first datapoint in line
+
+ l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_global_memory.dat"; label = "CuFINUFFT GPU", kws_finufft..., kws_gpu..., kws_all...)
+ push!(first_points_gpu, l[1][][1]) # get first datapoint in line
+ push!(last_points_gpu, l[1][][end]) # get last datapoint in line
+
+ l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_shared_memory.dat"; label = "CuFINUFFT GPU (SM)", kws_finufft..., kws_gpu_sm..., kws_all...)
+ push!(first_points_gpu, l[1][][1]) # get first datapoint in line
+ push!(last_points_gpu, l[1][][end]) # get last datapoint in line
+
+ let points = first_points_cpu, text = "CPU"
+ x = minimum(p -> p[1], points) # generally all points x are the same
+ y = maximum(p -> p[2], points)
+ text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16)
+ end
+
+ let points = first_points_gpu, text = "GPU"
+ x = minimum(p -> p[1], points) # generally all points x are the same
+ y = maximum(p -> p[2], points)
+ text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16)
+ end
+
+ let xs = logrange(0.5, 10.0; length = 3)
+ ymin, n = findmin(p -> p[2], last_points_gpu)
+ xmin = last_points_gpu[n][1] / Ngrid # as a density ρ
+ scale = ymin / xmin * 0.7
+ ys = @. xs * scale
+ lines!(ax_top, xs, ys; linestyle = :dash, color = :black, linewidth = 3)
+ text!(ax_top, xs[2], ys[2]; text = L"∼N", align = (:left, :top), fontsize = 18)
+ end
+
+ Label(
+ fig[1, 2][1, 1],
+ """
+ Type-$(nufft_type) NUFFT
+ $(N)³ Fourier modes
+ $T data
+ 6-digit accuracy
+ """;
+ justification = :left, fontsize = 16, lineheight = 1.2,
+ )
+ Legend(fig[1, 2][2, 1], ax; framevisible = false, rowgap = 8, labelsize = 14)
+
+ if save_svg
+ save("benchmark_$(T)_type$(nufft_type).svg", fig; backend = CairoMakie)
+ end
+
+ fig
+end
+
+for T ∈ (Float64, ComplexF64), nufft_type ∈ (1, 2)
+ plot_benchmark(T; nufft_type, save_svg = true)
+end
+
+fig = plot_benchmark(ComplexF64; nufft_type = 1)
diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat
new file mode 100644
index 0000000..63f5a22
--- /dev/null
+++ b/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat
@@ -0,0 +1,22 @@
+# CuFINUFFT
+# Benchmark: NUFFT of scalar data
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Relative tolerance: 1.0e-6
+# - GPU sort: 1
+# - GPU kernel evaluation method: 1
+# - Order of Fourier modes (modeord): 1
+# - GPU method: 1 (global_memory)
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.010049466 0.010189318 1.3870120840117604e-6 3.1010784379441936e-7
+5305 0.010149413 0.0102016 1.3863655070435886e-6 3.1806259113920807e-7
+16777 0.010432813 0.0102544095 1.3870558057308985e-6 3.4374717946022066e-7
+53054 0.01117762 0.010611818 1.3888666894696818e-6 4.022040070391591e-7
+167772 0.01320493 0.011400021 1.3864687098220945e-6 5.396416394710086e-7
+530542 0.017729944 0.013321508 1.3879307982260866e-6 8.077188843675696e-7
+1677722 0.030144937 0.0174889445 1.3883252357972235e-6 1.1943118827017768e-6
+5305422 0.071517286 0.031759459 1.3881328727539812e-6 1.57036542866348e-6
+16777216 0.2098746265 0.070962529 1.3868072524375616e-6 1.803777670735224e-6
+53054215 0.741148733 0.182403937 1.388143599891377e-6 1.9030418314879922e-6
+167772160 2.606663643 0.5031452625 1.387107956477105e-6 1.9385382768826457e-6
diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat
new file mode 100644
index 0000000..0d98c58
--- /dev/null
+++ b/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat
@@ -0,0 +1,22 @@
+# CuFINUFFT
+# Benchmark: NUFFT of scalar data
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Relative tolerance: 1.0e-6
+# - GPU sort: 1
+# - GPU kernel evaluation method: 1
+# - Order of Fourier modes (modeord): 1
+# - GPU method: 2 (shared_memory)
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.011692011 0.010875831 1.387012084011762e-6 3.101078437955394e-7
+5305 0.014491329 0.011954708 1.3863655070436594e-6 3.180625911384002e-7
+16777 0.022250966 0.014949265 1.3870558057312356e-6 3.437471794564674e-7
+53054 0.0403717795 0.021438011 1.3888666894692987e-6 4.0220400703767747e-7
+167772 0.076147689 0.032283476 1.3864687098221654e-6 5.39641639470162e-7
+530542 0.1424077785 0.046160006 1.3879307982261694e-6 8.07718884367156e-7
+1677722 0.24217535 0.061705163 1.3883252357980463e-6 1.1943118827025802e-6
+5305422 0.3729179835 0.077908472 1.3881328727557852e-6 1.5703654286660455e-6
+16777216 0.59926713 0.099986922 1.3868072524389908e-6 1.8037776707365977e-6
+53054215 1.140496802 0.152142785 1.3881435998922012e-6 1.9030418314885585e-6
+167772160 2.7159741345 0.309835595 1.3871079564877375e-6 1.9385382768972833e-6
diff --git a/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat b/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat
new file mode 100644
index 0000000..633fe33
--- /dev/null
+++ b/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat
@@ -0,0 +1,22 @@
+# FINUFFT (CPU)
+# Benchmark: NUFFT of scalar data
+# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+# - Number of threads: 32
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Relative tolerance: 1.0e-6
+# - Sort (spread_sort): 1
+# - Kernel evaluation method (spread_kerevalmeth): 1
+# - Order of Fourier modes (modeord): 1
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.5005381625 0.253770516 3.731950120546327e-6 6.646216249772271e-7
+5305 0.552043342 0.239666509 3.7290631645038357e-6 6.72066525127654e-7
+16777 0.577708378 0.261397829 3.734294780588139e-6 7.464022115781271e-7
+53054 0.673337338 0.274950167 3.7239928569652815e-6 9.119287514989875e-7
+167772 0.729330451 0.283593921 3.729391822696644e-6 1.3275103721977031e-6
+530542 0.738622538 0.32117297 3.7359429392970702e-6 2.0583360832214665e-6
+1677722 0.800928979 0.368688791 3.7286290440589845e-6 3.078078920315979e-6
+5305422 0.9500394215 0.5086966405 3.7276066153558594e-6 4.075858583640244e-6
+16777216 1.219703752 0.916775313 3.7307501803740638e-6 4.693500207626813e-6
+53054215 2.09705826 1.437311869 3.724478960801549e-6 4.947480478356764e-6
+167772160 4.7336734635 3.0110193405 3.7328363128680543e-6 5.056647405745642e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat
new file mode 100644
index 0000000..f2c9ce2
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CPU
+# Benchmark: NUFFT of scalar data
+# - Backend: CPU
+# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - Number of threads: 32
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.401702684 0.217824823 1.281824057186365e-6 3.011939154273146e-7
+5305 0.478089718 0.226900624 1.277407211115232e-6 3.060539615508211e-7
+16777 0.556644054 0.226762416 1.2776077397706963e-6 3.201846369029206e-7
+53054 0.6612915435 0.24357653 1.2839845138440086e-6 3.7866796040564127e-7
+167772 0.705163086 0.264036265 1.286857150143559e-6 5.066732411096722e-7
+530542 0.791394353 0.282972265 1.2874042722043801e-6 7.458672638907114e-7
+1677722 0.803317388 0.3235546355 1.2875233459096014e-6 1.0983844251011532e-6
+5305422 0.890094409 0.4209180075 1.285417860634058e-6 1.4413882842227143e-6
+16777216 1.108990892 0.875500309 1.2870935078768847e-6 1.65653108658738e-6
+53054215 1.807307983 1.752785424 1.2862609599745603e-6 1.745912493773992e-6
+167772160 5.076364579 4.7441131055 1.2854795078661256e-6 1.7777658759107162e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat
new file mode 100644
index 0000000..bf9ebe7
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+# - Backend: CUDABackend
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - GPU method: global_memory
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.007174481 0.0069437445 1.7461299210416866e-6 3.51735849884217e-7
+5305 0.007340271 0.006967824 1.7448299030724668e-6 3.5935428480849213e-7
+16777 0.007364155 0.006959589 1.7441440240783446e-6 3.8526407361623966e-7
+53054 0.008016956 0.0071371565 1.7459043209583552e-6 4.681070024635566e-7
+167772 0.01129348 0.0080823985 1.744026447406056e-6 6.545796093721018e-7
+530542 0.018176627 0.0098429595 1.7433317941794872e-6 9.95894287098444e-7
+1677722 0.036135638 0.013180793 1.7455342688942634e-6 1.485682919911522e-6
+5305422 0.095645657 0.023485831 1.743499377366084e-6 1.961321173351077e-6
+16777216 0.308876142 0.057238519 1.7457314354108132e-6 2.2579240564636625e-6
+53054215 1.102950989 0.166080969 1.742616495241337e-6 2.3789078434566558e-6
+167772160 3.9515899275 0.51076222 1.743980969237664e-6 2.4261231727166925e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat
new file mode 100644
index 0000000..510feb8
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+# - Backend: CUDABackend
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: ComplexF64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - GPU method: shared_memory
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.017566898 0.013264335 1.7461299210424742e-6 3.517358498891171e-7
+5305 0.018137655 0.013418704 1.7448299030716066e-6 3.59354284813164e-7
+16777 0.019053637 0.013943315 1.7441440240776934e-6 3.852640736163668e-7
+53054 0.020030834 0.014643423 1.7459043209587112e-6 4.681070024594046e-7
+167772 0.021095564 0.015789195 1.7440264474057985e-6 6.545796093692288e-7
+530542 0.022440168 0.0175126215 1.7433317941788803e-6 9.958942870956675e-7
+1677722 0.026032583 0.020145207 1.7455342688943773e-6 1.4856829199119722e-6
+5305422 0.0378931535 0.0263968385 1.7434993773628165e-6 1.9613211733482794e-6
+16777216 0.078095254 0.046994287 1.7457314354124795e-6 2.2579240564680298e-6
+53054215 0.2090714075 0.114592533 1.7426164952268956e-6 2.3789078434342945e-6
+167772160 0.6257227505 0.3285903345 1.743980969245702e-6 2.426123172726822e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat
new file mode 100644
index 0000000..b40da43
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat
@@ -0,0 +1,20 @@
+# NonuniformFFTs.jl using CPU
+# Benchmark: NUFFT of scalar data
+# - Backend: CPU
+# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+# - Element type: Float64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - Number of threads: 32
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.157921409 0.057483433 1.2992421887135856e-6 3.0117054596563075e-7
+5305 0.185425857 0.054794236 1.2840271425855802e-6 2.9623411061857803e-7
+16777 0.2107738795 0.060259554 1.2883263838044824e-6 3.169193033522629e-7
+53054 0.259005555 0.067185219 1.2977247862698859e-6 3.801684137736015e-7
+167772 0.299163513 0.082973939 1.2934667124620822e-6 5.032368971604971e-7
+530542 0.295613095 0.093088267 1.2963805116600738e-6 7.476721873945614e-7
+1677722 0.338260935 0.122719703 1.296796876868053e-6 1.1055503659183577e-6
+5305422 0.4419285785 0.217629578 1.2973073932578797e-6 1.4521190807869027e-6
+16777216 0.650143119 0.586211104 1.2963264136956878e-6 1.6655878400956973e-6
+53054215 1.364857875 1.474264205 1.297942847667679e-6 1.7599064232383652e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat
new file mode 100644
index 0000000..a0c7be7
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+# - Backend: CUDABackend
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: Float64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - GPU method: global_memory
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.003320917 0.003230889 1.7468019419364787e-6 3.5090636814544796e-7
+5305 0.003324334 0.003240768 1.7436883516306644e-6 3.569900784629132e-7
+16777 0.003340074 0.003251238 1.7428919558873212e-6 3.933815099546998e-7
+53054 0.003735293 0.003365211 1.74393734550516e-6 4.7633118362671775e-7
+167772 0.00541354 0.00402824 1.7443022952469846e-6 6.543784470139837e-7
+530542 0.008519906 0.006309474 1.7447103632569267e-6 9.96697648602348e-7
+1677722 0.01911457 0.008761147 1.7430259600594823e-6 1.4888903328036116e-6
+5305422 0.051812251 0.015686412 1.7458544701717223e-6 1.9669560205696403e-6
+16777216 0.176896208 0.039465031 1.7465137652756605e-6 2.2609508058937542e-6
+53054215 0.6754469325 0.1147949335 1.7441674213003973e-6 2.3841887129635894e-6
+167772160 2.503391267 0.351748967 1.7445912696046888e-6 2.428869915799653e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat
new file mode 100644
index 0000000..3a06ed4
--- /dev/null
+++ b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+# - Backend: CUDABackend
+# - Device: NVIDIA A100 80GB PCIe
+# - Element type: Float64
+# - Grid size: (256, 256, 256)
+# - Oversampling factor: 1.5
+# - Half support: HalfSupport{4}()
+# - GPU method: shared_memory
+# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2
+1678 0.008353976 0.0070949475 1.7468019419382034e-6 3.5090636817904113e-7
+5305 0.008659046 0.0072636425 1.7436883516277096e-6 3.5699007845828123e-7
+16777 0.009093549 0.007541187 1.7428919558873898e-6 3.9338150996385125e-7
+53054 0.009523572 0.007862112 1.7439373455033794e-6 4.7633118363122153e-7
+167772 0.0100324315 0.008284588 1.7443022952456069e-6 6.543784470131989e-7
+530542 0.011138013 0.008922181 1.744710363257602e-6 9.966976486010409e-7
+1677722 0.014576812 0.010465275 1.743025960053548e-6 1.4888903327977372e-6
+5305422 0.025735163 0.0149117685 1.7458544701617326e-6 1.9669560205599723e-6
+16777216 0.063339967 0.030928607 1.7465137652730675e-6 2.26095080589055e-6
+53054215 0.1827624215 0.08283842 1.7441674213202952e-6 2.3841887129839585e-6
+167772160 0.560682647 0.246603554 1.7445912696151422e-6 2.4288699158223415e-6
diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl
new file mode 100644
index 0000000..8377b48
--- /dev/null
+++ b/benchmarks/run_benchmarks.jl
@@ -0,0 +1,428 @@
+using NonuniformFFTs
+using FINUFFT
+using Adapt
+using CUDA
+using FFTW
+using LinearAlgebra: norm
+using KernelAbstractions
+using KernelAbstractions: KernelAbstractions as KA
+using ThreadPinning
+using Random: randn!, Xoshiro
+using BenchmarkTools
+using Statistics
+using TimerOutputs
+
+# NOTE: FINUFFT doesn't play well with ThreadPinning, so pinthreads should *not* be used
+# for FINUFFT (CPU) benchmarks! Seems to be a conflict between ThreadPinning and OpenMP.
+if haskey(ENV, "SLURM_JOB_ID")
+ pinthreads(:affinitymask)
+else
+ pinthreads(:cores)
+end
+
+# Useful for FINUFFT (CPU):
+ENV["OMP_PROC_BIND"] = "true"
+# ENV["OMP_PLACES"] = "cores"
+ENV["OMP_NUM_THREADS"] = Threads.nthreads()
+
+get_device_name(::CPU) = string(Sys.cpu_info()[1].model, " ($(Sys.CPU_NAME), $(Sys.CPU_THREADS) threads)")
+
+function get_device_name(::CUDABackend)
+ dev = CUDA.device()
+ io = IOBuffer()
+ show(io, MIME"text/plain"(), dev) # e.g. "CuDevice(1): NVIDIA A100 80GB PCIe"
+ s = String(take!(io))
+ split(s, " "; limit = 2)[2]
+end
+
+function bench_nonuniformffts(
+ ::Type{Z}, backend::KA.Backend, Ns::Dims, Np;
+ σ, m, gpu_method = :global_memory, kws...,
+ ) where {Z}
+ D = length(Ns)
+ T = real(Z)
+ rng = Xoshiro(42)
+ xp = ntuple(D) do _
+ adapt(backend, randn(rng, T, Np))
+ end
+ vp = adapt(backend, randn(rng, Z, Np))
+ wp = similar(vp)
+
+ p = PlanNUFFT(Z, Ns; backend, σ, m, gpu_method, kws...)
+ ûs = similar(vp, complex(T), size(p))
+
+ # Execute plan once
+ set_points!(p, xp)
+ exec_type1!(ûs, p, vp)
+ exec_type2!(wp, p, ûs)
+
+ # Compare with reference case (high accuracy: rtol = 1e-14)
+ relative_errors = let p_ref = PlanNUFFT(Z, Ns; backend, kws..., gpu_method = :global_memory, m = HalfSupport(8), σ = T(2.0))
+ ûs_ref = similar(ûs)
+ wp_ref = similar(wp)
+ set_points!(p_ref, xp)
+ exec_type1!(ûs_ref, p_ref, vp)
+ exec_type2!(wp_ref, p_ref, ûs_ref)
+ local type1 = norm(ûs_ref - ûs) / norm(ûs_ref)
+ local type2 = norm(wp_ref - wp) / norm(wp_ref)
+ KA.unsafe_free!(ûs_ref)
+ KA.unsafe_free!(wp_ref)
+ (; type1, type2,)
+ end
+
+ if backend isa KA.CPU
+ reset_timer!(p.timer)
+ end
+
+ type1 = @benchmark let
+ set_points!($p, $xp)
+ exec_type1!($ûs, $p, $vp)
+ KA.synchronize($backend)
+ end
+
+ type2 = @benchmark let
+ set_points!($p, $xp)
+ exec_type2!($wp, $p, $ûs)
+ KA.synchronize($backend)
+ end
+
+ if backend isa KA.CPU
+ println(p.timer)
+ end
+
+ (; type1, type2, relative_errors,)
+end
+
+function bench_finufft_cpu(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex}
+ backend = CPU()
+ T = real(Z)
+ D = length(Ns)
+ rng = Xoshiro(42)
+ xp = ntuple(D) do _
+ adapt(backend, randn(rng, T, Np))
+ end
+ vp = adapt(backend, randn(rng, Z, Np))
+ ûs = similar(vp, complex(T), Ns)
+ ntrans = 1
+ n_modes = collect(Ns)
+ plan_type1 = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...)
+ finalizer(finufft_destroy!, plan_type1)
+ type1 = @benchmark let
+ finufft_setpts!($plan_type1, $xp...)
+ finufft_exec!($plan_type1, $vp, $ûs)
+ KA.synchronize($backend)
+ end
+ (; type1,)
+end
+
+function bench_cufinufft(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex}
+ backend = CUDABackend()
+ T = real(Z)
+ D = length(Ns)
+ rng = Xoshiro(42)
+ xp = ntuple(D) do _
+ adapt(backend, randn(rng, T, Np))
+ end
+ vp = adapt(backend, randn(rng, Z, Np))
+ wp = similar(vp)
+ ûs = similar(vp, complex(T), Ns)
+ ntrans = 1
+ n_modes = collect(Ns)
+
+ plan_type1 = cufinufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...)
+ plan_type2 = cufinufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts...)
+ finalizer(cufinufft_destroy!, plan_type1)
+ finalizer(cufinufft_destroy!, plan_type2)
+
+ # Execute plans once
+ cufinufft_setpts!(plan_type1, xp...)
+ cufinufft_setpts!(plan_type2, xp...)
+ cufinufft_exec!(plan_type1, vp, ûs)
+ cufinufft_exec!(plan_type2, ûs, wp)
+
+ # Compare with reference case (high accuracy: rtol = 1e-14)
+ relative_errors = let tol = 1e-14
+ p1_ref = cufinufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts..., gpu_method = 1, upsampfac = 2.0)
+ p2_ref = cufinufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts..., gpu_method = 1, upsampfac = 2.0)
+
+ ûs_ref = similar(ûs)
+ wp_ref = similar(wp)
+
+ cufinufft_setpts!(p1_ref, xp...)
+ cufinufft_setpts!(p2_ref, xp...)
+ cufinufft_exec!(p1_ref, vp, ûs_ref)
+ cufinufft_exec!(p2_ref, ûs_ref, wp_ref)
+
+ cufinufft_destroy!(p1_ref)
+ cufinufft_destroy!(p2_ref)
+
+ local type1 = norm(ûs_ref - ûs) / norm(ûs_ref)
+ local type2 = norm(wp_ref - wp) / norm(wp_ref)
+
+ KA.unsafe_free!(ûs_ref)
+ KA.unsafe_free!(wp_ref)
+
+ (; type1, type2,)
+ end
+
+ type1 = @benchmark let
+ cufinufft_setpts!($plan_type1, $xp...)
+ cufinufft_exec!($plan_type1, $vp, $ûs)
+ KA.synchronize($backend)
+ end
+
+ type2 = @benchmark let
+ cufinufft_setpts!($plan_type2, $xp...)
+ cufinufft_exec!($plan_type2, $ûs, $wp)
+ KA.synchronize($backend)
+ end
+
+ (; type1, type2, relative_errors,)
+end
+
+function bench_finufft_cpu(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex}
+ T = real(Z)
+ D = length(Ns)
+ rng = Xoshiro(42)
+ xp = ntuple(D) do _
+ randn(rng, T, Np)
+ end
+ vp = randn(rng, Z, Np)
+ wp = similar(vp)
+ ûs = similar(vp, complex(T), Ns)
+ ntrans = 1
+ n_modes = collect(Ns)
+
+ plan_type1 = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...)
+ plan_type2 = finufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts...)
+ finalizer(finufft_destroy!, plan_type1)
+ finalizer(finufft_destroy!, plan_type2)
+
+ # Execute plans once
+ finufft_setpts!(plan_type1, xp...)
+ finufft_setpts!(plan_type2, xp...)
+ finufft_exec!(plan_type1, vp, ûs)
+ finufft_exec!(plan_type2, ûs, wp)
+
+ # Compare with reference case (high accuracy: rtol = 1e-14)
+ relative_errors = let tol = 1e-14
+ p1_ref = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts..., upsampfac = 2.0)
+ p2_ref = finufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts..., upsampfac = 2.0)
+
+ ûs_ref = similar(ûs)
+ wp_ref = similar(wp)
+
+ finufft_setpts!(p1_ref, xp...)
+ finufft_setpts!(p2_ref, xp...)
+ finufft_exec!(p1_ref, vp, ûs_ref)
+ finufft_exec!(p2_ref, ûs_ref, wp_ref)
+
+ finufft_destroy!(p1_ref)
+ finufft_destroy!(p2_ref)
+
+ local type1 = norm(ûs_ref - ûs) / norm(ûs_ref)
+ local type2 = norm(wp_ref - wp) / norm(wp_ref)
+
+ (; type1, type2,)
+ end
+
+ type1 = @benchmark let
+ finufft_setpts!($plan_type1, $xp...)
+ finufft_exec!($plan_type1, $vp, $ûs)
+ end
+
+ type2 = @benchmark let
+ finufft_setpts!($plan_type2, $xp...)
+ finufft_exec!($plan_type2, $ûs, $wp)
+ end
+
+ (; type1, type2, relative_errors,)
+end
+
+function run_benchmark_nonuniformffts(
+ ::Type{Z},
+ backend,
+ Ns::Dims,
+ Nps::AbstractVector; # number of points per test case
+ σ = 1.5, m = HalfSupport(4), # rtol = 1e-6
+ gpu_method = :global_memory,
+ fftw_flags = FFTW.ESTIMATE,
+ params...,
+ ) where {Z <: Number}
+ N = first(Ns)
+ suffix = if backend isa GPU
+ "_$gpu_method"
+ else
+ ""
+ end
+ device_name = get_device_name(backend)
+ filename = "NonuniformFFTs_$(N)_$(Z)_$(typeof(backend))$(suffix).dat"
+ open(filename, "w") do io
+ println(io, "# NonuniformFFTs.jl using $(typeof(backend))")
+ println(io, "# Benchmark: NUFFT of scalar data")
+ println(io, "# - Backend: ", typeof(backend))
+ println(io, "# - Device: ", device_name)
+ println(io, "# - Element type: ", Z)
+ println(io, "# - Grid size: ", Ns)
+ println(io, "# - Oversampling factor: ", σ)
+ println(io, "# - Half support: ", m)
+ if backend isa KA.GPU
+ println(io, "# - GPU method: ", gpu_method)
+ else
+ println(io, "# - Number of threads: ", Threads.nthreads())
+ end
+ println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2")
+ for Np ∈ Nps
+ bench = bench_nonuniformffts(Z, backend, Ns, Np; params..., σ, m, gpu_method, fftw_flags)
+ type1 = median(bench.type1.times) / 1e9 # in seconds
+ type2 = median(bench.type2.times) / 1e9 # in seconds
+ @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2
+ join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t')
+ print(io, '\n')
+ flush(io)
+ end
+ end
+ filename
+end
+
+function run_benchmark_cufinufft(
+ ::Type{Z},
+ Ns::Dims,
+ Nps::AbstractVector; # number of points per test case
+ tol = 1e-6,
+ modeord = 1, # convention used by default in NonuniformFFTs
+ gpu_method = 1, # 1: global memory (-sort) method | 2: shared memory | method 0 or 2 is much slower, at least with Float64 data
+ gpu_sort = 1, # needed for good performance
+ gpu_kerevalmeth = 1, # setting it to 0 (direct evaluation) doesn't make a big difference?
+ params...,
+ ) where {Z <: Complex}
+ N = first(Ns)
+ method = gpu_method == 1 ? :global_memory : :shared_memory
+ # Make sure we use the current CUDA device and stream (otherwise KA.synchronize won't
+ # work as expected).
+ params_cuda = (
+ gpu_device_id = CUDA.deviceid(CUDA.device()),
+ gpu_stream = Base.unsafe_convert(Ptr{CUDA.CUstream_st}, CUDA.stream()),
+ )
+ device_name = get_device_name(CUDABackend())
+ filename = "CuFINUFFT_$(N)_$(Z)_$(method).dat"
+ open(filename, "w") do io
+ println(io, "# CuFINUFFT")
+ println(io, "# Benchmark: NUFFT of scalar data")
+ println(io, "# - Device: ", device_name)
+ println(io, "# - Element type: ", Z)
+ println(io, "# - Grid size: ", Ns)
+ println(io, "# - Relative tolerance: ", tol)
+ println(io, "# - GPU sort: ", gpu_sort)
+ println(io, "# - GPU kernel evaluation method: ", gpu_kerevalmeth)
+ println(io, "# - Order of Fourier modes (modeord): ", modeord)
+ println(io, "# - GPU method: ", gpu_method, " ($method)")
+ println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2")
+ for Np ∈ Nps
+ bench = bench_cufinufft(Z, Ns, Np; params..., params_cuda..., tol, modeord, gpu_method, gpu_sort, gpu_kerevalmeth)
+ type1 = median(bench.type1.times) / 1e9 # in seconds
+ type2 = median(bench.type2.times) / 1e9 # in seconds
+ @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2
+ join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t')
+ print(io, '\n')
+ flush(io)
+ end
+ end
+ filename
+end
+
+function run_benchmark_finufft_cpu(
+ ::Type{Z},
+ Ns::Dims,
+ Nps::AbstractVector; # number of points per test case
+ tol = 1e-6,
+ modeord = 1, # convention used by default in NonuniformFFTs
+ spread_sort = 1, # needed for good performance
+ spread_kerevalmeth = 1,
+ fftw = FFTW.ESTIMATE,
+ nthreads = Threads.nthreads(),
+ params...,
+ ) where {Z <: Complex}
+ N = first(Ns)
+ device_name = get_device_name(CPU())
+ filename = "FINUFFT_$(N)_$(Z)_CPU.dat"
+ open(filename, "w") do io
+ println(io, "# FINUFFT (CPU)")
+ println(io, "# Benchmark: NUFFT of scalar data")
+ println(io, "# - Device: ", device_name)
+ println(io, "# - Number of threads: ", nthreads)
+ println(io, "# - Element type: ", Z)
+ println(io, "# - Grid size: ", Ns)
+ println(io, "# - Relative tolerance: ", tol)
+ println(io, "# - Sort (spread_sort): ", spread_sort)
+ println(io, "# - Kernel evaluation method (spread_kerevalmeth): ", spread_kerevalmeth)
+ println(io, "# - Order of Fourier modes (modeord): ", modeord)
+ println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2")
+ for Np ∈ Nps
+ bench = bench_finufft_cpu(Z, Ns, Np; params..., tol, fftw, nthreads, modeord, spread_sort, spread_kerevalmeth)
+ type1 = median(bench.type1.times) / 1e9 # in seconds
+ type2 = median(bench.type2.times) / 1e9 # in seconds
+ @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2
+ join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t')
+ print(io, '\n')
+ flush(io)
+ end
+ end
+ filename
+end
+
+function run_all_benchmarks()
+ T = Float64
+ Z = complex(T)
+ Ngrid = 256
+ Ns = (1, 1, 1) .* Ngrid
+
+ # Parameters for 1e-6 accuracy
+ σ = 1.5
+ m = HalfSupport(4)
+
+ ρs = exp10.(-4:0.5:1) # point densities
+ Nprod = prod(Ns)
+ Nps = map(ρs) do ρ
+ round(Int, ρ * Nprod)
+ end
+
+ # Real data
+ run_benchmark_nonuniformffts(T, CUDABackend(), Ns, Nps; σ, m, gpu_method = :global_memory)
+ run_benchmark_nonuniformffts(T, CUDABackend(), Ns, Nps; σ, m, gpu_method = :shared_memory)
+ run_benchmark_nonuniformffts(T, CPU(), Ns, Nps; σ, m)
+
+ # Complex data
+ run_benchmark_nonuniformffts(Z, CUDABackend(), Ns, Nps; σ, m, gpu_method = :global_memory)
+ run_benchmark_nonuniformffts(Z, CUDABackend(), Ns, Nps; σ, m, gpu_method = :shared_memory)
+ run_benchmark_nonuniformffts(Z, CPU(), Ns, Nps; σ, m)
+
+ # CuFINUFFT
+ params_cufinufft = (;
+ gpu_sort = 1, # needed for good performance
+ gpu_kerevalmeth = 1, # setting it to 0 (direct evaluation) doesn't make a big difference?
+ tol = 1e-6,
+ )
+
+ gpu_method = 1 # global memory (GM-sort)
+ run_benchmark_cufinufft(Z, Ns, Nps; gpu_method, params_cufinufft...)
+
+ gpu_method = 2 # shared memory (SM)
+ run_benchmark_cufinufft(Z, Ns, Nps; gpu_method, params_cufinufft...)
+
+ # FINUFFT CPU
+ params_finufft = (;
+ spread_sort = 1,
+ spread_kerevalmeth = 1,
+ tol = 1e-6,
+ )
+ run_benchmark_finufft_cpu(Z, Ns, Nps; params_finufft...)
+
+ nothing
+end
+
+outdir = "results"
+mkpath(outdir)
+cd(outdir) do
+ run_all_benchmarks()
+end