Add benchmark data and plots

jipolanco · Nov 29, 2024 · 9d2e544 · 9d2e544
1 parent 5e13c96
commit 9d2e544
Show file tree

Hide file tree

Showing 18 changed files with 7,799 additions and 0 deletions.
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+FINUFFT = "d8beea63-0952-562e-9c6a-8e8ef7364055"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+
+[extras]
+CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,44 @@
+# Benchmarks
+
+The benchmark consists in type-1 and type-2 NUFFTs on a uniform 3D grid of
+fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of
+non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values
+between $10^{-4}$ (very few points) and $10^1$ (very dense).
+Points are randomly located in $[0, 2π)^3$.
+Moreover, the relative tolerance is fixed to $10^{-6}$.
+In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5`
+(oversampling factor) and $m = HalfSupport(4)$ (see Accuracy page in the docs).
+
+The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an
+NVIDIA A100 GPU.
+
+The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1.
+
+Each reported time includes (1) the time spent processing non-uniform points
+(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`).
+
+## FINUFFT set-up
+
+We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For
+performance reasons, the (Cu)FINUFFT libraries were compiled locally and the
+FINUFFT.jl sources were modified accordingly as described
+[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries).
+FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`.
+Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3.
+
+All FINUFFT benchmarks were run with relative tolerance `1e-6`.
+Moreover, the following options were used:
+
+- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs)
+- `spread_sort = 1` (enable point sorting in CPU plans)
+- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation)
+- `fftw = FFTW.ESTIMATE` (CPU plans)
+
+and for GPU plans:
+
+- `gpu_sort = 1` (enable point sorting)
+- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation)
+- `gpu_method = 1` (global memory method, non-uniform point driven)
+
+We also tried `gpu_method = 2` (based on shared memory) but found it to be
+considerably slower in almost all cases (in three dimensions, at the requested tolerance).
diff --git a/benchmarks/plots/Project.toml b/benchmarks/plots/Project.toml
@@ -0,0 +1,4 @@
+[deps]
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
diff --git a/benchmarks/plots/benchmark_ComplexF64_type1.svg b/benchmarks/plots/benchmark_ComplexF64_type1.svg
diff --git a/benchmarks/plots/benchmark_ComplexF64_type2.svg b/benchmarks/plots/benchmark_ComplexF64_type2.svg
diff --git a/benchmarks/plots/benchmark_Float64_type1.svg b/benchmarks/plots/benchmark_Float64_type1.svg
diff --git a/benchmarks/plots/benchmark_Float64_type2.svg b/benchmarks/plots/benchmark_Float64_type2.svg
diff --git a/benchmarks/plots/plot_benchmarks.jl b/benchmarks/plots/plot_benchmarks.jl
@@ -0,0 +1,142 @@
+using GLMakie
+using CairoMakie
+using DelimitedFiles
+
+GLMakie.activate!()
+
+function read_timings(io::IO; nufft_type::Int)
+    while Char(peek(io)) == '#'
+        readline(io)
+    end
+    data = readdlm(io, Float64)
+    (; Np = @view(data[:, 1]), times = @view(data[:, 1 + nufft_type]),)
+end
+
+read_timings(fname::AbstractString; kws...) = open(io -> read_timings(io; kws...), fname)
+
+function plot_from_file!(ax::Axis, filename; nufft_type, zorder = nothing, kws...)
+    data = read_timings(filename; nufft_type)
+    (; Np, times,) = data
+    l = scatterlines!(ax, Np, times; kws...)
+    l.strokecolor[] = l.color[]
+    if zorder !== nothing
+        translate!(l, 0, 0, zorder)
+    end
+    l
+end
+
+function plot_benchmark(::Type{T}; nufft_type = 1, save_svg = false,) where {T <: Number}
+    Ns = (256, 256, 256)
+    N = first(Ns)
+    Ngrid = prod(Ns)
+    Z = complex(T)  # for FINUFFT (complex data only)
+
+    fig = Figure(size = (750, 440))
+    ax = Axis(
+        fig[1, 1];
+        xscale = log10, yscale = log10,
+        xlabel = L"Number of nonuniform points $N$", xlabelsize = 16,
+        ylabel = "Time (s)",
+        xticks = LogTicks(0:20), xminorticks = IntervalsBetween(9), xminorticksvisible = true,
+        yticks = LogTicks(-8:3), yminorticks = IntervalsBetween(9), yminorticksvisible = true,
+        xgridvisible = false, ygridvisible = false,
+        limits = (nothing, (1e-3, 1e1)),
+    )
+    limits_top = lift(ax.finallimits) do lims
+        xlims = Makie.xlimits(lims) ./ Ngrid
+        ylims = Makie.ylimits(lims)
+        xlims, ylims
+    end
+    ax_top = Axis(
+        fig[1, 1];
+        xscale = ax.xscale, yscale = ax.yscale,
+        xaxisposition = :top,
+        # xgridvisible = true, ygridvisible = false,
+        xticks = LogTicks(-8:2), xminorticks = IntervalsBetween(9), xminorticksvisible = true,
+        xlabel = L"Point density $ρ = N / M^3$", xlabelsize = 16,
+        limits = limits_top,
+    )
+    hidespines!(ax_top)
+    hideydecorations!(ax_top; grid = false)
+
+    colours = Makie.wong_colors()
+
+    kws_all = (; nufft_type,)
+    kws_cpu = (; marker = :x, markersize = 16, strokewidth = 0,)
+    kws_gpu = (; marker = :circle, markersize = 10, strokewidth = 2,)
+    kws_gpu_sm = (; kws_gpu..., markercolor = :transparent,)  # open symbols
+    kws_nonuniform = (; linestyle = :solid, color = colours[1], zorder = 10,)
+    kws_finufft = (; linestyle = :dash, color = colours[2],)
+
+    # Leftmost point of all CPU/GPU curves (for annotating later)
+    first_points_cpu = Point2{Float64}[]
+    first_points_gpu = Point2{Float64}[]
+    last_points_gpu = Point2{Float64}[]
+
+    l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CPU.dat"; label = "NonuniformFFTs CPU", kws_nonuniform..., kws_cpu..., kws_all...)
+    push!(first_points_cpu, l[1][][1])  # get first datapoint in line
+
+    l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_global_memory.dat"; label = "NonuniformFFTs GPU", kws_nonuniform..., kws_gpu..., kws_all...)
+    push!(first_points_gpu, l[1][][1])  # get first datapoint in line
+    push!(last_points_gpu, l[1][][end])  # get last datapoint in line
+
+    l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_shared_memory.dat"; label = "NonuniformFFTs GPU (SM)", kws_nonuniform..., kws_gpu_sm..., kws_all...)
+    push!(first_points_gpu, l[1][][1])  # get first datapoint in line
+    push!(last_points_gpu, l[1][][end])  # get last datapoint in line
+
+    l = plot_from_file!(ax, "../results/FINUFFT_$(N)_$(Z)_CPU.dat"; label = "FINUFFT CPU", kws_finufft..., kws_cpu..., kws_all...)
+    push!(first_points_cpu, l[1][][1])  # get first datapoint in line
+
+    l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_global_memory.dat"; label = "CuFINUFFT GPU", kws_finufft..., kws_gpu..., kws_all...)
+    push!(first_points_gpu, l[1][][1])  # get first datapoint in line
+    push!(last_points_gpu, l[1][][end])  # get last datapoint in line
+
+    l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_shared_memory.dat"; label = "CuFINUFFT GPU (SM)", kws_finufft..., kws_gpu_sm..., kws_all...)
+    push!(first_points_gpu, l[1][][1])  # get first datapoint in line
+    push!(last_points_gpu, l[1][][end])  # get last datapoint in line
+
+    let points = first_points_cpu, text = "CPU"
+        x = minimum(p -> p[1], points)  # generally all points x are the same
+        y = maximum(p -> p[2], points)
+        text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16)
+    end
+
+    let points = first_points_gpu, text = "GPU"
+        x = minimum(p -> p[1], points)  # generally all points x are the same
+        y = maximum(p -> p[2], points)
+        text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16)
+    end
+
+    let xs = logrange(0.5, 10.0; length = 3)
+        ymin, n = findmin(p -> p[2], last_points_gpu)
+        xmin = last_points_gpu[n][1] / Ngrid  # as a density ρ
+        scale = ymin / xmin * 0.7
+        ys = @. xs * scale
+        lines!(ax_top, xs, ys; linestyle = :dash, color = :black, linewidth = 3)
+        text!(ax_top, xs[2], ys[2]; text = L"∼N", align = (:left, :top), fontsize = 18)
+    end
+
+    Label(
+        fig[1, 2][1, 1],
+        """
+        Type-$(nufft_type) NUFFT
+        $(N)³ Fourier modes
+        $T data
+        6-digit accuracy
+        """;
+        justification = :left, fontsize = 16, lineheight = 1.2,
+    )
+    Legend(fig[1, 2][2, 1], ax; framevisible = false, rowgap = 8, labelsize = 14)
+
+    if save_svg
+        save("benchmark_$(T)_type$(nufft_type).svg", fig; backend = CairoMakie)
+    end
+
+    fig
+end
+
+for T ∈ (Float64, ComplexF64), nufft_type ∈ (1, 2)
+    plot_benchmark(T; nufft_type, save_svg = true)
+end
+
+fig = plot_benchmark(ComplexF64; nufft_type = 1)
diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat
@@ -0,0 +1,22 @@
+# CuFINUFFT
+# Benchmark: NUFFT of scalar data
+#  - Device: NVIDIA A100 80GB PCIe
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Relative tolerance: 1.0e-6
+#  - GPU sort: 1
+#  - GPU kernel evaluation method: 1
+#  - Order of Fourier modes (modeord): 1
+#  - GPU method: 1 (global_memory)
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.010049466	0.010189318	1.3870120840117604e-6	3.1010784379441936e-7
+5305	0.010149413	0.0102016	1.3863655070435886e-6	3.1806259113920807e-7
+16777	0.010432813	0.0102544095	1.3870558057308985e-6	3.4374717946022066e-7
+53054	0.01117762	0.010611818	1.3888666894696818e-6	4.022040070391591e-7
+167772	0.01320493	0.011400021	1.3864687098220945e-6	5.396416394710086e-7
+530542	0.017729944	0.013321508	1.3879307982260866e-6	8.077188843675696e-7
+1677722	0.030144937	0.0174889445	1.3883252357972235e-6	1.1943118827017768e-6
+5305422	0.071517286	0.031759459	1.3881328727539812e-6	1.57036542866348e-6
+16777216	0.2098746265	0.070962529	1.3868072524375616e-6	1.803777670735224e-6
+53054215	0.741148733	0.182403937	1.388143599891377e-6	1.9030418314879922e-6
+167772160	2.606663643	0.5031452625	1.387107956477105e-6	1.9385382768826457e-6
diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat
@@ -0,0 +1,22 @@
+# CuFINUFFT
+# Benchmark: NUFFT of scalar data
+#  - Device: NVIDIA A100 80GB PCIe
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Relative tolerance: 1.0e-6
+#  - GPU sort: 1
+#  - GPU kernel evaluation method: 1
+#  - Order of Fourier modes (modeord): 1
+#  - GPU method: 2 (shared_memory)
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.011692011	0.010875831	1.387012084011762e-6	3.101078437955394e-7
+5305	0.014491329	0.011954708	1.3863655070436594e-6	3.180625911384002e-7
+16777	0.022250966	0.014949265	1.3870558057312356e-6	3.437471794564674e-7
+53054	0.0403717795	0.021438011	1.3888666894692987e-6	4.0220400703767747e-7
+167772	0.076147689	0.032283476	1.3864687098221654e-6	5.39641639470162e-7
+530542	0.1424077785	0.046160006	1.3879307982261694e-6	8.07718884367156e-7
+1677722	0.24217535	0.061705163	1.3883252357980463e-6	1.1943118827025802e-6
+5305422	0.3729179835	0.077908472	1.3881328727557852e-6	1.5703654286660455e-6
+16777216	0.59926713	0.099986922	1.3868072524389908e-6	1.8037776707365977e-6
+53054215	1.140496802	0.152142785	1.3881435998922012e-6	1.9030418314885585e-6
+167772160	2.7159741345	0.309835595	1.3871079564877375e-6	1.9385382768972833e-6
diff --git a/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat b/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat
@@ -0,0 +1,22 @@
+# FINUFFT (CPU)
+# Benchmark: NUFFT of scalar data
+#  - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+#  - Number of threads: 32
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Relative tolerance: 1.0e-6
+#  - Sort (spread_sort): 1
+#  - Kernel evaluation method (spread_kerevalmeth): 1
+#  - Order of Fourier modes (modeord): 1
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.5005381625	0.253770516	3.731950120546327e-6	6.646216249772271e-7
+5305	0.552043342	0.239666509	3.7290631645038357e-6	6.72066525127654e-7
+16777	0.577708378	0.261397829	3.734294780588139e-6	7.464022115781271e-7
+53054	0.673337338	0.274950167	3.7239928569652815e-6	9.119287514989875e-7
+167772	0.729330451	0.283593921	3.729391822696644e-6	1.3275103721977031e-6
+530542	0.738622538	0.32117297	3.7359429392970702e-6	2.0583360832214665e-6
+1677722	0.800928979	0.368688791	3.7286290440589845e-6	3.078078920315979e-6
+5305422	0.9500394215	0.5086966405	3.7276066153558594e-6	4.075858583640244e-6
+16777216	1.219703752	0.916775313	3.7307501803740638e-6	4.693500207626813e-6
+53054215	2.09705826	1.437311869	3.724478960801549e-6	4.947480478356764e-6
+167772160	4.7336734635	3.0110193405	3.7328363128680543e-6	5.056647405745642e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CPU
+# Benchmark: NUFFT of scalar data
+#  - Backend: CPU
+#  - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Oversampling factor: 1.5
+#  - Half support: HalfSupport{4}()
+#  - Number of threads: 32
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.401702684	0.217824823	1.281824057186365e-6	3.011939154273146e-7
+5305	0.478089718	0.226900624	1.277407211115232e-6	3.060539615508211e-7
+16777	0.556644054	0.226762416	1.2776077397706963e-6	3.201846369029206e-7
+53054	0.6612915435	0.24357653	1.2839845138440086e-6	3.7866796040564127e-7
+167772	0.705163086	0.264036265	1.286857150143559e-6	5.066732411096722e-7
+530542	0.791394353	0.282972265	1.2874042722043801e-6	7.458672638907114e-7
+1677722	0.803317388	0.3235546355	1.2875233459096014e-6	1.0983844251011532e-6
+5305422	0.890094409	0.4209180075	1.285417860634058e-6	1.4413882842227143e-6
+16777216	1.108990892	0.875500309	1.2870935078768847e-6	1.65653108658738e-6
+53054215	1.807307983	1.752785424	1.2862609599745603e-6	1.745912493773992e-6
+167772160	5.076364579	4.7441131055	1.2854795078661256e-6	1.7777658759107162e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+#  - Backend: CUDABackend
+#  - Device: NVIDIA A100 80GB PCIe
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Oversampling factor: 1.5
+#  - Half support: HalfSupport{4}()
+#  - GPU method: global_memory
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.007174481	0.0069437445	1.7461299210416866e-6	3.51735849884217e-7
+5305	0.007340271	0.006967824	1.7448299030724668e-6	3.5935428480849213e-7
+16777	0.007364155	0.006959589	1.7441440240783446e-6	3.8526407361623966e-7
+53054	0.008016956	0.0071371565	1.7459043209583552e-6	4.681070024635566e-7
+167772	0.01129348	0.0080823985	1.744026447406056e-6	6.545796093721018e-7
+530542	0.018176627	0.0098429595	1.7433317941794872e-6	9.95894287098444e-7
+1677722	0.036135638	0.013180793	1.7455342688942634e-6	1.485682919911522e-6
+5305422	0.095645657	0.023485831	1.743499377366084e-6	1.961321173351077e-6
+16777216	0.308876142	0.057238519	1.7457314354108132e-6	2.2579240564636625e-6
+53054215	1.102950989	0.166080969	1.742616495241337e-6	2.3789078434566558e-6
+167772160	3.9515899275	0.51076222	1.743980969237664e-6	2.4261231727166925e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+#  - Backend: CUDABackend
+#  - Device: NVIDIA A100 80GB PCIe
+#  - Element type: ComplexF64
+#  - Grid size: (256, 256, 256)
+#  - Oversampling factor: 1.5
+#  - Half support: HalfSupport{4}()
+#  - GPU method: shared_memory
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.017566898	0.013264335	1.7461299210424742e-6	3.517358498891171e-7
+5305	0.018137655	0.013418704	1.7448299030716066e-6	3.59354284813164e-7
+16777	0.019053637	0.013943315	1.7441440240776934e-6	3.852640736163668e-7
+53054	0.020030834	0.014643423	1.7459043209587112e-6	4.681070024594046e-7
+167772	0.021095564	0.015789195	1.7440264474057985e-6	6.545796093692288e-7
+530542	0.022440168	0.0175126215	1.7433317941788803e-6	9.958942870956675e-7
+1677722	0.026032583	0.020145207	1.7455342688943773e-6	1.4856829199119722e-6
+5305422	0.0378931535	0.0263968385	1.7434993773628165e-6	1.9613211733482794e-6
+16777216	0.078095254	0.046994287	1.7457314354124795e-6	2.2579240564680298e-6
+53054215	0.2090714075	0.114592533	1.7426164952268956e-6	2.3789078434342945e-6
+167772160	0.6257227505	0.3285903345	1.743980969245702e-6	2.426123172726822e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat
@@ -0,0 +1,20 @@
+# NonuniformFFTs.jl using CPU
+# Benchmark: NUFFT of scalar data
+#  - Backend: CPU
+#  - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads
+#  - Element type: Float64
+#  - Grid size: (256, 256, 256)
+#  - Oversampling factor: 1.5
+#  - Half support: HalfSupport{4}()
+#  - Number of threads: 32
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.157921409	0.057483433	1.2992421887135856e-6	3.0117054596563075e-7
+5305	0.185425857	0.054794236	1.2840271425855802e-6	2.9623411061857803e-7
+16777	0.2107738795	0.060259554	1.2883263838044824e-6	3.169193033522629e-7
+53054	0.259005555	0.067185219	1.2977247862698859e-6	3.801684137736015e-7
+167772	0.299163513	0.082973939	1.2934667124620822e-6	5.032368971604971e-7
+530542	0.295613095	0.093088267	1.2963805116600738e-6	7.476721873945614e-7
+1677722	0.338260935	0.122719703	1.296796876868053e-6	1.1055503659183577e-6
+5305422	0.4419285785	0.217629578	1.2973073932578797e-6	1.4521190807869027e-6
+16777216	0.650143119	0.586211104	1.2963264136956878e-6	1.6655878400956973e-6
+53054215	1.364857875	1.474264205	1.297942847667679e-6	1.7599064232383652e-6
diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat
@@ -0,0 +1,21 @@
+# NonuniformFFTs.jl using CUDABackend
+# Benchmark: NUFFT of scalar data
+#  - Backend: CUDABackend
+#  - Device: NVIDIA A100 80GB PCIe
+#  - Element type: Float64
+#  - Grid size: (256, 256, 256)
+#  - Oversampling factor: 1.5
+#  - Half support: HalfSupport{4}()
+#  - GPU method: global_memory
+# (1) Number of points  (2) Type 1 (median, s)  (3) Type 2 (median, s)  (4) Relative error type 1  (5) Relative error type 2
+1678	0.003320917	0.003230889	1.7468019419364787e-6	3.5090636814544796e-7
+5305	0.003324334	0.003240768	1.7436883516306644e-6	3.569900784629132e-7
+16777	0.003340074	0.003251238	1.7428919558873212e-6	3.933815099546998e-7
+53054	0.003735293	0.003365211	1.74393734550516e-6	4.7633118362671775e-7
+167772	0.00541354	0.00402824	1.7443022952469846e-6	6.543784470139837e-7
+530542	0.008519906	0.006309474	1.7447103632569267e-6	9.96697648602348e-7
+1677722	0.01911457	0.008761147	1.7430259600594823e-6	1.4888903328036116e-6
+5305422	0.051812251	0.015686412	1.7458544701717223e-6	1.9669560205696403e-6
+16777216	0.176896208	0.039465031	1.7465137652756605e-6	2.2609508058937542e-6
+53054215	0.6754469325	0.1147949335	1.7441674213003973e-6	2.3841887129635894e-6
+167772160	2.503391267	0.351748967	1.7445912696046888e-6	2.428869915799653e-6