From 9d2e544bb59db6ced67bd5f4bde7cf59ce5df8a5 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 29 Nov 2024 13:47:10 +0100 Subject: [PATCH 1/4] Add benchmark data and plots --- benchmarks/Project.toml | 16 + benchmarks/README.md | 44 + benchmarks/plots/Project.toml | 4 + .../plots/benchmark_ComplexF64_type1.svg | 1756 +++++++++++++++++ .../plots/benchmark_ComplexF64_type2.svg | 1756 +++++++++++++++++ benchmarks/plots/benchmark_Float64_type1.svg | 1731 ++++++++++++++++ benchmarks/plots/benchmark_Float64_type2.svg | 1731 ++++++++++++++++ benchmarks/plots/plot_benchmarks.jl | 142 ++ ...CuFINUFFT_256_ComplexF64_global_memory.dat | 22 + ...CuFINUFFT_256_ComplexF64_shared_memory.dat | 22 + .../results/FINUFFT_256_ComplexF64_CPU.dat | 22 + .../NonuniformFFTs_256_ComplexF64_CPU.dat | 21 + ...6_ComplexF64_CUDABackend_global_memory.dat | 21 + ...6_ComplexF64_CUDABackend_shared_memory.dat | 21 + .../NonuniformFFTs_256_Float64_CPU.dat | 20 + ..._256_Float64_CUDABackend_global_memory.dat | 21 + ..._256_Float64_CUDABackend_shared_memory.dat | 21 + benchmarks/run_benchmarks.jl | 428 ++++ 18 files changed, 7799 insertions(+) create mode 100644 benchmarks/Project.toml create mode 100644 benchmarks/README.md create mode 100644 benchmarks/plots/Project.toml create mode 100644 benchmarks/plots/benchmark_ComplexF64_type1.svg create mode 100644 benchmarks/plots/benchmark_ComplexF64_type2.svg create mode 100644 benchmarks/plots/benchmark_Float64_type1.svg create mode 100644 benchmarks/plots/benchmark_Float64_type2.svg create mode 100644 benchmarks/plots/plot_benchmarks.jl create mode 100644 benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat create mode 100644 benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat create mode 100644 benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat create mode 100644 benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat create mode 100644 benchmarks/run_benchmarks.jl diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml new file mode 100644 index 0000000..97f42a7 --- /dev/null +++ b/benchmarks/Project.toml @@ -0,0 +1,16 @@ +[deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +FINUFFT = "d8beea63-0952-562e-9c6a-8e8ef7364055" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042" +TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" + +[extras] +CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..fbf52b8 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,44 @@ +# Benchmarks + +The benchmark consists in type-1 and type-2 NUFFTs on a uniform 3D grid of +fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of +non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values +between $10^{-4}$ (very few points) and $10^1$ (very dense). +Points are randomly located in $[0, 2π)^3$. +Moreover, the relative tolerance is fixed to $10^{-6}$. +In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5` +(oversampling factor) and $m = HalfSupport(4)$ (see Accuracy page in the docs). + +The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an +NVIDIA A100 GPU. + +The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1. + +Each reported time includes (1) the time spent processing non-uniform points +(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`). + +## FINUFFT set-up + +We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For +performance reasons, the (Cu)FINUFFT libraries were compiled locally and the +FINUFFT.jl sources were modified accordingly as described +[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries). +FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`. +Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3. + +All FINUFFT benchmarks were run with relative tolerance `1e-6`. +Moreover, the following options were used: + +- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs) +- `spread_sort = 1` (enable point sorting in CPU plans) +- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation) +- `fftw = FFTW.ESTIMATE` (CPU plans) + +and for GPU plans: + +- `gpu_sort = 1` (enable point sorting) +- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation) +- `gpu_method = 1` (global memory method, non-uniform point driven) + +We also tried `gpu_method = 2` (based on shared memory) but found it to be +considerably slower in almost all cases (in three dimensions, at the requested tolerance). diff --git a/benchmarks/plots/Project.toml b/benchmarks/plots/Project.toml new file mode 100644 index 0000000..bbc29bb --- /dev/null +++ b/benchmarks/plots/Project.toml @@ -0,0 +1,4 @@ +[deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" diff --git a/benchmarks/plots/benchmark_ComplexF64_type1.svg b/benchmarks/plots/benchmark_ComplexF64_type1.svg new file mode 100644 index 0000000..ac0a25a --- /dev/null +++ b/benchmarks/plots/benchmark_ComplexF64_type1.svg @@ -0,0 +1,1756 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/plots/benchmark_ComplexF64_type2.svg b/benchmarks/plots/benchmark_ComplexF64_type2.svg new file mode 100644 index 0000000..4769683 --- /dev/null +++ b/benchmarks/plots/benchmark_ComplexF64_type2.svg @@ -0,0 +1,1756 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/plots/benchmark_Float64_type1.svg b/benchmarks/plots/benchmark_Float64_type1.svg new file mode 100644 index 0000000..bd7db6f --- /dev/null +++ b/benchmarks/plots/benchmark_Float64_type1.svg @@ -0,0 +1,1731 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/plots/benchmark_Float64_type2.svg b/benchmarks/plots/benchmark_Float64_type2.svg new file mode 100644 index 0000000..f12b342 --- /dev/null +++ b/benchmarks/plots/benchmark_Float64_type2.svg @@ -0,0 +1,1731 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/plots/plot_benchmarks.jl b/benchmarks/plots/plot_benchmarks.jl new file mode 100644 index 0000000..0533d7f --- /dev/null +++ b/benchmarks/plots/plot_benchmarks.jl @@ -0,0 +1,142 @@ +using GLMakie +using CairoMakie +using DelimitedFiles + +GLMakie.activate!() + +function read_timings(io::IO; nufft_type::Int) + while Char(peek(io)) == '#' + readline(io) + end + data = readdlm(io, Float64) + (; Np = @view(data[:, 1]), times = @view(data[:, 1 + nufft_type]),) +end + +read_timings(fname::AbstractString; kws...) = open(io -> read_timings(io; kws...), fname) + +function plot_from_file!(ax::Axis, filename; nufft_type, zorder = nothing, kws...) + data = read_timings(filename; nufft_type) + (; Np, times,) = data + l = scatterlines!(ax, Np, times; kws...) + l.strokecolor[] = l.color[] + if zorder !== nothing + translate!(l, 0, 0, zorder) + end + l +end + +function plot_benchmark(::Type{T}; nufft_type = 1, save_svg = false,) where {T <: Number} + Ns = (256, 256, 256) + N = first(Ns) + Ngrid = prod(Ns) + Z = complex(T) # for FINUFFT (complex data only) + + fig = Figure(size = (750, 440)) + ax = Axis( + fig[1, 1]; + xscale = log10, yscale = log10, + xlabel = L"Number of nonuniform points $N$", xlabelsize = 16, + ylabel = "Time (s)", + xticks = LogTicks(0:20), xminorticks = IntervalsBetween(9), xminorticksvisible = true, + yticks = LogTicks(-8:3), yminorticks = IntervalsBetween(9), yminorticksvisible = true, + xgridvisible = false, ygridvisible = false, + limits = (nothing, (1e-3, 1e1)), + ) + limits_top = lift(ax.finallimits) do lims + xlims = Makie.xlimits(lims) ./ Ngrid + ylims = Makie.ylimits(lims) + xlims, ylims + end + ax_top = Axis( + fig[1, 1]; + xscale = ax.xscale, yscale = ax.yscale, + xaxisposition = :top, + # xgridvisible = true, ygridvisible = false, + xticks = LogTicks(-8:2), xminorticks = IntervalsBetween(9), xminorticksvisible = true, + xlabel = L"Point density $ρ = N / M^3$", xlabelsize = 16, + limits = limits_top, + ) + hidespines!(ax_top) + hideydecorations!(ax_top; grid = false) + + colours = Makie.wong_colors() + + kws_all = (; nufft_type,) + kws_cpu = (; marker = :x, markersize = 16, strokewidth = 0,) + kws_gpu = (; marker = :circle, markersize = 10, strokewidth = 2,) + kws_gpu_sm = (; kws_gpu..., markercolor = :transparent,) # open symbols + kws_nonuniform = (; linestyle = :solid, color = colours[1], zorder = 10,) + kws_finufft = (; linestyle = :dash, color = colours[2],) + + # Leftmost point of all CPU/GPU curves (for annotating later) + first_points_cpu = Point2{Float64}[] + first_points_gpu = Point2{Float64}[] + last_points_gpu = Point2{Float64}[] + + l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CPU.dat"; label = "NonuniformFFTs CPU", kws_nonuniform..., kws_cpu..., kws_all...) + push!(first_points_cpu, l[1][][1]) # get first datapoint in line + + l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_global_memory.dat"; label = "NonuniformFFTs GPU", kws_nonuniform..., kws_gpu..., kws_all...) + push!(first_points_gpu, l[1][][1]) # get first datapoint in line + push!(last_points_gpu, l[1][][end]) # get last datapoint in line + + l = plot_from_file!(ax, "../results/NonuniformFFTs_$(N)_$(T)_CUDABackend_shared_memory.dat"; label = "NonuniformFFTs GPU (SM)", kws_nonuniform..., kws_gpu_sm..., kws_all...) + push!(first_points_gpu, l[1][][1]) # get first datapoint in line + push!(last_points_gpu, l[1][][end]) # get last datapoint in line + + l = plot_from_file!(ax, "../results/FINUFFT_$(N)_$(Z)_CPU.dat"; label = "FINUFFT CPU", kws_finufft..., kws_cpu..., kws_all...) + push!(first_points_cpu, l[1][][1]) # get first datapoint in line + + l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_global_memory.dat"; label = "CuFINUFFT GPU", kws_finufft..., kws_gpu..., kws_all...) + push!(first_points_gpu, l[1][][1]) # get first datapoint in line + push!(last_points_gpu, l[1][][end]) # get last datapoint in line + + l = plot_from_file!(ax, "../results/CuFINUFFT_$(N)_$(Z)_shared_memory.dat"; label = "CuFINUFFT GPU (SM)", kws_finufft..., kws_gpu_sm..., kws_all...) + push!(first_points_gpu, l[1][][1]) # get first datapoint in line + push!(last_points_gpu, l[1][][end]) # get last datapoint in line + + let points = first_points_cpu, text = "CPU" + x = minimum(p -> p[1], points) # generally all points x are the same + y = maximum(p -> p[2], points) + text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16) + end + + let points = first_points_gpu, text = "GPU" + x = minimum(p -> p[1], points) # generally all points x are the same + y = maximum(p -> p[2], points) + text!(ax, x, y; text = rich(text; font = :bold), align = (:left, :bottom), offset = (0, 8), fontsize = 16) + end + + let xs = logrange(0.5, 10.0; length = 3) + ymin, n = findmin(p -> p[2], last_points_gpu) + xmin = last_points_gpu[n][1] / Ngrid # as a density ρ + scale = ymin / xmin * 0.7 + ys = @. xs * scale + lines!(ax_top, xs, ys; linestyle = :dash, color = :black, linewidth = 3) + text!(ax_top, xs[2], ys[2]; text = L"∼N", align = (:left, :top), fontsize = 18) + end + + Label( + fig[1, 2][1, 1], + """ + Type-$(nufft_type) NUFFT + $(N)³ Fourier modes + $T data + 6-digit accuracy + """; + justification = :left, fontsize = 16, lineheight = 1.2, + ) + Legend(fig[1, 2][2, 1], ax; framevisible = false, rowgap = 8, labelsize = 14) + + if save_svg + save("benchmark_$(T)_type$(nufft_type).svg", fig; backend = CairoMakie) + end + + fig +end + +for T ∈ (Float64, ComplexF64), nufft_type ∈ (1, 2) + plot_benchmark(T; nufft_type, save_svg = true) +end + +fig = plot_benchmark(ComplexF64; nufft_type = 1) diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat new file mode 100644 index 0000000..63f5a22 --- /dev/null +++ b/benchmarks/results/CuFINUFFT_256_ComplexF64_global_memory.dat @@ -0,0 +1,22 @@ +# CuFINUFFT +# Benchmark: NUFFT of scalar data +# - Device: NVIDIA A100 80GB PCIe +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Relative tolerance: 1.0e-6 +# - GPU sort: 1 +# - GPU kernel evaluation method: 1 +# - Order of Fourier modes (modeord): 1 +# - GPU method: 1 (global_memory) +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.010049466 0.010189318 1.3870120840117604e-6 3.1010784379441936e-7 +5305 0.010149413 0.0102016 1.3863655070435886e-6 3.1806259113920807e-7 +16777 0.010432813 0.0102544095 1.3870558057308985e-6 3.4374717946022066e-7 +53054 0.01117762 0.010611818 1.3888666894696818e-6 4.022040070391591e-7 +167772 0.01320493 0.011400021 1.3864687098220945e-6 5.396416394710086e-7 +530542 0.017729944 0.013321508 1.3879307982260866e-6 8.077188843675696e-7 +1677722 0.030144937 0.0174889445 1.3883252357972235e-6 1.1943118827017768e-6 +5305422 0.071517286 0.031759459 1.3881328727539812e-6 1.57036542866348e-6 +16777216 0.2098746265 0.070962529 1.3868072524375616e-6 1.803777670735224e-6 +53054215 0.741148733 0.182403937 1.388143599891377e-6 1.9030418314879922e-6 +167772160 2.606663643 0.5031452625 1.387107956477105e-6 1.9385382768826457e-6 diff --git a/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat b/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat new file mode 100644 index 0000000..0d98c58 --- /dev/null +++ b/benchmarks/results/CuFINUFFT_256_ComplexF64_shared_memory.dat @@ -0,0 +1,22 @@ +# CuFINUFFT +# Benchmark: NUFFT of scalar data +# - Device: NVIDIA A100 80GB PCIe +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Relative tolerance: 1.0e-6 +# - GPU sort: 1 +# - GPU kernel evaluation method: 1 +# - Order of Fourier modes (modeord): 1 +# - GPU method: 2 (shared_memory) +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.011692011 0.010875831 1.387012084011762e-6 3.101078437955394e-7 +5305 0.014491329 0.011954708 1.3863655070436594e-6 3.180625911384002e-7 +16777 0.022250966 0.014949265 1.3870558057312356e-6 3.437471794564674e-7 +53054 0.0403717795 0.021438011 1.3888666894692987e-6 4.0220400703767747e-7 +167772 0.076147689 0.032283476 1.3864687098221654e-6 5.39641639470162e-7 +530542 0.1424077785 0.046160006 1.3879307982261694e-6 8.07718884367156e-7 +1677722 0.24217535 0.061705163 1.3883252357980463e-6 1.1943118827025802e-6 +5305422 0.3729179835 0.077908472 1.3881328727557852e-6 1.5703654286660455e-6 +16777216 0.59926713 0.099986922 1.3868072524389908e-6 1.8037776707365977e-6 +53054215 1.140496802 0.152142785 1.3881435998922012e-6 1.9030418314885585e-6 +167772160 2.7159741345 0.309835595 1.3871079564877375e-6 1.9385382768972833e-6 diff --git a/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat b/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat new file mode 100644 index 0000000..633fe33 --- /dev/null +++ b/benchmarks/results/FINUFFT_256_ComplexF64_CPU.dat @@ -0,0 +1,22 @@ +# FINUFFT (CPU) +# Benchmark: NUFFT of scalar data +# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads +# - Number of threads: 32 +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Relative tolerance: 1.0e-6 +# - Sort (spread_sort): 1 +# - Kernel evaluation method (spread_kerevalmeth): 1 +# - Order of Fourier modes (modeord): 1 +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.5005381625 0.253770516 3.731950120546327e-6 6.646216249772271e-7 +5305 0.552043342 0.239666509 3.7290631645038357e-6 6.72066525127654e-7 +16777 0.577708378 0.261397829 3.734294780588139e-6 7.464022115781271e-7 +53054 0.673337338 0.274950167 3.7239928569652815e-6 9.119287514989875e-7 +167772 0.729330451 0.283593921 3.729391822696644e-6 1.3275103721977031e-6 +530542 0.738622538 0.32117297 3.7359429392970702e-6 2.0583360832214665e-6 +1677722 0.800928979 0.368688791 3.7286290440589845e-6 3.078078920315979e-6 +5305422 0.9500394215 0.5086966405 3.7276066153558594e-6 4.075858583640244e-6 +16777216 1.219703752 0.916775313 3.7307501803740638e-6 4.693500207626813e-6 +53054215 2.09705826 1.437311869 3.724478960801549e-6 4.947480478356764e-6 +167772160 4.7336734635 3.0110193405 3.7328363128680543e-6 5.056647405745642e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat new file mode 100644 index 0000000..f2c9ce2 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CPU.dat @@ -0,0 +1,21 @@ +# NonuniformFFTs.jl using CPU +# Benchmark: NUFFT of scalar data +# - Backend: CPU +# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - Number of threads: 32 +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.401702684 0.217824823 1.281824057186365e-6 3.011939154273146e-7 +5305 0.478089718 0.226900624 1.277407211115232e-6 3.060539615508211e-7 +16777 0.556644054 0.226762416 1.2776077397706963e-6 3.201846369029206e-7 +53054 0.6612915435 0.24357653 1.2839845138440086e-6 3.7866796040564127e-7 +167772 0.705163086 0.264036265 1.286857150143559e-6 5.066732411096722e-7 +530542 0.791394353 0.282972265 1.2874042722043801e-6 7.458672638907114e-7 +1677722 0.803317388 0.3235546355 1.2875233459096014e-6 1.0983844251011532e-6 +5305422 0.890094409 0.4209180075 1.285417860634058e-6 1.4413882842227143e-6 +16777216 1.108990892 0.875500309 1.2870935078768847e-6 1.65653108658738e-6 +53054215 1.807307983 1.752785424 1.2862609599745603e-6 1.745912493773992e-6 +167772160 5.076364579 4.7441131055 1.2854795078661256e-6 1.7777658759107162e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat new file mode 100644 index 0000000..bf9ebe7 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_global_memory.dat @@ -0,0 +1,21 @@ +# NonuniformFFTs.jl using CUDABackend +# Benchmark: NUFFT of scalar data +# - Backend: CUDABackend +# - Device: NVIDIA A100 80GB PCIe +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - GPU method: global_memory +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.007174481 0.0069437445 1.7461299210416866e-6 3.51735849884217e-7 +5305 0.007340271 0.006967824 1.7448299030724668e-6 3.5935428480849213e-7 +16777 0.007364155 0.006959589 1.7441440240783446e-6 3.8526407361623966e-7 +53054 0.008016956 0.0071371565 1.7459043209583552e-6 4.681070024635566e-7 +167772 0.01129348 0.0080823985 1.744026447406056e-6 6.545796093721018e-7 +530542 0.018176627 0.0098429595 1.7433317941794872e-6 9.95894287098444e-7 +1677722 0.036135638 0.013180793 1.7455342688942634e-6 1.485682919911522e-6 +5305422 0.095645657 0.023485831 1.743499377366084e-6 1.961321173351077e-6 +16777216 0.308876142 0.057238519 1.7457314354108132e-6 2.2579240564636625e-6 +53054215 1.102950989 0.166080969 1.742616495241337e-6 2.3789078434566558e-6 +167772160 3.9515899275 0.51076222 1.743980969237664e-6 2.4261231727166925e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat new file mode 100644 index 0000000..510feb8 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_ComplexF64_CUDABackend_shared_memory.dat @@ -0,0 +1,21 @@ +# NonuniformFFTs.jl using CUDABackend +# Benchmark: NUFFT of scalar data +# - Backend: CUDABackend +# - Device: NVIDIA A100 80GB PCIe +# - Element type: ComplexF64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - GPU method: shared_memory +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.017566898 0.013264335 1.7461299210424742e-6 3.517358498891171e-7 +5305 0.018137655 0.013418704 1.7448299030716066e-6 3.59354284813164e-7 +16777 0.019053637 0.013943315 1.7441440240776934e-6 3.852640736163668e-7 +53054 0.020030834 0.014643423 1.7459043209587112e-6 4.681070024594046e-7 +167772 0.021095564 0.015789195 1.7440264474057985e-6 6.545796093692288e-7 +530542 0.022440168 0.0175126215 1.7433317941788803e-6 9.958942870956675e-7 +1677722 0.026032583 0.020145207 1.7455342688943773e-6 1.4856829199119722e-6 +5305422 0.0378931535 0.0263968385 1.7434993773628165e-6 1.9613211733482794e-6 +16777216 0.078095254 0.046994287 1.7457314354124795e-6 2.2579240564680298e-6 +53054215 0.2090714075 0.114592533 1.7426164952268956e-6 2.3789078434342945e-6 +167772160 0.6257227505 0.3285903345 1.743980969245702e-6 2.426123172726822e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat new file mode 100644 index 0000000..b40da43 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_Float64_CPU.dat @@ -0,0 +1,20 @@ +# NonuniformFFTs.jl using CPU +# Benchmark: NUFFT of scalar data +# - Backend: CPU +# - Device: AMD EPYC 7302 16-Core Processor (znver2, 32 threads +# - Element type: Float64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - Number of threads: 32 +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.157921409 0.057483433 1.2992421887135856e-6 3.0117054596563075e-7 +5305 0.185425857 0.054794236 1.2840271425855802e-6 2.9623411061857803e-7 +16777 0.2107738795 0.060259554 1.2883263838044824e-6 3.169193033522629e-7 +53054 0.259005555 0.067185219 1.2977247862698859e-6 3.801684137736015e-7 +167772 0.299163513 0.082973939 1.2934667124620822e-6 5.032368971604971e-7 +530542 0.295613095 0.093088267 1.2963805116600738e-6 7.476721873945614e-7 +1677722 0.338260935 0.122719703 1.296796876868053e-6 1.1055503659183577e-6 +5305422 0.4419285785 0.217629578 1.2973073932578797e-6 1.4521190807869027e-6 +16777216 0.650143119 0.586211104 1.2963264136956878e-6 1.6655878400956973e-6 +53054215 1.364857875 1.474264205 1.297942847667679e-6 1.7599064232383652e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat new file mode 100644 index 0000000..a0c7be7 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_global_memory.dat @@ -0,0 +1,21 @@ +# NonuniformFFTs.jl using CUDABackend +# Benchmark: NUFFT of scalar data +# - Backend: CUDABackend +# - Device: NVIDIA A100 80GB PCIe +# - Element type: Float64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - GPU method: global_memory +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.003320917 0.003230889 1.7468019419364787e-6 3.5090636814544796e-7 +5305 0.003324334 0.003240768 1.7436883516306644e-6 3.569900784629132e-7 +16777 0.003340074 0.003251238 1.7428919558873212e-6 3.933815099546998e-7 +53054 0.003735293 0.003365211 1.74393734550516e-6 4.7633118362671775e-7 +167772 0.00541354 0.00402824 1.7443022952469846e-6 6.543784470139837e-7 +530542 0.008519906 0.006309474 1.7447103632569267e-6 9.96697648602348e-7 +1677722 0.01911457 0.008761147 1.7430259600594823e-6 1.4888903328036116e-6 +5305422 0.051812251 0.015686412 1.7458544701717223e-6 1.9669560205696403e-6 +16777216 0.176896208 0.039465031 1.7465137652756605e-6 2.2609508058937542e-6 +53054215 0.6754469325 0.1147949335 1.7441674213003973e-6 2.3841887129635894e-6 +167772160 2.503391267 0.351748967 1.7445912696046888e-6 2.428869915799653e-6 diff --git a/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat new file mode 100644 index 0000000..3a06ed4 --- /dev/null +++ b/benchmarks/results/NonuniformFFTs_256_Float64_CUDABackend_shared_memory.dat @@ -0,0 +1,21 @@ +# NonuniformFFTs.jl using CUDABackend +# Benchmark: NUFFT of scalar data +# - Backend: CUDABackend +# - Device: NVIDIA A100 80GB PCIe +# - Element type: Float64 +# - Grid size: (256, 256, 256) +# - Oversampling factor: 1.5 +# - Half support: HalfSupport{4}() +# - GPU method: shared_memory +# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2 +1678 0.008353976 0.0070949475 1.7468019419382034e-6 3.5090636817904113e-7 +5305 0.008659046 0.0072636425 1.7436883516277096e-6 3.5699007845828123e-7 +16777 0.009093549 0.007541187 1.7428919558873898e-6 3.9338150996385125e-7 +53054 0.009523572 0.007862112 1.7439373455033794e-6 4.7633118363122153e-7 +167772 0.0100324315 0.008284588 1.7443022952456069e-6 6.543784470131989e-7 +530542 0.011138013 0.008922181 1.744710363257602e-6 9.966976486010409e-7 +1677722 0.014576812 0.010465275 1.743025960053548e-6 1.4888903327977372e-6 +5305422 0.025735163 0.0149117685 1.7458544701617326e-6 1.9669560205599723e-6 +16777216 0.063339967 0.030928607 1.7465137652730675e-6 2.26095080589055e-6 +53054215 0.1827624215 0.08283842 1.7441674213202952e-6 2.3841887129839585e-6 +167772160 0.560682647 0.246603554 1.7445912696151422e-6 2.4288699158223415e-6 diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl new file mode 100644 index 0000000..8377b48 --- /dev/null +++ b/benchmarks/run_benchmarks.jl @@ -0,0 +1,428 @@ +using NonuniformFFTs +using FINUFFT +using Adapt +using CUDA +using FFTW +using LinearAlgebra: norm +using KernelAbstractions +using KernelAbstractions: KernelAbstractions as KA +using ThreadPinning +using Random: randn!, Xoshiro +using BenchmarkTools +using Statistics +using TimerOutputs + +# NOTE: FINUFFT doesn't play well with ThreadPinning, so pinthreads should *not* be used +# for FINUFFT (CPU) benchmarks! Seems to be a conflict between ThreadPinning and OpenMP. +if haskey(ENV, "SLURM_JOB_ID") + pinthreads(:affinitymask) +else + pinthreads(:cores) +end + +# Useful for FINUFFT (CPU): +ENV["OMP_PROC_BIND"] = "true" +# ENV["OMP_PLACES"] = "cores" +ENV["OMP_NUM_THREADS"] = Threads.nthreads() + +get_device_name(::CPU) = string(Sys.cpu_info()[1].model, " ($(Sys.CPU_NAME), $(Sys.CPU_THREADS) threads)") + +function get_device_name(::CUDABackend) + dev = CUDA.device() + io = IOBuffer() + show(io, MIME"text/plain"(), dev) # e.g. "CuDevice(1): NVIDIA A100 80GB PCIe" + s = String(take!(io)) + split(s, " "; limit = 2)[2] +end + +function bench_nonuniformffts( + ::Type{Z}, backend::KA.Backend, Ns::Dims, Np; + σ, m, gpu_method = :global_memory, kws..., + ) where {Z} + D = length(Ns) + T = real(Z) + rng = Xoshiro(42) + xp = ntuple(D) do _ + adapt(backend, randn(rng, T, Np)) + end + vp = adapt(backend, randn(rng, Z, Np)) + wp = similar(vp) + + p = PlanNUFFT(Z, Ns; backend, σ, m, gpu_method, kws...) + ûs = similar(vp, complex(T), size(p)) + + # Execute plan once + set_points!(p, xp) + exec_type1!(ûs, p, vp) + exec_type2!(wp, p, ûs) + + # Compare with reference case (high accuracy: rtol = 1e-14) + relative_errors = let p_ref = PlanNUFFT(Z, Ns; backend, kws..., gpu_method = :global_memory, m = HalfSupport(8), σ = T(2.0)) + ûs_ref = similar(ûs) + wp_ref = similar(wp) + set_points!(p_ref, xp) + exec_type1!(ûs_ref, p_ref, vp) + exec_type2!(wp_ref, p_ref, ûs_ref) + local type1 = norm(ûs_ref - ûs) / norm(ûs_ref) + local type2 = norm(wp_ref - wp) / norm(wp_ref) + KA.unsafe_free!(ûs_ref) + KA.unsafe_free!(wp_ref) + (; type1, type2,) + end + + if backend isa KA.CPU + reset_timer!(p.timer) + end + + type1 = @benchmark let + set_points!($p, $xp) + exec_type1!($ûs, $p, $vp) + KA.synchronize($backend) + end + + type2 = @benchmark let + set_points!($p, $xp) + exec_type2!($wp, $p, $ûs) + KA.synchronize($backend) + end + + if backend isa KA.CPU + println(p.timer) + end + + (; type1, type2, relative_errors,) +end + +function bench_finufft_cpu(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex} + backend = CPU() + T = real(Z) + D = length(Ns) + rng = Xoshiro(42) + xp = ntuple(D) do _ + adapt(backend, randn(rng, T, Np)) + end + vp = adapt(backend, randn(rng, Z, Np)) + ûs = similar(vp, complex(T), Ns) + ntrans = 1 + n_modes = collect(Ns) + plan_type1 = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...) + finalizer(finufft_destroy!, plan_type1) + type1 = @benchmark let + finufft_setpts!($plan_type1, $xp...) + finufft_exec!($plan_type1, $vp, $ûs) + KA.synchronize($backend) + end + (; type1,) +end + +function bench_cufinufft(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex} + backend = CUDABackend() + T = real(Z) + D = length(Ns) + rng = Xoshiro(42) + xp = ntuple(D) do _ + adapt(backend, randn(rng, T, Np)) + end + vp = adapt(backend, randn(rng, Z, Np)) + wp = similar(vp) + ûs = similar(vp, complex(T), Ns) + ntrans = 1 + n_modes = collect(Ns) + + plan_type1 = cufinufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...) + plan_type2 = cufinufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts...) + finalizer(cufinufft_destroy!, plan_type1) + finalizer(cufinufft_destroy!, plan_type2) + + # Execute plans once + cufinufft_setpts!(plan_type1, xp...) + cufinufft_setpts!(plan_type2, xp...) + cufinufft_exec!(plan_type1, vp, ûs) + cufinufft_exec!(plan_type2, ûs, wp) + + # Compare with reference case (high accuracy: rtol = 1e-14) + relative_errors = let tol = 1e-14 + p1_ref = cufinufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts..., gpu_method = 1, upsampfac = 2.0) + p2_ref = cufinufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts..., gpu_method = 1, upsampfac = 2.0) + + ûs_ref = similar(ûs) + wp_ref = similar(wp) + + cufinufft_setpts!(p1_ref, xp...) + cufinufft_setpts!(p2_ref, xp...) + cufinufft_exec!(p1_ref, vp, ûs_ref) + cufinufft_exec!(p2_ref, ûs_ref, wp_ref) + + cufinufft_destroy!(p1_ref) + cufinufft_destroy!(p2_ref) + + local type1 = norm(ûs_ref - ûs) / norm(ûs_ref) + local type2 = norm(wp_ref - wp) / norm(wp_ref) + + KA.unsafe_free!(ûs_ref) + KA.unsafe_free!(wp_ref) + + (; type1, type2,) + end + + type1 = @benchmark let + cufinufft_setpts!($plan_type1, $xp...) + cufinufft_exec!($plan_type1, $vp, $ûs) + KA.synchronize($backend) + end + + type2 = @benchmark let + cufinufft_setpts!($plan_type2, $xp...) + cufinufft_exec!($plan_type2, $ûs, $wp) + KA.synchronize($backend) + end + + (; type1, type2, relative_errors,) +end + +function bench_finufft_cpu(::Type{Z}, Ns::Dims, Np; tol, opts...) where {Z <: Complex} + T = real(Z) + D = length(Ns) + rng = Xoshiro(42) + xp = ntuple(D) do _ + randn(rng, T, Np) + end + vp = randn(rng, Z, Np) + wp = similar(vp) + ûs = similar(vp, complex(T), Ns) + ntrans = 1 + n_modes = collect(Ns) + + plan_type1 = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts...) + plan_type2 = finufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts...) + finalizer(finufft_destroy!, plan_type1) + finalizer(finufft_destroy!, plan_type2) + + # Execute plans once + finufft_setpts!(plan_type1, xp...) + finufft_setpts!(plan_type2, xp...) + finufft_exec!(plan_type1, vp, ûs) + finufft_exec!(plan_type2, ûs, wp) + + # Compare with reference case (high accuracy: rtol = 1e-14) + relative_errors = let tol = 1e-14 + p1_ref = finufft_makeplan(1, n_modes, -1, ntrans, tol; dtype = T, opts..., upsampfac = 2.0) + p2_ref = finufft_makeplan(2, n_modes, +1, ntrans, tol; dtype = T, opts..., upsampfac = 2.0) + + ûs_ref = similar(ûs) + wp_ref = similar(wp) + + finufft_setpts!(p1_ref, xp...) + finufft_setpts!(p2_ref, xp...) + finufft_exec!(p1_ref, vp, ûs_ref) + finufft_exec!(p2_ref, ûs_ref, wp_ref) + + finufft_destroy!(p1_ref) + finufft_destroy!(p2_ref) + + local type1 = norm(ûs_ref - ûs) / norm(ûs_ref) + local type2 = norm(wp_ref - wp) / norm(wp_ref) + + (; type1, type2,) + end + + type1 = @benchmark let + finufft_setpts!($plan_type1, $xp...) + finufft_exec!($plan_type1, $vp, $ûs) + end + + type2 = @benchmark let + finufft_setpts!($plan_type2, $xp...) + finufft_exec!($plan_type2, $ûs, $wp) + end + + (; type1, type2, relative_errors,) +end + +function run_benchmark_nonuniformffts( + ::Type{Z}, + backend, + Ns::Dims, + Nps::AbstractVector; # number of points per test case + σ = 1.5, m = HalfSupport(4), # rtol = 1e-6 + gpu_method = :global_memory, + fftw_flags = FFTW.ESTIMATE, + params..., + ) where {Z <: Number} + N = first(Ns) + suffix = if backend isa GPU + "_$gpu_method" + else + "" + end + device_name = get_device_name(backend) + filename = "NonuniformFFTs_$(N)_$(Z)_$(typeof(backend))$(suffix).dat" + open(filename, "w") do io + println(io, "# NonuniformFFTs.jl using $(typeof(backend))") + println(io, "# Benchmark: NUFFT of scalar data") + println(io, "# - Backend: ", typeof(backend)) + println(io, "# - Device: ", device_name) + println(io, "# - Element type: ", Z) + println(io, "# - Grid size: ", Ns) + println(io, "# - Oversampling factor: ", σ) + println(io, "# - Half support: ", m) + if backend isa KA.GPU + println(io, "# - GPU method: ", gpu_method) + else + println(io, "# - Number of threads: ", Threads.nthreads()) + end + println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2") + for Np ∈ Nps + bench = bench_nonuniformffts(Z, backend, Ns, Np; params..., σ, m, gpu_method, fftw_flags) + type1 = median(bench.type1.times) / 1e9 # in seconds + type2 = median(bench.type2.times) / 1e9 # in seconds + @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2 + join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t') + print(io, '\n') + flush(io) + end + end + filename +end + +function run_benchmark_cufinufft( + ::Type{Z}, + Ns::Dims, + Nps::AbstractVector; # number of points per test case + tol = 1e-6, + modeord = 1, # convention used by default in NonuniformFFTs + gpu_method = 1, # 1: global memory (-sort) method | 2: shared memory | method 0 or 2 is much slower, at least with Float64 data + gpu_sort = 1, # needed for good performance + gpu_kerevalmeth = 1, # setting it to 0 (direct evaluation) doesn't make a big difference? + params..., + ) where {Z <: Complex} + N = first(Ns) + method = gpu_method == 1 ? :global_memory : :shared_memory + # Make sure we use the current CUDA device and stream (otherwise KA.synchronize won't + # work as expected). + params_cuda = ( + gpu_device_id = CUDA.deviceid(CUDA.device()), + gpu_stream = Base.unsafe_convert(Ptr{CUDA.CUstream_st}, CUDA.stream()), + ) + device_name = get_device_name(CUDABackend()) + filename = "CuFINUFFT_$(N)_$(Z)_$(method).dat" + open(filename, "w") do io + println(io, "# CuFINUFFT") + println(io, "# Benchmark: NUFFT of scalar data") + println(io, "# - Device: ", device_name) + println(io, "# - Element type: ", Z) + println(io, "# - Grid size: ", Ns) + println(io, "# - Relative tolerance: ", tol) + println(io, "# - GPU sort: ", gpu_sort) + println(io, "# - GPU kernel evaluation method: ", gpu_kerevalmeth) + println(io, "# - Order of Fourier modes (modeord): ", modeord) + println(io, "# - GPU method: ", gpu_method, " ($method)") + println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2") + for Np ∈ Nps + bench = bench_cufinufft(Z, Ns, Np; params..., params_cuda..., tol, modeord, gpu_method, gpu_sort, gpu_kerevalmeth) + type1 = median(bench.type1.times) / 1e9 # in seconds + type2 = median(bench.type2.times) / 1e9 # in seconds + @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2 + join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t') + print(io, '\n') + flush(io) + end + end + filename +end + +function run_benchmark_finufft_cpu( + ::Type{Z}, + Ns::Dims, + Nps::AbstractVector; # number of points per test case + tol = 1e-6, + modeord = 1, # convention used by default in NonuniformFFTs + spread_sort = 1, # needed for good performance + spread_kerevalmeth = 1, + fftw = FFTW.ESTIMATE, + nthreads = Threads.nthreads(), + params..., + ) where {Z <: Complex} + N = first(Ns) + device_name = get_device_name(CPU()) + filename = "FINUFFT_$(N)_$(Z)_CPU.dat" + open(filename, "w") do io + println(io, "# FINUFFT (CPU)") + println(io, "# Benchmark: NUFFT of scalar data") + println(io, "# - Device: ", device_name) + println(io, "# - Number of threads: ", nthreads) + println(io, "# - Element type: ", Z) + println(io, "# - Grid size: ", Ns) + println(io, "# - Relative tolerance: ", tol) + println(io, "# - Sort (spread_sort): ", spread_sort) + println(io, "# - Kernel evaluation method (spread_kerevalmeth): ", spread_kerevalmeth) + println(io, "# - Order of Fourier modes (modeord): ", modeord) + println(io, "# (1) Number of points (2) Type 1 (median, s) (3) Type 2 (median, s) (4) Relative error type 1 (5) Relative error type 2") + for Np ∈ Nps + bench = bench_finufft_cpu(Z, Ns, Np; params..., tol, fftw, nthreads, modeord, spread_sort, spread_kerevalmeth) + type1 = median(bench.type1.times) / 1e9 # in seconds + type2 = median(bench.type2.times) / 1e9 # in seconds + @show Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2 + join(io, (Np, type1, type2, bench.relative_errors.type1, bench.relative_errors.type2), '\t') + print(io, '\n') + flush(io) + end + end + filename +end + +function run_all_benchmarks() + T = Float64 + Z = complex(T) + Ngrid = 256 + Ns = (1, 1, 1) .* Ngrid + + # Parameters for 1e-6 accuracy + σ = 1.5 + m = HalfSupport(4) + + ρs = exp10.(-4:0.5:1) # point densities + Nprod = prod(Ns) + Nps = map(ρs) do ρ + round(Int, ρ * Nprod) + end + + # Real data + run_benchmark_nonuniformffts(T, CUDABackend(), Ns, Nps; σ, m, gpu_method = :global_memory) + run_benchmark_nonuniformffts(T, CUDABackend(), Ns, Nps; σ, m, gpu_method = :shared_memory) + run_benchmark_nonuniformffts(T, CPU(), Ns, Nps; σ, m) + + # Complex data + run_benchmark_nonuniformffts(Z, CUDABackend(), Ns, Nps; σ, m, gpu_method = :global_memory) + run_benchmark_nonuniformffts(Z, CUDABackend(), Ns, Nps; σ, m, gpu_method = :shared_memory) + run_benchmark_nonuniformffts(Z, CPU(), Ns, Nps; σ, m) + + # CuFINUFFT + params_cufinufft = (; + gpu_sort = 1, # needed for good performance + gpu_kerevalmeth = 1, # setting it to 0 (direct evaluation) doesn't make a big difference? + tol = 1e-6, + ) + + gpu_method = 1 # global memory (GM-sort) + run_benchmark_cufinufft(Z, Ns, Nps; gpu_method, params_cufinufft...) + + gpu_method = 2 # shared memory (SM) + run_benchmark_cufinufft(Z, Ns, Nps; gpu_method, params_cufinufft...) + + # FINUFFT CPU + params_finufft = (; + spread_sort = 1, + spread_kerevalmeth = 1, + tol = 1e-6, + ) + run_benchmark_finufft_cpu(Z, Ns, Nps; params_finufft...) + + nothing +end + +outdir = "results" +mkpath(outdir) +cd(outdir) do + run_all_benchmarks() +end From c3f26440215b0d8a8b30da661217aee5425351f6 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 29 Nov 2024 16:31:43 +0100 Subject: [PATCH 2/4] Add benchmark page to docs --- benchmarks/README.md | 23 +++++-- docs/Project.toml | 1 + docs/make.jl | 19 ++++++ docs/src/accuracy.md | 2 +- docs/src/assets/benchmarks.css | 9 +++ docs/src/assets/citations.css | 18 ++++++ docs/src/benchmarks.md | 115 +++++++++++++++++++++++++++++++++ docs/src/benchmarks/.gitignore | 2 + docs/src/index.md | 5 ++ docs/src/refs.bib | 9 +++ 10 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 docs/src/assets/benchmarks.css create mode 100644 docs/src/assets/citations.css create mode 100644 docs/src/benchmarks.md create mode 100644 docs/src/benchmarks/.gitignore create mode 100644 docs/src/refs.bib diff --git a/benchmarks/README.md b/benchmarks/README.md index fbf52b8..469bc73 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,13 +1,14 @@ # Benchmarks -The benchmark consists in type-1 and type-2 NUFFTs on a uniform 3D grid of +The benchmarks consist in type-1 and type-2 NUFFTs on a uniform 3D grid of fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values between $10^{-4}$ (very few points) and $10^1$ (very dense). -Points are randomly located in $[0, 2π)^3$. -Moreover, the relative tolerance is fixed to $10^{-6}$. +Points are randomly located in $[0, 2π)^3$ using a uniform distribution. +The relative tolerance is fixed to $10^{-6}$. In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5` -(oversampling factor) and $m = HalfSupport(4)$ (see Accuracy page in the docs). +(oversampling factor) and $m = HalfSupport(4)$ (see [Accuracy](@ref accuracy)). +All tests are run in double precision (`Float64` or `ComplexF64` non-uniform data). The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an NVIDIA A100 GPU. @@ -42,3 +43,17 @@ and for GPU plans: We also tried `gpu_method = 2` (based on shared memory) but found it to be considerably slower in almost all cases (in three dimensions, at the requested tolerance). + +## Results + +### Complex data + +![](plots/benchmark_ComplexF64_type1.svg) + +![](plots/benchmark_ComplexF64_type2.svg) + +### Real data + +![](plots/benchmark_Float64_type1.svg) + +![](plots/benchmark_Float64_type2.svg) diff --git a/docs/Project.toml b/docs/Project.toml index a1ebc9a..83249d7 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,5 +2,6 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244" NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/docs/make.jl b/docs/make.jl index 5d78fd7..1b87236 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,17 +1,36 @@ using Documenter +using DocumenterCitations using NonuniformFFTs +# Copy benchmark results to docs/src/benchmarks/ +dstdir = joinpath(@__DIR__, "src", "benchmarks") +srcdir = joinpath(@__DIR__, "..", "benchmarks", "plots") +@assert isdir(dstdir) +for fname ∈ readdir(srcdir) + endswith(".svg")(fname) || continue + cp(joinpath(srcdir, fname), joinpath(dstdir, fname); force = true) +end + +# Bibliography +bib = CitationBibliography(joinpath(@__DIR__, "src", "refs.bib"); style = :authoryear) + makedocs(; sitename = "NonuniformFFTs", format = Documenter.HTML( prettyurls = get(ENV, "CI", nothing) == "true", + assets = [ + "assets/citations.css", + "assets/benchmarks.css", + ], ), modules = [NonuniformFFTs], pages = [ "index.md", "accuracy.md", + "benchmarks.md", "API.md", ], + plugins = [bib], warnonly = [:missing_docs], # TODO fix this? ) diff --git a/docs/src/accuracy.md b/docs/src/accuracy.md index 258d422..9af1793 100644 --- a/docs/src/accuracy.md +++ b/docs/src/accuracy.md @@ -1,4 +1,4 @@ -# Accuracy +# [Accuracy](@id accuracy) Here we document the accuracy of the NUFFTs implemented in this package, and how it varies as a function of the kernel half-width ``M``, the oversampling factor ``σ`` and the choice diff --git a/docs/src/assets/benchmarks.css b/docs/src/assets/benchmarks.css new file mode 100644 index 0000000..d149a59 --- /dev/null +++ b/docs/src/assets/benchmarks.css @@ -0,0 +1,9 @@ +.NonuniformFFTs { + color: rgb(0.0% 44.705883% 69.803923%); /* Makie blue - (0.0f0, 0.44705883f0, 0.69803923f0) */ + font-weight: bold; +} + +.FINUFFT { + color: rgb(90.19608% 62.352943% 0.0%); /* Makie orange - 0.9019608f0, 0.62352943f0, 0.0f0 */ + font-weight: bold; +} diff --git a/docs/src/assets/citations.css b/docs/src/assets/citations.css new file mode 100644 index 0000000..39eba03 --- /dev/null +++ b/docs/src/assets/citations.css @@ -0,0 +1,18 @@ +.citation dl { + display: grid; + grid-template-columns: max-content auto; } +.citation dt { + grid-column-start: 1; } +.citation dd { + grid-column-start: 2; + margin-bottom: 0.75em; } +.citation ul { + padding: 0 0 2.25em 0; + margin: 0; + list-style: none !important;} +.citation ul li { + text-indent: -2.25em; + margin: 0.33em 0.5em 0.5em 2.25em;} +.citation ol li { + padding-left:0.75em;} + diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md new file mode 100644 index 0000000..f5ed214 --- /dev/null +++ b/docs/src/benchmarks.md @@ -0,0 +1,115 @@ +# Benchmarks + +```@contents +Pages = ["benchmarks.md"] +Depth = 2:3 +``` + +## Introduction + +The benchmarks consist in type-1 and type-2 NUFFTs on a uniform 3D grid of +fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of +non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values +between $10^{-4}$ (very few points) and $10^1$ (very dense). +Points are randomly located in $[0, 2π)^3$ using a uniform distribution. +The relative tolerance is fixed to $10^{-6}$. +In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5` +(oversampling factor) and `m = HalfSupport(4)` (see [Accuracy](@ref accuracy)). +All tests are run in double precision (`Float64` or `ComplexF64` non-uniform data). + +The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an +NVIDIA A100 GPU. + +The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1 +(see [FINUFFT set-up](@ref) for details). + +Each reported time includes (1) the time spent processing non-uniform points +(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`). + +## [Complex non-uniform data](@id benchmarks-complex) + +```@raw html +

+Libraries like FINUFFT or NFFT.jl only support complex non-uniform data. +Therefore, these tests provide a direct comparison of the performance of different libraries. +On the CPU (crosses), the performance of the multi-threaded NonuniformFFTs.jl (blue) and +FINUFFT (orange) implementations is quite comparable over a wide range of problem sizes. +

+``` + +On the GPU, we test two different implementations which are heavily inspired by the CuFINUFFT paper [Shih2021](@cite). +The default one (**filled circles**) corresponds to setting +`gpu_method = :global_memory` in [`PlanNUFFT`](@ref). +This method is slightly faster than CuFINUFFT at low point densities, but +slightly slower at large ones. +However, at large densities it is actually faster to use the non-default +`gpu_method = :shared_memory` option (**open circles**, labelled "SM" in the figures). + +The `:shared_memory` method performs some operations on GPU [shared +memory](https://developer.nvidia.com/blog/using-shared-memory-cuda-cc/) (also called [local data share](https://rocm.docs.amd.com/projects/HIP/en/latest/understand/hardware_implementation.html#local-data-share)), which is small but much faster than the GPU's global memory. +During spreading (type-1 transforms), this approach allows to reduce the number +of atomic operations performed in global memory. +Our implementation is inspired by the CuFINUFFT one [Shih2021](@cite) with +a few differences. +In particular, we completely avoid atomic operations on shared memory, which +seems to speed up things quite a bit and might explain the important gains +with respect to the CuFINUFFT implementation.[^1] +Another difference is that we also provide a shared-memory implementation of type-2 transforms +(interpolation). +As seen [below](@ref benchmarks-complex-type2), this can enable some minor gains +at large point densities. + +[^1]: In fact the CuFINUFFT shared-memory implementation is surprisingly slow + for the considered problem. It might perform better for two-dimensional or low-accuracy problems. + +### Type-1 transforms + +![Benchmark of type-1 transforms of ComplexF64 data.](benchmarks/benchmark_ComplexF64_type1.svg) + +### [Type-2 transforms](@id benchmarks-complex-type2) + +![Benchmark of type-2 transforms of ComplexF64 data.](benchmarks/benchmark_ComplexF64_type2.svg) + +## [Real non-uniform data](@id benchmarks-real) + +These tests are of interest for applications where **non-uniform data is +real-valued** (imaginary part is zero). +This enables the use of real-to-complex (type-1) and complex-to-real (type-2) +FFTs and also allows to halve the amount of data processed during the spreading +(type-1) and interpolation (type-2) procedures. +The benchmarks showcase the important gains which can be obtained by using real-data +transforms, which are not available in other libraries like FINUFFT or NFFT.jl. + +### Type-1 transforms + +![](benchmarks/benchmark_Float64_type1.svg) + +### Type-2 transforms + +![](benchmarks/benchmark_Float64_type2.svg) + +## FINUFFT set-up + +We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For +performance reasons, the (Cu)FINUFFT libraries were compiled locally and the +FINUFFT.jl sources were modified accordingly as described +[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries). +FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`. +Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3. + +All FINUFFT benchmarks were run with relative tolerance `1e-6`. +Moreover, the following options were used: + +- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs) +- `spread_sort = 1` (enable point sorting in CPU plans) +- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation) +- `fftw = FFTW.ESTIMATE` (CPU plans) + +and for GPU plans: + +- `gpu_sort = 1` (enable point sorting) +- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation) +- `gpu_method = 1` (global memory method, "non-uniform points driven") + +We also tried `gpu_method = 2` (open symbols, labelled SM) which seems to be +considerably slower in nearly all cases (in three dimensions, at the requested tolerance). diff --git a/docs/src/benchmarks/.gitignore b/docs/src/benchmarks/.gitignore new file mode 100644 index 0000000..c8d6c16 --- /dev/null +++ b/docs/src/benchmarks/.gitignore @@ -0,0 +1,2 @@ +# SVG files are copied from /benchmarks/plots, so local files shouldn't be added to git. +*.svg diff --git a/docs/src/index.md b/docs/src/index.md index 5ca0d0c..4b3a8df 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -345,3 +345,8 @@ the opposite sign convention is used for Fourier transforms. - A different smoothing kernel function is used (backwards Kaiser–Bessel kernel by default on CPUs; Kaiser–Bessel kernel on GPUs). - It is possible to use the same plan for type-1 and type-2 transforms, reducing memory requirements in cases where one wants to perform both. + +## Bibliography + +```@bibliography +``` diff --git a/docs/src/refs.bib b/docs/src/refs.bib new file mode 100644 index 0000000..47975b8 --- /dev/null +++ b/docs/src/refs.bib @@ -0,0 +1,9 @@ +@misc{Shih2021, + title={cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs}, + author={Yu-hsuan Shih and Garrett Wright and Joakim Andén and Johannes Blaschke and Alex H. Barnett}, + year={2021}, + eprint={2102.08463}, + archivePrefix={arXiv}, + primaryClass={cs.DC}, + url={https://arxiv.org/abs/2102.08463}, +} From 3634e7e176dfee42fd1dfa67a6ae48f0efe0075f Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 29 Nov 2024 16:34:59 +0100 Subject: [PATCH 3/4] Update benchmarks/README.md --- benchmarks/README.md | 60 +++----------------------------------------- 1 file changed, 4 insertions(+), 56 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 469bc73..fc4a0bc 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,59 +1,7 @@ # Benchmarks -The benchmarks consist in type-1 and type-2 NUFFTs on a uniform 3D grid of -fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of -non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values -between $10^{-4}$ (very few points) and $10^1$ (very dense). -Points are randomly located in $[0, 2π)^3$ using a uniform distribution. -The relative tolerance is fixed to $10^{-6}$. -In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5` -(oversampling factor) and $m = HalfSupport(4)$ (see [Accuracy](@ref accuracy)). -All tests are run in double precision (`Float64` or `ComplexF64` non-uniform data). +This directory contains scripts for executing benchmarks (`run_benchmarks.jl`) +and generated plotting results (`plots/plot_benchmarks.jl`). -The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an -NVIDIA A100 GPU. - -The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1. - -Each reported time includes (1) the time spent processing non-uniform points -(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`). - -## FINUFFT set-up - -We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For -performance reasons, the (Cu)FINUFFT libraries were compiled locally and the -FINUFFT.jl sources were modified accordingly as described -[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries). -FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`. -Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3. - -All FINUFFT benchmarks were run with relative tolerance `1e-6`. -Moreover, the following options were used: - -- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs) -- `spread_sort = 1` (enable point sorting in CPU plans) -- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation) -- `fftw = FFTW.ESTIMATE` (CPU plans) - -and for GPU plans: - -- `gpu_sort = 1` (enable point sorting) -- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation) -- `gpu_method = 1` (global memory method, non-uniform point driven) - -We also tried `gpu_method = 2` (based on shared memory) but found it to be -considerably slower in almost all cases (in three dimensions, at the requested tolerance). - -## Results - -### Complex data - -![](plots/benchmark_ComplexF64_type1.svg) - -![](plots/benchmark_ComplexF64_type2.svg) - -### Real data - -![](plots/benchmark_Float64_type1.svg) - -![](plots/benchmark_Float64_type2.svg) +It also contains raw benchmark results (in `results`) and their associated +plots (`plots/*.svg`) which are discussed in the Benchmarks section of the docs. From bf79dea3a70242c3083fe20bb7534f4ddc33b5a2 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Fri, 29 Nov 2024 16:38:33 +0100 Subject: [PATCH 4/4] Add alt text --- docs/src/benchmarks.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md index f5ed214..4b38bcc 100644 --- a/docs/src/benchmarks.md +++ b/docs/src/benchmarks.md @@ -82,11 +82,11 @@ transforms, which are not available in other libraries like FINUFFT or NFFT.jl. ### Type-1 transforms -![](benchmarks/benchmark_Float64_type1.svg) +![Benchmark of type-1 transforms of Float64 data.](benchmarks/benchmark_Float64_type1.svg) ### Type-2 transforms -![](benchmarks/benchmark_Float64_type2.svg) +![Benchmark of type-2 transforms of Float64 data.](benchmarks/benchmark_Float64_type2.svg) ## FINUFFT set-up