diff --git a/benchmarks/README.md b/benchmarks/README.md
index fbf52b8..469bc73 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,13 +1,14 @@
 # Benchmarks
 
-The benchmark consists in type-1 and type-2 NUFFTs on a uniform 3D grid of
+The benchmarks consist in type-1 and type-2 NUFFTs on a uniform 3D grid of
 fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of
 non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values
 between $10^{-4}$ (very few points) and $10^1$ (very dense).
-Points are randomly located in $[0, 2π)^3$.
-Moreover, the relative tolerance is fixed to $10^{-6}$.
+Points are randomly located in $[0, 2π)^3$ using a uniform distribution.
+The relative tolerance is fixed to $10^{-6}$.
 In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5`
-(oversampling factor) and $m = HalfSupport(4)$ (see Accuracy page in the docs).
+(oversampling factor) and $m = HalfSupport(4)$ (see [Accuracy](@ref accuracy)).
+All tests are run in double precision (`Float64` or `ComplexF64` non-uniform data).
 
 The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an
 NVIDIA A100 GPU.
@@ -42,3 +43,17 @@ and for GPU plans:
 
 We also tried `gpu_method = 2` (based on shared memory) but found it to be
 considerably slower in almost all cases (in three dimensions, at the requested tolerance).
+
+## Results
+
+### Complex data
+
+![](plots/benchmark_ComplexF64_type1.svg)
+
+![](plots/benchmark_ComplexF64_type2.svg)
+
+### Real data
+
+![](plots/benchmark_Float64_type1.svg)
+
+![](plots/benchmark_Float64_type2.svg)
diff --git a/docs/Project.toml b/docs/Project.toml
index a1ebc9a..83249d7 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,5 +2,6 @@
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/docs/make.jl b/docs/make.jl
index 5d78fd7..1b87236 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,17 +1,36 @@
 using Documenter
+using DocumenterCitations
 using NonuniformFFTs
 
+# Copy benchmark results to docs/src/benchmarks/
+dstdir = joinpath(@__DIR__, "src", "benchmarks")
+srcdir = joinpath(@__DIR__, "..", "benchmarks", "plots")
+@assert isdir(dstdir)
+for fname ∈ readdir(srcdir)
+    endswith(".svg")(fname) || continue
+    cp(joinpath(srcdir, fname), joinpath(dstdir, fname); force = true)
+end
+
+# Bibliography
+bib = CitationBibliography(joinpath(@__DIR__, "src", "refs.bib"); style = :authoryear)
+
 makedocs(;
     sitename = "NonuniformFFTs",
     format = Documenter.HTML(
         prettyurls = get(ENV, "CI", nothing) == "true",
+        assets = [
+            "assets/citations.css",
+            "assets/benchmarks.css",
+        ],
     ),
     modules = [NonuniformFFTs],
     pages = [
         "index.md",
         "accuracy.md",
+        "benchmarks.md",
         "API.md",
     ],
+    plugins = [bib],
     warnonly = [:missing_docs],  # TODO fix this?
 )
 
diff --git a/docs/src/accuracy.md b/docs/src/accuracy.md
index 258d422..9af1793 100644
--- a/docs/src/accuracy.md
+++ b/docs/src/accuracy.md
@@ -1,4 +1,4 @@
-# Accuracy
+# [Accuracy](@id accuracy)
 
 Here we document the accuracy of the NUFFTs implemented in this package, and how it varies
 as a function of the kernel half-width ``M``, the oversampling factor ``σ`` and the choice
diff --git a/docs/src/assets/benchmarks.css b/docs/src/assets/benchmarks.css
new file mode 100644
index 0000000..d149a59
--- /dev/null
+++ b/docs/src/assets/benchmarks.css
@@ -0,0 +1,9 @@
+.NonuniformFFTs {
+    color: rgb(0.0% 44.705883% 69.803923%);  /* Makie blue - (0.0f0, 0.44705883f0, 0.69803923f0) */
+    font-weight: bold;
+}
+
+.FINUFFT {
+    color: rgb(90.19608% 62.352943% 0.0%);  /* Makie orange - 0.9019608f0, 0.62352943f0, 0.0f0 */
+    font-weight: bold;
+}
diff --git a/docs/src/assets/citations.css b/docs/src/assets/citations.css
new file mode 100644
index 0000000..39eba03
--- /dev/null
+++ b/docs/src/assets/citations.css
@@ -0,0 +1,18 @@
+.citation dl {
+  display: grid;
+  grid-template-columns: max-content auto; }
+.citation dt {
+  grid-column-start: 1; }
+.citation dd {
+  grid-column-start: 2;
+  margin-bottom: 0.75em; }
+.citation ul {
+ padding: 0 0 2.25em 0;
+ margin: 0;
+ list-style: none !important;}
+.citation ul li {
+ text-indent: -2.25em;
+ margin: 0.33em 0.5em 0.5em 2.25em;}
+.citation ol li {
+ padding-left:0.75em;}
+
diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md
new file mode 100644
index 0000000..f5ed214
--- /dev/null
+++ b/docs/src/benchmarks.md
@@ -0,0 +1,115 @@
+# Benchmarks
+
+```@contents
+Pages = ["benchmarks.md"]
+Depth = 2:3
+```
+
+## Introduction
+
+The benchmarks consist in type-1 and type-2 NUFFTs on a uniform 3D grid of
+fixed dimensions $M^3 = 256^3$ (excluding oversampling). We vary the number of
+non-uniform points $N$, so that the point density $ρ = N / M^3$ takes values
+between $10^{-4}$ (very few points) and $10^1$ (very dense).
+Points are randomly located in $[0, 2π)^3$ using a uniform distribution.
+The relative tolerance is fixed to $10^{-6}$.
+In NonuniformFFTs.jl, this can be achieved with the parameters `σ = 1.5`
+(oversampling factor) and `m = HalfSupport(4)` (see [Accuracy](@ref accuracy)).
+All tests are run in double precision (`Float64` or `ComplexF64` non-uniform data).
+
+The tests were run on a cluster with an AMD EPYC 7302 CPU (32 threads) and an
+NVIDIA A100 GPU.
+
+The benchmarks compare NonuniformFFTs.jl v0.6.7 (26/11/2024) and FINUFFT v2.3.1
+(see [FINUFFT set-up](@ref) for details).
+
+Each reported time includes (1) the time spent processing non-uniform points
+(`set_points!` / `(cu)finufft_setpts!`) and (2) the time spent on the actual transform (`exec_type{1,2}!` / `(cu)finufft_exec!`).
+
+## [Complex non-uniform data](@id benchmarks-complex)
+
+```@raw html
+<p>
+Libraries like FINUFFT or NFFT.jl only support complex non-uniform data.
+Therefore, these tests provide a direct comparison of the performance of different libraries.
+On the CPU (<b>crosses</b>), the performance of the multi-threaded NonuniformFFTs.jl (<span class=NonuniformFFTs>blue</span>) and
+FINUFFT (<span class=FINUFFT>orange</span>) implementations is quite comparable over a wide range of problem sizes.
+</p>
+```
+
+On the GPU, we test two different implementations which are heavily inspired by the CuFINUFFT paper [Shih2021](@cite).
+The default one (**filled circles**) corresponds to setting
+`gpu_method = :global_memory` in [`PlanNUFFT`](@ref).
+This method is slightly faster than CuFINUFFT at low point densities, but
+slightly slower at large ones.
+However, at large densities it is actually faster to use the non-default
+`gpu_method = :shared_memory` option (**open circles**, labelled "SM" in the figures).
+
+The `:shared_memory` method performs some operations on GPU [shared
+memory](https://developer.nvidia.com/blog/using-shared-memory-cuda-cc/) (also called [local data share](https://rocm.docs.amd.com/projects/HIP/en/latest/understand/hardware_implementation.html#local-data-share)), which is small but much faster than the GPU's global memory.
+During spreading (type-1 transforms), this approach allows to reduce the number
+of atomic operations performed in global memory.
+Our implementation is inspired by the CuFINUFFT one [Shih2021](@cite) with
+a few differences.
+In particular, we completely avoid atomic operations on shared memory, which
+seems to speed up things quite a bit and might explain the important gains
+with respect to the CuFINUFFT implementation.[^1]
+Another difference is that we also provide a shared-memory implementation of type-2 transforms
+(interpolation).
+As seen [below](@ref benchmarks-complex-type2), this can enable some minor gains
+at large point densities.
+
+[^1]: In fact the CuFINUFFT shared-memory implementation is surprisingly slow
+    for the considered problem. It might perform better for two-dimensional or low-accuracy problems.
+
+### Type-1 transforms
+
+![Benchmark of type-1 transforms of ComplexF64 data.](benchmarks/benchmark_ComplexF64_type1.svg)
+
+### [Type-2 transforms](@id benchmarks-complex-type2)
+
+![Benchmark of type-2 transforms of ComplexF64 data.](benchmarks/benchmark_ComplexF64_type2.svg)
+
+## [Real non-uniform data](@id benchmarks-real)
+
+These tests are of interest for applications where **non-uniform data is
+real-valued** (imaginary part is zero).
+This enables the use of real-to-complex (type-1) and complex-to-real (type-2)
+FFTs and also allows to halve the amount of data processed during the spreading
+(type-1) and interpolation (type-2) procedures.
+The benchmarks showcase the important gains which can be obtained by using real-data
+transforms, which are not available in other libraries like FINUFFT or NFFT.jl.
+
+### Type-1 transforms
+
+![](benchmarks/benchmark_Float64_type1.svg)
+
+### Type-2 transforms
+
+![](benchmarks/benchmark_Float64_type2.svg)
+
+## FINUFFT set-up
+
+We used FINUFFT via its Julia wrapper [FINUFFT.jl](https://github.com/ludvigak/FINUFFT.jl) v3.3.0. For
+performance reasons, the (Cu)FINUFFT libraries were compiled locally and the
+FINUFFT.jl sources were modified accordingly as described
+[here](https://github.com/ludvigak/FINUFFT.jl?tab=readme-ov-file#advanced-installation-and-locally-compiling-binaries).
+FINUFFT was compiled with GCC 10.2.0 using CMake with its default flags in `Release` mode, which include `-fPIC -funroll-loops -O3 -march=native`.
+Moreover, we set `CMAKE_CUDA_ARCHITECTURES=80` (for an NVIDIA A100) and used the `nvcc` compiler included in CUDA 12.3.
+
+All FINUFFT benchmarks were run with relative tolerance `1e-6`.
+Moreover, the following options were used:
+
+- `modeord = 1` (use FFTW ordering, for consistency with NonuniformFFTs)
+- `spread_sort = 1` (enable point sorting in CPU plans)
+- `spread_kerevalmeth = 1` (use the recommended piecewise polynomial evaluation)
+- `fftw = FFTW.ESTIMATE` (CPU plans)
+
+and for GPU plans:
+
+- `gpu_sort = 1` (enable point sorting)
+- `gpu_kerevalmeth = 1` (use piecewise polynomial evaluation)
+- `gpu_method = 1` (global memory method, "non-uniform points driven")
+
+We also tried `gpu_method = 2` (open symbols, labelled SM) which seems to be
+considerably slower in nearly all cases (in three dimensions, at the requested tolerance).
diff --git a/docs/src/benchmarks/.gitignore b/docs/src/benchmarks/.gitignore
new file mode 100644
index 0000000..c8d6c16
--- /dev/null
+++ b/docs/src/benchmarks/.gitignore
@@ -0,0 +1,2 @@
+# SVG files are copied from /benchmarks/plots, so local files shouldn't be added to git.
+*.svg
diff --git a/docs/src/index.md b/docs/src/index.md
index 5ca0d0c..4b3a8df 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -345,3 +345,8 @@ the opposite sign convention is used for Fourier transforms.
 - A different smoothing kernel function is used (backwards Kaiser–Bessel kernel by default on CPUs; Kaiser–Bessel kernel on GPUs).
 
 - It is possible to use the same plan for type-1 and type-2 transforms, reducing memory requirements in cases where one wants to perform both.
+
+## Bibliography
+
+```@bibliography
+```
diff --git a/docs/src/refs.bib b/docs/src/refs.bib
new file mode 100644
index 0000000..47975b8
--- /dev/null
+++ b/docs/src/refs.bib
@@ -0,0 +1,9 @@
+@misc{Shih2021,
+      title={cuFINUFFT: a load-balanced GPU library for general-purpose nonuniform FFTs}, 
+      author={Yu-hsuan Shih and Garrett Wright and Joakim Andén and Johannes Blaschke and Alex H. Barnett},
+      year={2021},
+      eprint={2102.08463},
+      archivePrefix={arXiv},
+      primaryClass={cs.DC},
+      url={https://arxiv.org/abs/2102.08463}, 
+}