From c2efc935fb64175a6e03c2e864551055c6ed1de3 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Tue, 29 Oct 2024 14:16:44 +0100
Subject: [PATCH] Fix type-2 GPU shared memory with ntransforms > 1 (#43)

* Fix type-2 GPU shared memory with ntransforms > 1

Fix synchronisation issue in type-2 NUFFTs on GPU when
`gpu_method = :shared_memory` and when multiple transforms are performed
at once.

* Allow testing on actual GPUs

* Disable tests on Julia 'pre' (= 1.11 currently)
---
 .github/workflows/ci.yml | 10 +++----
 src/interpolation/gpu.jl |  4 +++
 test/Project.toml        |  1 +
 test/pseudo_gpu.jl       | 63 +++++++++++++++++++++++++---------------
 4 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index fc95d16..4969150 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,11 +28,11 @@ jobs:
           - ubuntu-latest
         arch:
           - x64
-        include:
-          - version: 'pre'
-            os: ubuntu-latest
-            experimental: true
-            arch: x64
+        # include:
+        #   - version: 'pre'
+        #     os: ubuntu-latest
+        #     experimental: true
+        #     arch: x64
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl
index ad6712a..fdb1562 100644
--- a/src/interpolation/gpu.jl
+++ b/src/interpolation/gpu.jl
@@ -273,6 +273,10 @@ end
             v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor)
             @inbounds vp[c][j] = v
         end
+
+        # Avoid copying data to u_local too early in the next iteration (c -> c + 1).
+        # This is mostly useful when c < C (but putting an `if` fails...).
+        @synchronize
     end
 
     nothing
diff --git a/test/Project.toml b/test/Project.toml
index 7a1a8c0..5ac49de 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,6 +8,7 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl
index 24c21b4..64684dd 100644
--- a/test/pseudo_gpu.jl
+++ b/test/pseudo_gpu.jl
@@ -1,10 +1,19 @@
-# Test GPU code using CPU arrays. Everything is run on the CPU.
+# Test GPU code using CPU arrays. By default everything is run on the CPU.
 #
 # We define a minimal custom array type, so that it runs the kernels the GPU would run
 # instead of using the alternative CPU branches.
 # We also tried to use JLArrays instead, but we need scalar indexing which is disallowed by
 # JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels
 # seem to fail for other reasons.
+#
+# To test an actual GPU backend, set the environment variable JULIA_GPU_BACKEND before
+# launching this script. Possible values are:
+#  - PseudoGPU (default)
+#  - CUDA
+#  - AMDGPU
+# The required packages (e.g. CUDA.jl) will automatically be installed in the current
+# environment.
+
 
 # TODO:
 # - use new JLBackend in the latest GPUArrays
@@ -18,6 +27,10 @@ using GPUArraysCore: AbstractGPUArray
 using Random
 using Test
 
+# Allow testing on actual GPU arrays if the right environment variable is passed (and the
+# right package is installed).
+const GPU_BACKEND = get(ENV, "JULIA_GPU_BACKEND", "PseudoGPU")
+
 # ================================================================================ #
 
 # Definition of custom "GPU" array type and custom KA backend.
@@ -51,6 +64,25 @@ end
 
 # ================================================================================ #
 
+@static if GPU_BACKEND == "PseudoGPU"
+    const GPUBackend = PseudoGPU
+    const GPUArrayType = PseudoGPUArray
+elseif GPU_BACKEND == "CUDA"
+    using Pkg; Pkg.add("CUDA")
+    using CUDA
+    const GPUBackend = CUDABackend
+    const GPUArrayType = CuArray
+elseif GPU_BACKEND == "AMDGPU"
+    using Pkg; Pkg.add("AMDGPU")
+    using AMDGPU
+    const GPUBackend = ROCBackend
+    const GPUArrayType = ROCArray
+else
+    error("unknown value of JULIA_GPU_BACKEND: $GPU_BACKEND")
+end
+
+@info "GPU tests - using:" GPU_BACKEND GPUBackend GPUArrayType
+
 function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc}
     (; backend,) = p
 
@@ -59,23 +91,6 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, Abst
 
     set_points!(p, xp)
 
-    save_points_sorted = false # this can be useful for verifying spatial sorting graphically
-
-    if backend isa PseudoGPU && save_points_sorted
-        inds = NonuniformFFTs.get_pointperm(p.blocks)
-        if inds !== nothing
-            open("points_sorted.dat", "w") do io
-                for i ∈ inds
-                    x⃗ = xp[i]
-                    for x ∈ x⃗
-                        print(io, "\t", x)
-                    end
-                    print(io, "\n")
-                end
-            end
-        end
-    end
-
     T = eltype(p)  # type in Fourier space (always complex) - for compatibility with AbstractNFFTs plans
     @test T <: Complex
     dims = size(p)
@@ -102,17 +117,17 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc}
     xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np]  # non-uniform points in [0, 2π]ᵈ
     vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms)
 
-    @inferred test_inference_block_dims_shmem(PseudoGPU(), T, dims, HalfSupport(4))
+    @inferred test_inference_block_dims_shmem(GPUBackend(), T, dims, HalfSupport(4))
 
     params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...)
     p_cpu = @inferred PlanNUFFT(T, dims; params..., backend = CPU())
-    p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = PseudoGPU())
+    p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = GPUBackend())
 
     # Test that plan_nfft interface works.
     @testset "AbstractNFFTs.plan_nfft" begin
         xmat = reinterpret(reshape, Tr, xp_init)
-        p_nfft = @inferred AbstractNFFTs.plan_nfft(PseudoGPUArray, xmat, dims)
-        @test p_nfft.p.backend isa PseudoGPU
+        p_nfft = @inferred AbstractNFFTs.plan_nfft(GPUArrayType, xmat, dims)
+        @test p_nfft.p.backend isa GPUBackend
         # Test without the initial argument (type of array)
         p_nfft_cpu = @inferred AbstractNFFTs.plan_nfft(xmat, dims)
         @test p_nfft_cpu.p.backend isa CPU
@@ -127,8 +142,8 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc}
         # approximation of the KB kernel, while the GPU evaluates it "exactly" from its
         # definition (based on Bessel functions for KB).
         rtol = Tr === Float64 ? 1e-7 : Tr === Float32 ? 1f-5 : nothing
-        @test r_cpu.us[c] ≈ r_gpu.us[c] rtol=rtol  # output of type-1 transform
-        @test r_cpu.wp[c] ≈ r_gpu.wp[c] rtol=rtol  # output of type-2 transform
+        @test r_cpu.us[c] ≈ Array(r_gpu.us[c]) rtol=rtol  # output of type-1 transform
+        @test r_cpu.wp[c] ≈ Array(r_gpu.wp[c]) rtol=rtol  # output of type-2 transform
     end
 
     nothing