From c2efc935fb64175a6e03c2e864551055c6ed1de3 Mon Sep 17 00:00:00 2001 From: Juan Ignacio Polanco Date: Tue, 29 Oct 2024 14:16:44 +0100 Subject: [PATCH] Fix type-2 GPU shared memory with ntransforms > 1 (#43) * Fix type-2 GPU shared memory with ntransforms > 1 Fix synchronisation issue in type-2 NUFFTs on GPU when `gpu_method = :shared_memory` and when multiple transforms are performed at once. * Allow testing on actual GPUs * Disable tests on Julia 'pre' (= 1.11 currently) --- .github/workflows/ci.yml | 10 +++---- src/interpolation/gpu.jl | 4 +++ test/Project.toml | 1 + test/pseudo_gpu.jl | 63 +++++++++++++++++++++++++--------------- 4 files changed, 49 insertions(+), 29 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc95d16..4969150 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,11 +28,11 @@ jobs: - ubuntu-latest arch: - x64 - include: - - version: 'pre' - os: ubuntu-latest - experimental: true - arch: x64 + # include: + # - version: 'pre' + # os: ubuntu-latest + # experimental: true + # arch: x64 steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 diff --git a/src/interpolation/gpu.jl b/src/interpolation/gpu.jl index ad6712a..fdb1562 100644 --- a/src/interpolation/gpu.jl +++ b/src/interpolation/gpu.jl @@ -273,6 +273,10 @@ end v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor) @inbounds vp[c][j] = v end + + # Avoid copying data to u_local too early in the next iteration (c -> c + 1). + # This is mostly useful when c < C (but putting an `if` fails...). + @synchronize end nothing diff --git a/test/Project.toml b/test/Project.toml index 7a1a8c0..5ac49de 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -8,6 +8,7 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/pseudo_gpu.jl b/test/pseudo_gpu.jl index 24c21b4..64684dd 100644 --- a/test/pseudo_gpu.jl +++ b/test/pseudo_gpu.jl @@ -1,10 +1,19 @@ -# Test GPU code using CPU arrays. Everything is run on the CPU. +# Test GPU code using CPU arrays. By default everything is run on the CPU. # # We define a minimal custom array type, so that it runs the kernels the GPU would run # instead of using the alternative CPU branches. # We also tried to use JLArrays instead, but we need scalar indexing which is disallowed by # JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels # seem to fail for other reasons. +# +# To test an actual GPU backend, set the environment variable JULIA_GPU_BACKEND before +# launching this script. Possible values are: +# - PseudoGPU (default) +# - CUDA +# - AMDGPU +# The required packages (e.g. CUDA.jl) will automatically be installed in the current +# environment. + # TODO: # - use new JLBackend in the latest GPUArrays @@ -18,6 +27,10 @@ using GPUArraysCore: AbstractGPUArray using Random using Test +# Allow testing on actual GPU arrays if the right environment variable is passed (and the +# right package is installed). +const GPU_BACKEND = get(ENV, "JULIA_GPU_BACKEND", "PseudoGPU") + # ================================================================================ # # Definition of custom "GPU" array type and custom KA backend. @@ -51,6 +64,25 @@ end # ================================================================================ # +@static if GPU_BACKEND == "PseudoGPU" + const GPUBackend = PseudoGPU + const GPUArrayType = PseudoGPUArray +elseif GPU_BACKEND == "CUDA" + using Pkg; Pkg.add("CUDA") + using CUDA + const GPUBackend = CUDABackend + const GPUArrayType = CuArray +elseif GPU_BACKEND == "AMDGPU" + using Pkg; Pkg.add("AMDGPU") + using AMDGPU + const GPUBackend = ROCBackend + const GPUArrayType = ROCArray +else + error("unknown value of JULIA_GPU_BACKEND: $GPU_BACKEND") +end + +@info "GPU tests - using:" GPU_BACKEND GPUBackend GPUArrayType + function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc} (; backend,) = p @@ -59,23 +91,6 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, Abst set_points!(p, xp) - save_points_sorted = false # this can be useful for verifying spatial sorting graphically - - if backend isa PseudoGPU && save_points_sorted - inds = NonuniformFFTs.get_pointperm(p.blocks) - if inds !== nothing - open("points_sorted.dat", "w") do io - for i ∈ inds - x⃗ = xp[i] - for x ∈ x⃗ - print(io, "\t", x) - end - print(io, "\n") - end - end - end - end - T = eltype(p) # type in Fourier space (always complex) - for compatibility with AbstractNFFTs plans @test T <: Complex dims = size(p) @@ -102,17 +117,17 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc} xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np] # non-uniform points in [0, 2π]ᵈ vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms) - @inferred test_inference_block_dims_shmem(PseudoGPU(), T, dims, HalfSupport(4)) + @inferred test_inference_block_dims_shmem(GPUBackend(), T, dims, HalfSupport(4)) params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...) p_cpu = @inferred PlanNUFFT(T, dims; params..., backend = CPU()) - p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = PseudoGPU()) + p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = GPUBackend()) # Test that plan_nfft interface works. @testset "AbstractNFFTs.plan_nfft" begin xmat = reinterpret(reshape, Tr, xp_init) - p_nfft = @inferred AbstractNFFTs.plan_nfft(PseudoGPUArray, xmat, dims) - @test p_nfft.p.backend isa PseudoGPU + p_nfft = @inferred AbstractNFFTs.plan_nfft(GPUArrayType, xmat, dims) + @test p_nfft.p.backend isa GPUBackend # Test without the initial argument (type of array) p_nfft_cpu = @inferred AbstractNFFTs.plan_nfft(xmat, dims) @test p_nfft_cpu.p.backend isa CPU @@ -127,8 +142,8 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc} # approximation of the KB kernel, while the GPU evaluates it "exactly" from its # definition (based on Bessel functions for KB). rtol = Tr === Float64 ? 1e-7 : Tr === Float32 ? 1f-5 : nothing - @test r_cpu.us[c] ≈ r_gpu.us[c] rtol=rtol # output of type-1 transform - @test r_cpu.wp[c] ≈ r_gpu.wp[c] rtol=rtol # output of type-2 transform + @test r_cpu.us[c] ≈ Array(r_gpu.us[c]) rtol=rtol # output of type-1 transform + @test r_cpu.wp[c] ≈ Array(r_gpu.wp[c]) rtol=rtol # output of type-2 transform end nothing