Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix type-2 GPU shared memory with ntransforms > 1 #43

Merged
merged 3 commits into from
Oct 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -28,11 +28,11 @@ jobs:
- ubuntu-latest
arch:
- x64
include:
- version: 'pre'
os: ubuntu-latest
experimental: true
arch: x64
# include:
# - version: 'pre'
# os: ubuntu-latest
# experimental: true
# arch: x64
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
4 changes: 4 additions & 0 deletions src/interpolation/gpu.jl
Original file line number Diff line number Diff line change
@@ -273,6 +273,10 @@
v = interpolate_from_arrays_shmem(u_local, inds_start, window_vals, prefactor)
@inbounds vp[c][j] = v
end

# Avoid copying data to u_local too early in the next iteration (c -> c + 1).
# This is mostly useful when c < C (but putting an `if` fails...).
@synchronize

Check warning on line 279 in src/interpolation/gpu.jl

Codecov / codecov/patch

src/interpolation/gpu.jl#L279

Added line #L279 was not covered by tests
end

nothing
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
63 changes: 39 additions & 24 deletions test/pseudo_gpu.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
# Test GPU code using CPU arrays. Everything is run on the CPU.
# Test GPU code using CPU arrays. By default everything is run on the CPU.
#
# We define a minimal custom array type, so that it runs the kernels the GPU would run
# instead of using the alternative CPU branches.
# We also tried to use JLArrays instead, but we need scalar indexing which is disallowed by
# JLArray (since it's supposed to mimic GPU arrays). Even with allowscalar(true), kernels
# seem to fail for other reasons.
#
# To test an actual GPU backend, set the environment variable JULIA_GPU_BACKEND before
# launching this script. Possible values are:
# - PseudoGPU (default)
# - CUDA
# - AMDGPU
# The required packages (e.g. CUDA.jl) will automatically be installed in the current
# environment.


# TODO:
# - use new JLBackend in the latest GPUArrays
@@ -18,6 +27,10 @@ using GPUArraysCore: AbstractGPUArray
using Random
using Test

# Allow testing on actual GPU arrays if the right environment variable is passed (and the
# right package is installed).
const GPU_BACKEND = get(ENV, "JULIA_GPU_BACKEND", "PseudoGPU")

# ================================================================================ #

# Definition of custom "GPU" array type and custom KA backend.
@@ -51,6 +64,25 @@ end

# ================================================================================ #

@static if GPU_BACKEND == "PseudoGPU"
const GPUBackend = PseudoGPU
const GPUArrayType = PseudoGPUArray
elseif GPU_BACKEND == "CUDA"
using Pkg; Pkg.add("CUDA")
using CUDA
const GPUBackend = CUDABackend
const GPUArrayType = CuArray
elseif GPU_BACKEND == "AMDGPU"
using Pkg; Pkg.add("AMDGPU")
using AMDGPU
const GPUBackend = ROCBackend
const GPUArrayType = ROCArray
else
error("unknown value of JULIA_GPU_BACKEND: $GPU_BACKEND")
end

@info "GPU tests - using:" GPU_BACKEND GPUBackend GPUArrayType

function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, AbstractVector}) where {Nc}
(; backend,) = p

@@ -59,23 +91,6 @@ function run_plan(p::PlanNUFFT, xp_init::AbstractArray, vp_init::NTuple{Nc, Abst

set_points!(p, xp)

save_points_sorted = false # this can be useful for verifying spatial sorting graphically

if backend isa PseudoGPU && save_points_sorted
inds = NonuniformFFTs.get_pointperm(p.blocks)
if inds !== nothing
open("points_sorted.dat", "w") do io
for i ∈ inds
x⃗ = xp[i]
for x ∈ x⃗
print(io, "\t", x)
end
print(io, "\n")
end
end
end
end

T = eltype(p) # type in Fourier space (always complex) - for compatibility with AbstractNFFTs plans
@test T <: Complex
dims = size(p)
@@ -102,17 +117,17 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc}
xp_init = [rand(rng, SVector{N, Tr}) * Tr(2π) for _ ∈ 1:Np] # non-uniform points in [0, 2π]ᵈ
vp_init = ntuple(_ -> randn(rng, T, Np), ntransforms)

@inferred test_inference_block_dims_shmem(PseudoGPU(), T, dims, HalfSupport(4))
@inferred test_inference_block_dims_shmem(GPUBackend(), T, dims, HalfSupport(4))

params = (; m = HalfSupport(4), kernel = KaiserBesselKernel(), σ = 1.5, ntransforms, kws...)
p_cpu = @inferred PlanNUFFT(T, dims; params..., backend = CPU())
p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = PseudoGPU())
p_gpu = @inferred PlanNUFFT(T, dims; params..., backend = GPUBackend())

# Test that plan_nfft interface works.
@testset "AbstractNFFTs.plan_nfft" begin
xmat = reinterpret(reshape, Tr, xp_init)
p_nfft = @inferred AbstractNFFTs.plan_nfft(PseudoGPUArray, xmat, dims)
@test p_nfft.p.backend isa PseudoGPU
p_nfft = @inferred AbstractNFFTs.plan_nfft(GPUArrayType, xmat, dims)
@test p_nfft.p.backend isa GPUBackend
# Test without the initial argument (type of array)
p_nfft_cpu = @inferred AbstractNFFTs.plan_nfft(xmat, dims)
@test p_nfft_cpu.p.backend isa CPU
@@ -127,8 +142,8 @@ function compare_with_cpu(::Type{T}, dims; Np = prod(dims), ntransforms::Val{Nc}
# approximation of the KB kernel, while the GPU evaluates it "exactly" from its
# definition (based on Bessel functions for KB).
rtol = Tr === Float64 ? 1e-7 : Tr === Float32 ? 1f-5 : nothing
@test r_cpu.us[c] ≈ r_gpu.us[c] rtol=rtol # output of type-1 transform
@test r_cpu.wp[c] ≈ r_gpu.wp[c] rtol=rtol # output of type-2 transform
@test r_cpu.us[c] ≈ Array(r_gpu.us[c]) rtol=rtol # output of type-1 transform
@test r_cpu.wp[c] ≈ Array(r_gpu.wp[c]) rtol=rtol # output of type-2 transform
end

nothing