From c57cb19f7ae99901d7f1a9d074121c9244fa9e29 Mon Sep 17 00:00:00 2001 From: Korbinian Eckstein Date: Fri, 15 Dec 2023 14:49:08 +1000 Subject: [PATCH] don't use CUDA as default --- Project.toml | 4 +-- README.md | 13 ++++---- src/QuantitativeSusceptibilityMappingTGV.jl | 2 +- src/tgv.jl | 22 ++---------- test/Project.toml | 1 + test/runtests.jl | 22 ++---------- tgv_qsm.jl | 2 +- tgv_qsm_cuda.jl | 37 +++++++++++++++++++++ 8 files changed, 52 insertions(+), 51 deletions(-) create mode 100644 tgv_qsm_cuda.jl diff --git a/Project.toml b/Project.toml index d47c34b..cde6523 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,9 @@ name = "QuantitativeSusceptibilityMappingTGV" uuid = "bd393529-335a-4aed-902f-5de61cc7ff49" authors = ["Korbinian Eckstein korbinian90@gmail.com"] -version = "0.2.1" +version = "0.3.0" [deps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5" ImageMorphology = "787d08f9-d448-5407-9aad-5290dd7ab264" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" @@ -18,7 +17,6 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] -CUDA = "4,5" ImageFiltering = "0.7" ImageMorphology = "0.4" Interpolations = "0.14" diff --git a/README.md b/README.md index f458c99..c51a9a7 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,6 @@ Under Linux: Make the file executable with `chmod +x tgv_qsm.jl` and run via TE = 0.004 # in [s] fieldstrength = 3 # in [T] - # Automatically runs on GPU, if a CUDA device is detected chi = qsm_tgv(phase, mask, res; TE=TE, fieldstrength=fieldstrength); savenii(chi, "chi", "") @@ -97,11 +96,6 @@ The first execution might take some time to compile the kernels (~1min). The default settings were optimized for brain QSM and should lead to good results independently of the acquired resolution. -```julia -# Run on CPU in parallel -chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=false); -``` - It uses the number of CPU threads julia was started with. You can use `julia --threads=auto` or set it to a specific number of threads. ### Other optional keyword arguments @@ -146,10 +140,15 @@ qsm = qsm_tgv(phase, mask, res; TE, fieldstrength, laplacian=get_laplace_phase3, The parallel CPU version is about twice as fast as the Cython version, the GPU version is about 10x faster than the Cython version (on a RTX 3060 Laptop GPU 6GB) -## Run with other GPU types +## Run on GPU Other GPU types don't work with the command line script. They have to be accessed via Julia (or the command line script modified). +```julia +using CUDA +chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=CUDA); +``` + ```julia using AMDGPU chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=AMDGPU); diff --git a/src/QuantitativeSusceptibilityMappingTGV.jl b/src/QuantitativeSusceptibilityMappingTGV.jl index e1ffb75..18923ec 100644 --- a/src/QuantitativeSusceptibilityMappingTGV.jl +++ b/src/QuantitativeSusceptibilityMappingTGV.jl @@ -1,6 +1,6 @@ module QuantitativeSusceptibilityMappingTGV -using CUDA, KernelAbstractions, PaddedViews, ImageMorphology, Interpolations, Rotations, OffsetArrays, StaticArrays, ProgressMeter, Statistics, ImageFiltering, ROMEO +using KernelAbstractions, PaddedViews, ImageMorphology, Interpolations, Rotations, OffsetArrays, StaticArrays, ProgressMeter, Statistics, ImageFiltering, ROMEO include("tgv.jl") include("tgv_helper.jl") diff --git a/src/tgv.jl b/src/tgv.jl index cfe401b..db79c62 100644 --- a/src/tgv.jl +++ b/src/tgv.jl @@ -7,7 +7,7 @@ Example: chi = qsm_tgv(randn(20,20,20), trues(20,20,20), [1,1,2]; TE=0.025, fieldstrength=3) """ -function qsm_tgv(phase, mask, res; TE, B0_dir=[0, 0, 1], fieldstrength=3, regularization=2, alpha=get_default_alpha(regularization), step_size=3, iterations=get_default_iterations(res, step_size), erosions=3, dedimensionalize=false, correct_laplacian=true, laplacian=get_laplace_phase_del, type=Float32, gpu=CUDA.functional(), nblocks=32) +function qsm_tgv(phase, mask, res; TE, B0_dir=[0, 0, 1], fieldstrength=3, regularization=2, alpha=get_default_alpha(regularization), step_size=3, iterations=get_default_iterations(res, step_size), erosions=3, dedimensionalize=false, correct_laplacian=true, laplacian=get_laplace_phase_del, type=Float32, gpu=nothing, nblocks=32) device, cu = select_device(gpu) phase, res, alpha, fieldstrength, mask = adjust_types(type, phase, res, alpha, fieldstrength, mask) @@ -163,15 +163,7 @@ function reduce_to_mask_box(laplace_phi0, mask) return laplace_phi0, mask, box_indices, original_size end -function default_backend() - if CUDA.functional() - return CUDA.CUDAKernels.CUDABackend() - else - return CPU() - end -end - -function select_device(library::Module) +function select_device(library) if Symbol(library) == :CUDA println("Using the GPU via CUDA") return library.CUDAKernels.CUDABackend(), library.cu @@ -190,16 +182,6 @@ function select_device(library::Module) end end -function select_device(gpu) - if gpu - println("Using the GPU") - return CUDA.CUDAKernels.CUDABackend(), CUDA.cu - else - println("Using $(Threads.nthreads()) CPU threads") - return CPU(), identity - end -end - function erode_mask(mask) SE = strel(CartesianIndex, strel_diamond(mask)) erode_vox(I) = minimum(mask[I+J] for J in SE if checkbounds(Bool, mask, I + J)) diff --git a/test/Project.toml b/test/Project.toml index 5788300..b2bcf2b 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" diff --git a/test/runtests.jl b/test/runtests.jl index af44e9c..ed32e66 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,25 +17,9 @@ using TestItemRunner @test size(chi) == sz end -@testitem "GPU" begin - if QuantitativeSusceptibilityMappingTGV.CUDA.functional() - sz = (20, 20, 20) - phase = randn(sz) - mask = trues(sz) - res = [1, 1, 1] - TE = 1 - - iterations = 10 - chi_cpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=false) - chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=true) - - relative_diff(A, B) = sum(abs.(A .- B)) / sum(abs.(B)) - @test relative_diff(Array(chi_gpu), chi_cpu) < 1e-7 - end -end - @testitem "CUDA" begin - if QuantitativeSusceptibilityMappingTGV.CUDA.functional() + using CUDA + if CUDA.functional() sz = (20, 20, 20) phase = randn(sz) mask = trues(sz) @@ -44,7 +28,7 @@ end iterations = 10 chi_cpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=false) - chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=QuantitativeSusceptibilityMappingTGV.CUDA) + chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=CUDA) relative_diff(A, B) = sum(abs.(A .- B)) / sum(abs.(B)) @test relative_diff(Array(chi_gpu), chi_cpu) < 1e-7 diff --git a/tgv_qsm.jl b/tgv_qsm.jl index c02defa..d3f0e5b 100644 --- a/tgv_qsm.jl +++ b/tgv_qsm.jl @@ -25,7 +25,7 @@ end version = Comonicon.get_version(QuantitativeSusceptibilityMappingTGV) Comonicon.get_version(::Module) = version -@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, no_gpu::Bool=false, type::DataType=Float32, nblocks::Int=32) +@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, type::DataType=Float32, nblocks::Int=32) println("Starting calculation...") phase = readphase(fn_phase) mask = niread(fn_mask) .!= 0 diff --git a/tgv_qsm_cuda.jl b/tgv_qsm_cuda.jl new file mode 100644 index 0000000..5745afe --- /dev/null +++ b/tgv_qsm_cuda.jl @@ -0,0 +1,37 @@ +#!/usr/bin/env -S julia --color=yes --startup-file=no --threads=auto + +## Usage + +# Call with: `/tgv_qsm.jl ARGS` +# On windows use: `julia --threads=auto /tgv_qsm.jl ARGS` + +# Example call: +# `./tgv_qsm.jl phase.nii.gz mask.nii.gz --TE 0.025 --output output.nii.gz --no-gpu + +import Pkg + +## Uncomment to use a local julia package directory instead of the global one +# package_dir = joinpath(@__DIR__, ".tgv_cmd_packages") +# mkpath(package_dir) +# Pkg.activate(package_dir) + +try + using CUDA, QuantitativeSusceptibilityMappingTGV, MriResearchTools, Comonicon +catch + Pkg.add(["CUDA", "QuantitativeSusceptibilityMappingTGV", "MriResearchTools", "Comonicon"]) + using CUDA, QuantitativeSusceptibilityMappingTGV, MriResearchTools, Comonicon +end + +version = Comonicon.get_version(QuantitativeSusceptibilityMappingTGV) +Comonicon.get_version(::Module) = version + +@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, type::DataType=Float32, nblocks::Int=32) + println("Starting calculation...") + phase = readphase(fn_phase) + mask = niread(fn_mask) .!= 0 + res = header(phase).pixdim[2:4] + println("Resolution from NIfTI header [mm]: $(round.(Float64.(res); digits=2))") + chi = qsm_tgv(phase, mask, res; TE, B0_dir, fieldstrength, regularization, erosions, dedimensionalize, correct_laplacian=!no_laplacian_correction, gpu=!no_gpu, step_size, type, nblocks, gpu=CUDA) + println("Writing output") + savenii(chi, output; header=header(phase)) +end