diff --git a/Project.toml b/Project.toml
index d47c34b..cde6523 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,9 @@
 name = "QuantitativeSusceptibilityMappingTGV"
 uuid = "bd393529-335a-4aed-902f-5de61cc7ff49"
 authors = ["Korbinian Eckstein korbinian90@gmail.com"]
-version = "0.2.1"
+version = "0.3.0"
 
 [deps]
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
 ImageMorphology = "787d08f9-d448-5407-9aad-5290dd7ab264"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
@@ -18,7 +17,6 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-CUDA = "4,5"
 ImageFiltering = "0.7"
 ImageMorphology = "0.4"
 Interpolations = "0.14"
diff --git a/README.md b/README.md
index f458c99..c51a9a7 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,6 @@ Under Linux: Make the file executable with `chmod +x tgv_qsm.jl` and run via
     TE = 0.004 # in [s]
     fieldstrength = 3 # in [T]
     
-    # Automatically runs on GPU, if a CUDA device is detected
     chi = qsm_tgv(phase, mask, res; TE=TE, fieldstrength=fieldstrength);
 
     savenii(chi, "chi", "<folder-to-save>")
@@ -97,11 +96,6 @@ The first execution might take some time to compile the kernels (~1min).
 
 The default settings were optimized for brain QSM and should lead to good results independently of the acquired resolution.
 
-```julia
-# Run on CPU in parallel
-chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=false);
-```
-
 It uses the number of CPU threads julia was started with. You can use `julia --threads=auto` or set it to a specific number of threads.
 
 ### Other optional keyword arguments
@@ -146,10 +140,15 @@ qsm = qsm_tgv(phase, mask, res; TE, fieldstrength, laplacian=get_laplace_phase3,
 
 The parallel CPU version is about twice as fast as the Cython version, the GPU version is about 10x faster than the Cython version (on a RTX 3060 Laptop GPU 6GB)  
 
-## Run with other GPU types
+## Run on GPU
 
 Other GPU types don't work with the command line script. They have to be accessed via Julia (or the command line script modified).
 
+```julia
+using CUDA
+chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=CUDA);
+```
+
 ```julia
 using AMDGPU
 chi = qsm_tgv(phase, mask, res; TE, fieldstrength, gpu=AMDGPU);
diff --git a/src/QuantitativeSusceptibilityMappingTGV.jl b/src/QuantitativeSusceptibilityMappingTGV.jl
index e1ffb75..18923ec 100644
--- a/src/QuantitativeSusceptibilityMappingTGV.jl
+++ b/src/QuantitativeSusceptibilityMappingTGV.jl
@@ -1,6 +1,6 @@
 module QuantitativeSusceptibilityMappingTGV
 
-using CUDA, KernelAbstractions, PaddedViews, ImageMorphology, Interpolations, Rotations, OffsetArrays, StaticArrays, ProgressMeter, Statistics, ImageFiltering, ROMEO
+using KernelAbstractions, PaddedViews, ImageMorphology, Interpolations, Rotations, OffsetArrays, StaticArrays, ProgressMeter, Statistics, ImageFiltering, ROMEO
 
 include("tgv.jl")
 include("tgv_helper.jl")
diff --git a/src/tgv.jl b/src/tgv.jl
index cfe401b..db79c62 100644
--- a/src/tgv.jl
+++ b/src/tgv.jl
@@ -7,7 +7,7 @@ Example:
     chi = qsm_tgv(randn(20,20,20), trues(20,20,20), [1,1,2]; TE=0.025, fieldstrength=3)
 
 """
-function qsm_tgv(phase, mask, res; TE, B0_dir=[0, 0, 1], fieldstrength=3, regularization=2, alpha=get_default_alpha(regularization), step_size=3, iterations=get_default_iterations(res, step_size), erosions=3, dedimensionalize=false, correct_laplacian=true, laplacian=get_laplace_phase_del, type=Float32, gpu=CUDA.functional(), nblocks=32)
+function qsm_tgv(phase, mask, res; TE, B0_dir=[0, 0, 1], fieldstrength=3, regularization=2, alpha=get_default_alpha(regularization), step_size=3, iterations=get_default_iterations(res, step_size), erosions=3, dedimensionalize=false, correct_laplacian=true, laplacian=get_laplace_phase_del, type=Float32, gpu=nothing, nblocks=32)
     device, cu = select_device(gpu)
     phase, res, alpha, fieldstrength, mask = adjust_types(type, phase, res, alpha, fieldstrength, mask)
 
@@ -163,15 +163,7 @@ function reduce_to_mask_box(laplace_phi0, mask)
     return laplace_phi0, mask, box_indices, original_size
 end
 
-function default_backend()
-    if CUDA.functional()
-        return CUDA.CUDAKernels.CUDABackend()
-    else
-        return CPU()
-    end
-end
-
-function select_device(library::Module)
+function select_device(library)
     if Symbol(library) == :CUDA
         println("Using the GPU via CUDA")
         return library.CUDAKernels.CUDABackend(), library.cu
@@ -190,16 +182,6 @@ function select_device(library::Module)
     end
 end
 
-function select_device(gpu)
-    if gpu
-        println("Using the GPU")
-        return CUDA.CUDAKernels.CUDABackend(), CUDA.cu
-    else
-        println("Using $(Threads.nthreads()) CPU threads")
-        return CPU(), identity
-    end
-end
-
 function erode_mask(mask)
     SE = strel(CartesianIndex, strel_diamond(mask))
     erode_vox(I) = minimum(mask[I+J] for J in SE if checkbounds(Bool, mask, I + J))
diff --git a/test/Project.toml b/test/Project.toml
index 5788300..b2bcf2b 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
diff --git a/test/runtests.jl b/test/runtests.jl
index af44e9c..ed32e66 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,25 +17,9 @@ using TestItemRunner
     @test size(chi) == sz
 end
 
-@testitem "GPU" begin
-    if QuantitativeSusceptibilityMappingTGV.CUDA.functional()
-        sz = (20, 20, 20)
-        phase = randn(sz)
-        mask = trues(sz)
-        res = [1, 1, 1]
-        TE = 1
-
-        iterations = 10
-        chi_cpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=false)
-        chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=true)
-
-        relative_diff(A, B) = sum(abs.(A .- B)) / sum(abs.(B))
-        @test relative_diff(Array(chi_gpu), chi_cpu) < 1e-7
-    end
-end
-
 @testitem "CUDA" begin
-    if QuantitativeSusceptibilityMappingTGV.CUDA.functional()
+    using CUDA
+    if CUDA.functional()
         sz = (20, 20, 20)
         phase = randn(sz)
         mask = trues(sz)
@@ -44,7 +28,7 @@ end
 
         iterations = 10
         chi_cpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=false)
-        chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=QuantitativeSusceptibilityMappingTGV.CUDA)
+        chi_gpu = qsm_tgv(phase, mask, res; TE, iterations, gpu=CUDA)
 
         relative_diff(A, B) = sum(abs.(A .- B)) / sum(abs.(B))
         @test relative_diff(Array(chi_gpu), chi_cpu) < 1e-7
diff --git a/tgv_qsm.jl b/tgv_qsm.jl
index c02defa..d3f0e5b 100644
--- a/tgv_qsm.jl
+++ b/tgv_qsm.jl
@@ -25,7 +25,7 @@ end
 version = Comonicon.get_version(QuantitativeSusceptibilityMappingTGV)
 Comonicon.get_version(::Module) = version
 
-@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, no_gpu::Bool=false, type::DataType=Float32, nblocks::Int=32)
+@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, type::DataType=Float32, nblocks::Int=32)
     println("Starting calculation...")
     phase = readphase(fn_phase)
     mask = niread(fn_mask) .!= 0
diff --git a/tgv_qsm_cuda.jl b/tgv_qsm_cuda.jl
new file mode 100644
index 0000000..5745afe
--- /dev/null
+++ b/tgv_qsm_cuda.jl
@@ -0,0 +1,37 @@
+#!/usr/bin/env -S julia --color=yes --startup-file=no --threads=auto
+
+## Usage
+
+# Call with: `<path-to-file>/tgv_qsm.jl ARGS`
+# On windows use: `julia --threads=auto <path-to-file>/tgv_qsm.jl ARGS`
+
+# Example call:
+# `./tgv_qsm.jl phase.nii.gz mask.nii.gz --TE 0.025 --output output.nii.gz --no-gpu
+
+import Pkg
+
+## Uncomment to use a local julia package directory instead of the global one
+# package_dir = joinpath(@__DIR__, ".tgv_cmd_packages")
+# mkpath(package_dir)
+# Pkg.activate(package_dir)
+
+try
+    using CUDA, QuantitativeSusceptibilityMappingTGV, MriResearchTools, Comonicon
+catch
+    Pkg.add(["CUDA", "QuantitativeSusceptibilityMappingTGV", "MriResearchTools", "Comonicon"])
+    using CUDA, QuantitativeSusceptibilityMappingTGV, MriResearchTools, Comonicon
+end
+
+version = Comonicon.get_version(QuantitativeSusceptibilityMappingTGV)
+Comonicon.get_version(::Module) = version
+
+@main function tgv_qsm(fn_phase, fn_mask; TE::Float64, output::String="output.nii.gz", fieldstrength::Float64=3.0, regularization::Float64=2.0, erosions::Int=3, B0_dir::Array{Int}=[0,0,1], dedimensionalize::Bool=false, no_laplacian_correction::Bool=false, step_size::Float64=3.0, type::DataType=Float32, nblocks::Int=32)
+    println("Starting calculation...")
+    phase = readphase(fn_phase)
+    mask = niread(fn_mask) .!= 0
+    res = header(phase).pixdim[2:4]
+    println("Resolution from NIfTI header [mm]: $(round.(Float64.(res); digits=2))")
+    chi = qsm_tgv(phase, mask, res; TE, B0_dir, fieldstrength, regularization, erosions, dedimensionalize, correct_laplacian=!no_laplacian_correction, gpu=!no_gpu, step_size, type, nblocks, gpu=CUDA)
+    println("Writing output")
+    savenii(chi, output; header=header(phase))
+end