Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cuda heat example w quaditer #913

Draft
wants to merge 144 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
144 commits
Select commit Hold shift + click to select a range
a979fb2
Initial ideas
KnutAM Jan 11, 2024
298158c
Working implementation
KnutAM Jan 11, 2024
1794db3
Merge branch 'master' into kam/QuadraturePointIterator
KnutAM Feb 24, 2024
51ab4f2
Add static values version and improve interface
KnutAM Feb 24, 2024
22a7377
Add dev example and test
KnutAM Feb 24, 2024
18377f3
Merge branch 'master' into kam/QuadraturePointIterator
KnutAM Feb 28, 2024
27a3a96
Add StaticCellValues without stored cell coordinates
KnutAM Feb 28, 2024
95b5729
initial ideas
Abdelrahman912 May 7, 2024
d4e881d
minor changes
Abdelrahman912 May 14, 2024
f55b878
Merge branch 'Ferrite-FEM:master' into cuda-heat-example-w-quaditer
Abdelrahman912 May 14, 2024
c1ef6ad
add some abstractions
Abdelrahman912 May 23, 2024
394ac6a
add minor comment
Abdelrahman912 May 23, 2024
1f0df67
add z dierction for numerical integration
Abdelrahman912 May 30, 2024
3152042
add Float32
Abdelrahman912 Jun 4, 2024
aac5994
minor fix
Abdelrahman912 Jun 4, 2024
142f89a
init coloring implementation
Abdelrahman912 Jun 18, 2024
eaff534
init working on the assembler
Abdelrahman912 Jun 19, 2024
ffdc341
init gpu_assembler
Abdelrahman912 Jun 20, 2024
59595e8
implement naive gpu_assembler
Abdelrahman912 Jun 20, 2024
0e3cb21
minor fix
Abdelrahman912 Jun 20, 2024
687141d
use CuSparseMatrixCSC in assembler
Abdelrahman912 Jun 26, 2024
11d5a01
minor fix
Abdelrahman912 Jun 26, 2024
d5c951c
minor fix
Abdelrahman912 Jun 26, 2024
f4272a6
hoist dh, cellvalues, assembler outside the cuda loop
Abdelrahman912 Jun 26, 2024
d5cf949
add run_gpu macro
Abdelrahman912 Jun 26, 2024
2e52de1
init using int32 instead of int64 to reduce number of registers
Abdelrahman912 Jul 3, 2024
2cd0168
finish use int32
Abdelrahman912 Jul 3, 2024
54922ab
stupid way to circumvent rubbish values
Abdelrahman912 Jul 4, 2024
9406ff9
add discorse ref
Abdelrahman912 Jul 4, 2024
8fedba5
add ncu benchmark
Abdelrahman912 Jul 4, 2024
8bd417a
fix error in benchmark and add ref.
Abdelrahman912 Jul 4, 2024
abf11b6
set the code for debugging
Abdelrahman912 Jul 8, 2024
4f85cf5
init test
Abdelrahman912 Jul 8, 2024
4935b70
fix adapt issue
Abdelrahman912 Jul 8, 2024
188cceb
remove unnecessary cushow
Abdelrahman912 Jul 8, 2024
9c904e4
add heat equation main test set
Abdelrahman912 Jul 8, 2024
06432db
remove unncessary comments
Abdelrahman912 Jul 8, 2024
a67caaa
add nsys benchmark
Abdelrahman912 Jul 8, 2024
ecee17f
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Jul 8, 2024
60edda9
fix some issues regarding the merge
Abdelrahman912 Jul 8, 2024
063ff7a
minor fix
Abdelrahman912 Jul 8, 2024
9206be3
remove nsight files
Abdelrahman912 Jul 8, 2024
1eeb568
minor fix
Abdelrahman912 Jul 8, 2024
5e339a0
add comments
Abdelrahman912 Jul 8, 2024
204f3be
minor fix
Abdelrahman912 Jul 8, 2024
0f2e6b7
add comments
Abdelrahman912 Jul 8, 2024
7100e0a
fix for CI
Abdelrahman912 Jul 8, 2024
f129449
fix for CI
Abdelrahman912 Jul 8, 2024
618adb5
CI fix
Abdelrahman912 Jul 8, 2024
78f120c
ci
Abdelrahman912 Jul 8, 2024
4971cba
minor fix
Abdelrahman912 Jul 8, 2024
ea8451c
fix ci
Abdelrahman912 Jul 8, 2024
986c5db
remove file
Abdelrahman912 Jul 8, 2024
f93fdfb
add CUDA to docs project
Abdelrahman912 Jul 8, 2024
f442ae2
add v2 for gpu_heat_equation
Abdelrahman912 Jul 15, 2024
81274d5
add adapt to docs
Abdelrahman912 Jul 15, 2024
fbc05ed
minor fix
Abdelrahman912 Jul 22, 2024
506328c
init assemble per dof
Abdelrahman912 Jul 22, 2024
b505189
assemble global v3
Abdelrahman912 Jul 22, 2024
b0a94aa
minor fix
Abdelrahman912 Jul 22, 2024
aa3d1ae
add comment + start in v4
Abdelrahman912 Jul 31, 2024
c8cf6fe
add map dof to elements
Abdelrahman912 Jul 31, 2024
8a4523d
add 3d array for local matrices
Abdelrahman912 Aug 1, 2024
9617a4f
init code for v4
Abdelrahman912 Aug 1, 2024
427a6b0
fix bug w assemble global in v4
Abdelrahman912 Aug 5, 2024
bbed047
precommit fix
Abdelrahman912 Aug 5, 2024
85c055c
add preserve ref
Abdelrahman912 Aug 5, 2024
2b77613
fix precommit
Abdelrahman912 Aug 5, 2024
f9c70ab
fix logic error in v4
Abdelrahman912 Sep 7, 2024
0519016
init shared array usage
Abdelrahman912 Sep 9, 2024
5752676
optimize threads for dynamic shared memory threshold
Abdelrahman912 Sep 10, 2024
0fe023c
fix bug in dynamic shared mem
Abdelrahman912 Sep 11, 2024
a352612
minor fix
Abdelrahman912 Sep 11, 2024
2a6120a
init kernel abstractions
Abdelrahman912 Sep 16, 2024
67face7
add local matrix kernel
Abdelrahman912 Sep 16, 2024
aca8a6f
add global matrix kernel with CUDA dependency
Abdelrahman912 Sep 16, 2024
9e4d592
minor change
Abdelrahman912 Sep 16, 2024
6114495
init working KS implementation (still CUDA dependent )
Abdelrahman912 Sep 17, 2024
2a8abeb
remove cuda dependency
Abdelrahman912 Sep 18, 2024
630017c
add refrence to
Abdelrahman912 Sep 18, 2024
fc26670
use Atomix.jl
Abdelrahman912 Sep 20, 2024
ae7bc93
init v4 ks
Abdelrahman912 Sep 20, 2024
0e28f14
init cell cache prototype
Abdelrahman912 Sep 23, 2024
0eb376d
working gpu cell cache
Abdelrahman912 Sep 23, 2024
8f7a182
fix types
Abdelrahman912 Sep 23, 2024
9b1567d
init gpu cell iterator
Abdelrahman912 Sep 23, 2024
a08ab97
add iterator
Abdelrahman912 Sep 25, 2024
b34c43b
add stride kernel
Abdelrahman912 Sep 26, 2024
b289b69
minor fix
Abdelrahman912 Sep 26, 2024
b2c0347
fix blocks, threads for kernel launch
Abdelrahman912 Sep 27, 2024
b87d78b
minor fix for thread, blocks
Abdelrahman912 Sep 27, 2024
e10e2f6
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Oct 3, 2024
42a28e1
add gpu as extension
Abdelrahman912 Oct 4, 2024
e59b8b8
add some documentaion and remove unnecessary implementations.
Abdelrahman912 Oct 7, 2024
e7157e4
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Oct 10, 2024
e4b194d
init unit test
Abdelrahman912 Oct 10, 2024
a613107
init test for iterators
Abdelrahman912 Oct 11, 2024
113a7a2
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Oct 11, 2024
d1e831e
add tests in GPU/
Abdelrahman912 Oct 11, 2024
7f8fa3c
add test local ke and fe
Abdelrahman912 Oct 11, 2024
c38419c
minor fix
Abdelrahman912 Oct 11, 2024
190e43e
fix ci - 1
Abdelrahman912 Oct 11, 2024
763c6b5
fix ci-2
Abdelrahman912 Oct 11, 2024
1b6060d
minor edit
Abdelrahman912 Oct 11, 2024
d767668
fix ci
Abdelrahman912 Oct 12, 2024
726ea9e
ci
Abdelrahman912 Oct 12, 2024
8590aa4
fix ci
Abdelrahman912 Oct 12, 2024
39e1f0c
minor edit
Abdelrahman912 Oct 12, 2024
f0cd305
add validation for cuda, minor fix, seperate unit tests into multiple…
Abdelrahman912 Oct 14, 2024
9d4e8b9
fix precommit shit
Abdelrahman912 Oct 14, 2024
12f64bb
try documentation test fix
Abdelrahman912 Oct 14, 2024
361333b
documentation test fix
Abdelrahman912 Oct 14, 2024
e31c6e3
make ci happy
Abdelrahman912 Oct 14, 2024
626dec2
change kernel launch, init adapt test
Abdelrahman912 Oct 15, 2024
fbc1b4b
minor fix
Abdelrahman912 Oct 15, 2024
ea83925
add test_adapt, some comments
Abdelrahman912 Oct 15, 2024
a356d8d
fix precommit
Abdelrahman912 Oct 15, 2024
ee1f77c
init cpu multi threading
Abdelrahman912 Nov 4, 2024
fb7e1fc
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Nov 4, 2024
b38ab72
hot fix for buggy assembly logic
Abdelrahman912 Nov 5, 2024
adb166a
minor fix
Abdelrahman912 Nov 6, 2024
6300a4a
test sth
Abdelrahman912 Nov 6, 2024
b7301c2
precommit fix
Abdelrahman912 Nov 6, 2024
18f47b8
fix explicit imports
Abdelrahman912 Nov 6, 2024
f6e9cc6
add fillzero
Abdelrahman912 Nov 6, 2024
8a796de
Merge branch 'master' into cuda-heat-example-w-quaditer
Abdelrahman912 Nov 6, 2024
75e89ed
minor fix for gpu assembly
Abdelrahman912 Nov 6, 2024
a77c347
minor minor fix
Abdelrahman912 Nov 6, 2024
7338788
make cache mutable
Abdelrahman912 Nov 12, 2024
cbab665
put the coloring stuff in the init
Abdelrahman912 Nov 12, 2024
1c81281
minor fix
Abdelrahman912 Nov 12, 2024
d42bcab
code for benchmarking (to be removed)
Abdelrahman912 Nov 13, 2024
1ab1650
rm cpu multithreading benchmark code
Abdelrahman912 Nov 13, 2024
bc8ec95
init fix for higher order approximations in gpu
Abdelrahman912 Nov 18, 2024
c7f4b0f
add working imp for global gpu mem
Abdelrahman912 Nov 18, 2024
d4d5967
add some comments
Abdelrahman912 Nov 18, 2024
3b2196b
trying to make the ci happy
Abdelrahman912 Nov 19, 2024
825d257
minor fix
Abdelrahman912 Nov 19, 2024
6109bd1
comment gpu related stuff in eg to pass ci
Abdelrahman912 Nov 19, 2024
9caa60b
some review fixes
Abdelrahman912 Nov 25, 2024
868d559
some review fixes
Abdelrahman912 Nov 25, 2024
a4637b6
add allocate_matrix for CuSparseMatrix
Abdelrahman912 Nov 26, 2024
1619986
init first ideas for cuda mem allocator
Abdelrahman912 Nov 28, 2024
69eb55a
add cuda mem interface
Abdelrahman912 Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 139 additions & 52 deletions docs/src/literate-tutorials/gpu_qp_heat_equation.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
using Ferrite, CUDA

left = Tensor{1,2,Float64}((0,-0)) # define the left bottom corner of the grid.
right = Tensor{1,2,Float64}((400.0,400.0)) # define the right top corner of the grid.


left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.
right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.

grid = generate_grid(Quadrilateral, (100, 100),left,right);


ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)

# define the numerical integration rule
# (i.e. integrating over quad shape with two quadrature points per direction)
qr = QuadratureRule{RefQuadrilateral}(2)
qr = QuadratureRule{RefQuadrilateral,Float32}(2)
cellvalues = CellValues(qr, ip);


Expand All @@ -20,6 +23,7 @@ close!(dh);




# Standard assembly of the element.
function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
n_basefuncs = getnbasefunctions(cellvalues)
Expand All @@ -46,29 +50,28 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
end

# Element assembly by using static cell (PR #883)
function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues,cell_coords::AbstractVector)
n_basefuncs = getnbasefunctions(cellvalues)
## Loop over quadrature points
for qv in Ferrite.QuadratureValuesIterator(cellvalues,cell_coords)
## Get the quadrature weight
dΩ = getdetJdV(qv)
## Loop over test shape functions
for i in 1:n_basefuncs
δu = shape_value(qv, i)
∇δu = shape_gradient(qv, i)
## Add contribution to fe
fe[i] += δu * dΩ
## Loop over trial shape functions
for j in 1:n_basefuncs
∇u = shape_gradient(qv, j)
## Add contribution to Ke
Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
end
end
end
return Ke, fe
end
K = create_sparsity_pattern(dh)
# function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues,cell_coords::AbstractVector)
# n_basefuncs = getnbasefunctions(cellvalues)
# ## Loop over quadrature points
# for qv in Ferrite.QuadratureValuesIterator(cellvalues,cell_coords)
# ## Get the quadrature weight
# dΩ = getdetJdV(qv)
# ## Loop over test shape functions
# for i in 1:n_basefuncs
# δu = shape_value(qv, i)
# ∇δu = shape_gradient(qv, i)
# ## Add contribution to fe
# fe[i] += δu * dΩ
# ## Loop over trial shape functions
# for j in 1:n_basefuncs
# ∇u = shape_gradient(qv, j)
# ## Add contribution to Ke
# Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
# end
# end
# end
# return Ke, fe
# end

function create_buffers(cellvalues, dh)
f = zeros(ndofs(dh))
Expand Down Expand Up @@ -105,41 +108,122 @@ function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where
end



### Old impelementation that makes each threads over the quadrature points.
# function assemble_element_gpu!(Kgpu,cv,dh)
# i = threadIdx().x
# j = threadIdx().y
# bx = threadIdx().z + (blockIdx().x - Int32(1)) * blockDim().z # element number
# bx <= length(dh.grid.cells) || return nothing
# #bx = blockIdx().x
# cell_coords = getcoordinates(dh.grid)
# n_basefuncs = Int32(getnbasefunctions(cv))
# #Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs)) # We don't need shared memory
# keij = 0.0f0
# for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords)
# # Get the quadrature weight
# dΩ = getdetJdV(qv)
# ## Get test function gradient
# ∇δu = shape_gradient(qv, i)
# ## Get shape function gradient
# ∇u = shape_gradient(qv, j)

# keij += (∇δu ⋅ ∇u) * dΩ
# end
# #Ke[i,j] = keij # We don't need shared memory

# # TODO: Assemble local matrix here in Kgpu
# # TODO: Add abstraction, Addittionally use assembler to assemble the local matrix into global matrix.
# dofs = dh.cell_dofs
# @inbounds ig = Int32(dofs[Int32((bx-Int32(1))*n_basefuncs+i)])
# @inbounds jg = Int32(dofs[Int32((bx-Int32(1))*n_basefuncs+j)] )

# ## Sparse Addition ##
# col_start = Kgpu.colptr[jg]
# col_end = Kgpu.colptr[jg + 1] - 1

# for k in col_start:col_end
# if Kgpu.rowval[k] == ig
# # Update the existing element
# CUDA.@atomic Kgpu.nzval[k] += keij
# return
# end
# end
# ##custom_atomic_add!(Kgpu, keij, ig, jg)
# #Kgpu[ig, jg] += Float32(keij)

# return nothing
# end


function assemble_element_gpu!(Kgpu,cv,dh)
i = threadIdx().x
j = threadIdx().y
bx = blockIdx().x
q_point = Int64(threadIdx().z) # quadrature point

bx = blockIdx().x # element number

cell_coords = getcoordinates(dh.grid)
n_basefuncs = getnbasefunctions(cv)
#Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs)) # We don't need shared memory
keij = 0.0
for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords)
# Get the quadrature weight
dΩ = getdetJdV(qv)
## Get test function gradient
∇δu = shape_gradient(qv, i)
## Get shape function gradient
∇u = shape_gradient(qv, j)

keij += (∇δu ⋅ ∇u) * dΩ
end
#Ke[i,j] = keij # We don't need shared memory

# TODO: Assemble local matrix here in Kgpu
# TODO: Add abstraction, Addittionally use assembler to assemble the local matrix into global matrix.
Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs))
Ke[i,j] = 0.0f0

sync_threads()

qv = Ferrite.quadrature_point_values(cv, q_point, cell_coords)
# Get the quadrature weight
dΩ = getdetJdV(qv)
## Get test function gradient
∇δu = shape_gradient(qv, i)
## Get shape function gradient
∇u = shape_gradient(qv, j)

sync_threads()


CUDA.@atomic Ke[i,j] += (∇δu ⋅ ∇u) * dΩ

#Ke[i,j] = keij # We don't need shared memory


dofs = dh.cell_dofs
@inbounds ig = dofs[(bx-1)*n_basefuncs+i]
@inbounds jg = dofs[(bx-1)*n_basefuncs+j]
CUDA.@atomic Kgpu[ig, jg] += keij
ig = dofs[(bx-1)*n_basefuncs+i]
jg = dofs[(bx-1)*n_basefuncs+j]


sync_threads()

## Sparse Addition ##
# col_start = Kgpu.colptr[jg]
# col_end = Kgpu.colptr[jg + 1] - 1

# for k in col_start:col_end
# if Kgpu.rowval[k] == ig
# # Update the existing element
# CUDA.@atomic Kgpu.nzval[Int32(k)] += Ke[i,j]
# return
# end
# end
q_point == 1 || return nothing
CUDA.@atomic Kgpu[ig, jg] += Ke[i,j]


return nothing
end





function assemble_global_gpu(cellvalues,dh)
Kgpu = CUDA.zeros(dh.ndofs.x,dh.ndofs.x)
n_base_funcs = getnbasefunctions(cellvalues)
K = create_sparsity_pattern(dh)
Kgpu = GPUSparseMatrixCSC( Int32(K.m), Int32(K.n), cu(Int32.(K.colptr)), cu(Int32.(K.rowval)), cu(Float32.(K.nzval)))
# each block represents a cell, and every (i,j) in the 2D threads represents an element in the local stiffness matrix.
@cuda blocks=length(dh.grid.cells) threads = (n_base_funcs,n_base_funcs) assemble_element_gpu!(Kgpu,cellvalues,dh)
#n_blocks = cld(length(dh.grid.cells), 16) # 16 threads in z direction
@cuda blocks=length(dh.grid.cells) threads = (n_base_funcs,n_base_funcs,length(cellvalues.qr.weights)) assemble_element_gpu!(Kgpu,cellvalues,dh)
return Kgpu
end

Expand All @@ -149,20 +233,23 @@ end
stassy(cv,dh) = assemble_global!(cv,dh,Val(false))


qpassy(cv,dh) = assemble_global!(cv,dh,Val(true))
# qpassy(cv,dh) = assemble_global!(cv,dh,Val(true))


using BenchmarkTools
using LinearAlgebra
# using LinearAlgebra


Kgpu =@btime assemble_global_gpu($cellvalues,$dh);

Kgpu = @btime CUDA.@sync assemble_global_gpu($cellvalues,$dh)
#Kgpu = assemble_global_gpu(cellvalues,dh)

# sqrt(sum(abs2, Kgpu.nzval))
norm(Kgpu)

Kstd , Fstd = @btime stassy($cellvalues,$dh);

#std , Fstd = @benchmark stassy(cellvalues,dh)
Kstd , Fstd = stassy(cellvalues,dh);
# Kstd[2,6]
norm(Kstd)
Kstd[1:5,1:5]


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are missing the analogue benchmark using QuadraturePointIterator

1 change: 1 addition & 0 deletions src/FEValues/StaticCellValues.jl
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ end
end



function _quadrature_point_values(fe_v::StaticCellValues, q_point::Int, cell_coords::AbstractVector,neg_detJ_err_fun::Function)
#q_point bounds checked, ok to use @inbounds
@inbounds begin
Expand Down
1 change: 1 addition & 0 deletions src/Ferrite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ include("deprecations.jl")
include("docs.jl")

# GPU support
include("GPU/sparsematrix.jl")
include("GPU/adapt.jl")
include("Grid/gpu_grid.jl")
include("Dofs/GPUDofHandler.jl")
Expand Down
26 changes: 24 additions & 2 deletions src/GPU/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,30 @@ function Adapt.adapt_structure(to, grid::Grid)
end

function Adapt.adapt_structure(to, dh::DofHandler)
cell_dofs = Adapt.adapt_structure(to, cu(dh.cell_dofs))
cell_dofs = Adapt.adapt_structure(to, Int32.(dh.cell_dofs) |> cu)
cells = Adapt.adapt_structure(to, cu(dh.grid.cells))
nodes = Adapt.adapt_structure(to, cu(dh.grid.nodes))
GPUDofHandler(cell_dofs, GPUGrid(cells,nodes))
end
end


function Adapt.adapt_structure(to, K::SparseMatrixCSC)
m = Adapt.adapt_structure(to, Int32(K.m))
n = Adapt.adapt_structure(to, Int32(K.n))
colptr = Adapt.adapt_structure(to, Int32.(K.colptr)|>cu)
rowval = Adapt.adapt_structure(to, Int32.(K.rowval)|>cu)
nzval = Adapt.adapt_structure(to, Float32.(K.nzval) |> cu)
GPUSparseMatrixCSC(m, n, colptr, rowval, nzval)
end

function Adapt.adapt_structure(to, K::GPUSparseMatrixCSC)
m = Adapt.adapt_structure(to, K.m)
n = Adapt.adapt_structure(to, K.n)
colptr = Adapt.adapt_structure(to, K.colptr)
rowval = Adapt.adapt_structure(to, K.rowval)
nzval = Adapt.adapt_structure(to, K.nzval)
GPUSparseMatrixCSC(m, n, colptr, rowval, nzval)
end



88 changes: 88 additions & 0 deletions src/GPU/sparsematrix.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
using SparseArrays
using CUDA
struct GPUSparseMatrixCSC{Tv,VEC <: AbstractVector{Int32} , NZVEC <: AbstractVector{Tv}}
m::Int32 # Number of rows
n::Int32 # Number of columns
colptr::VEC # Column i is in colptr[i]:(colptr[i+1]-1)
rowval::VEC # Row indices of stored values
nzval::NZVEC # Stored values, typically nonzeros
end

function GPUSparseMatrixCSC{Tv}(m::Int32, n::Int32, colptr::AbstractVector{Int32},
rowval:: AbstractVector{Int32}, nzval::AbstractVector{Tv}) where {Tv}
new(m, n, colptr, rowval, nzval)
end

function GPUSparseMatrixCSC{Tv}(A::SparseMatrixCSC{Tv}) where {Tv}
GPUSparseMatrixCSC(A.m, A.n, A.colptr, A.rowval, A.nzval)
end


function Base.getindex(A::GPUSparseMatrixCSC{Tv}, i::Int32, j::Int32) where Tv
# TODO: Add bounds checking

col_start = A.colptr[j]
col_end = A.colptr[j + 1] - 1

for k in col_start:col_end
if A.rowval[k] == i
return A.nzval[k]
end
end

return zero(Tv)
end

function Base.setindex!(A::GPUSparseMatrixCSC{T}, v::Float32, i::Int32, j::Int32) where T
col_start = A.colptr[j]
col_end = A.colptr[j + 1] - 1

for k in col_start:col_end
if A.rowval[k] == i
# Update the existing element
A.nzval[k] = v
return
end
end
end


function custom_atomic_add!(A::GPUSparseMatrixCSC{T}, v::Float32, i::Int32, j::Int32) where T
col_start = A.colptr[j]
col_end = A.colptr[j + 1] - 1

for k in col_start:col_end
if A.rowval[k] == i
# Update the existing element
CUDA.@atomic A.nzval[k] += v
return
end
end

end

function gpu_sparse_norm(A::GPUSparseMatrixCSC{T}, p::Real=2) where T
if p == 2 # Frobenius norm
return sqrt(sum(abs2, A.nzval))
elseif p == 1 # L1 norm
col_sums = zeros(T, A.n)
for j in 1:A.n
for k in A.colptr[j]:(A.colptr[j + 1] - 1)
col_sums[j] += abs(A.nzval[k])
end
end
return maximum(col_sums)
elseif p == Inf # L∞ norm
row_sums = zeros(T, A.m)
for j in 1:A.n
for k in A.colptr[j]:(A.colptr[j + 1] - 1)
i = A.rowval[k]
row_sums[i] += abs(A.nzval[k])
end
end
return maximum(row_sums)
else
return -1.0f0
end
end

Loading