Ferrite-FEM · Abdelrahman912 · Jan 11, 2024 · Jan 11, 2024 · Feb 24, 2024 · Feb 24, 2024
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -1,15 +1,18 @@
 using Ferrite, CUDA
 
-left = Tensor{1,2,Float64}((0,-0)) # define the left bottom corner of the grid.
-right = Tensor{1,2,Float64}((400.0,400.0)) # define the right top corner of the grid.
+
+
+left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.
+right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.
+
 grid = generate_grid(Quadrilateral, (100, 100),left,right); 
 
 
 ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
 
 # define the numerical integration rule 
 # (i.e. integrating over quad shape with two quadrature points per direction)
-qr = QuadratureRule{RefQuadrilateral}(2) 
+qr = QuadratureRule{RefQuadrilateral,Float32}(2) 
 cellvalues = CellValues(qr, ip);
 
 
@@ -20,6 +23,7 @@ close!(dh);
 
 
 
+
 # Standard assembly of the element.
 function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
     n_basefuncs = getnbasefunctions(cellvalues)
@@ -46,29 +50,28 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
 end
 
 # Element assembly by using static cell (PR #883)
-function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues,cell_coords::AbstractVector)
-    n_basefuncs = getnbasefunctions(cellvalues)
-    ## Loop over quadrature points
-    for qv in Ferrite.QuadratureValuesIterator(cellvalues,cell_coords)
-        ## Get the quadrature weight
-         dΩ = getdetJdV(qv)
-        ## Loop over test shape functions
-        for i in 1:n_basefuncs
-            δu  = shape_value(qv, i)
-            ∇δu = shape_gradient(qv, i)
-            ## Add contribution to fe
-            fe[i] += δu * dΩ
-            ## Loop over trial shape functions
-            for j in 1:n_basefuncs
-                ∇u = shape_gradient(qv, j)
-                ## Add contribution to Ke
-                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
-            end
-        end
-    end
-    return Ke, fe
-end
-K = create_sparsity_pattern(dh)
+# function assemble_element_qpiter!(Ke::Matrix, fe::Vector, cellvalues,cell_coords::AbstractVector)
+#     n_basefuncs = getnbasefunctions(cellvalues)
+#     ## Loop over quadrature points
+#     for qv in Ferrite.QuadratureValuesIterator(cellvalues,cell_coords)
+#         ## Get the quadrature weight
+#          dΩ = getdetJdV(qv)
+#         ## Loop over test shape functions
+#         for i in 1:n_basefuncs
+#             δu  = shape_value(qv, i)
+#             ∇δu = shape_gradient(qv, i)
+#             ## Add contribution to fe
+#             fe[i] += δu * dΩ
+#             ## Loop over trial shape functions
+#             for j in 1:n_basefuncs
+#                 ∇u = shape_gradient(qv, j)
+#                 ## Add contribution to Ke
+#                 Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
+#             end
+#         end
+#     end
+#     return Ke, fe
+# end
 
 function create_buffers(cellvalues, dh)
     f = zeros(ndofs(dh))
@@ -105,41 +108,122 @@ function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where
 end
 
 
+
+### Old impelementation that makes each threads over the quadrature points.
+# function assemble_element_gpu!(Kgpu,cv,dh) 
+#     i = threadIdx().x 
+#     j = threadIdx().y 
+#     bx = threadIdx().z + (blockIdx().x - Int32(1)) * blockDim().z # element number
+#     bx <= length(dh.grid.cells) || return nothing
+#     #bx = blockIdx().x
+#     cell_coords = getcoordinates(dh.grid)
+#     n_basefuncs = Int32(getnbasefunctions(cv))
+#     #Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs)) # We don't need shared memory
+#     keij = 0.0f0
+#     for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords) 
+#         # Get the quadrature weight
+#         dΩ = getdetJdV(qv)
+#         ## Get test function gradient
+#         ∇δu = shape_gradient(qv, i)
+#         ## Get shape function gradient
+#         ∇u = shape_gradient(qv, j)
+
+#         keij += (∇δu ⋅ ∇u) * dΩ 
+#     end 
+#     #Ke[i,j] = keij # We don't need shared memory
+
+#     # TODO: Assemble local matrix here in Kgpu
+#     # TODO: Add abstraction, Addittionally use assembler to assemble the local matrix into global matrix.
+#     dofs = dh.cell_dofs
+#     @inbounds ig = Int32(dofs[Int32((bx-Int32(1))*n_basefuncs+i)])
+#     @inbounds jg = Int32(dofs[Int32((bx-Int32(1))*n_basefuncs+j)] )
+
+#     ## Sparse Addition ##
+#     col_start = Kgpu.colptr[jg]
+#     col_end = Kgpu.colptr[jg + 1] - 1
+
+#     for k in col_start:col_end
+#         if Kgpu.rowval[k] == ig
+#             # Update the existing element
+#             CUDA.@atomic Kgpu.nzval[k] += keij
+#             return
+#         end
+#     end
+#     ##custom_atomic_add!(Kgpu, keij, ig, jg)
+#     #Kgpu[ig, jg] += Float32(keij)
+
+#     return nothing
+# end
+
+
 function assemble_element_gpu!(Kgpu,cv,dh) 
     i = threadIdx().x 
     j = threadIdx().y 
-    bx = blockIdx().x
+    q_point = Int64(threadIdx().z) # quadrature point
+
+    bx = blockIdx().x # element number
+
     cell_coords = getcoordinates(dh.grid)
     n_basefuncs = getnbasefunctions(cv)
-    #Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs)) # We don't need shared memory
-    keij = 0.0
-    for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords) 
-        # Get the quadrature weight
-        dΩ = getdetJdV(qv)
-        ## Get test function gradient
-        ∇δu = shape_gradient(qv, i)
-        ## Get shape function gradient
-        ∇u = shape_gradient(qv, j)
-
-        keij += (∇δu ⋅ ∇u) * dΩ 
-    end 
-    #Ke[i,j] = keij # We don't need shared memory
 
-    # TODO: Assemble local matrix here in Kgpu
-    # TODO: Add abstraction, Addittionally use assembler to assemble the local matrix into global matrix.
+    Ke = CuStaticSharedArray(Float32, (n_basefuncs, n_basefuncs))
+    Ke[i,j] = 0.0f0
+
+    sync_threads()
+
+    qv = Ferrite.quadrature_point_values(cv, q_point, cell_coords)
+    # Get the quadrature weight
+    dΩ = getdetJdV(qv)
+    ## Get test function gradient
+    ∇δu = shape_gradient(qv, i)
+    ## Get shape function gradient
+    ∇u = shape_gradient(qv, j)
+
+    sync_threads()
+
+
+    CUDA.@atomic Ke[i,j] += (∇δu ⋅ ∇u) * dΩ 
+
+    #Ke[i,j] = keij # We don't need shared memory
+
+
     dofs = dh.cell_dofs
-    @inbounds ig = dofs[(bx-1)*n_basefuncs+i]
-    @inbounds jg = dofs[(bx-1)*n_basefuncs+j] 
-    CUDA.@atomic Kgpu[ig, jg] += keij
+    ig = dofs[(bx-1)*n_basefuncs+i]
+    jg = dofs[(bx-1)*n_basefuncs+j] 
+
+
+    sync_threads()
+
+    ## Sparse Addition ##
+    # col_start = Kgpu.colptr[jg]
+    # col_end = Kgpu.colptr[jg + 1] - 1
+
+    # for k in col_start:col_end
+    #     if Kgpu.rowval[k] == ig
+    #         # Update the existing element
+    #         CUDA.@atomic Kgpu.nzval[Int32(k)] += Ke[i,j]
+    #         return
+    #     end
+    # end
+    q_point == 1 || return nothing
+    CUDA.@atomic Kgpu[ig, jg] += Ke[i,j]
+
+
     return nothing
 end
 
 
+
+
+
 function assemble_global_gpu(cellvalues,dh)
     Kgpu =   CUDA.zeros(dh.ndofs.x,dh.ndofs.x)
     n_base_funcs = getnbasefunctions(cellvalues) 
+    K = create_sparsity_pattern(dh)
+    Kgpu = GPUSparseMatrixCSC( Int32(K.m), Int32(K.n), cu(Int32.(K.colptr)), cu(Int32.(K.rowval)), cu(Float32.(K.nzval)))
     # each block represents a cell, and every (i,j) in the 2D threads represents an element in the local stiffness matrix. 
-    @cuda blocks=length(dh.grid.cells) threads = (n_base_funcs,n_base_funcs) assemble_element_gpu!(Kgpu,cellvalues,dh)
+    #n_blocks = cld(length(dh.grid.cells), 16) # 16 threads in z direction
+    @cuda blocks=length(dh.grid.cells) threads = (n_base_funcs,n_base_funcs,length(cellvalues.qr.weights)) assemble_element_gpu!(Kgpu,cellvalues,dh)
     return Kgpu
 end
 
@@ -149,20 +233,23 @@ end
 stassy(cv,dh) = assemble_global!(cv,dh,Val(false))
 
 
-qpassy(cv,dh) = assemble_global!(cv,dh,Val(true))
+# qpassy(cv,dh) = assemble_global!(cv,dh,Val(true))
 
 
 using BenchmarkTools
-using LinearAlgebra
+# using LinearAlgebra
 
 
-Kgpu =@btime assemble_global_gpu($cellvalues,$dh);
-
+Kgpu = @btime CUDA.@sync   assemble_global_gpu($cellvalues,$dh)
+#Kgpu =    assemble_global_gpu(cellvalues,dh)
 
+# sqrt(sum(abs2, Kgpu.nzval))
 norm(Kgpu)
 
-Kstd , Fstd = @btime stassy($cellvalues,$dh);
-
+#std , Fstd = @benchmark stassy(cellvalues,dh)
+Kstd , Fstd = stassy(cellvalues,dh);
+# Kstd[2,6]
 norm(Kstd)
+Kstd[1:5,1:5] 
 
 
diff --git a/src/FEValues/StaticCellValues.jl b/src/FEValues/StaticCellValues.jl
@@ -115,6 +115,7 @@ end
 end
 
 
+
 function _quadrature_point_values(fe_v::StaticCellValues, q_point::Int, cell_coords::AbstractVector,neg_detJ_err_fun::Function)
     #q_point bounds checked, ok to use @inbounds
     @inbounds begin

diff --git a/src/Ferrite.jl b/src/Ferrite.jl
@@ -146,6 +146,7 @@ include("deprecations.jl")
 include("docs.jl")
 
 # GPU support
+include("GPU/sparsematrix.jl")
 include("GPU/adapt.jl")
 include("Grid/gpu_grid.jl")
 include("Dofs/GPUDofHandler.jl")

diff --git a/src/GPU/adapt.jl b/src/GPU/adapt.jl
@@ -29,8 +29,30 @@ function Adapt.adapt_structure(to, grid::Grid)
 end
 
 function Adapt.adapt_structure(to, dh::DofHandler)
-    cell_dofs = Adapt.adapt_structure(to, cu(dh.cell_dofs))
+    cell_dofs = Adapt.adapt_structure(to, Int32.(dh.cell_dofs) |> cu)
     cells = Adapt.adapt_structure(to, cu(dh.grid.cells))
     nodes = Adapt.adapt_structure(to, cu(dh.grid.nodes))
     GPUDofHandler(cell_dofs, GPUGrid(cells,nodes))
-end
+end
+
+
+function Adapt.adapt_structure(to, K::SparseMatrixCSC)
+    m = Adapt.adapt_structure(to, Int32(K.m))
+    n = Adapt.adapt_structure(to, Int32(K.n))
+    colptr = Adapt.adapt_structure(to, Int32.(K.colptr)|>cu)
+    rowval = Adapt.adapt_structure(to, Int32.(K.rowval)|>cu)
+    nzval = Adapt.adapt_structure(to, Float32.(K.nzval) |> cu)
+    GPUSparseMatrixCSC(m, n, colptr, rowval, nzval)
+end
+
+function Adapt.adapt_structure(to, K::GPUSparseMatrixCSC)
+    m = Adapt.adapt_structure(to, K.m)
+    n = Adapt.adapt_structure(to, K.n)
+    colptr = Adapt.adapt_structure(to, K.colptr)
+    rowval = Adapt.adapt_structure(to, K.rowval)
+    nzval = Adapt.adapt_structure(to, K.nzval)
+    GPUSparseMatrixCSC(m, n, colptr, rowval, nzval)
+end
+
+
+
diff --git a/src/GPU/sparsematrix.jl b/src/GPU/sparsematrix.jl
@@ -0,0 +1,88 @@
+using SparseArrays
+using CUDA
+struct GPUSparseMatrixCSC{Tv,VEC <: AbstractVector{Int32} , NZVEC <: AbstractVector{Tv}} 
+    m::Int32                  # Number of rows
+    n::Int32                  # Number of columns
+    colptr::VEC     # Column i is in colptr[i]:(colptr[i+1]-1)
+    rowval::VEC     # Row indices of stored values
+    nzval::NZVEC       # Stored values, typically nonzeros
+end
+
+function GPUSparseMatrixCSC{Tv}(m::Int32, n::Int32, colptr::AbstractVector{Int32},
+                        rowval:: AbstractVector{Int32}, nzval::AbstractVector{Tv}) where {Tv}
+    new(m, n, colptr, rowval, nzval)
+end
+
+function GPUSparseMatrixCSC{Tv}(A::SparseMatrixCSC{Tv}) where {Tv}
+    GPUSparseMatrixCSC(A.m, A.n, A.colptr, A.rowval, A.nzval)
+end
+
+
+function Base.getindex(A::GPUSparseMatrixCSC{Tv}, i::Int32, j::Int32) where Tv 
+    # TODO: Add bounds checking
+
+    col_start = A.colptr[j] 
+    col_end = A.colptr[j + 1] - 1
+
+    for k in col_start:col_end
+        if A.rowval[k] == i
+            return A.nzval[k]
+        end
+    end
+
+    return zero(Tv)
+end
+
+function Base.setindex!(A::GPUSparseMatrixCSC{T}, v::Float32, i::Int32, j::Int32) where T
+    col_start = A.colptr[j]
+    col_end = A.colptr[j + 1] - 1
+
+    for k in col_start:col_end
+        if A.rowval[k] == i
+            # Update the existing element
+                A.nzval[k] = v
+            return
+        end
+    end
+end
+
+
+function custom_atomic_add!(A::GPUSparseMatrixCSC{T}, v::Float32, i::Int32, j::Int32) where T
+    col_start = A.colptr[j]
+    col_end = A.colptr[j + 1] - 1
+
+    for k in col_start:col_end
+        if A.rowval[k] == i
+            # Update the existing element
+            CUDA.@atomic A.nzval[k] += v
+            return
+        end
+    end
+
+end
+
+function gpu_sparse_norm(A::GPUSparseMatrixCSC{T}, p::Real=2) where T
+    if p == 2  # Frobenius norm
+        return sqrt(sum(abs2, A.nzval))
+    elseif p == 1  # L1 norm
+        col_sums = zeros(T, A.n)
+        for j in 1:A.n
+            for k in A.colptr[j]:(A.colptr[j + 1] - 1)
+                col_sums[j] += abs(A.nzval[k])
+            end
+        end
+        return maximum(col_sums)
+    elseif p == Inf  # L∞ norm
+        row_sums = zeros(T, A.m)
+        for j in 1:A.n
+            for k in A.colptr[j]:(A.colptr[j + 1] - 1)
+                i = A.rowval[k]
+                row_sums[i] += abs(A.nzval[k])
+            end
+        end
+        return maximum(row_sums)
+    else
+        return -1.0f0
+    end
+end
+