From 2a914ce75ec9326f74bc9666fbc504b5aa1cfff0 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Mon, 5 Feb 2024 14:29:42 +0100
Subject: [PATCH 1/9] Starting to add example003

---
 extensions/GalerkinToolkitExamples/src/example003.jl        | 0
 extensions/GalerkinToolkitExamples/test/example003_tests.jl | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 extensions/GalerkinToolkitExamples/src/example003.jl
 create mode 100644 extensions/GalerkinToolkitExamples/test/example003_tests.jl

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
new file mode 100644
index 00000000..e69de29b
diff --git a/extensions/GalerkinToolkitExamples/test/example003_tests.jl b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
new file mode 100644
index 00000000..e69de29b

From 55656fc25503d4f179b76312cdf505ebfa16cfe7 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Mon, 5 Feb 2024 16:19:33 +0100
Subject: [PATCH 2/9] Drafting example003

---
 .../GalerkinToolkitExamples/src/example003.jl | 623 ++++++++++++++++++
 1 file changed, 623 insertions(+)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index e69de29b..8d635446 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -0,0 +1,623 @@
+module Example003
+
+import GalerkinToolkit as gk
+import ForwardDiff
+using StaticArrays
+using LinearAlgebra
+using SparseArrays
+using WriteVTK
+
+# This one implements a vanilla sequential iso-parametric Poisson solver
+# with FE assembly on GPUs
+
+function main(params_in)
+
+    # Process params
+    params_default = default_params()
+    params = add_default_params(params_in,params_default)
+    results = Dict{Symbol,Any}()
+
+    # Setup main data structures
+    state = setup(params)
+    add_basic_info(results,params,state)
+
+    # Assemble system and solve it
+    A,b = assemble_system(state)
+    x = solve_system(A,b,params)
+
+    # Post process
+    uh = setup_uh(x,state)
+    integrate_error_norms(results,uh,state)
+    export_results(uh,params,state)
+
+    results
+end
+
+function add_default_params(params_in,params_default)
+    UmD = setdiff(keys(params_in),keys(params_default))
+    for k in UmD
+        @warn "Parameter with key :$k is unused"
+    end
+    UiD = intersect(keys(params_default),keys(params_in))
+    DmU = setdiff(keys(params_default),keys(params_in))
+    a = [ k=>params_in[k] for k in UiD]
+    b = [ k=>params_default[k] for k in DmU]
+    Dict(vcat(a,b))
+end
+
+function default_params()
+    outdir = mkpath(joinpath(@__DIR__,"..","output"))
+    params = Dict{Symbol,Any}()
+    params[:mesh] = gk.cartesian_mesh((0,1,0,1),(10,10))
+    params[:u] = (x) -> sum(x)
+    params[:f] = (x) -> 0.0
+    params[:g] = (x) -> 0.0
+    params[:dirichlet_tags] = ["boundary"]
+    params[:neumann_tags] = String[]
+    params[:integration_degree] = 2
+    params[:solver] = lu_solver()
+    params[:example_path] = joinpath(outdir,"example003")
+    params
+end
+
+function lu_solver()
+    setup(x,A,b) = lu(A)
+    setup! = lu!
+    solve! = ldiv!
+    finalize!(S) = nothing
+    (;setup,setup!,solve!,finalize!)
+end
+
+function setup_dirichlet_bcs(params)
+    mesh = params[:mesh]
+    u = params[:u]
+    dirichlet_tags = params[:dirichlet_tags]
+    node_to_tag = zeros(gk.num_nodes(mesh))
+    tag_to_name = dirichlet_tags
+    gk.classify_mesh_nodes!(node_to_tag,mesh,tag_to_name)
+    free_and_dirichlet_nodes = gk.partition_from_mask(i->i==0,node_to_tag)
+    node_to_x = gk.node_coordinates(mesh)
+    dirichlet_nodes = last(free_and_dirichlet_nodes)
+    x_dirichlet = view(node_to_x,dirichlet_nodes)
+    u_dirichlet = u.(x_dirichlet)
+    dirichlet_bcs = (;free_and_dirichlet_nodes,u_dirichlet,node_to_tag)
+end
+
+function setup_integration(params,objects)
+    mesh = params[:mesh]
+    degree = params[:integration_degree]
+    D = gk.num_dims(mesh)
+    if objects === :cells
+        d = D
+    elseif objects === :faces
+        d = D-1
+    else
+        error("")
+    end
+    # Integration
+    ref_cells = gk.reference_faces(mesh,d)
+    face_to_rid = gk.face_reference_id(mesh,d)
+    integration_rules = map(ref_cells) do ref_cell
+        gk.default_quadrature(gk.geometry(ref_cell),degree)
+    end
+    rid_to_weights = map(gk.weights,integration_rules)
+    rid_to_coords = map(gk.coordinates,integration_rules)
+    integration = (;rid_to_weights,rid_to_coords,face_to_rid,d)
+end
+
+function setup_isomap(params,integration)
+    rid_to_coords = integration.rid_to_coords
+    face_to_rid = integration.face_to_rid
+    d = integration.d
+    mesh = params[:mesh]
+    ref_cells = gk.reference_faces(mesh,d)
+    shape_funs = map(rid_to_coords,ref_cells) do q,ref_cell
+        shape_vals = gk.tabulator(ref_cell)(gk.value,q)
+        shape_grads = gk.tabulator(ref_cell)(ForwardDiff.gradient,q)
+        shape_vals, shape_grads
+    end
+    rid_to_shape_vals = map(first,shape_funs)
+    rid_to_shape_grads = map(last,shape_funs)
+    face_to_nodes = gk.face_nodes(mesh,d)
+    node_to_coords = gk.node_coordinates(mesh)
+    isomap = (;face_to_nodes,node_to_coords,face_to_rid,rid_to_shape_vals,rid_to_shape_grads,d)
+end
+
+function setup_neumann_bcs(params)
+    mesh = params[:mesh]
+    neumann_tags = params[:neumann_tags]
+    neum_face_to_face = Int[]
+    if length(neumann_tags) != 0
+        D = gk.num_dims(mesh)
+        tag_to_groups = gk.physical_faces(mesh,D-1)
+        neum_face_to_face = reduce(union,map(tag->tag_to_groups[tag],neumann_tags))
+    end
+    (;neum_face_to_face)
+end
+
+function setup_user_funs(params)
+    u = params[:u]
+    f = params[:f]
+    g = params[:g]
+    user_funs = (;u,f,g)
+end
+
+function setup(params)
+    dirichlet_bcs = setup_dirichlet_bcs(params)
+    neumann_bcs = setup_neumann_bcs(params)
+    cell_integration = setup_integration(params,:cells)
+    face_integration = setup_integration(params,:faces)
+    cell_isomap = setup_isomap(params,cell_integration)
+    face_isomap = setup_isomap(params,face_integration)
+    user_funs = setup_user_funs(params)
+    solver = params[:solver]
+    state = (;solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs)
+end
+
+function add_basic_info(results,params,state)
+    node_to_x = state.cell_isomap.node_to_coords
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    nfree = length(first(free_and_dirichlet_nodes))
+    nnodes = length(node_to_x)
+    ncells = length(cell_to_rid)
+    results[:nnodes] = nnodes
+    results[:nfree] = nfree
+    results[:ncells] = ncells
+    results
+end
+
+function integrate_matrix(state)
+
+    # We assume a single cell type
+    @assert length(state.cell_isomap.rid_to_shape_grads) == 1
+
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇se = state.cell_isomap.rid_to_shape_grads[1]
+    se = state.cell_isomap.rid_to_shape_vals[1]
+    we = state.cell_integration.rid_to_weights[1]
+    d = state.cell_integration.d
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
+    nfree = length(first(free_and_dirichlet_nodes))
+
+    cell
+
+    n_local_nodes = size(∇se,2)
+    n_int_points = size(∇se,1)
+    n_cells = length(cell_to_nodes)
+    n_coo = n_cells*n_local_nodes
+
+    Tx = SVector{d,Float64}
+    TJ = typeof(zero(Tx)*zero(Tx)')
+
+    # Assemble
+    # This one should be good enough on the CPU
+    Ie = zeros(Float64,n_local_nodes*n_cells)
+    Je = zeros(Float64,n_local_nodes*n_cells)
+    for cell in 1:n_cells
+        nodes = cell_to_nodes[cell]
+        for i in 1:n_local_nodes
+            for j in 1:n_local_nodes
+                p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
+                Ie[p_ij] = 
+                Je[p_ij] = nodes[J]
+            end
+        end
+    end
+
+    # Convert data to a format more suitable for gpus
+    # Perhaps more date is needed, and with appropriate reordering
+    nodese = JaggedArray(cell_to_nodes).data
+    cell_to_is_dirichlet = fill(false,n_cells)
+    for cell in 1:n_cells
+        nodes = cell_to_nodes[cell]
+        if any(node-> node_to_free_node[node]>nfree,nodes)
+            cell_to_is_dirichlet[cell] = true
+        end
+    end
+
+    # TODO move to the GPU
+    # The code below runs on the CPU
+    # but it is implemented in a way close of what
+    # one will implement on the GPU
+    Ae = zeros(Float64,n_local_nodes*n_local_nodes*n_cells)
+    fe = zeros(Float64,n_local_nodes*n_cells)
+
+    # Auxiliary memory
+    # can we get rid of this?
+    ∇xe = zeros(Tx,n_local_nodes*n_cells)
+    for cell in 1:n_cells
+        for iq in 1:n_int_points
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:n_local_nodes
+                p_k = (cell-1)*n_local_nodes + k
+                node = nodese[p_k]
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:n_local_nodes
+                p_k = (cell-1)*n_local_nodes + k
+                ∇xe[p_k] = invJt*∇se[iq,k]
+            end
+            # Matrix
+            for j in 1:n_local_nodes
+                p_j = (cell-1)*n_local_nodes + j
+                for i in 1:n_local_nodes
+                    p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
+                    p_i = (cell-1)*n_local_nodes + i
+                    V_coo[p_ij] += ∇xe[p_i]⋅∇xe[p_j]*dV
+                end
+            end
+            # Vector
+            fx = f(xint)
+            for k in 1:n_local_nodes
+                p_k = (cell-1)*n_local_nodes + k
+                b_coo[p_k] +=  fx*se[iq,k]*dV
+            end
+        end
+    end
+
+    # Add boundary conditions to fe
+    # We provably don't want to merge this and the previous one
+    # since this will lead to branching but it will be shorter
+    for cell in 1:n_cells
+        if !cell_to_is_dirichlet[cell]
+            continue
+        end
+        for i in 1:n_local_nodes
+            p_i = (cell-1)*n_local_nodes + i
+            ni = nodese[p_i]
+            gi = node_to_free_node[ni]
+            if gi > nfree # Not really needed
+                continue
+            end
+            for j in 1:n_local_nodes
+                p_j = (cell-1)*n_local_nodes + j
+                nj = nodese[p_j]
+                gj = node_to_free_node[nj]
+                if gj <= nfree
+                    continue
+                end
+                p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
+                Aij = Ae[p_ij]
+                uj = u_dirichlet[gj-nfree]
+                be[p_i] -= Aij*uj
+            end
+        end
+    end
+
+
+
+
+
+   
+
+
+
+
+end
+
+function assemble_system_loop(state)
+
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇s = state.cell_isomap.rid_to_shape_grads
+    s = state.cell_isomap.rid_to_shape_vals
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    f = state.user_funs.f
+    w = state.cell_integration.rid_to_weights
+    d = state.cell_integration.d
+
+    # Count coo entries
+    n_coo = 0
+    ncells = length(cell_to_nodes)
+    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
+    nfree = length(first(free_and_dirichlet_nodes))
+    for cell in 1:ncells
+        nodes = cell_to_nodes[cell]
+        for nj in nodes
+            if node_to_free_node[nj] > nfree
+                continue
+            end
+            for ni in nodes
+                if node_to_free_node[ni] > nfree
+                    continue
+                end
+                n_coo += 1
+            end
+        end
+    end
+
+    # Allocate coo values
+    I_coo = zeros(Int32,n_coo)
+    J_coo = zeros(Int32,n_coo)
+    V_coo = zeros(Float64,n_coo)
+    b = zeros(nfree)
+
+    # Allocate auxiliary buffers
+    ∇x = map(i->similar(i,size(i,2)),∇s)
+    Aes = map(i->zeros(size(i,2),size(i,2)),∇s)
+    fes = map(i->zeros(size(i,2)),∇s)
+    ues = map(i->zeros(size(i,2)),∇s)
+
+    Tx = SVector{d,Float64}
+    TJ = typeof(zero(Tx)*zero(Tx)')
+
+    # Fill coo values
+    i_coo = 0
+    for cell in 1:ncells
+        rid = cell_to_rid[cell]
+        nodes = cell_to_nodes[cell]
+        Ae = Aes[rid]
+        ue = ues[rid]
+        fe = fes[rid]
+        nl = length(nodes)
+        ∇se = ∇s[rid]
+        ∇xe = ∇x[rid]
+        se = s[rid]
+        we = w[rid]
+        nq = length(we)
+        fill!(Ae,zero(eltype(Ae)))
+        fill!(fe,zero(eltype(Ae)))
+        fill!(ue,zero(eltype(Ae)))
+        for k in 1:nl
+            nk = nodes[k]
+            gk = node_to_free_node[nk]
+            if  gk <= nfree
+                continue
+            end
+            ue[k] = u_dirichlet[gk-nfree]
+        end
+        for iq in 1:nq
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:nl
+                ∇xe[k] = invJt*∇se[iq,k]
+            end
+            for j in 1:nl
+                for i in 1:nl
+                    Ae[i,j] += ∇xe[i]⋅∇xe[j]*dV
+                end
+            end
+            fx = f(xint)
+            for k in 1:nl
+                fe[k] +=  fx*se[iq,k]*dV
+            end
+        end
+        for i in 1:nl
+            for j in 1:nl
+                fe[i] -= Ae[i,j]*ue[j]
+            end
+        end
+        for i in 1:nl
+            ni = nodes[i]
+            gi = node_to_free_node[ni]
+            if gi > nfree
+                continue
+            end
+            b[gi] += fe[i]
+        end
+        for j in 1:nl
+            nj = nodes[j]
+            gj = node_to_free_node[nj]
+            if  gj > nfree
+                continue
+            end
+            for i in 1:nl
+                ni = nodes[i]
+                gi = node_to_free_node[ni]
+                if gi > nfree
+                    continue
+                end
+                i_coo += 1
+                I_coo[i_coo] = gi
+                J_coo[i_coo] = gj
+                V_coo[i_coo] = Ae[i,j]
+            end
+        end
+    end
+    add_neumann_bcs!(b,state)
+    I_coo,J_coo,V_coo,b
+end
+
+function add_neumann_bcs!(b,state)
+
+    neum_face_to_face = state.neumann_bcs.neum_face_to_face
+    if length(neum_face_to_face) == 0
+        return nothing
+    end
+    face_to_rid = state.face_isomap.face_to_rid
+    face_to_nodes = state.face_isomap.face_to_nodes
+    ∇s_f = state.face_isomap.rid_to_shape_grads
+    s_f = state.face_isomap.rid_to_shape_vals
+    node_to_x = state.face_isomap.node_to_coords
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    g = state.user_funs.g
+    w_f = state.face_integration.rid_to_weights
+    d = state.cell_integration.d
+
+    fes = map(i->zeros(size(i,2)),s_f)
+    Tx = SVector{d,Float64}
+    Ts = typeof(zero(SVector{d-1,Float64})*zero(Tx)')
+    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
+    nfree = length(first(free_and_dirichlet_nodes))
+
+    for face in neum_face_to_face
+        rid = face_to_rid[face]
+        nodes = face_to_nodes[face]
+        ∇se = ∇s_f[rid]
+        se = s_f[rid]
+        we = w_f[rid]
+        nq = length(we)
+        fe = fes[rid]
+        fill!(fe,zero(eltype(fe)))
+        nl = length(nodes)
+        for iq in 1:nq
+            Jt = zero(Ts) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            J = transpose(Jt)
+            dS = sqrt(det(Jt*J))*we[iq]
+            gx = g(xint)
+            for k in 1:nl
+                fe[k] +=  gx*se[iq,k]*dS
+            end
+        end
+        for i in 1:nl
+            ni = nodes[i]
+            gi = node_to_free_node[ni]
+            if gi > nfree
+                continue
+            end
+            b[gi] += fe[i]
+        end
+    end
+
+    nothing
+end
+
+function assemble_system(state)
+    I,J,V,b = assemble_system_loop(state)
+    nfree = length(b)
+    A = sparse(I,J,V,nfree,nfree)
+    A,b
+end
+
+function solve_system(A,b,params)
+    solver = params[:solver]
+    x = similar(b,axes(A,2))
+    setup = solver.setup(x,A,b)
+    solver.solve!(x,setup,b)
+    solver.finalize!(setup)
+    x
+end
+
+function setup_uh(x,state)
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    u_free = x
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    nnodes = length(node_to_x)
+    uh = zeros(nnodes)
+    uh[first(free_and_dirichlet_nodes)] = u_free
+    uh[last(free_and_dirichlet_nodes)] = u_dirichlet
+    uh
+end
+
+function integrate_error_norms_loop(uh,state)
+
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇s = state.cell_isomap.rid_to_shape_grads
+    s = state.cell_isomap.rid_to_shape_vals
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    u = state.user_funs.u
+    w = state.cell_integration.rid_to_weights
+    d = state.cell_integration.d
+
+    eh1 = 0.0
+    el2 = 0.0
+    ncells = length(cell_to_rid)
+    ues = map(i->zeros(size(i,2)),∇s)
+    ∇x = map(i->similar(i,size(i,2)),∇s)
+    Tx = SVector{d,Float64}
+    TJ = typeof(zero(Tx)*zero(Tx)')
+    for cell in 1:ncells
+        rid = cell_to_rid[cell]
+        nodes = cell_to_nodes[cell]
+        ue = ues[rid]
+        nl = length(nodes)
+        ∇se = ∇s[rid]
+        ∇xe = ∇x[rid]
+        se = s[rid]
+        we = w[rid]
+        nq = length(we)
+        for k in 1:nl
+            nk = nodes[k]
+            ue[k] = uh[nk]
+        end
+        for iq in 1:nq
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:nl
+                ∇xe[k] = invJt*∇se[iq,k]
+            end
+            ux = u(xint)
+            ∇ux = ForwardDiff.gradient(u,xint)
+            ∇uhx = zero(∇ux)
+            uhx = zero(ux)
+            for k in 1:nl
+                uek = ue[k]
+                ∇uhx += uek*∇xe[k]
+                uhx += uek*se[iq,k]
+            end
+            ∇ex = ∇ux - ∇uhx
+            ex =  ux - uhx
+            eh1 += (∇ex⋅∇ex + ex*ex)*dV
+            el2 += ex*ex*dV
+        end
+    end
+    eh1, el2
+end
+
+function integrate_error_norms(results,uh,state)
+    eh1², el2² = integrate_error_norms_loop(uh,state)
+    eh1 = sqrt(eh1²)
+    el2 = sqrt(el2²)
+    results[:eh1] = eh1
+    results[:el2] = el2
+    results
+end
+
+function export_results(uh,params,state)
+    mesh = params[:mesh]
+    example_path = params[:example_path]
+    node_to_tag = state.dirichlet_bcs.node_to_tag
+    vtk_grid(example_path,gk.vtk_args(mesh)...) do vtk
+        gk.vtk_physical_faces!(vtk,mesh)
+        vtk["tag"] = node_to_tag
+        vtk["uh"] = uh
+        vtk["dim"] = gk.face_dim(mesh)
+    end
+end
+
+end # module

From d700833e70692646c24d2b4d5a446179aa2ece93 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Thu, 8 Feb 2024 15:39:26 +0100
Subject: [PATCH 3/9] First version of p-Laplacian working

---
 .../GalerkinToolkitExamples/Project.toml      |   2 +
 .../src/GalerkinToolkitExamples.jl            |   1 +
 .../GalerkinToolkitExamples/src/example003.jl | 554 +++++++++++-------
 .../test/example003_tests.jl                  |  27 +
 4 files changed, 361 insertions(+), 223 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/Project.toml b/extensions/GalerkinToolkitExamples/Project.toml
index d295d8ba..b4757349 100644
--- a/extensions/GalerkinToolkitExamples/Project.toml
+++ b/extensions/GalerkinToolkitExamples/Project.toml
@@ -9,9 +9,11 @@ GalerkinToolkit = "5e3ba9c4-dd81-444d-b69a-0e7bd7bf60a4"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 PetscCall = "1194c000-87c4-4102-b4a0-a6217ec4849e"
 Preconditioners = "af69fa37-3177-5a40-98ee-561f696e4fcd"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
diff --git a/extensions/GalerkinToolkitExamples/src/GalerkinToolkitExamples.jl b/extensions/GalerkinToolkitExamples/src/GalerkinToolkitExamples.jl
index 03dd517e..aa1d3e99 100644
--- a/extensions/GalerkinToolkitExamples/src/GalerkinToolkitExamples.jl
+++ b/extensions/GalerkinToolkitExamples/src/GalerkinToolkitExamples.jl
@@ -2,5 +2,6 @@ module GalerkinToolkitExamples
 
 include("example001.jl")
 include("example002.jl")
+include("example003.jl")
 
 end # module
diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index 8d635446..a3f6f34e 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -6,29 +6,38 @@ using StaticArrays
 using LinearAlgebra
 using SparseArrays
 using WriteVTK
+using PartitionedArrays: JaggedArray, nzindex
+using TimerOutputs
+using NLsolve
+using Random
 
-# This one implements a vanilla sequential iso-parametric Poisson solver
-# with FE assembly on GPUs
+using Preconditioners
+using IterativeSolvers: cg!
+
+# This one implements a vanilla sequential iso-parametric p-Laplacian solver by only
+# using the mesh interface.
 
 function main(params_in)
 
+    timer = TimerOutput()
+
     # Process params
     params_default = default_params()
     params = add_default_params(params_in,params_default)
     results = Dict{Symbol,Any}()
 
     # Setup main data structures
-    state = setup(params)
+    @timeit timer "setup" state = setup(params,timer)
     add_basic_info(results,params,state)
 
-    # Assemble system and solve it
-    A,b = assemble_system(state)
-    x = solve_system(A,b,params)
+    @timeit timer "solve_problem" x = solve_problem(params,state)
 
     # Post process
-    uh = setup_uh(x,state)
-    integrate_error_norms(results,uh,state)
-    export_results(uh,params,state)
+    @timeit timer "setup_uh" uh = setup_uh(x,state)
+    @timeit timer "integrate_error_norms" integrate_error_norms(results,uh,state)
+    @timeit timer "export_results" export_results(uh,params,state)
+
+    display(timer)
 
     results
 end
@@ -55,8 +64,10 @@ function default_params()
     params[:dirichlet_tags] = ["boundary"]
     params[:neumann_tags] = String[]
     params[:integration_degree] = 2
-    params[:solver] = lu_solver()
+    params[:p] = 2
+    params[:solver] = nlsolve_solver()
     params[:example_path] = joinpath(outdir,"example003")
+    params[:export_vtu] = true
     params
 end
 
@@ -68,6 +79,60 @@ function lu_solver()
     (;setup,setup!,solve!,finalize!)
 end
 
+function cg_amg_solver(;reltol=1.0e-6,verbose=false)
+    function setup(x,A,b)
+        Pl = AMGPreconditioner{SmoothedAggregation}(A)
+        cache = (;Pl,A)
+        cache
+    end
+    function solve!(x,setup,b)
+        (;Pl,A) = setup
+        fill!(x,0)
+        cg!(x, A, b;Pl,reltol,verbose)
+    end
+    function setup!(setup,A)
+        error("not implemented")
+    end
+    function finalize!(setup)
+        nothing
+    end
+    (;setup,solve!,setup!,finalize!)
+end
+
+function nlsolve_solver(;linear_solver=lu_solver(),options...)
+    function setup(x0,nlp)
+        r0,J0,cache = nlp.linearize(x0)
+        dx = similar(r0,axes(J0,2)) # TODO is there any way of reusing this in the nonlinear solve?
+        ls_setup = linear_solver.setup(dx,J0,r0)
+        function linsolve(x,A,b)
+            # TODO we dont need to re-setup for the first
+            # linear solve.
+            # This can be avoided with a Ref{Bool} shared
+            # between this function and j!
+            linear_solver.setup!(ls_setup,A)
+            linear_solver.solve!(x,ls_setup,b)
+            x
+        end
+        f!(r,x) = nlp.residual!(r,x,cache)
+        j!(J,x) = nlp.jacobian!(J,x,cache)
+        df = OnceDifferentiable(f!,j!,x0,r0,J0)
+        (;df,linsolve,J0,cache,ls_setup,linear_solver)
+    end
+    function solve!(x,setup)
+        (;df,linsolve) = setup
+        result = nlsolve(df,x;linsolve,options...)
+        x .= result.zero
+        nothing
+    end
+    function setup!(setup,x0,nlp)
+        error("todo")
+    end
+    function finalize!(setup)
+        setup.linear_solver.finalize!(setup)
+    end
+    (;setup,solve!,setup!,finalize!)
+end
+
 function setup_dirichlet_bcs(params)
     mesh = params[:mesh]
     u = params[:u]
@@ -118,7 +183,7 @@ function setup_isomap(params,integration)
     end
     rid_to_shape_vals = map(first,shape_funs)
     rid_to_shape_grads = map(last,shape_funs)
-    face_to_nodes = gk.face_nodes(mesh,d)
+    face_to_nodes = JaggedArray(gk.face_nodes(mesh,d))
     node_to_coords = gk.node_coordinates(mesh)
     isomap = (;face_to_nodes,node_to_coords,face_to_rid,rid_to_shape_vals,rid_to_shape_grads,d)
 end
@@ -142,16 +207,40 @@ function setup_user_funs(params)
     user_funs = (;u,f,g)
 end
 
-function setup(params)
-    dirichlet_bcs = setup_dirichlet_bcs(params)
-    neumann_bcs = setup_neumann_bcs(params)
-    cell_integration = setup_integration(params,:cells)
-    face_integration = setup_integration(params,:faces)
-    cell_isomap = setup_isomap(params,cell_integration)
-    face_isomap = setup_isomap(params,face_integration)
-    user_funs = setup_user_funs(params)
+function setup_dofs(params,dirichlet_bcs,cell_isomap,face_isomap)
+    free_and_dirichlet_nodes = dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
+    n_free = length(first(free_and_dirichlet_nodes))
+    n_dofs = n_free
+    cell_to_nodes = cell_isomap.face_to_nodes
+    face_to_nodes = face_isomap.face_to_nodes
+    function node_to_dof(node)
+        free_node = node_to_free_node[node]
+        if free_node <= n_free
+            return Int(free_node)
+        end
+        Int(-(free_node-n_free))
+    end
+    cell_to_dofs_data = node_to_dof.(cell_to_nodes.data)
+    face_to_dofs_data = node_to_dof.(face_to_nodes.data)
+    cell_to_dofs = JaggedArray(cell_to_dofs_data,cell_to_nodes.ptrs)
+    face_to_dofs = JaggedArray(face_to_dofs_data,face_to_nodes.ptrs)
+    dofs = (;cell_to_dofs,face_to_dofs,n_dofs)
+    dofs
+end
+
+function setup(params,timer)
+    @timeit timer "dirichlet_bcs" dirichlet_bcs = setup_dirichlet_bcs(params)
+    @timeit timer "neumann_bcs" neumann_bcs = setup_neumann_bcs(params)
+    @timeit timer "cell_integration" cell_integration = setup_integration(params,:cells)
+    @timeit timer "face_integration" face_integration = setup_integration(params,:faces)
+    @timeit timer "cell_isomap" cell_isomap = setup_isomap(params,cell_integration)
+    @timeit timer "face_isomap" face_isomap = setup_isomap(params,face_integration)
+    @timeit timer "dofs" dofs = setup_dofs(params,dirichlet_bcs,cell_isomap,face_isomap)
+    @timeit timer "user_funs" user_funs = setup_user_funs(params)
     solver = params[:solver]
-    state = (;solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs)
+    p = params[:p]
+    state = (;p,timer,solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs,dofs)
 end
 
 function add_basic_info(results,params,state)
@@ -167,147 +256,119 @@ function add_basic_info(results,params,state)
     results
 end
 
-function integrate_matrix(state)
-
-    # We assume a single cell type
-    @assert length(state.cell_isomap.rid_to_shape_grads) == 1
-
-    cell_to_nodes = state.cell_isomap.face_to_nodes
-    node_to_x = state.cell_isomap.node_to_coords
-    ∇se = state.cell_isomap.rid_to_shape_grads[1]
-    se = state.cell_isomap.rid_to_shape_vals[1]
-    we = state.cell_integration.rid_to_weights[1]
-    d = state.cell_integration.d
-    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
-    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
-    nfree = length(first(free_and_dirichlet_nodes))
-
-    cell
-
-    n_local_nodes = size(∇se,2)
-    n_int_points = size(∇se,1)
-    n_cells = length(cell_to_nodes)
-    n_coo = n_cells*n_local_nodes
-
-    Tx = SVector{d,Float64}
-    TJ = typeof(zero(Tx)*zero(Tx)')
-
-    # Assemble
-    # This one should be good enough on the CPU
-    Ie = zeros(Float64,n_local_nodes*n_cells)
-    Je = zeros(Float64,n_local_nodes*n_cells)
-    for cell in 1:n_cells
-        nodes = cell_to_nodes[cell]
-        for i in 1:n_local_nodes
-            for j in 1:n_local_nodes
-                p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
-                Ie[p_ij] = 
-                Je[p_ij] = nodes[J]
-            end
-        end
+function nonlinear_problem(state)
+    function initial()
+        n = state.dofs.n_dofs
+        Random.seed!(1)
+        rand(Float64,n)
     end
-
-    # Convert data to a format more suitable for gpus
-    # Perhaps more date is needed, and with appropriate reordering
-    nodese = JaggedArray(cell_to_nodes).data
-    cell_to_is_dirichlet = fill(false,n_cells)
-    for cell in 1:n_cells
-        nodes = cell_to_nodes[cell]
-        if any(node-> node_to_free_node[node]>nfree,nodes)
-            cell_to_is_dirichlet[cell] = true
-        end
+    function linearize(u_dofs)
+        r,J,V,K = assemble_sybmolic(state)
+        cache = (V,K)
+        residual_cells!(r,u_dofs,state)
+        residual_faces!(r,u_dofs,state)
+        jacobian_cells!(V,u_dofs,state)
+        setcoofast!(J,V,K)
+        r,J,cache
+    end
+    function residual!(r,u_dofs,cache)
+        fill!(r,0)
+        residual_cells!(r,u_dofs,state)
+        residual_faces!(r,u_dofs,state)
+        r
+    end
+    function jacobian!(J,u_dofs,cache)
+        (V,K) = cache 
+        LinearAlgebra.fillstored!(J,0)
+        jacobian_cells!(V,u_dofs,state)
+        setcoofast!(J,V,K)
+        J
     end
+    (;initial,linearize,residual!,jacobian!)
+end
 
-    # TODO move to the GPU
-    # The code below runs on the CPU
-    # but it is implemented in a way close of what
-    # one will implement on the GPU
-    Ae = zeros(Float64,n_local_nodes*n_local_nodes*n_cells)
-    fe = zeros(Float64,n_local_nodes*n_cells)
-
-    # Auxiliary memory
-    # can we get rid of this?
-    ∇xe = zeros(Tx,n_local_nodes*n_cells)
-    for cell in 1:n_cells
-        for iq in 1:n_int_points
-            Jt = zero(TJ) 
-            xint = zero(Tx)
-            for k in 1:n_local_nodes
-                p_k = (cell-1)*n_local_nodes + k
-                node = nodese[p_k]
-                x = node_to_x[nodes[k]]
-                ∇sqx = ∇se[iq,k]
-                sqx = se[iq,k]
-                Jt += ∇sqx*x'
-                xint += sqx*x
-            end
-            detJt = det(Jt)
-            invJt = inv(Jt)
-            dV = abs(detJt)*we[iq]
-            for k in 1:n_local_nodes
-                p_k = (cell-1)*n_local_nodes + k
-                ∇xe[p_k] = invJt*∇se[iq,k]
+function assemble_sybmolic(state)
+    cell_to_dofs = state.dofs.cell_to_dofs
+    n_dofs = state.dofs.n_dofs
+
+    n_coo = 0
+    ncells = length(cell_to_dofs)
+    for cell in 1:ncells
+        dofs = cell_to_dofs[cell]
+        ndofs = length(dofs)
+        for i in 1:ndofs
+            if !(dofs[i]>0)
+                continue
             end
-            # Matrix
-            for j in 1:n_local_nodes
-                p_j = (cell-1)*n_local_nodes + j
-                for i in 1:n_local_nodes
-                    p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
-                    p_i = (cell-1)*n_local_nodes + i
-                    V_coo[p_ij] += ∇xe[p_i]⋅∇xe[p_j]*dV
+            for j in 1:ndofs
+                if !(dofs[j]>0)
+                    continue
                 end
-            end
-            # Vector
-            fx = f(xint)
-            for k in 1:n_local_nodes
-                p_k = (cell-1)*n_local_nodes + k
-                b_coo[p_k] +=  fx*se[iq,k]*dV
+                n_coo += 1
             end
         end
     end
 
-    # Add boundary conditions to fe
-    # We provably don't want to merge this and the previous one
-    # since this will lead to branching but it will be shorter
-    for cell in 1:n_cells
-        if !cell_to_is_dirichlet[cell]
-            continue
-        end
-        for i in 1:n_local_nodes
-            p_i = (cell-1)*n_local_nodes + i
-            ni = nodese[p_i]
-            gi = node_to_free_node[ni]
-            if gi > nfree # Not really needed
+    I_coo = Vector{Int32}(undef,n_coo)
+    J_coo = Vector{Int32}(undef,n_coo)
+    V_coo = Vector{Float64}(undef,n_coo)
+    r = zeros(Float64,state.dofs.n_dofs)
+
+    n_coo = 0
+    for cell in 1:ncells
+        dofs = cell_to_dofs[cell]
+        ndofs = length(dofs)
+        for i in 1:ndofs
+            if !(dofs[i]>0)
                 continue
             end
-            for j in 1:n_local_nodes
-                p_j = (cell-1)*n_local_nodes + j
-                nj = nodese[p_j]
-                gj = node_to_free_node[nj]
-                if gj <= nfree
+            dofs_i = dofs[i]
+            for j in 1:ndofs
+                if !(dofs[j]>0)
                     continue
                 end
-                p_ij = (cell-1)*n_local_nodes*n_local_nodes + (j-1)*n_local_nodes + i
-                Aij = Ae[p_ij]
-                uj = u_dirichlet[gj-nfree]
-                be[p_i] -= Aij*uj
+                n_coo += 1
+                I_coo[n_coo] = dofs_i
+                J_coo[n_coo] = dofs[j]
             end
         end
     end
 
+    J = sparse(I_coo,J_coo,V_coo,n_dofs,n_dofs)
+    K = precompute_nzindex(J,I_coo,J_coo)
+    r,J,V_coo,K
+end
 
+function precompute_nzindex(A,I,J)
+    K = zeros(Int32,length(I))
+    for (p,(i,j)) in enumerate(zip(I,J))
+        K[p] = nzindex(A,i,j)
+    end
+    K
+end
 
+function setcoofast!(A,V,K)
+    LinearAlgebra.fillstored!(A,0)
+    A_nz = nonzeros(A)
+    for (k,v) in zip(K,V)
+        A_nz[k] += v
+    end
+    A
+end
 
+@inline function p_laplace_residual(∇u,∇dv,dv,f,p)
+    ∇dv⋅((norm(∇u)^(p-2))*∇u) - f*dv
+end
 
-   
-
-
-
-
+@inline function p_laplace_jacobian(∇u,∇du,∇dv,p)
+    pm2 = p-2
+    pm4 = p-4
+    ∇dv⋅((norm(∇u)^pm2)*∇du) + ∇dv⋅(pm2*(norm(∇u)^pm4)*(∇u⋅∇du)*∇u)
 end
 
-function assemble_system_loop(state)
+function residual_cells!(r,u_dofs,state)
 
+    cell_to_dofs = state.dofs.cell_to_dofs
     cell_to_nodes = state.cell_isomap.face_to_nodes
     cell_to_rid = state.cell_isomap.face_to_rid
     free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
@@ -318,141 +379,194 @@ function assemble_system_loop(state)
     f = state.user_funs.f
     w = state.cell_integration.rid_to_weights
     d = state.cell_integration.d
-
-    # Count coo entries
-    n_coo = 0
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
     ncells = length(cell_to_nodes)
-    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
-    nfree = length(first(free_and_dirichlet_nodes))
+    p = state.p
+
+    # Allocate auxiliary buffers
+    ∇x = map(i->similar(i,size(i,2)),∇s)
+    Aes = map(i->zeros(size(i,2),size(i,2)),∇s)
+    fes = map(i->zeros(size(i,2)),∇s)
+    ues = map(i->zeros(size(i,2)),∇s)
+    ∇st = map(m->collect(permutedims(m)),∇s)
+    st = map(m->collect(permutedims(m)),s)
+
+    Tx = eltype(node_to_x)
+    TJ = typeof(zero(Tx)*zero(Tx)')
+    T = eltype(Tx)
+
+    i_coo = 0
     for cell in 1:ncells
+        rid = cell_to_rid[cell]
         nodes = cell_to_nodes[cell]
-        for nj in nodes
-            if node_to_free_node[nj] > nfree
-                continue
+        dofs = cell_to_dofs[cell]
+        Ae = Aes[rid]
+        fe = fes[rid]
+        ue = ues[rid]
+        nl = length(nodes)
+        ∇ste = ∇st[rid]
+        ∇xe = ∇x[rid]
+        ste = st[rid]
+        we = w[rid]
+        nq = length(we)
+        fill!(Ae,zero(eltype(Ae)))
+        fill!(fe,zero(eltype(Ae)))
+        # Integrate
+        for iq in 1:nq
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇ste[k,iq]
+                sqx = ste[k,iq]
+                Jt += ∇sqx*x'
+                xint += sqx*x
             end
-            for ni in nodes
-                if node_to_free_node[ni] > nfree
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:nl
+                dof_k = dofs[k]
+                if dof_k < 1
+                    uk = u_dirichlet[-dof_k]
+                    ue[k] = uk
                     continue
                 end
-                n_coo += 1
+                ue[k] = u_dofs[dof_k]
+            end
+            ∇u = zero(Tx)
+            for k in 1:nl
+                uek = ue[k] 
+                ∇xek = invJt*∇ste[k,iq]
+                ∇u += ∇xek*uek
+                ∇xe[k] = ∇xek
+            end
+            fx = f(xint)
+            for k in 1:nl
+                dv = ste[k,iq]
+                ∇dv = ∇xe[k]
+                fe[k] += ( p_laplace_residual(∇u,∇dv,dv,fx,p) )*dV
+            end
+        end
+        for i in 1:nl
+            if !(dofs[i]>0)
+                continue
             end
+            r[dofs[i]] += fe[i]
         end
     end
+end
+
+function jacobian_cells!(V_coo,u_dofs,state)
 
-    # Allocate coo values
-    I_coo = zeros(Int32,n_coo)
-    J_coo = zeros(Int32,n_coo)
-    V_coo = zeros(Float64,n_coo)
-    b = zeros(nfree)
+    cell_to_dofs = state.dofs.cell_to_dofs
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇s = state.cell_isomap.rid_to_shape_grads
+    s = state.cell_isomap.rid_to_shape_vals
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    f = state.user_funs.f
+    w = state.cell_integration.rid_to_weights
+    d = state.cell_integration.d
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    ncells = length(cell_to_nodes)
+    p = state.p
 
     # Allocate auxiliary buffers
     ∇x = map(i->similar(i,size(i,2)),∇s)
     Aes = map(i->zeros(size(i,2),size(i,2)),∇s)
     fes = map(i->zeros(size(i,2)),∇s)
     ues = map(i->zeros(size(i,2)),∇s)
+    ∇st = map(m->collect(permutedims(m)),∇s)
+    st = map(m->collect(permutedims(m)),s)
 
-    Tx = SVector{d,Float64}
+    Tx = eltype(node_to_x)
     TJ = typeof(zero(Tx)*zero(Tx)')
+    T = eltype(Tx)
 
-    # Fill coo values
     i_coo = 0
     for cell in 1:ncells
         rid = cell_to_rid[cell]
         nodes = cell_to_nodes[cell]
+        dofs = cell_to_dofs[cell]
         Ae = Aes[rid]
-        ue = ues[rid]
         fe = fes[rid]
+        ue = ues[rid]
         nl = length(nodes)
-        ∇se = ∇s[rid]
+        ∇ste = ∇st[rid]
         ∇xe = ∇x[rid]
-        se = s[rid]
+        ste = st[rid]
         we = w[rid]
         nq = length(we)
         fill!(Ae,zero(eltype(Ae)))
         fill!(fe,zero(eltype(Ae)))
-        fill!(ue,zero(eltype(Ae)))
-        for k in 1:nl
-            nk = nodes[k]
-            gk = node_to_free_node[nk]
-            if  gk <= nfree
-                continue
-            end
-            ue[k] = u_dirichlet[gk-nfree]
-        end
+        # Integrate
         for iq in 1:nq
             Jt = zero(TJ) 
-            xint = zero(Tx)
             for k in 1:nl
                 x = node_to_x[nodes[k]]
-                ∇sqx = ∇se[iq,k]
-                sqx = se[iq,k]
+                ∇sqx = ∇ste[k,iq]
+                sqx = ste[k,iq]
                 Jt += ∇sqx*x'
-                xint += sqx*x
             end
             detJt = det(Jt)
             invJt = inv(Jt)
             dV = abs(detJt)*we[iq]
             for k in 1:nl
-                ∇xe[k] = invJt*∇se[iq,k]
-            end
-            for j in 1:nl
-                for i in 1:nl
-                    Ae[i,j] += ∇xe[i]⋅∇xe[j]*dV
+                dof_k = dofs[k]
+                if dof_k < 1
+                    uk = u_dirichlet[-dof_k]
+                    ue[k] = uk
+                    continue
                 end
+                ue[k] = u_dofs[dof_k]
             end
-            fx = f(xint)
+            ∇u = zero(Tx)
             for k in 1:nl
-                fe[k] +=  fx*se[iq,k]*dV
+                uek = ue[k]
+                ∇xek = invJt*∇ste[k,iq]
+                ∇u += ∇xek*uek
+                ∇xe[k] = ∇xek
             end
-        end
-        for i in 1:nl
             for j in 1:nl
-                fe[i] -= Ae[i,j]*ue[j]
+                ∇du = ∇xe[j]
+                for i in 1:nl
+                    ∇dv = ∇xe[i]
+                    Ae[i,j] += ( p_laplace_jacobian(∇u,∇du,∇dv,p) )*dV
+                end
             end
         end
+        # Set the result in the output array
         for i in 1:nl
-            ni = nodes[i]
-            gi = node_to_free_node[ni]
-            if gi > nfree
+            if !(dofs[i]>0)
                 continue
             end
-            b[gi] += fe[i]
-        end
-        for j in 1:nl
-            nj = nodes[j]
-            gj = node_to_free_node[nj]
-            if  gj > nfree
-                continue
-            end
-            for i in 1:nl
-                ni = nodes[i]
-                gi = node_to_free_node[ni]
-                if gi > nfree
+            for j in 1:nl
+                if !(dofs[j]>0)
                     continue
                 end
                 i_coo += 1
-                I_coo[i_coo] = gi
-                J_coo[i_coo] = gj
                 V_coo[i_coo] = Ae[i,j]
             end
         end
     end
-    add_neumann_bcs!(b,state)
-    I_coo,J_coo,V_coo,b
 end
 
-function add_neumann_bcs!(b,state)
+function residual_faces!(r,u_dofs,state)
 
     neum_face_to_face = state.neumann_bcs.neum_face_to_face
     if length(neum_face_to_face) == 0
         return nothing
     end
+
     face_to_rid = state.face_isomap.face_to_rid
+    face_to_dofs = state.dofs.face_to_dofs
     face_to_nodes = state.face_isomap.face_to_nodes
     ∇s_f = state.face_isomap.rid_to_shape_grads
     s_f = state.face_isomap.rid_to_shape_vals
     node_to_x = state.face_isomap.node_to_coords
-    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
     g = state.user_funs.g
     w_f = state.face_integration.rid_to_weights
     d = state.cell_integration.d
@@ -460,12 +574,11 @@ function add_neumann_bcs!(b,state)
     fes = map(i->zeros(size(i,2)),s_f)
     Tx = SVector{d,Float64}
     Ts = typeof(zero(SVector{d-1,Float64})*zero(Tx)')
-    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
-    nfree = length(first(free_and_dirichlet_nodes))
 
     for face in neum_face_to_face
         rid = face_to_rid[face]
         nodes = face_to_nodes[face]
+        dofs = face_to_dofs[face]
         ∇se = ∇s_f[rid]
         se = s_f[rid]
         we = w_f[rid]
@@ -491,31 +604,22 @@ function add_neumann_bcs!(b,state)
             end
         end
         for i in 1:nl
-            ni = nodes[i]
-            gi = node_to_free_node[ni]
-            if gi > nfree
+            if !(dofs[i]>0)
                 continue
             end
-            b[gi] += fe[i]
+            r[dofs[i]] -= fe[i]
         end
     end
-
-    nothing
 end
 
-function assemble_system(state)
-    I,J,V,b = assemble_system_loop(state)
-    nfree = length(b)
-    A = sparse(I,J,V,nfree,nfree)
-    A,b
-end
-
-function solve_system(A,b,params)
+function solve_problem(params,state)
+    timer = state.timer
+    problem = nonlinear_problem(state)
     solver = params[:solver]
-    x = similar(b,axes(A,2))
-    setup = solver.setup(x,A,b)
-    solver.solve!(x,setup,b)
-    solver.finalize!(setup)
+    x = problem.initial()
+    @timeit timer "solver.setup" setup = solver.setup(x,problem)
+    @timeit timer "solver.solve!" solver.solve!(x,setup)
+    @timeit timer "solver.finalize!" solver.finalize!(setup)
     x
 end
 
@@ -549,7 +653,7 @@ function integrate_error_norms_loop(uh,state)
     ncells = length(cell_to_rid)
     ues = map(i->zeros(size(i,2)),∇s)
     ∇x = map(i->similar(i,size(i,2)),∇s)
-    Tx = SVector{d,Float64}
+    Tx = eltype(node_to_x)
     TJ = typeof(zero(Tx)*zero(Tx)')
     for cell in 1:ncells
         rid = cell_to_rid[cell]
@@ -609,6 +713,9 @@ function integrate_error_norms(results,uh,state)
 end
 
 function export_results(uh,params,state)
+    if ! params[:export_vtu]
+        return nothing
+    end
     mesh = params[:mesh]
     example_path = params[:example_path]
     node_to_tag = state.dirichlet_bcs.node_to_tag
@@ -618,6 +725,7 @@ function export_results(uh,params,state)
         vtk["uh"] = uh
         vtk["dim"] = gk.face_dim(mesh)
     end
+    nothing
 end
 
 end # module
diff --git a/extensions/GalerkinToolkitExamples/test/example003_tests.jl b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
index e69de29b..1d6e0cf6 100644
--- a/extensions/GalerkinToolkitExamples/test/example003_tests.jl
+++ b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
@@ -0,0 +1,27 @@
+module Example003Tests
+
+import GalerkinToolkit as gk
+using GalerkinToolkitExamples: Example003
+using Test
+
+tol = 1.0e-10
+
+params = Dict{Symbol,Any}()
+params[:mesh] = gk.cartesian_mesh((0,1,0,1),(3,3))
+results = Example003.main(params)
+@test results[:eh1] < tol
+@test results[:el2] < tol
+
+params = Dict{Symbol,Any}()
+params[:mesh] = gk.cartesian_mesh((0,3,0,2),(5,10),complexify=false)
+params[:dirichlet_tags] = ["1-face-1","1-face-3","1-face-4"]
+params[:neumann_tags] = ["1-face-2"]
+params[:u] = (x) -> sum(x)
+params[:f] = (x) -> 0.0
+params[:g] = (x) -> 1.0
+results = Example003.main(params)
+@test results[:eh1] < tol
+@test results[:el2] < tol
+@test results[:ncells] == 10*5
+
+end # module

From db11f0c950b3302582315db7933567811ff37456 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Thu, 8 Feb 2024 17:20:09 +0100
Subject: [PATCH 4/9] More enhancements in example003

---
 .../GalerkinToolkitExamples/src/example001.jl | 10 +--
 .../GalerkinToolkitExamples/src/example003.jl | 69 ++++++-------------
 .../test/example003_tests.jl                  | 21 +++++-
 3 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example001.jl b/extensions/GalerkinToolkitExamples/src/example001.jl
index f58cb7dc..80ad1a6a 100644
--- a/extensions/GalerkinToolkitExamples/src/example001.jl
+++ b/extensions/GalerkinToolkitExamples/src/example001.jl
@@ -78,19 +78,21 @@ function lu_solver()
     (;setup,setup!,solve!,finalize!)
 end
 
-function cg_amg_solver(;reltol=1.0e-6,verbose=false)
+function cg_amg_solver(;reltol=1.0e-6,verbose=false,timer=TimerOutput())
     function setup(x,A,b)
         Pl = AMGPreconditioner{SmoothedAggregation}(A)
-        cache = (;Pl,A)
+        cache = Ref((;Pl,A))
         cache
     end
     function solve!(x,setup,b)
-        (;Pl,A) = setup
+        (;Pl,A) = setup[]
         fill!(x,0)
         cg!(x, A, b;Pl,reltol,verbose)
     end
     function setup!(setup,A)
-        error("not implemented")
+        Pl = AMGPreconditioner{SmoothedAggregation}(A)
+        setup[] = (;Pl,A)
+        setup
     end
     function finalize!(setup)
         nothing
diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index a3f6f34e..efee5e0c 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -10,6 +10,7 @@ using PartitionedArrays: JaggedArray, nzindex
 using TimerOutputs
 using NLsolve
 using Random
+using GalerkinToolkitExamples: Example001
 
 using Preconditioners
 using IterativeSolvers: cg!
@@ -19,15 +20,15 @@ using IterativeSolvers: cg!
 
 function main(params_in)
 
-    timer = TimerOutput()
 
     # Process params
     params_default = default_params()
     params = add_default_params(params_in,params_default)
     results = Dict{Symbol,Any}()
+    timer = params[:timer]
 
     # Setup main data structures
-    @timeit timer "setup" state = setup(params,timer)
+    @timeit timer "setup" state = setup(params)
     add_basic_info(results,params,state)
 
     @timeit timer "solve_problem" x = solve_problem(params,state)
@@ -37,7 +38,7 @@ function main(params_in)
     @timeit timer "integrate_error_norms" integrate_error_norms(results,uh,state)
     @timeit timer "export_results" export_results(uh,params,state)
 
-    display(timer)
+    print_timer(timer,allocations=false)
 
     results
 end
@@ -68,49 +69,22 @@ function default_params()
     params[:solver] = nlsolve_solver()
     params[:example_path] = joinpath(outdir,"example003")
     params[:export_vtu] = true
+    params[:timer] = TimerOutput()
     params
 end
 
-function lu_solver()
-    setup(x,A,b) = lu(A)
-    setup! = lu!
-    solve! = ldiv!
-    finalize!(S) = nothing
-    (;setup,setup!,solve!,finalize!)
-end
-
-function cg_amg_solver(;reltol=1.0e-6,verbose=false)
-    function setup(x,A,b)
-        Pl = AMGPreconditioner{SmoothedAggregation}(A)
-        cache = (;Pl,A)
-        cache
-    end
-    function solve!(x,setup,b)
-        (;Pl,A) = setup
-        fill!(x,0)
-        cg!(x, A, b;Pl,reltol,verbose)
-    end
-    function setup!(setup,A)
-        error("not implemented")
-    end
-    function finalize!(setup)
-        nothing
-    end
-    (;setup,solve!,setup!,finalize!)
-end
-
-function nlsolve_solver(;linear_solver=lu_solver(),options...)
+function nlsolve_solver(;linear_solver=Example001.lu_solver(),timer=TimerOutput(),options...)
     function setup(x0,nlp)
-        r0,J0,cache = nlp.linearize(x0)
+        @timeit timer "linearize" r0,J0,cache = nlp.linearize(x0)
         dx = similar(r0,axes(J0,2)) # TODO is there any way of reusing this in the nonlinear solve?
-        ls_setup = linear_solver.setup(dx,J0,r0)
+        @timeit timer "linear_solver_setup" ls_setup = linear_solver.setup(dx,J0,r0)
         function linsolve(x,A,b)
             # TODO we dont need to re-setup for the first
             # linear solve.
             # This can be avoided with a Ref{Bool} shared
             # between this function and j!
-            linear_solver.setup!(ls_setup,A)
-            linear_solver.solve!(x,ls_setup,b)
+            @timeit timer "linear_solver_setup!"  linear_solver.setup!(ls_setup,A)
+            @timeit timer "linear_solver_solve!" linear_solver.solve!(x,ls_setup,b)
             x
         end
         f!(r,x) = nlp.residual!(r,x,cache)
@@ -229,7 +203,8 @@ function setup_dofs(params,dirichlet_bcs,cell_isomap,face_isomap)
     dofs
 end
 
-function setup(params,timer)
+function setup(params)
+    timer = params[:timer]
     @timeit timer "dirichlet_bcs" dirichlet_bcs = setup_dirichlet_bcs(params)
     @timeit timer "neumann_bcs" neumann_bcs = setup_neumann_bcs(params)
     @timeit timer "cell_integration" cell_integration = setup_integration(params,:cells)
@@ -257,31 +232,31 @@ function add_basic_info(results,params,state)
 end
 
 function nonlinear_problem(state)
+    timer = state.timer
     function initial()
         n = state.dofs.n_dofs
         Random.seed!(1)
         rand(Float64,n)
     end
     function linearize(u_dofs)
-        r,J,V,K = assemble_sybmolic(state)
+        @timeit timer "assemble_sybmolic" r,J,V,K = assemble_sybmolic(state)
         cache = (V,K)
-        residual_cells!(r,u_dofs,state)
-        residual_faces!(r,u_dofs,state)
-        jacobian_cells!(V,u_dofs,state)
-        setcoofast!(J,V,K)
+        @timeit timer "residual_cells!" residual_cells!(r,u_dofs,state)
+        @timeit timer "residual_faces!" residual_faces!(r,u_dofs,state)
+        @timeit timer "jacobian_cells!" jacobian_cells!(V,u_dofs,state)
+        @timeit timer "sparse_matrix!" setcoofast!(J,V,K)
         r,J,cache
     end
     function residual!(r,u_dofs,cache)
         fill!(r,0)
-        residual_cells!(r,u_dofs,state)
-        residual_faces!(r,u_dofs,state)
+        @timeit timer "residual_cells!" residual_cells!(r,u_dofs,state)
+        @timeit timer "residual_faces!" residual_faces!(r,u_dofs,state)
         r
     end
     function jacobian!(J,u_dofs,cache)
         (V,K) = cache 
-        LinearAlgebra.fillstored!(J,0)
-        jacobian_cells!(V,u_dofs,state)
-        setcoofast!(J,V,K)
+        @timeit timer "jacobian_cells!" jacobian_cells!(V,u_dofs,state)
+        @timeit timer "sparse_matrix!" setcoofast!(J,V,K)
         J
     end
     (;initial,linearize,residual!,jacobian!)
diff --git a/extensions/GalerkinToolkitExamples/test/example003_tests.jl b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
index 1d6e0cf6..a9b09be3 100644
--- a/extensions/GalerkinToolkitExamples/test/example003_tests.jl
+++ b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
@@ -1,8 +1,9 @@
 module Example003Tests
 
 import GalerkinToolkit as gk
-using GalerkinToolkitExamples: Example003
+using GalerkinToolkitExamples: Example003, Example001
 using Test
+    using TimerOutputs
 
 tol = 1.0e-10
 
@@ -24,4 +25,22 @@ results = Example003.main(params)
 @test results[:el2] < tol
 @test results[:ncells] == 10*5
 
+params = Dict{Symbol,Any}()
+timer = TimerOutput()
+@timeit timer "cartesian_mesh" params[:mesh] = gk.cartesian_mesh((0,3,0,2,0,1),(5,5,5))
+linear_solver = Example001.cg_amg_solver(;verbose=true)
+params[:solver] = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
+params[:export_vtu] = false
+params[:timer] = timer
+Example003.main(params)
+
+params = Dict{Symbol,Any}()
+timer = TimerOutput()
+@timeit timer "cartesian_mesh" params[:mesh] = gk.cartesian_mesh((0,3,0,2,0,1),(70,70,70))
+linear_solver = Example001.cg_amg_solver(;verbose=true)
+params[:solver] = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
+params[:export_vtu] = false
+params[:timer] = timer
+Example003.main(params)
+
 end # module

From 6ccf2e7f936fbd3c78ed175790d2c5b99f0df4b2 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Thu, 8 Feb 2024 18:29:41 +0100
Subject: [PATCH 5/9] More enhancements in example003

---
 .../GalerkinToolkitExamples/src/example003.jl | 66 +++++++++++++++++--
 1 file changed, 60 insertions(+), 6 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index efee5e0c..623f398c 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -6,7 +6,7 @@ using StaticArrays
 using LinearAlgebra
 using SparseArrays
 using WriteVTK
-using PartitionedArrays: JaggedArray, nzindex
+using PartitionedArrays: JaggedArray, nzindex, val_parameter
 using TimerOutputs
 using NLsolve
 using Random
@@ -244,7 +244,7 @@ function nonlinear_problem(state)
         @timeit timer "residual_cells!" residual_cells!(r,u_dofs,state)
         @timeit timer "residual_faces!" residual_faces!(r,u_dofs,state)
         @timeit timer "jacobian_cells!" jacobian_cells!(V,u_dofs,state)
-        @timeit timer "sparse_matrix!" setcoofast!(J,V,K)
+        @timeit timer "sparse_matrix!" sparse_matrix!(J,V,K)
         r,J,cache
     end
     function residual!(r,u_dofs,cache)
@@ -256,7 +256,7 @@ function nonlinear_problem(state)
     function jacobian!(J,u_dofs,cache)
         (V,K) = cache 
         @timeit timer "jacobian_cells!" jacobian_cells!(V,u_dofs,state)
-        @timeit timer "sparse_matrix!" setcoofast!(J,V,K)
+        @timeit timer "sparse_matrix!" sparse_matrix!(J,V,K)
         J
     end
     (;initial,linearize,residual!,jacobian!)
@@ -309,23 +309,77 @@ function assemble_sybmolic(state)
         end
     end
 
-    J = sparse(I_coo,J_coo,V_coo,n_dofs,n_dofs)
-    K = precompute_nzindex(J,I_coo,J_coo)
+    J,K = sparse_matrix(I_coo,J_coo,V_coo,n_dofs,n_dofs;reuse=true,skip_out_of_bounds=true)
     r,J,V_coo,K
 end
 
+struct FilteredCooVector{F,A,B,C,T} <: AbstractVector{T}
+    f::F
+    I::A
+    J::B
+    V::C
+    function FilteredCooVector(f::F,I::A,J::B,V::C) where {F,A,B,C}
+        T = eltype(C)
+        new{F,A,B,C,T}(f,I,J,V)
+    end
+end
+Base.size(a::FilteredCooVector) = size(a.V)
+Base.IndexStyle(::Type{<:FilteredCooVector}) = IndexLinear()
+Base.@propagate_inbounds function Base.getindex(a::FilteredCooVector,k::Int)
+    i = a.I[k]
+    j = a.J[k]
+    v = a.V[k]
+    if i < 1 || j < 1
+        return a.f(v)
+    end
+    v
+end
+
+function sparse_matrix(I,J,V,m,n;kwargs...)
+    sparse_matrix(sparse,I,J,V,m,n;kwargs...)
+end
+function sparse_matrix(f,I,J,V,m,n;reuse=Val(false),skip_out_of_bounds=true)
+    if !skip_out_of_bounds
+        I2 = I
+        J2 = J
+        V2 = V
+    elseif m*n == 0
+        Ti = eltype(I)
+        T = eltype(V)
+        I2 = Ti[]
+        J2 = Ti[]
+        V2 = Tv[]
+    else
+        I2 = FilteredCooVector(one,I,J,I)
+        J2 = FilteredCooVector(one,I,J,J)
+        V2 = FilteredCooVector(zero,I,J,V)
+    end
+    A = f(I2,J2,V2,m,n)
+    if val_parameter(reuse)
+        K = precompute_nzindex(A,I,J)
+        return A,K
+    end
+    A
+end
+
 function precompute_nzindex(A,I,J)
     K = zeros(Int32,length(I))
     for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
         K[p] = nzindex(A,i,j)
     end
     K
 end
 
-function setcoofast!(A,V,K)
+function sparse_matrix!(A,V,K)
     LinearAlgebra.fillstored!(A,0)
     A_nz = nonzeros(A)
     for (k,v) in zip(K,V)
+        if k < 1
+            continue
+        end
         A_nz[k] += v
     end
     A

From 2c7a0620aca00f5d12b9edde1fd2db1d833a01dd Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Fri, 9 Feb 2024 09:28:43 +0100
Subject: [PATCH 6/9] Allowing negative indices in coo vectors

---
 .../GalerkinToolkitExamples/src/example003.jl | 26 ++-----------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index 623f398c..84348d9a 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -271,17 +271,7 @@ function assemble_sybmolic(state)
     for cell in 1:ncells
         dofs = cell_to_dofs[cell]
         ndofs = length(dofs)
-        for i in 1:ndofs
-            if !(dofs[i]>0)
-                continue
-            end
-            for j in 1:ndofs
-                if !(dofs[j]>0)
-                    continue
-                end
-                n_coo += 1
-            end
-        end
+        n_coo += ndofs*ndofs
     end
 
     I_coo = Vector{Int32}(undef,n_coo)
@@ -294,14 +284,8 @@ function assemble_sybmolic(state)
         dofs = cell_to_dofs[cell]
         ndofs = length(dofs)
         for i in 1:ndofs
-            if !(dofs[i]>0)
-                continue
-            end
             dofs_i = dofs[i]
             for j in 1:ndofs
-                if !(dofs[j]>0)
-                    continue
-                end
                 n_coo += 1
                 I_coo[n_coo] = dofs_i
                 J_coo[n_coo] = dofs[j]
@@ -309,7 +293,7 @@ function assemble_sybmolic(state)
         end
     end
 
-    J,K = sparse_matrix(I_coo,J_coo,V_coo,n_dofs,n_dofs;reuse=true,skip_out_of_bounds=true)
+    J,K = sparse_matrix(I_coo,J_coo,V_coo,n_dofs,n_dofs;reuse=true)
     r,J,V_coo,K
 end
 
@@ -569,13 +553,7 @@ function jacobian_cells!(V_coo,u_dofs,state)
         end
         # Set the result in the output array
         for i in 1:nl
-            if !(dofs[i]>0)
-                continue
-            end
             for j in 1:nl
-                if !(dofs[j]>0)
-                    continue
-                end
                 i_coo += 1
                 V_coo[i_coo] = Ae[i,j]
             end

From c71110ec225fb3150ba75e96d5f2caf316f60466 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Fri, 9 Feb 2024 09:38:59 +0100
Subject: [PATCH 7/9] Moving code to PartitionedArrays

---
 .../GalerkinToolkitExamples/src/example003.jl | 75 +------------------
 .../test/example003_tests.jl                  |  1 +
 2 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index 84348d9a..c7934bf1 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -6,7 +6,7 @@ using StaticArrays
 using LinearAlgebra
 using SparseArrays
 using WriteVTK
-using PartitionedArrays: JaggedArray, nzindex, val_parameter
+using PartitionedArrays: JaggedArray, sparse_matrix, sparse_matrix!
 using TimerOutputs
 using NLsolve
 using Random
@@ -20,7 +20,6 @@ using IterativeSolvers: cg!
 
 function main(params_in)
 
-
     # Process params
     params_default = default_params()
     params = add_default_params(params_in,params_default)
@@ -297,78 +296,6 @@ function assemble_sybmolic(state)
     r,J,V_coo,K
 end
 
-struct FilteredCooVector{F,A,B,C,T} <: AbstractVector{T}
-    f::F
-    I::A
-    J::B
-    V::C
-    function FilteredCooVector(f::F,I::A,J::B,V::C) where {F,A,B,C}
-        T = eltype(C)
-        new{F,A,B,C,T}(f,I,J,V)
-    end
-end
-Base.size(a::FilteredCooVector) = size(a.V)
-Base.IndexStyle(::Type{<:FilteredCooVector}) = IndexLinear()
-Base.@propagate_inbounds function Base.getindex(a::FilteredCooVector,k::Int)
-    i = a.I[k]
-    j = a.J[k]
-    v = a.V[k]
-    if i < 1 || j < 1
-        return a.f(v)
-    end
-    v
-end
-
-function sparse_matrix(I,J,V,m,n;kwargs...)
-    sparse_matrix(sparse,I,J,V,m,n;kwargs...)
-end
-function sparse_matrix(f,I,J,V,m,n;reuse=Val(false),skip_out_of_bounds=true)
-    if !skip_out_of_bounds
-        I2 = I
-        J2 = J
-        V2 = V
-    elseif m*n == 0
-        Ti = eltype(I)
-        T = eltype(V)
-        I2 = Ti[]
-        J2 = Ti[]
-        V2 = Tv[]
-    else
-        I2 = FilteredCooVector(one,I,J,I)
-        J2 = FilteredCooVector(one,I,J,J)
-        V2 = FilteredCooVector(zero,I,J,V)
-    end
-    A = f(I2,J2,V2,m,n)
-    if val_parameter(reuse)
-        K = precompute_nzindex(A,I,J)
-        return A,K
-    end
-    A
-end
-
-function precompute_nzindex(A,I,J)
-    K = zeros(Int32,length(I))
-    for (p,(i,j)) in enumerate(zip(I,J))
-        if i < 1 || j < 1
-            continue
-        end
-        K[p] = nzindex(A,i,j)
-    end
-    K
-end
-
-function sparse_matrix!(A,V,K)
-    LinearAlgebra.fillstored!(A,0)
-    A_nz = nonzeros(A)
-    for (k,v) in zip(K,V)
-        if k < 1
-            continue
-        end
-        A_nz[k] += v
-    end
-    A
-end
-
 @inline function p_laplace_residual(∇u,∇dv,dv,f,p)
     ∇dv⋅((norm(∇u)^(p-2))*∇u) - f*dv
 end
diff --git a/extensions/GalerkinToolkitExamples/test/example003_tests.jl b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
index a9b09be3..e6c5527e 100644
--- a/extensions/GalerkinToolkitExamples/test/example003_tests.jl
+++ b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
@@ -41,6 +41,7 @@ linear_solver = Example001.cg_amg_solver(;verbose=true)
 params[:solver] = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
 params[:export_vtu] = false
 params[:timer] = timer
+params[:p] = 2
 Example003.main(params)
 
 end # module

From d7b56fb7754f76db9868ae871cc8b30304b69763 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Fri, 9 Feb 2024 10:31:16 +0100
Subject: [PATCH 8/9] Implemented autodiff in example003

---
 .../GalerkinToolkitExamples/src/example003.jl | 52 ++++++++++++++---
 .../test/example003_tests.jl                  | 57 +++++++++++++++----
 2 files changed, 88 insertions(+), 21 deletions(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index c7934bf1..e606fe08 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -68,6 +68,7 @@ function default_params()
     params[:solver] = nlsolve_solver()
     params[:example_path] = joinpath(outdir,"example003")
     params[:export_vtu] = true
+    params[:autodiff] = :hand
     params[:timer] = TimerOutput()
     params
 end
@@ -214,7 +215,19 @@ function setup(params)
     @timeit timer "user_funs" user_funs = setup_user_funs(params)
     solver = params[:solver]
     p = params[:p]
-    state = (;p,timer,solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs,dofs)
+    if params[:autodiff] === :hand
+        flux = flux_hand
+        dflux = dflux_hand
+    elseif params[:autodiff] === :flux
+        flux = flux_hand
+        dflux = dflux_from_flux
+    elseif params[:autodiff] === :energy
+        flux = flux_from_energy
+        dflux = dflux_from_energy
+    else
+        error("not implemented: autodiff == $autodiff")
+    end
+    state = (;p,timer,flux,dflux,solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs,dofs)
 end
 
 function add_basic_info(results,params,state)
@@ -296,18 +309,38 @@ function assemble_sybmolic(state)
     r,J,V_coo,K
 end
 
-@inline function p_laplace_residual(∇u,∇dv,dv,f,p)
-    ∇dv⋅((norm(∇u)^(p-2))*∇u) - f*dv
-end
-
-@inline function p_laplace_jacobian(∇u,∇du,∇dv,p)
+@inline flux_hand(∇u,p) = (norm(∇u)^(p-2))*∇u
+@inline function dflux_hand(∇u,∇du,p)
     pm2 = p-2
     pm4 = p-4
-    ∇dv⋅((norm(∇u)^pm2)*∇du) + ∇dv⋅(pm2*(norm(∇u)^pm4)*(∇u⋅∇du)*∇u)
+    ((norm(∇u)^pm2)*∇du) + (pm2*(norm(∇u)^pm4)*(∇u⋅∇du)*∇u)
+end
+@inline function dflux_from_flux(∇u,∇du,p)
+    f(x) = flux_hand(x,p)
+    x = ∇u
+    dx = ∇du
+    dfdx = ForwardDiff.jacobian(f,x)
+    dfdx*dx
+end
+
+@inline energy(∇u,p) = (1/p)*(norm(∇u)^p)
+@inline function flux_from_energy(∇u,p)
+    f(x) = energy(x,p)
+    x = ∇u
+    dfdx = ForwardDiff.gradient(f,x)
+    dfdx
+end
+@inline function dflux_from_energy(∇u,∇du,p)
+    f(x) = energy(x,p)
+    x = ∇u
+    dx = ∇du
+    dfdx = ForwardDiff.hessian(f,x)
+    dfdx*dx
 end
 
 function residual_cells!(r,u_dofs,state)
 
+    flux = state.flux
     cell_to_dofs = state.dofs.cell_to_dofs
     cell_to_nodes = state.cell_isomap.face_to_nodes
     cell_to_rid = state.cell_isomap.face_to_rid
@@ -385,7 +418,7 @@ function residual_cells!(r,u_dofs,state)
             for k in 1:nl
                 dv = ste[k,iq]
                 ∇dv = ∇xe[k]
-                fe[k] += ( p_laplace_residual(∇u,∇dv,dv,fx,p) )*dV
+                fe[k] += ( ∇dv⋅flux(∇u,p) - fx*dv )*dV
             end
         end
         for i in 1:nl
@@ -399,6 +432,7 @@ end
 
 function jacobian_cells!(V_coo,u_dofs,state)
 
+    dflux = state.dflux
     cell_to_dofs = state.dofs.cell_to_dofs
     cell_to_nodes = state.cell_isomap.face_to_nodes
     cell_to_rid = state.cell_isomap.face_to_rid
@@ -474,7 +508,7 @@ function jacobian_cells!(V_coo,u_dofs,state)
                 ∇du = ∇xe[j]
                 for i in 1:nl
                     ∇dv = ∇xe[i]
-                    Ae[i,j] += ( p_laplace_jacobian(∇u,∇du,∇dv,p) )*dV
+                    Ae[i,j] += ( ∇dv⋅dflux(∇u,∇du,p) )*dV
                 end
             end
         end
diff --git a/extensions/GalerkinToolkitExamples/test/example003_tests.jl b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
index e6c5527e..5e9dfbcc 100644
--- a/extensions/GalerkinToolkitExamples/test/example003_tests.jl
+++ b/extensions/GalerkinToolkitExamples/test/example003_tests.jl
@@ -13,6 +13,21 @@ results = Example003.main(params)
 @test results[:eh1] < tol
 @test results[:el2] < tol
 
+params[:autodiff] = :hand
+results = Example003.main(params)
+@test results[:eh1] < tol
+@test results[:el2] < tol
+
+params[:autodiff] = :flux
+results = Example003.main(params)
+@test results[:eh1] < tol
+@test results[:el2] < tol
+
+params[:autodiff] = :energy
+results = Example003.main(params)
+@test results[:eh1] < tol
+@test results[:el2] < tol
+
 params = Dict{Symbol,Any}()
 params[:mesh] = gk.cartesian_mesh((0,3,0,2),(5,10),complexify=false)
 params[:dirichlet_tags] = ["1-face-1","1-face-3","1-face-4"]
@@ -25,23 +40,41 @@ results = Example003.main(params)
 @test results[:el2] < tol
 @test results[:ncells] == 10*5
 
-params = Dict{Symbol,Any}()
+mesh = gk.cartesian_mesh((0,3,0,2,0,1),(5,5,5))
 timer = TimerOutput()
-@timeit timer "cartesian_mesh" params[:mesh] = gk.cartesian_mesh((0,3,0,2,0,1),(5,5,5))
 linear_solver = Example001.cg_amg_solver(;verbose=true)
-params[:solver] = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
-params[:export_vtu] = false
-params[:timer] = timer
-Example003.main(params)
+solver = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
 
 params = Dict{Symbol,Any}()
-timer = TimerOutput()
-@timeit timer "cartesian_mesh" params[:mesh] = gk.cartesian_mesh((0,3,0,2,0,1),(70,70,70))
-linear_solver = Example001.cg_amg_solver(;verbose=true)
-params[:solver] = Example003.nlsolve_solver(;timer,linear_solver,show_trace=true,method=:newton)
-params[:export_vtu] = false
 params[:timer] = timer
-params[:p] = 2
+params[:solver] = solver
+params[:mesh] = mesh
+params[:export_vtu] = false
+params[:autodiff] = :hand
+Example003.main(params)
+
+reset_timer!(timer)
+params[:autodiff] = :flux
+Example003.main(params)
+
+reset_timer!(timer)
+params[:autodiff] = :energy
 Example003.main(params)
 
+mesh = gk.cartesian_mesh((0,3,0,2,0,1),(70,70,70))
+reset_timer!(timer)
+params[:mesh] = mesh
+
+params[:autodiff] = :hand
+Example003.main(params)
+
+reset_timer!(timer)
+params[:autodiff] = :flux
+Example003.main(params)
+
+reset_timer!(timer)
+params[:autodiff] = :energy
+Example003.main(params)
+
+
 end # module

From 992f193653919609b188a060cbc9012fd8080ca8 Mon Sep 17 00:00:00 2001
From: Francesc Verdugo <f.verdugo.rojano@vu.nl>
Date: Fri, 9 Feb 2024 10:39:54 +0100
Subject: [PATCH 9/9] Depend on PartitionedArrays 0.4.2

---
 .../GalerkinToolkitExamples/src/example003.jl | 75 ++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/extensions/GalerkinToolkitExamples/src/example003.jl b/extensions/GalerkinToolkitExamples/src/example003.jl
index e606fe08..b5145137 100644
--- a/extensions/GalerkinToolkitExamples/src/example003.jl
+++ b/extensions/GalerkinToolkitExamples/src/example003.jl
@@ -6,7 +6,7 @@ using StaticArrays
 using LinearAlgebra
 using SparseArrays
 using WriteVTK
-using PartitionedArrays: JaggedArray, sparse_matrix, sparse_matrix!
+using PartitionedArrays: JaggedArray, val_parameter, nzindex
 using TimerOutputs
 using NLsolve
 using Random
@@ -696,4 +696,77 @@ function export_results(uh,params,state)
     nothing
 end
 
+# TODO move this to partitioned arrays
+struct FilteredCooVector{F,A,B,C,T} <: AbstractVector{T}
+    f::F
+    I::A
+    J::B
+    V::C
+    function FilteredCooVector(f::F,I::A,J::B,V::C) where {F,A,B,C}
+        T = eltype(C)
+        new{F,A,B,C,T}(f,I,J,V)
+    end
+end
+Base.size(a::FilteredCooVector) = size(a.V)
+Base.IndexStyle(::Type{<:FilteredCooVector}) = IndexLinear()
+Base.@propagate_inbounds function Base.getindex(a::FilteredCooVector,k::Int)
+    i = a.I[k]
+    j = a.J[k]
+    v = a.V[k]
+    if i < 1 || j < 1
+        return a.f(v)
+    end
+    v
+end
+
+function sparse_matrix(I,J,V,m,n;kwargs...)
+    sparse_matrix(sparse,I,J,V,m,n;kwargs...)
+end
+function sparse_matrix(f,I,J,V,m,n;reuse=Val(false),skip_out_of_bounds=true)
+    if !skip_out_of_bounds
+        I2 = I
+        J2 = J
+        V2 = V
+    elseif m*n == 0
+        Ti = eltype(I)
+        T = eltype(V)
+        I2 = Ti[]
+        J2 = Ti[]
+        V2 = Tv[]
+    else
+        I2 = FilteredCooVector(one,I,J,I)
+        J2 = FilteredCooVector(one,I,J,J)
+        V2 = FilteredCooVector(zero,I,J,V)
+    end
+    A = f(I2,J2,V2,m,n)
+    if val_parameter(reuse)
+        K = precompute_nzindex(A,I,J)
+        return A,K
+    end
+    A
+end
+
+function precompute_nzindex(A,I,J)
+    K = zeros(Int32,length(I))
+    for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
+        K[p] = nzindex(A,i,j)
+    end
+    K
+end
+
+function sparse_matrix!(A,V,K)
+    LinearAlgebra.fillstored!(A,0)
+    A_nz = nonzeros(A)
+    for (k,v) in zip(K,V)
+        if k < 1
+            continue
+        end
+        A_nz[k] += v
+    end
+    A
+end
+
 end # module