diff --git a/extensions/GalerkinToolkitExamples/Project.toml b/extensions/GalerkinToolkitExamples/Project.toml
index 94faabe8..d295d8ba 100644
--- a/extensions/GalerkinToolkitExamples/Project.toml
+++ b/extensions/GalerkinToolkitExamples/Project.toml
@@ -8,7 +8,9 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GalerkinToolkit = "5e3ba9c4-dd81-444d-b69a-0e7bd7bf60a4"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 PartitionedArrays = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
+PetscCall = "1194c000-87c4-4102-b4a0-a6217ec4849e"
 Preconditioners = "af69fa37-3177-5a40-98ee-561f696e4fcd"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/extensions/GalerkinToolkitExamples/src/example001.jl b/extensions/GalerkinToolkitExamples/src/example001.jl
index 2102326c..f58cb7dc 100644
--- a/extensions/GalerkinToolkitExamples/src/example001.jl
+++ b/extensions/GalerkinToolkitExamples/src/example001.jl
@@ -222,14 +222,14 @@ end
 
 function assemble_system(state)
     timer = state.timer
-    @timeit timer "assemble_sytem_coo" args = assemble_sytem_coo(state)
+    @timeit timer "assemble_system_coo" args = assemble_system_coo(state)
     I,J,V,b = args
     n_dofs = state.dofs.n_dofs
     @timeit timer "sparse" A = sparse(I,J,V,n_dofs,n_dofs)
     A,b
 end
 
-function assemble_sytem_coo(state)
+function assemble_system_coo(state)
     timer = state.timer
     @timeit timer "assemble_sybmolic" args = assemble_sybmolic(state)
     @timeit timer "assemble_numeric_cells!" assemble_numeric_cells!(args...,state)
@@ -548,211 +548,4 @@ function export_results(uh,params,state)
     nothing
 end
 
-### From here, it is old stuff to to be deleted
-
-function assemble_system_loop(state)
-
-    cell_to_nodes = state.cell_isomap.face_to_nodes
-    cell_to_rid = state.cell_isomap.face_to_rid
-    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
-    node_to_x = state.cell_isomap.node_to_coords
-    ∇s = state.cell_isomap.rid_to_shape_grads
-    s = state.cell_isomap.rid_to_shape_vals
-    u_dirichlet = state.dirichlet_bcs.u_dirichlet
-    f = state.user_funs.f
-    w = state.cell_integration.rid_to_weights
-    d = state.cell_integration.d
-
-    # Count coo entries
-    n_coo = 0
-    ncells = length(cell_to_nodes)
-    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
-    nfree = length(first(free_and_dirichlet_nodes))
-    for cell in 1:ncells
-        nodes = cell_to_nodes[cell]
-        for nj in nodes
-            if node_to_free_node[nj] > nfree
-                continue
-            end
-            for ni in nodes
-                if node_to_free_node[ni] > nfree
-                    continue
-                end
-                n_coo += 1
-            end
-        end
-    end
-
-    # Allocate coo values
-    I_coo = zeros(Int32,n_coo)
-    J_coo = zeros(Int32,n_coo)
-    V_coo = zeros(Float64,n_coo)
-    b = zeros(nfree)
-
-    # Allocate auxiliary buffers
-    ∇x = map(i->similar(i,size(i,2)),∇s)
-    Aes = map(i->zeros(size(i,2),size(i,2)),∇s)
-    fes = map(i->zeros(size(i,2)),∇s)
-    ues = map(i->zeros(size(i,2)),∇s)
-
-    Tx = SVector{d,Float64}
-    TJ = typeof(zero(Tx)*zero(Tx)')
-
-    # Fill coo values
-    i_coo = 0
-    for cell in 1:ncells
-        rid = cell_to_rid[cell]
-        nodes = cell_to_nodes[cell]
-        Ae = Aes[rid]
-        ue = ues[rid]
-        fe = fes[rid]
-        nl = length(nodes)
-        ∇se = ∇s[rid]
-        ∇xe = ∇x[rid]
-        se = s[rid]
-        we = w[rid]
-        nq = length(we)
-        fill!(Ae,zero(eltype(Ae)))
-        fill!(fe,zero(eltype(Ae)))
-        fill!(ue,zero(eltype(Ae)))
-        for k in 1:nl
-            nk = nodes[k]
-            gk = node_to_free_node[nk]
-            if  gk <= nfree
-                continue
-            end
-            ue[k] = u_dirichlet[gk-nfree]
-        end
-        for iq in 1:nq
-            Jt = zero(TJ) 
-            xint = zero(Tx)
-            for k in 1:nl
-                x = node_to_x[nodes[k]]
-                ∇sqx = ∇se[iq,k]
-                sqx = se[iq,k]
-                Jt += ∇sqx*x'
-                xint += sqx*x
-            end
-            detJt = det(Jt)
-            invJt = inv(Jt)
-            dV = abs(detJt)*we[iq]
-            for k in 1:nl
-                ∇xe[k] = invJt*∇se[iq,k]
-            end
-            for j in 1:nl
-                for i in 1:nl
-                    Ae[i,j] += ∇xe[i]⋅∇xe[j]*dV
-                end
-            end
-            fx = f(xint)
-            for k in 1:nl
-                fe[k] +=  fx*se[iq,k]*dV
-            end
-        end
-        for i in 1:nl
-            for j in 1:nl
-                fe[i] -= Ae[i,j]*ue[j]
-            end
-        end
-        for i in 1:nl
-            ni = nodes[i]
-            gi = node_to_free_node[ni]
-            if gi > nfree
-                continue
-            end
-            b[gi] += fe[i]
-        end
-        for j in 1:nl
-            nj = nodes[j]
-            gj = node_to_free_node[nj]
-            if  gj > nfree
-                continue
-            end
-            for i in 1:nl
-                ni = nodes[i]
-                gi = node_to_free_node[ni]
-                if gi > nfree
-                    continue
-                end
-                i_coo += 1
-                I_coo[i_coo] = gi
-                J_coo[i_coo] = gj
-                V_coo[i_coo] = Ae[i,j]
-            end
-        end
-    end
-    add_neumann_bcs!(b,state)
-    I_coo,J_coo,V_coo,b
-end
-
-function add_neumann_bcs!(b,state)
-
-    neum_face_to_face = state.neumann_bcs.neum_face_to_face
-    if length(neum_face_to_face) == 0
-        return nothing
-    end
-    face_to_rid = state.face_isomap.face_to_rid
-    face_to_nodes = state.face_isomap.face_to_nodes
-    ∇s_f = state.face_isomap.rid_to_shape_grads
-    s_f = state.face_isomap.rid_to_shape_vals
-    node_to_x = state.face_isomap.node_to_coords
-    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
-    g = state.user_funs.g
-    w_f = state.face_integration.rid_to_weights
-    d = state.cell_integration.d
-
-    fes = map(i->zeros(size(i,2)),s_f)
-    Tx = SVector{d,Float64}
-    Ts = typeof(zero(SVector{d-1,Float64})*zero(Tx)')
-    node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
-    nfree = length(first(free_and_dirichlet_nodes))
-
-    for face in neum_face_to_face
-        rid = face_to_rid[face]
-        nodes = face_to_nodes[face]
-        ∇se = ∇s_f[rid]
-        se = s_f[rid]
-        we = w_f[rid]
-        nq = length(we)
-        fe = fes[rid]
-        fill!(fe,zero(eltype(fe)))
-        nl = length(nodes)
-        for iq in 1:nq
-            Jt = zero(Ts) 
-            xint = zero(Tx)
-            for k in 1:nl
-                x = node_to_x[nodes[k]]
-                ∇sqx = ∇se[iq,k]
-                sqx = se[iq,k]
-                Jt += ∇sqx*x'
-                xint += sqx*x
-            end
-            J = transpose(Jt)
-            dS = sqrt(det(Jt*J))*we[iq]
-            gx = g(xint)
-            for k in 1:nl
-                fe[k] +=  gx*se[iq,k]*dS
-            end
-        end
-        for i in 1:nl
-            ni = nodes[i]
-            gi = node_to_free_node[ni]
-            if gi > nfree
-                continue
-            end
-            b[gi] += fe[i]
-        end
-    end
-
-    nothing
-end
-
-function assemble_system_old(state)
-    I,J,V,b = assemble_system_loop(state)
-    nfree = length(b)
-    A = sparse(I,J,V,nfree,nfree)
-    A,b
-end
-
-
 end # module
diff --git a/extensions/GalerkinToolkitExamples/src/example002.jl b/extensions/GalerkinToolkitExamples/src/example002.jl
index 9b816805..02b31e53 100644
--- a/extensions/GalerkinToolkitExamples/src/example002.jl
+++ b/extensions/GalerkinToolkitExamples/src/example002.jl
@@ -1,7 +1,6 @@
 module Example002
 
 import GalerkinToolkit as gk
-import GalerkinToolkitExamples: Example001
 import ForwardDiff
 using StaticArrays
 using LinearAlgebra
@@ -9,30 +8,36 @@ using SparseArrays
 using WriteVTK
 using PartitionedArrays
 using TimerOutputs
+using PetscCall
 
 function main(params_in)
 
-    timer = TimerOutput()
-
     # Dict to collect results
     results = Dict{Symbol,Any}()
 
     # Process params
     params_default = default_params()
     params = add_default_params(params_in,params_default)
+    parts = linear_indices(partition(params[:mesh]))
+
+    timer = TimerOutput()
 
     # Setup main data structures
-    state = setup(params,timer)
+    @timeit timer "setup"  state = setup(params,timer)
     add_basic_info(results,params,state)
 
     # Assemble system and solve it
-    A,b = assemble_system(state)
-    x = solve_system(A,b,params)
+    @timeit timer "assemble_system" A,b = assemble_system(state)
+    @timeit timer "solve_system" x = solve_system(A,b,params,state)
 
     # Post process
-    uh = setup_uh(x,state)
-    integrate_error_norms(results,uh,state)
-    export_results(uh,params,state)
+    @timeit timer "setup_uh"  uh = setup_uh(x,state)
+    @timeit timer "integrate_error_norms" integrate_error_norms(results,uh,state)
+    @timeit timer "export_results"  export_results(uh,params,state)
+
+    map_main(state.local_states) do local_state
+        display(local_state.timer)
+    end
 
     results
 end
@@ -58,20 +63,65 @@ function default_params()
     ghost_layers = 0
     mesh = gk.cartesian_mesh(domain,cells_per_dir,parts_per_dir;parts,ghost_layers)
     outdir = mkpath(joinpath(@__DIR__,"..","output"))
-    params = Example001.default_params()
+    params = Dict{Symbol,Any}()
+    params[:u] = (x) -> sum(x)
+    params[:f] = (x) -> 0.0
+    params[:g] = (x) -> 0.0
+    params[:dirichlet_tags] = ["boundary"]
+    params[:neumann_tags] = String[]
+    params[:integration_degree] = 2
+    params[:solver] = lu_solver()
+    params[:export_vtu] = true
     params[:mesh] = mesh
     params[:example_path] = joinpath(outdir,"example002")
     params
 end
 
+# Not a parallel solver
+# just for debugging purposes
+function lu_solver()
+    setup(x,A,b) = lu(A)
+    setup! = lu!
+    solve! = ldiv!
+    finalize!(S) = nothing
+    (;setup,setup!,solve!,finalize!)
+end
+
+function ksp_solver()
+    setup = PetscCall.ksp_setup
+    setup! = PetscCall.ksp_setup!
+    solve! = PetscCall.ksp_solve!
+    finalize! = PetscCall.ksp_finalize!
+    (;setup,setup!,solve!,finalize!)
+end
+
 function setup(params,timer)
     mesh = params[:mesh]
-    local_states = map(partition(mesh)) do mesh
+    local_states_0 = map(partition(mesh)) do mesh
         local_params = copy(params)
         local_params[:mesh] = mesh
-        Example001.setup(local_params,timer)
+        local_setup(local_params,timer)
     end
+    @timeit timer "setup_dofs" dofs,local_dofs = setup_dofs(params,local_states_0)
+    local_states = map((a,dofs)->(;dofs,a...),local_states_0,local_dofs)
+    state = (;local_states,dofs,timer)
+    state
+end
 
+function local_setup(params,timer)
+    @timeit timer "dirichlet_bcs" dirichlet_bcs = setup_dirichlet_bcs(params)
+    @timeit timer "neumann_bcs" neumann_bcs = setup_neumann_bcs(params)
+    @timeit timer "cell_integration" cell_integration = setup_integration(params,:cells)
+    @timeit timer "face_integration" face_integration = setup_integration(params,:faces)
+    @timeit timer "cell_isomap" cell_isomap = setup_isomap(params,cell_integration)
+    @timeit timer "face_isomap" face_isomap = setup_isomap(params,face_integration)
+    @timeit timer "user_funs" user_funs = setup_user_funs(params)
+    solver = params[:solver]
+    state = (;timer,solver,dirichlet_bcs,neumann_bcs,cell_integration,cell_isomap,face_integration,face_isomap,user_funs)
+end
+
+function setup_dofs(params,local_states)
+    mesh = params[:mesh]
     node_partition = gk.node_partition(mesh)
     global_node_to_mask = pfill(false,node_partition)
     function fillmask!(node_to_mask,state)
@@ -81,13 +131,108 @@ function setup(params,timer)
     map(fillmask!,partition(global_node_to_mask),local_states)
     global_dof_to_node, global_node_to_dof = find_local_indices(global_node_to_mask)
     dof_partition = partition(axes(global_dof_to_node,1))
-    state = (;local_states,global_dof_to_node, global_node_to_dof)
-    state
+    function setup_local_dofs(state,node_to_dof)
+        free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+        node_to_free_node = gk.permutation(free_and_dirichlet_nodes)
+        n_free = length(first(free_and_dirichlet_nodes))
+        n_dofs = n_free
+        cell_to_nodes = state.cell_isomap.face_to_nodes
+        face_to_nodes = state.face_isomap.face_to_nodes
+        function map_node_to_dof(node)
+            free_node = node_to_free_node[node]
+            if free_node > n_free
+                return Int(-(free_node-n_free))
+            end
+            dof = node_to_dof[node]
+            Int(dof)
+        end
+        cell_to_dofs_data = map_node_to_dof.(cell_to_nodes.data)
+        face_to_dofs_data = map_node_to_dof.(face_to_nodes.data)
+        cell_to_dofs = JaggedArray(cell_to_dofs_data,cell_to_nodes.ptrs)
+        face_to_dofs = JaggedArray(face_to_dofs_data,face_to_nodes.ptrs)
+        (;cell_to_dofs,face_to_dofs,n_dofs)
+    end
+    local_dofs = map(setup_local_dofs,local_states,partition(global_node_to_dof))
+    (;dof_partition,global_node_to_dof,global_dof_to_node), local_dofs
+end
+
+function setup_dirichlet_bcs(params)
+    mesh = params[:mesh]
+    u = params[:u]
+    dirichlet_tags = params[:dirichlet_tags]
+    node_to_tag = zeros(gk.num_nodes(mesh))
+    tag_to_name = dirichlet_tags
+    gk.classify_mesh_nodes!(node_to_tag,mesh,tag_to_name)
+    free_and_dirichlet_nodes = gk.partition_from_mask(i->i==0,node_to_tag)
+    node_to_x = gk.node_coordinates(mesh)
+    dirichlet_nodes = last(free_and_dirichlet_nodes)
+    x_dirichlet = view(node_to_x,dirichlet_nodes)
+    u_dirichlet = u.(x_dirichlet)
+    dirichlet_bcs = (;free_and_dirichlet_nodes,u_dirichlet,node_to_tag)
+end
+
+function setup_integration(params,objects)
+    mesh = params[:mesh]
+    degree = params[:integration_degree]
+    D = gk.num_dims(mesh)
+    if objects === :cells
+        d = D
+    elseif objects === :faces
+        d = D-1
+    else
+        error("")
+    end
+    # Integration
+    ref_cells = gk.reference_faces(mesh,d)
+    face_to_rid = gk.face_reference_id(mesh,d)
+    integration_rules = map(ref_cells) do ref_cell
+        gk.default_quadrature(gk.geometry(ref_cell),degree)
+    end
+    rid_to_weights = map(gk.weights,integration_rules)
+    rid_to_coords = map(gk.coordinates,integration_rules)
+    integration = (;rid_to_weights,rid_to_coords,face_to_rid,d)
+end
+
+function setup_isomap(params,integration)
+    rid_to_coords = integration.rid_to_coords
+    face_to_rid = integration.face_to_rid
+    d = integration.d
+    mesh = params[:mesh]
+    ref_cells = gk.reference_faces(mesh,d)
+    shape_funs = map(rid_to_coords,ref_cells) do q,ref_cell
+        shape_vals = gk.tabulator(ref_cell)(gk.value,q)
+        shape_grads = gk.tabulator(ref_cell)(ForwardDiff.gradient,q)
+        shape_vals, shape_grads
+    end
+    rid_to_shape_vals = map(first,shape_funs)
+    rid_to_shape_grads = map(last,shape_funs)
+    face_to_nodes = JaggedArray(gk.face_nodes(mesh,d))
+    node_to_coords = gk.node_coordinates(mesh)
+    isomap = (;face_to_nodes,node_to_coords,face_to_rid,rid_to_shape_vals,rid_to_shape_grads,d)
+end
+
+function setup_neumann_bcs(params)
+    mesh = params[:mesh]
+    neumann_tags = params[:neumann_tags]
+    neum_face_to_face = Int[]
+    if length(neumann_tags) != 0
+        D = gk.num_dims(mesh)
+        tag_to_groups = gk.physical_faces(mesh,D-1)
+        neum_face_to_face = reduce(union,map(tag->tag_to_groups[tag],neumann_tags))
+    end
+    (;neum_face_to_face)
+end
+
+function setup_user_funs(params)
+    u = params[:u]
+    f = params[:f]
+    g = params[:g]
+    user_funs = (;u,f,g)
 end
 
 function add_basic_info(results,params,state)
     mesh = params[:mesh]
-    nfree = length(state.global_dof_to_node)
+    nfree = length(state.dofs.global_dof_to_node)
     nnodes = gk.num_nodes(mesh)
     ncells = gk.num_faces(mesh,gk.num_dims(mesh))
     results[:nfree] = nfree
@@ -97,70 +242,255 @@ function add_basic_info(results,params,state)
 end
 
 function assemble_system(state)
-    dof_partition = partition(axes(state.global_dof_to_node,1))
-    b = pzeros(Float64,dof_partition)
-    I,J,V,free_node_to_b = map(Example001.assemble_system_loop,state.local_states) |> tuple_of_arrays
-    function map_dofs(state,node_to_dof,dofs,I,J,free_node_to_b,dof_to_b)
-        free_node_to_node = first(state.dirichlet_bcs.free_and_dirichlet_nodes)
-        dof_to_global_dof = local_to_global(dofs)
-        function free_node_to_global_dof(free_node)
-            # TODO this free_node -> dof should not be needed of the order free dofs conveniently
-            node = free_node_to_node[free_node]
-            dof = node_to_dof[node]
-            global_dof = dof_to_global_dof[dof]
-            global_dof
-        end
-        I .= free_node_to_global_dof.(I)
-        J .= free_node_to_global_dof.(J)
-        free_node_to_dof = view(node_to_dof,free_node_to_node)
-        # TODO this one should not be needed of the order free dofs conveniently
-        dof_to_b[free_node_to_dof] = free_node_to_b
-        nothing
-    end
-    map(map_dofs,state.local_states,partition(state.global_node_to_dof),dof_partition,I,J,free_node_to_b,partition(b))
-    A = psparse(I,J,V,dof_partition,dof_partition;assemble=false) |> fetch
-    # TODO
-    A = assemble(A) |> fetch
-    b = assemble(b,partition(axes(A,1))) |> fetch
+    timer = state.timer
+    dof_partition = partition(axes(state.dofs.global_dof_to_node,1))
+    @timeit timer "assemble_system_coo" I,J,V,b_partition = map(assemble_system_coo,state.local_states) |> tuple_of_arrays
+    @timeit timer "psparse" A = psparse(I,J,V,dof_partition,dof_partition;subassembled=true,indices=:local) |> fetch
+    row_partition = partition(axes(A,1))
+    b = PVector(b_partition,dof_partition)
+    @timeit timer "assemble_b" b = assemble(b,row_partition) |> fetch
     A,b
 end
 
-function solve_system(A,b,params)
+function assemble_system_coo(state)
+    timer = state.timer
+    @timeit timer "assemble_sybmolic" args = assemble_sybmolic(state)
+    @timeit timer "assemble_numeric_cells!" assemble_numeric_cells!(args...,state)
+    @timeit timer "assemble_numeric_faces!" assemble_numeric_faces!(args...,state)
+    args
+end
+
+function assemble_sybmolic(state)
+    cell_to_dofs = state.dofs.cell_to_dofs
+
+    n_coo = 0
+    ncells = length(cell_to_dofs)
+    for cell in 1:ncells
+        dofs = cell_to_dofs[cell]
+        ndofs = length(dofs)
+        for i in 1:ndofs
+            if !(dofs[i]>0)
+                continue
+            end
+            for j in 1:ndofs
+                if !(dofs[j]>0)
+                    continue
+                end
+                n_coo += 1
+            end
+        end
+    end
+
+    I_coo = Vector{Int32}(undef,n_coo)
+    J_coo = Vector{Int32}(undef,n_coo)
+    V_coo = Vector{Float64}(undef,n_coo)
+    b = zeros(Float64,state.dofs.n_dofs)
+
+    n_coo = 0
+    for cell in 1:ncells
+        dofs = cell_to_dofs[cell]
+        ndofs = length(dofs)
+        for i in 1:ndofs
+            if !(dofs[i]>0)
+                continue
+            end
+            dofs_i = dofs[i]
+            for j in 1:ndofs
+                if !(dofs[j]>0)
+                    continue
+                end
+                n_coo += 1
+                I_coo[n_coo] = dofs_i
+                J_coo[n_coo] = dofs[j]
+            end
+        end
+    end
+
+    I_coo,J_coo,V_coo,b
+end
+
+function assemble_numeric_cells!(I_coo,J_coo,V_coo,b,state)
+
+    cell_to_dofs = state.dofs.cell_to_dofs
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇s = state.cell_isomap.rid_to_shape_grads
+    s = state.cell_isomap.rid_to_shape_vals
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    f = state.user_funs.f
+    w = state.cell_integration.rid_to_weights
+    d = state.cell_integration.d
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    ncells = length(cell_to_nodes)
+
+    # Allocate auxiliary buffers
+    ∇x = map(i->similar(i,size(i,2)),∇s)
+    Aes = map(i->zeros(size(i,2),size(i,2)),∇s)
+    fes = map(i->zeros(size(i,2)),∇s)
+    ∇st = map(m->collect(permutedims(m)),∇s)
+    st = map(m->collect(permutedims(m)),s)
+
+    Tx = eltype(node_to_x)
+    TJ = typeof(zero(Tx)*zero(Tx)')
+
+    i_coo = 0
+    for cell in 1:ncells
+        rid = cell_to_rid[cell]
+        nodes = cell_to_nodes[cell]
+        dofs = cell_to_dofs[cell]
+        Ae = Aes[rid]
+        fe = fes[rid]
+        nl = length(nodes)
+        ∇ste = ∇st[rid]
+        ∇xe = ∇x[rid]
+        ste = st[rid]
+        we = w[rid]
+        nq = length(we)
+        fill!(Ae,zero(eltype(Ae)))
+        fill!(fe,zero(eltype(Ae)))
+        # Integrate
+        for iq in 1:nq
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇ste[k,iq]
+                sqx = ste[k,iq]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:nl
+                ∇xe[k] = invJt*∇ste[k,iq]
+            end
+            for j in 1:nl
+                ∇xej = ∇xe[j]
+                for i in 1:nl
+                    Ae[i,j] += ∇xe[i]⋅∇xej*dV
+                end
+            end
+            fx = f(xint)
+            for k in 1:nl
+                fe[k] +=  fx*ste[k,iq]*dV
+            end
+        end
+        # Set the result in the output array
+        for i in 1:nl
+            if !(dofs[i]>0)
+                continue
+            end
+            b[dofs[i]] += fe[i]
+            for j in 1:nl
+                if !(dofs[j]>0)
+                    uj = u_dirichlet[-dofs[j]]
+                    b[dofs[i]] -= Ae[i,j]*uj
+                    continue
+                end
+                i_coo += 1
+                V_coo[i_coo] = Ae[i,j]
+            end
+        end
+    end
+
+end
+
+function assemble_numeric_faces!(I_coo,J_coo,V_coo,b,state)
+
+    neum_face_to_face = state.neumann_bcs.neum_face_to_face
+    if length(neum_face_to_face) == 0
+        return nothing
+    end
+
+    face_to_rid = state.face_isomap.face_to_rid
+    face_to_dofs = state.dofs.face_to_dofs
+    face_to_nodes = state.face_isomap.face_to_nodes
+    ∇s_f = state.face_isomap.rid_to_shape_grads
+    s_f = state.face_isomap.rid_to_shape_vals
+    node_to_x = state.face_isomap.node_to_coords
+    g = state.user_funs.g
+    w_f = state.face_integration.rid_to_weights
+    d = state.cell_integration.d
+
+    fes = map(i->zeros(size(i,2)),s_f)
+    Tx = SVector{d,Float64}
+    Ts = typeof(zero(SVector{d-1,Float64})*zero(Tx)')
+
+    for face in neum_face_to_face
+        rid = face_to_rid[face]
+        nodes = face_to_nodes[face]
+        dofs = face_to_dofs[face]
+        ∇se = ∇s_f[rid]
+        se = s_f[rid]
+        we = w_f[rid]
+        nq = length(we)
+        fe = fes[rid]
+        fill!(fe,zero(eltype(fe)))
+        nl = length(nodes)
+        for iq in 1:nq
+            Jt = zero(Ts) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            J = transpose(Jt)
+            dS = sqrt(det(Jt*J))*we[iq]
+            gx = g(xint)
+            for k in 1:nl
+                fe[k] +=  gx*se[iq,k]*dS
+            end
+        end
+        for i in 1:nl
+            if !(dofs[i]>0)
+                continue
+            end
+            b[dofs[i]] += fe[i]
+        end
+    end
+end
+
+function solve_system(A,b,params,state)
+    timer = state.timer
     solver = params[:solver]
     x = similar(b,axes(A,2))
-    setup = solver.setup(x,A,b)
-    solver.solve!(x,setup,b)
-    solver.finalize!(setup)
+    @timeit timer "solver.setup" setup = solver.setup(x,A,b)
+    @timeit timer "solver.solve!" solver.solve!(x,setup,b)
+    @timeit timer "solver.finalize!" solver.finalize!(setup)
     x
 end
 
 function setup_uh(x,state)
-    dofs = axes(state.global_dof_to_node,1)
+    dofs = axes(state.dofs.global_dof_to_node,1)
     local_states = state.local_states
-
-    # TODO this similar can be avoided when using
-    # a sub-assembled system
     global_dof_to_x = similar(x,dofs)
     global_dof_to_x .= x
     consistent!(global_dof_to_x) |> wait
-
-    # TODO this map can be avoided
-    # when the local numeration of free nodes coincides
-    # with local dofs.
-    function map_dofs(dof_to_x,node_to_dof,state)
-        free_node_to_node = first(state.dirichlet_bcs.free_and_dirichlet_nodes)
-        free_node_to_dof = view(node_to_dof,free_node_to_node)
-        view(dof_to_x,free_node_to_dof)
+    function setup_local_uh(dof_to_x,state,node_to_dof)
+        free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+        node_to_x = state.cell_isomap.node_to_coords
+        free_node_to_node = first(free_and_dirichlet_nodes)
+        @views u_free = dof_to_x[node_to_dof[free_node_to_node]]
+        u_dirichlet = state.dirichlet_bcs.u_dirichlet
+        nnodes = length(node_to_x)
+        local_uh = zeros(nnodes)
+        local_uh[first(free_and_dirichlet_nodes)] = u_free
+        local_uh[last(free_and_dirichlet_nodes)] = u_dirichlet
+        local_uh
     end
-    free_node_to_x = map(map_dofs,partition(global_dof_to_x),partition(state.global_node_to_dof),state.local_states)
-
-    uh = map(Example001.setup_uh,free_node_to_x,local_states)
+    uh = map(setup_local_uh,partition(global_dof_to_x),local_states,partition(state.dofs.global_node_to_dof))
     uh
 end
 
 function integrate_error_norms(results,uh,state)
     local_states = state.local_states
-    eh1², el2² = map(Example001.integrate_error_norms_loop,uh,local_states) |> tuple_of_arrays
+    eh1², el2² = map(integrate_error_norms_loop,uh,local_states) |> tuple_of_arrays
     eh1 = sqrt(sum(eh1²))
     el2 = sqrt(sum(el2²))
     results[:eh1] = eh1
@@ -168,7 +498,78 @@ function integrate_error_norms(results,uh,state)
     results
 end
 
+function integrate_error_norms_loop(uh,state)
+
+    cell_to_nodes = state.cell_isomap.face_to_nodes
+    cell_to_rid = state.cell_isomap.face_to_rid
+    free_and_dirichlet_nodes = state.dirichlet_bcs.free_and_dirichlet_nodes
+    node_to_x = state.cell_isomap.node_to_coords
+    ∇s = state.cell_isomap.rid_to_shape_grads
+    s = state.cell_isomap.rid_to_shape_vals
+    u_dirichlet = state.dirichlet_bcs.u_dirichlet
+    u = state.user_funs.u
+    w = state.cell_integration.rid_to_weights
+    d = state.cell_integration.d
+
+    eh1 = 0.0
+    el2 = 0.0
+    ncells = length(cell_to_rid)
+    ues = map(i->zeros(size(i,2)),∇s)
+    ∇x = map(i->similar(i,size(i,2)),∇s)
+    Tx = eltype(node_to_x)
+    TJ = typeof(zero(Tx)*zero(Tx)')
+    for cell in 1:ncells
+        rid = cell_to_rid[cell]
+        nodes = cell_to_nodes[cell]
+        ue = ues[rid]
+        nl = length(nodes)
+        ∇se = ∇s[rid]
+        ∇xe = ∇x[rid]
+        se = s[rid]
+        we = w[rid]
+        nq = length(we)
+        for k in 1:nl
+            nk = nodes[k]
+            ue[k] = uh[nk]
+        end
+        for iq in 1:nq
+            Jt = zero(TJ) 
+            xint = zero(Tx)
+            for k in 1:nl
+                x = node_to_x[nodes[k]]
+                ∇sqx = ∇se[iq,k]
+                sqx = se[iq,k]
+                Jt += ∇sqx*x'
+                xint += sqx*x
+            end
+            detJt = det(Jt)
+            invJt = inv(Jt)
+            dV = abs(detJt)*we[iq]
+            for k in 1:nl
+                ∇xe[k] = invJt*∇se[iq,k]
+            end
+            ux = u(xint)
+            ∇ux = ForwardDiff.gradient(u,xint)
+            ∇uhx = zero(∇ux)
+            uhx = zero(ux)
+            for k in 1:nl
+                uek = ue[k]
+                ∇uhx += uek*∇xe[k]
+                uhx += uek*se[iq,k]
+            end
+            ∇ex = ∇ux - ∇uhx
+            ex =  ux - uhx
+            eh1 += (∇ex⋅∇ex + ex*ex)*dV
+            el2 += ex*ex*dV
+        end
+    end
+    eh1, el2
+end
+
 function export_results(uh,params,state)
+    if ! params[:export_vtu]
+        return nothing
+    end
     example_path = params[:example_path]
     mesh = params[:mesh]
     ranks = linear_indices(gk.node_partition(mesh))
diff --git a/extensions/GalerkinToolkitExamples/test/example002_defs.jl b/extensions/GalerkinToolkitExamples/test/example002_defs.jl
new file mode 100644
index 00000000..4e6eb1c5
--- /dev/null
+++ b/extensions/GalerkinToolkitExamples/test/example002_defs.jl
@@ -0,0 +1,39 @@
+
+import GalerkinToolkit as gk
+using GalerkinToolkitExamples: Example002
+using Test
+using PartitionedArrays
+using PetscCall
+
+function example002_tests_np_4(distribute)
+    tol = 1.0e-8
+    params = Dict{Symbol,Any}()
+    domain = (0,10,0,10)
+    cells_per_dir = (20,20)
+    parts_per_dir = (2,2)
+    np = prod(parts_per_dir)
+    parts = distribute(LinearIndices((np,)))
+    ghost_layers = 0
+    mesh = gk.cartesian_mesh(domain,cells_per_dir,parts_per_dir;parts,ghost_layers)
+    params[:mesh] = mesh
+    results = Example002.main(params)
+    results = Example002.main(params)
+    @test results[:eh1] < tol
+    @test results[:el2] < tol
+
+    options = "-pc_type gamg -ksp_type cg -ksp_error_if_not_converged true -ksp_converged_reason -ksp_rtol 1.0e-6"
+    PetscCall.init(args=split(options))
+    params = Dict{Symbol,Any}()
+    domain = (0,10,0,10,0,10)
+    cells_per_dir = (40,40,40)
+    parts_per_dir = (2,2,1)
+    np = prod(parts_per_dir)
+    parts = distribute(LinearIndices((np,)))
+    ghost_layers = 0
+    mesh = gk.cartesian_mesh(domain,cells_per_dir,parts_per_dir;parts,ghost_layers)
+    params[:mesh] = mesh
+    params[:export_vtu] = false
+    params[:solver] = Example002.ksp_solver()
+    results = Example002.main(params)
+    results = Example002.main(params)
+end
diff --git a/extensions/GalerkinToolkitExamples/test/example002_tests.jl b/extensions/GalerkinToolkitExamples/test/example002_tests.jl
index 27149066..c6cc72cb 100644
--- a/extensions/GalerkinToolkitExamples/test/example002_tests.jl
+++ b/extensions/GalerkinToolkitExamples/test/example002_tests.jl
@@ -1,28 +1,6 @@
 module Example002Tests
 
-import GalerkinToolkit as gk
-using GalerkinToolkitExamples: Example002
-using Test
-using PartitionedArrays
-
-tol = 1.0e-10
-
-params = Dict{Symbol,Any}()
-results = Example002.main(params)
-@test results[:eh1] < tol
-@test results[:el2] < tol
-
-domain = (0,10,0,10)
-cells_per_dir = (20,20)
-parts_per_dir = (2,2)
-np = prod(parts_per_dir)
-parts = DebugArray(LinearIndices((np,)))
-ghost_layers = 0
-mesh = gk.cartesian_mesh(domain,cells_per_dir,parts_per_dir;parts,ghost_layers)
-
-params = Dict{Symbol,Any}()
-results = Example002.main(params)
-@test results[:eh1] < tol
-@test results[:el2] < tol
+include("example002_defs.jl")
+with_debug(example002_tests_np_4)
 
 end # module
diff --git a/extensions/GalerkinToolkitExamples/test/mpi_array/example002_driver_np_4.jl b/extensions/GalerkinToolkitExamples/test/mpi_array/example002_driver_np_4.jl
new file mode 100644
index 00000000..e29ef5c9
--- /dev/null
+++ b/extensions/GalerkinToolkitExamples/test/mpi_array/example002_driver_np_4.jl
@@ -0,0 +1,8 @@
+module TMP
+
+using PartitionedArrays
+
+include(joinpath("..","example002_defs.jl"))
+with_mpi(example002_tests_np_4)
+
+end # module
diff --git a/extensions/GalerkinToolkitExamples/test/mpi_array/example002_tests.jl b/extensions/GalerkinToolkitExamples/test/mpi_array/example002_tests.jl
new file mode 100644
index 00000000..897646a6
--- /dev/null
+++ b/extensions/GalerkinToolkitExamples/test/mpi_array/example002_tests.jl
@@ -0,0 +1,11 @@
+module Example002Tests
+
+using MPI
+
+include("run_mpi_driver.jl")
+
+file = joinpath(@__DIR__,"example002_driver_np_4.jl")
+run_mpi_driver(file;procs=4)
+
+end # module
+
diff --git a/extensions/GalerkinToolkitExamples/test/mpi_array/run_mpi_driver.jl b/extensions/GalerkinToolkitExamples/test/mpi_array/run_mpi_driver.jl
new file mode 100644
index 00000000..0d85dcaf
--- /dev/null
+++ b/extensions/GalerkinToolkitExamples/test/mpi_array/run_mpi_driver.jl
@@ -0,0 +1,15 @@
+using MPI
+using Test
+function run_mpi_driver(file;procs)
+  repodir = normpath(joinpath(@__DIR__,"..",".."))
+  mpiexec() do cmd
+    if MPI.MPI_LIBRARY == "OpenMPI" || (isdefined(MPI, :OpenMPI) && MPI.MPI_LIBRARY == MPI.OpenMPI)
+      run(`$cmd -n $procs --oversubscribe $(Base.julia_cmd()) --project=$repodir $(file)`)
+    else
+      run(`$cmd -n $procs $(Base.julia_cmd()) --project=$repodir $(file)`)
+    end
+    # This line will be reached if and only if the command launched by `run` runs without errors.
+    # Then, if we arrive here, the test has succeeded.
+    @test true  
+  end
+end
diff --git a/extensions/GalerkinToolkitExamples/test/mpi_array/runtests.jl b/extensions/GalerkinToolkitExamples/test/mpi_array/runtests.jl
new file mode 100644
index 00000000..c7602f63
--- /dev/null
+++ b/extensions/GalerkinToolkitExamples/test/mpi_array/runtests.jl
@@ -0,0 +1,9 @@
+
+module MPIArrayRunTests
+
+using Test
+using PartitionedArrays
+
+@testset "example002" begin include("example002_tests.jl") end
+
+end #module
diff --git a/extensions/GalerkinToolkitExamples/test/runtests.jl b/extensions/GalerkinToolkitExamples/test/runtests.jl
index b448524c..c7d1e6a4 100644
--- a/extensions/GalerkinToolkitExamples/test/runtests.jl
+++ b/extensions/GalerkinToolkitExamples/test/runtests.jl
@@ -7,4 +7,6 @@ using Test
     @testset "example002" begin include("example002_tests.jl") end
 end
 
+include(joinpath("mpi_array","runtests.jl"))
+
 end # module