Skip to content

Commit

Permalink
First round of reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
JordiManyer committed Aug 21, 2024
1 parent 1c79863 commit afe9e61
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 154 deletions.
20 changes: 0 additions & 20 deletions joss_paper/demo.pvtu

This file was deleted.

65 changes: 35 additions & 30 deletions joss_paper/paper.bib
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ @techreport{petsc-user-ref
}

@manual{mpi40,
author = "{Message Passing Interface Forum}",
title = "{MPI}: A Message-Passing Interface Standard Version 4.0",
url = "https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf",
year = 2021,
month = jun
author = "{Message Passing Interface Forum}",
title = "{MPI}: A Message-Passing Interface Standard Version 4.0",
url = "https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf",
year = 2021,
month = jun
}

@article{Verdugo2021,
Expand Down Expand Up @@ -61,7 +61,7 @@ @misc{parrays
}

@article{Bezanson2017,
abstract = {Bridging cultures that have often been distant, Julia combines expertise from the diverse fields of computer science and computational science to create a new approach to numerical computing. Julia...},
abstract = {Bridging cultures that have often been distant, Julia combines expertise from the diverse fields of computer science and computational science to create a new approach to numerical computing. {J}ulia...},
archivePrefix = {arXiv},
arxivId = {1411.1607},
author = {Bezanson, Jeff and Edelman, Alan and Karpinski, Stefan and Shah, Viral B.},
Expand All @@ -74,7 +74,7 @@ @article{Bezanson2017
number = {1},
pages = {65--98},
publisher = {Society for Industrial and Applied Mathematics},
title = {{Julia: a fresh approach to numerical computing}},
title = {{J}ulia: a fresh approach to numerical computing},
volume = {59},
year = {2017}
}
Expand All @@ -89,7 +89,7 @@ @article{Badia2020
number = {52},
pages = {2520},
publisher = {The Open Journal},
title = {{Gridap: an extensible finite element toolbox in Julia}},
title = {{G}ridap: an extensible finite element toolbox in {J}ulia},
url = {https://joss.theoj.org/papers/10.21105/joss.02520},
volume = {5},
year = {2020}
Expand All @@ -108,7 +108,7 @@ @article{Badia2020a
number = {6},
pages = {C436--C468},
publisher = {Society for Industrial and Applied Mathematics},
title = {{A generic finite element framework on parallel tree-based adaptive meshes}},
title = {A generic finite element framework on parallel tree-based adaptive meshes},
volume = {42},
year = {2020}
}
Expand All @@ -124,7 +124,7 @@ @article{p4est
number = {3},
pages = {1103--1133},
publisher = {Society for Industrial and Applied Mathematics},
title = {{p4est: scalable algorithms for parallel adaptive mesh refinement on forests of octrees}},
title = {{p4est}: scalable algorithms for parallel adaptive mesh refinement on forests of octrees},
volume = {33},
year = {2011}
}
Expand Down Expand Up @@ -153,7 +153,7 @@ @article {freefem
@Article{libMeshPaper,
doi = {10.1007/s00366-006-0049-3},
author = {B.~S.~Kirk and J.~W.~Peterson and R.~H.~Stogner and G.~F.~Carey},
title = {{\texttt{libMesh}: A C++ library for parallel adaptive mesh refinement/coarsening simulations}},
title = {{\texttt{libMesh}}: A {C}++ library for parallel adaptive mesh refinement/coarsening simulations},
journal = {Engineering with Computers},
volume = 22,
number = {3--4},
Expand Down Expand Up @@ -192,7 +192,7 @@ @article{Kirby2006
keywords = {Automation,Compiler,Finite element,Variational form},
number = {3},
pages = {417--444},
title = {{A compiler for variational forms}},
title = {A compiler for variational forms},
volume = {32},
year = {2006}
}
Expand Down Expand Up @@ -258,24 +258,26 @@ @Inbook{PARDISO
}

@article{MUMPS1,
title = {A Fully Asynchronous Multifrontal Solver Using Distributed Dynamic Scheduling},
author = {P.R. Amestoy and I. S. Duff and J. Koster and J.-Y. L'Excellent},
journal = {SIAM Journal on Matrix Analysis and Applications},
volume = {23},
number = {1},
year = {2001},
pages = {15-41}
}
title = {A Fully Asynchronous Multifrontal Solver Using Distributed Dynamic Scheduling},
author = {P.R. Amestoy and I. S. Duff and J. Koster and J.-Y. L'Excellent},
journal = {SIAM Journal on Matrix Analysis and Applications},
volume = {23},
number = {1},
year = {2001},
pages = {15-41},
doi = {10.1137/s0895479899358194}
}

@article{MUMPS2,
title = {{Performance and Scalability of the Block Low-Rank Multifrontal
Factorization on Multicore Architectures}},
title = {Performance and Scalability of the Block Low-Rank Multifrontal
Factorization on Multicore Architectures},
author = {P.R. Amestoy and A. Buttari and J.-Y. L'Excellent and T. Mary},
journal = {ACM Transactions on Mathematical Software},
volume = 45,
issue = 1,
pages = {2:1--2:26},
year={2019},
year = {2019},
doi = {10.1145/3242094}
}

@article{gridapdistributed,
Expand All @@ -287,29 +289,31 @@ @article{gridapdistributed
number = {74},
pages = {4157},
author = {Santiago Badia and Alberto F. Martín and Francesc Verdugo},
title = {GridapDistributed: a massively parallel finite element toolbox in Julia},
title = {GridapDistributed: a massively parallel finite element toolbox in {J}ulia},
journal = {Journal of Open Source Software}
}
@inproceedings{Arnold1,
title={Preconditing in H(div) and applications},
author={D. N. Arnold and Richard S. Falk and Ragnar Winther},
year={1997},
url={https://api.semanticscholar.org/CorpusID:12559456}
url={https://api.semanticscholar.org/CorpusID:12559456},
doi={10.1090/S0025-5718-97-00826-0}
}

@article{Arnold2,
title={Multigrid in H (div) and H (curl)},
title={Multigrid in H(div) and H(curl)},
author={D. N. Arnold and Richard S. Falk and Ragnar Winther},
journal={Numerische Mathematik},
year={2000},
volume={85},
pages={197-217},
url={https://api.semanticscholar.org/CorpusID:14301688}
url={https://api.semanticscholar.org/CorpusID:14301688},
doi={10.1007/PL00005386}
}

@article{fenics-patch,
title={PCPATCH: Software for the Topological Construction of Multigrid Relaxation Methods},
title={{PCPATCH}: Software for the Topological Construction of Multigrid Relaxation Methods},
volume={47},
ISSN={1557-7295},
url={http://dx.doi.org/10.1145/3445791},
Expand All @@ -323,11 +327,12 @@ @article{fenics-patch
}

@misc{dealII-patch,
title={An implementation of tensor product patch smoothers on GPU},
title={An implementation of tensor product patch smoothers on {GPU}},
author={Cu Cui and Paul Grosse-Bley and Guido Kanschat and Robert Strzodka},
year={2024},
eprint={2405.19004},
archivePrefix={arXiv},
primaryClass={math.NA},
url={https://arxiv.org/abs/2405.19004},
url={https://arxiv.org/abs/2405.19004},
doi={10.48550/arXiv.2405.19004}
}
109 changes: 5 additions & 104 deletions joss_paper/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ aas-journal: Journal of Open Source Software
The ever-increasing demand for resolution and accuracy in mathematical models of physical processes governed by systems of Partial Differential Equations (PDEs) can only be addressed using fully-parallel advanced numerical discretization methods and scalable solvers, thus able to exploit the vast amount of computational resources in state-of-the-art supercomputers.

One of the biggest scalability bottlenecks within Finite Element (FE) parallel codes is the solution of linear systems arising from the discretization of PDEs.
The implementation of exact factorization-based solvers in parallel environments is an extremely challenging task, and even state-of-the-art libraries such as MUMPS [@MUMPS1] [@MUMPS2] or PARDISO [@PARDISO] have severe limitations in terms of scalability and memory consumption above a certain number of CPU cores.
The implementation of exact factorization-based solvers in parallel environments is an extremely challenging task, and even state-of-the-art libraries such as MUMPS [@MUMPS1; @MUMPS2] or PARDISO [@PARDISO] have severe limitations in terms of scalability and memory consumption above a certain number of CPU cores.
Hence the use of iterative methods is crucial to maintain scalability of FE codes. Unfortunately, the convergence of iterative methods is not guaranteed and rapidly deteriorates as the size of the linear system increases. To retain performance, the use of highly scalable preconditioners is mandatory.
For simple problems, algebraic solvers and preconditioners (i.e based uniquelly on the algebraic system) are enough to obtain robust convergence. Many well-known libraries providing algebraic solvers already exist, such as PETSc [@petsc-user-ref], Trilinos [@trilinos], or Hypre [@hypre]. However, algebraic solvers are not always suited to deal with more challenging problems.

In these cases, solvers that exploit the physics and mathematical discretization of the particular problem are required. This is the case of many multiphysics problems involving differential operators with a large kernel such as the divergence [@Arnold1] and the curl [@Arnold2]. Examples can be found amongst highly relevant problems such as Navier-Stokes, Maxwell or Darcy. Scalable solvers for this type of multiphysics problems rely on exploiting the block structure of such systems to find a spectrally equivalent block-preconditioner, and are often tied to a specific discretization of the underlying equations.

As a consequence, high-quality open-source parallel finite element packages like FEniCS [@fenics-book] or deal.II [@dealII93] already provide implementations of several state-of-the-art physics-informed solvers [@fenics-patch] [@dealII-patch]. The Gridap ecosystem [@Badia2020] aims to provide a similar level of functionality within the Julia programming language [@Bezanson2017].
As a consequence, high-quality open-source parallel finite element packages like FEniCS [@fenics-book] or deal.II [@dealII93] already provide implementations of several state-of-the-art physics-informed solvers [@fenics-patch; @dealII-patch]. The Gridap ecosystem [@Badia2020] aims to provide a similar level of functionality within the Julia programming language [@Bezanson2017].

To this end, GridapSolvers is a registered Julia software package which provides highly scalable physics-informed solvers tailored for the FE numerical solution of PDEs on parallel computers within the Gridap ecosystem of packages. Emphasis is put on the modular design of the library, which easily allows new preconditioners to be designed from the user's specific problem.

Expand Down Expand Up @@ -89,116 +89,17 @@ $$
with $A$ the velocity laplacian block, and $M$ a pressure mass matrix.

The mass matrix is approximated by a Conjugate Gradient (CG) solver with Jacobi preconditioner. The eliminated velocity block is approximated by a 2-level V-cycle GMG solver, where the coarsest level is solved exactly in a single processor.
The code is setup to run in parallel with 4 MPI tasks and can be executed with the following command: `mpiexec -n 4 julia --project=. demo.jl`.

```julia
using Gridap, Gridap.Algebra, Gridap.MultiField
using PartitionedArrays, GridapDistributed, GridapSolvers

using GridapSolvers.LinearSolvers
using GridapSolvers.MultilevelTools
using GridapSolvers.BlockSolvers

function get_bilinear_form(mh_lev,biform,qdegree)
model = get_model(mh_lev)
Ω = Triangulation(model)
= Measure(Ω,qdegree)
return (u,v) -> biform(u,v,dΩ)
end

function add_labels!(labels)
add_tag_from_tags!(labels,"top",[6])
add_tag_from_tags!(labels,"walls",[1,2,3,4,5,7,8])
end

with_mpi() do distribute
np_per_level = [(2,2),(1,1)] # Number of processors per GMG level
parts = distribute(LinearIndices((prod(np_per_level[1]),)))

# Create multi-level mesh
domain = (0,1,0,1) # Cartesian domain (xmin,xmax,ymin,ymax)
ncells = (10,10) # Number of cells
mh = CartesianModelHierarchy(parts,np_per_level,domain,ncells;add_labels!)
model = get_model(mh,1) # Finest mesh

# Create FESpaces
fe_order = 2
reffe_u = ReferenceFE(lagrangian,VectorValue{2,Float64},fe_order)
reffe_p = ReferenceFE(lagrangian,Float64,fe_order-1;space=:P)

tests_u = TestFESpace(mh,reffe_u,dirichlet_tags=["walls","top"])
trials_u = TrialFESpace(tests_u,[VectorValue(0.0,0.0),VectorValue(1.0,0.0)])
U, V = get_fe_space(trials_u,1), get_fe_space(tests_u,1)
Q = TestFESpace(model,reffe_p;conformity=:L2,constraint=:zeromean)

mfs = BlockMultiFieldStyle() # Activates block-wise assembly
X = MultiFieldFESpace([U,Q];style=mfs)
Y = MultiFieldFESpace([V,Q];style=mfs)

# Weak formulation
f = VectorValue(1.0,1.0)
biform_u(u,v,dΩ) = ((v)(u))dΩ
biform((u,p),(v,q),dΩ) = biform_u(u,v,dΩ) - ((∇v)*p)dΩ - ((∇u)*q)dΩ
liform((v,q),dΩ) = (vf)dΩ

# Assemble linear system
qdegree = 2*(fe_order+1) # Quadrature degree
Ω = Triangulation(model)
= Measure(Ω,qdegree)
op = AffineFEOperator((u,v)->biform(u,v,dΩ),v->liform(v,dΩ),X,Y)
A, b = get_matrix(op), get_vector(op)

# GMG preconditioner for the velocity block
biforms = map(mh) do mhl
get_bilinear_form(mhl,biform_u,qdegree)
end
smoothers = map(view(mh,1:num_levels(mh)-1)) do mhl
RichardsonSmoother(JacobiLinearSolver(),10,2.0/3.0)
end
restrictions, prolongations = setup_transfer_operators(
trials_u, qdegree; mode=:residual, solver=CGSolver(JacobiLinearSolver())
)
solver_u = GMGLinearSolver(
mh,trials_u,tests_u,biforms,
prolongations,restrictions,
pre_smoothers=smoothers,
post_smoothers=smoothers,
coarsest_solver=LUSolver(),
maxiter=4,mode=:solver
)

# PCG solver for the pressure block
solver_p = CGSolver(JacobiLinearSolver();maxiter=20,atol=1e-14,rtol=1.e-6)

# Block triangular preconditioner
blocks = [LinearSystemBlock() LinearSystemBlock();
LinearSystemBlock() BiformBlock((p,q) -> (p*q)dΩ,Q,Q)]
P = BlockTriangularSolver(blocks,[solver_u,solver_p])

# Global solver
solver = FGMRESSolver(10,P;rtol=1.e-8,verbose=i_am_main(parts))
ns = numerical_setup(symbolic_setup(solver,A),A)

# Solve
x = allocate_in_domain(A)
fill!(x,0.0)
solve!(x,ns,b)

# Postprocess
uh, ph = FEFunction(X,x)
writevtk(Ω,"demo",cellfields=["uh"=>uh,"ph"=>ph])
end
```
The code for this example can be found [here](https://github.com/gridap/GridapSolvers.jl/tree/joss-paper/joss_paper/demo.jl). It is setup to run in parallel with 4 MPI tasks and can be executed with the following command: `mpiexec -n 4 julia --project=. demo.jl`.

## Parallel scaling benchmark

The following section shows scalability results for the demo problem discussed above. We run our code on the Gadi supercomputer, which is part of the Australian National Computational Infrastructure (NCI). We use Intel's Cascade Lake 2x24-core Xeon Platinum 8274 nodes. Scalability is shown for up to 64 nodes, for a fixed local problem size of 48x64 quadrangle cells per processor. This amounts to a maximum size of approximately 37M cells and 415M degrees of freedom distributed amongst 3072 processors. Within the GMG solver, the number of coarsening levels is progressively increased to keep the global size of the coarsest solve (approximately) constant. The coarsest solve is then performed by a CG solver preconditioned by an Algebraic MultiGrid (AMG) solver, provided by PETSc [@petsc-user-ref] through the package GridapPETSc.jl.

The results show that the code scales relatively well up to 3072 processors, with loss in performance mostly tied to the number of GMG levels used for the velocity solver. The number of F-GMRES iterations required for convergence is also shown to be relatively constant (and even decreasing for bigger problem sizes), indicating that the preconditioner is robust with respect to the problem size.
The results in \autoref{fig:scalability} show that the code scales relatively well up to 3072 processors, with loss in performance mostly tied to the number of GMG levels used for the velocity solver. The number of F-GMRES iterations required for convergence is also shown to be relatively constant (and even decreasing for bigger problem sizes), indicating that the preconditioner is robust with respect to the problem size.

The code used to create these results can be found [here](https://github.com/gridap/GridapSolvers.jl/tree/joss-paper/joss_paper/scalability). The exact releases for the packages used are provided by Julia's `Manifest.toml` file.

![**Top**: Weak scalability for a Stokes problem in 2D. Time is given per F-GMRES iteration, as a function of the number of processors. **Middle**: Number of coarsening levels for the GMG solver, as a function of the number of processors. **Bottom**: Number of F-GMRES iterations required for convergence. \label{fig:packages}](weakScalability.png){ width=80% }
![**Top**: Weak scalability for a Stokes problem in 2D. Time is given per F-GMRES iteration, as a function of the number of processors. **Middle**: Number of coarsening levels for the GMG solver, as a function of the number of processors. **Bottom**: Number of F-GMRES iterations required for convergence. \label{fig:scalability}](weakScalability.png){ width=80% }

## Acknowledgements

Expand Down

0 comments on commit afe9e61

Please sign in to comment.