WIP, think double diagonal is working, broken float MP

OPM · Oct 22, 2024 · b6d2394 · b6d2394
1 parent 276f6ca
commit b6d2394
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 89 deletions.
diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -362,10 +362,10 @@ struct StandardPreconditioners {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
             using field_type = typename V::field_type;
             using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
+            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);
 
             auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
             auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@@ -621,12 +621,12 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
 
             using field_type = typename V::field_type;
             using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
 
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
         });
 
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -129,7 +129,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
             if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
                     m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@@ -141,7 +141,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     v.data(),
                     lowerSolveThreadBlockSize);
             }else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
                     m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@@ -153,7 +153,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     v.data(),
                     lowerSolveThreadBlockSize);
             } else {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -63,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.

diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {
 
 template <class M, class X, class Y, int l>
-OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
+OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
     : m_cpuMatrix(A)
     , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
     , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@@ -56,8 +56,12 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
     , m_splitMatrix(splitMatrix)
     , m_tuneThreadBlockSizes(tuneKernels)
-    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
 {
+
+    OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme),
+                 fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme));
+    m_mixedPrecisionScheme = static_cast<MixedPrecisionScheme>(mixedPrecisionScheme);
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -85,13 +89,16 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
             m_cpuMatrix, m_reorderedToNatural);
     }
 
-    if (m_storeFactorizationAsFloat){
+    if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){
         OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
 
         // initialize mixed precision datastructures
         m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
         m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
-        m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
+        if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+            m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        }
     }
 
     LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
@@ -128,7 +135,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
+            if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
                 detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
@@ -171,8 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
         const int numOfRowsInLevel = m_levelSets[level].size();
         levelStartIdx -= numOfRowsInLevel;
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, float>(
                         m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                         m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
                         m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
@@ -183,8 +190,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         v.data(),
                         upperSolveThreadBlockSize);
             }
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
+                        m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                        m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                        m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                        m_gpuReorderToNatural.data(),
+                        levelStartIdx,
+                        numOfRowsInLevel,
+                        m_gpuMatrixReorderedDiag.value().data(),
+                        v.data(),
+                        upperSolveThreadBlockSize);
+            }
             else{
-                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpper->getRowIndices().data(),
                     m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@@ -272,8 +291,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
         const int numOfRowsInLevel = m_levelSets[level].size();
 
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
@@ -290,8 +309,26 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                     numOfRowsInLevel,
                     factorizationThreadBlockSize);
             }
+            else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    nullptr,
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    factorizationThreadBlockSize);
+            }
             else{
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::DEFAULT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),

diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>
@@ -64,7 +65,7 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
+    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
@@ -143,9 +144,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     bool m_splitMatrix;
     //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
     bool m_tuneThreadBlockSizes;
-    //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
-    //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
-    bool m_storeFactorizationAsFloat;
+    //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
+    MixedPrecisionScheme m_mixedPrecisionScheme;
     //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
     //! The default value of -1 indicates that we have not calibrated and selected a value yet
     int m_upperSolveThreadBlockSize = -1;

diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -473,41 +473,19 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
         MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
 
 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
+
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6);
 
 } // namespace Opm::gpuistl::detail::DILU