From c57f5d83d35d3d01c26e20dff59e1c3849c77358 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Mon, 14 Oct 2024 16:17:56 +0200
Subject: [PATCH 1/6] add mixed precision option to gpudilu class

---
 opm/simulators/linalg/PreconditionerFactory_impl.hpp |  9 ++++++---
 opm/simulators/linalg/gpuistl/GpuDILU.cpp            | 11 ++++++++++-
 opm/simulators/linalg/gpuistl/GpuDILU.hpp            | 10 +++++++++-
 tests/gpuistl/test_GpuDILU.cpp                       |  6 +++---
 4 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
index c53c069e7cd..0c905e9b2fb 100644
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -349,9 +349,10 @@ struct StandardPreconditioners {
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
             using field_type = typename V::field_type;
             using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
+            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
 
             auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
             auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@@ -631,14 +632,16 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
             using field_type = typename V::field_type;
             using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
         });
 
         F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
 
             using block_type = typename V::block_type;
             using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
@@ -647,7 +650,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
             using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
             using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
             auto converted = std::make_shared<Converter>(op.getmat());
-            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels));
+            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
             converted->setUnderlyingPreconditioner(adapted);
             return converted;
         });
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index cba3be2c993..53e13dfba5d 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {
 
 template <class M, class X, class Y, int l>
-GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels)
+GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
     : m_cpuMatrix(A)
     , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
     , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@@ -52,6 +52,7 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels)
     , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
     , m_splitMatrix(splitMatrix)
     , m_tuneThreadBlockSizes(tuneKernels)
+    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
 
 {
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
@@ -80,6 +81,14 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels)
         m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
             m_cpuMatrix, m_reorderedToNatural);
     }
+
+    if (m_storeFactorizationAsFloat) {
+        OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
+        m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
+        m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
+        m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+    }
+
     computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index 665bd667b2f..edea8c70792 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -53,6 +53,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     using field_type = typename X::field_type;
     //! \brief The GPU matrix type
     using CuMat = GpuSparseMatrix<field_type>;
+    using FloatMat = GpuSparseMatrix<float>;
+    using FloatVec = GpuVector<float>;
 
     //! \brief Constructor.
     //!
@@ -60,7 +62,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
@@ -127,6 +129,10 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
     //! \brief If matrix splitting is enabled, we also store the diagonal separately
     std::unique_ptr<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
+    //! \brief If mixed precision is enabled, store a float matrix
+    std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
+    std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
+    std::unique_ptr<FloatVec> m_gpuMatrixReorderedDiagFloat;
     //! row conversion from natural to reordered matrix indices stored on the GPU
     GpuVector<int> m_gpuNaturalToReorder;
     //! row conversion from reordered to natural matrix indices stored on the GPU
@@ -137,6 +143,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     bool m_splitMatrix;
     //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
     bool m_tuneThreadBlockSizes;
+    //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
+    bool m_storeFactorizationAsFloat;
     //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
     //! The default value of -1 indicates that we have not calibrated and selected a value yet
     int m_upperSolveThreadBlockSize = -1;
diff --git a/tests/gpuistl/test_GpuDILU.cpp b/tests/gpuistl/test_GpuDILU.cpp
index 5c796651fa8..ac9044f429e 100644
--- a/tests/gpuistl/test_GpuDILU.cpp
+++ b/tests/gpuistl/test_GpuDILU.cpp
@@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)
 
     // Initialize preconditioner objects
     Dune::MultithreadDILU<Sp1x1BlockMatrix, B1x1Vec, B1x1Vec> cpudilu(matA);
-    auto gpudilu = GpuDilu1x1(matA, true, true);
+    auto gpudilu = GpuDilu1x1(matA, true, true, false);
 
     // Use the apply
     gpudilu.apply(d_output, d_input);
@@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked)
 
     // init matrix with 2x2 blocks
     Sp2x2BlockMatrix matA = get2x2BlockTestMatrix();
-    auto gpudilu = GpuDilu2x2(matA, true, true);
+    auto gpudilu = GpuDilu2x2(matA, true, true, false);
     Dune::MultithreadDILU<Sp2x2BlockMatrix, B2x2Vec, B2x2Vec> cpudilu(matA);
 
     // create input/output buffers for the apply
@@ -275,7 +275,7 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
 {
     // create gpu dilu preconditioner
     Sp1x1BlockMatrix matA = get1x1BlockTestMatrix();
-    auto gpudilu = GpuDilu1x1(matA, true, true);
+    auto gpudilu = GpuDilu1x1(matA, true, true, false);
 
     matA[0][0][0][0] = 11.0;
     matA[0][1][0][0] = 12.0;

From fee3f4f09229480bc08712fb6b8d4b8a79fc8311 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Tue, 15 Oct 2024 13:22:55 +0200
Subject: [PATCH 2/6] working MP for gpu DILU

---
 opm/simulators/linalg/gpuistl/GpuDILU.cpp     | 126 +++++++++----
 opm/simulators/linalg/gpuistl/GpuDILU.hpp     |   1 +
 .../gpuistl/detail/deviceBlockOperations.hpp  |  15 +-
 .../preconditionerKernels/DILUKernels.cu      | 166 ++++++++++++------
 .../preconditionerKernels/DILUKernels.hpp     |  31 ++--
 5 files changed, 231 insertions(+), 108 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index 53e13dfba5d..d1bcdf1bf77 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -83,10 +83,13 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
     }
 
     if (m_storeFactorizationAsFloat) {
-        OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
+        if (!m_splitMatrix){
+            OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
+        }
         m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
         m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
         m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+        m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
     }
 
     computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@@ -120,17 +123,31 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                d.data(),
-                v.data(),
-                lowerSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat) {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInvFloat->data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            } else {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            }
         } else {
             detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
                 m_gpuMatrixReordered->getNonZeroValues().data(),
@@ -153,16 +170,29 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
         const int numOfRowsInLevel = m_levelSets[level].size();
         levelStartIdx -= numOfRowsInLevel;
         if (m_splitMatrix) {
-            detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                v.data(),
-                upperSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat){
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInvFloat->data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
+            } else {
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
+            }
         } else {
             detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
                 m_gpuMatrixReordered->getNonZeroValues().data(),
@@ -241,20 +271,44 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuMatrixReorderedDiag->data(),
-                m_gpuReorderToNatural.data(),
-                m_gpuNaturalToReorder.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                factorizationBlockSize);
+            if (m_storeFactorizationAsFloat) {
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    m_gpuDInvFloat->data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    factorizationBlockSize);
+            } else {
+                // TODO: should this be field type twice or field type then float in the template?
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    factorizationBlockSize);
+            }
         } else {
             detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
                 m_gpuMatrixReordered->getNonZeroValues().data(),
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index edea8c70792..623e151322e 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -133,6 +133,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
     std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
     std::unique_ptr<FloatVec> m_gpuMatrixReorderedDiagFloat;
+    std::unique_ptr<FloatVec> m_gpuDInvFloat;
     //! row conversion from natural to reordered matrix indices stored on the GPU
     GpuVector<int> m_gpuNaturalToReorder;
     //! row conversion from reordered to natural matrix indices stored on the GPU
diff --git a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
index 5ad30abb27e..ee22db3ae72 100644
--- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
@@ -159,7 +159,7 @@ mmv(const T* a, const T* b, T* c)
 // dst -= A*B*C
 template <class T, int blocksize>
 __device__ __forceinline__ void
-mmx2Subtraction(T* A, T* B, T* C, T* dst)
+mmx2Subtraction(const T* A, const T* B, const T* C, T* dst)
 {
 
     T tmp[blocksize * blocksize] = {0};
@@ -256,6 +256,19 @@ mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
     }
 }
 
+// TODO: consider merging with existing block operations
+// mixed precision general version of c += Ab
+template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
+__device__ __forceinline__ void
+umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        for (int j = 0; j < blocksize; ++j) {
+            c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
+        }
+    }
+}
+
 // TODO: consider merging with existing block operations
 // Mixed precision general version of c -= Ab
 template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index f0379d2dccc..050de298938 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -59,16 +59,16 @@ namespace
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuSolveLowerLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const T* dInv,
-                                              const T* d,
-                                              T* v)
+                                              const MatrixScalar* dInv,
+                                              const LinearSolverScalar* d,
+                                              LinearSolverScalar* v)
     {
         const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@@ -77,7 +77,7 @@ namespace
             const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
             const int naturalRowIdx = indexConversion[reorderedRowIdx];
 
-            T rhs[blocksize];
+            LinearSolverScalar rhs[blocksize];
             for (int i = 0; i < blocksize; i++) {
                 rhs[i] = d[naturalRowIdx * blocksize + i];
             }
@@ -85,10 +85,10 @@ namespace
             // TODO: removce the first condition in the for loop
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -118,15 +118,15 @@ namespace
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuSolveUpperLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const T* dInv,
-                                              T* v)
+                                              const MatrixScalar* dInv,
+                                              LinearSolverScalar* v)
     {
         const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@@ -134,13 +134,13 @@ namespace
             const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
             const int naturalRowIdx = indexConversion[reorderedRowIdx];
 
-            T rhs[blocksize] = {0};
+            LinearSolverScalar rhs[blocksize] = {0};
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -211,19 +211,24 @@ namespace
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
+    // TOOD: The important part is to only cast after that is fully computed
+    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                                int* lowerRowIndices,
                                                int* lowerColIndices,
-                                               T* reorderedUpperMat,
+                                               const InputScalar* srcReorderedUpperMat,
                                                int* upperRowIndices,
                                                int* upperColIndices,
-                                               T* diagonal,
+                                               const InputScalar* srcDiagonal,
                                                int* reorderedToNatural,
                                                int* naturalToReordered,
                                                const int startIdx,
                                                int rowsInLevelSet,
-                                               T* dInv)
+                                               InputScalar* dInv,
+                                               OutputScalar* dstDiag, // TODO: should this be diag or dInv?
+                                               OutputScalar* dstLowerMat,
+                                               OutputScalar* dstUpperMat)
     {
         const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@@ -231,10 +236,10 @@ namespace
             const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
             const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
 
-            T dInvTmp[blocksize * blocksize];
+            InputScalar dInvTmp[blocksize * blocksize];
             for (int i = 0; i < blocksize; ++i) {
                 for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                 }
             }
 
@@ -250,18 +255,28 @@ namespace
 
                 const int symOppositeBlock = symOppositeIdx;
 
-                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
+                if constexpr (copyResultToOtherMatrix) {
+                    // TODO: think long and hard about whether this performs only the wanted memory transfers
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                }
+
+                mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
                                               &dInv[col * blocksize * blocksize],
-                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
                                               dInvTmp);
             }
 
-            invBlockInPlace<T, blocksize>(dInvTmp);
+            invBlockInPlace<InputScalar, blocksize>(dInvTmp);
 
-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
-                }
+            // for (int i = 0; i < blocksize; ++i) {
+            //     for (int j = 0; j < blocksize; ++j) {
+            //         dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
+            //     }
+            // }
+            moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
+            if constexpr (copyResultToOtherMatrix) {
+                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
             }
         }
     }
@@ -289,23 +304,23 @@ solveLowerLevelSet(T* reorderedMat,
 }
 
 
-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveLowerLevelSetSplit(T* reorderedMat,
+solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const T* dInv,
-                        const T* d,
-                        T* v,
+                        const MatrixScalar* dInv,
+                        const LinearSolverScalar* d,
+                        LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
@@ -328,22 +343,22 @@ solveUpperLevelSet(T* reorderedMat,
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveUpperLevelSetSplit(T* reorderedMat,
+solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const T* dInv,
-                        T* v,
+                        const MatrixScalar* dInv,
+                        LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -376,51 +391,62 @@ computeDiluDiagonal(T* reorderedMat,
     }
 }
 
-template <class T, int blocksize>
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
 void
-computeDiluDiagonalSplit(T* reorderedLowerMat,
+computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                          int* lowerRowIndices,
                          int* lowerColIndices,
-                         T* reorderedUpperMat,
+                         const InputScalar* srcReorderedUpperMat,
                          int* upperRowIndices,
                          int* upperColIndices,
-                         T* diagonal,
+                         const InputScalar* srcDiagonal,
                          int* reorderedToNatural,
                          int* naturalToReordered,
                          const int startIdx,
                          int rowsInLevelSet,
-                         T* dInv,
+                         InputScalar* dInv,
+                         OutputScalar* dstDiag,
+                         OutputScalar* dstLowerMat,
+                         OutputScalar* dstUpperMat,
                          int thrBlockSize)
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-            cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
+            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                                      lowerRowIndices,
                                                                                      lowerColIndices,
-                                                                                     reorderedUpperMat,
+                                                                                     srcReorderedUpperMat,
                                                                                      upperRowIndices,
                                                                                      upperColIndices,
-                                                                                     diagonal,
+                                                                                     srcDiagonal,
                                                                                      reorderedToNatural,
                                                                                      naturalToReordered,
                                                                                      startIdx,
                                                                                      rowsInLevelSet,
-                                                                                     dInv);
+                                                                                     dInv,
+                                                                                     dstDiag,
+                                                                                     dstLowerMat,
+                                                                                     dstUpperMat);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
 }
 
+// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
-        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, false>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, false>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, true>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, true>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
-    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
-    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@@ -434,4 +460,30 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar)                                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int);                     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
+
 } // namespace Opm::gpuistl::detail::DILU
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
index 74a7e78fe95..44b94c2f178 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@@ -71,16 +71,16 @@ void solveLowerLevelSet(T* reorderedMat,
  * @param d Stores the defect
  * @param [out] v Will store the results of the lower solve
  */
-template <class T, int blocksize>
-void solveLowerLevelSetSplit(T* reorderedUpperMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const T* dInv,
-                             const T* d,
-                             T* v,
+                             const MatrixScalar* dInv,
+                             const LinearSolverScalar* d,
+                             LinearSolverScalar* v,
                              int threadBlockSize);
 
 /**
@@ -124,15 +124,15 @@ void solveUpperLevelSet(T* reorderedMat,
  * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
  * solve
  */
-template <class T, int blocksize>
-void solveUpperLevelSetSplit(T* reorderedUpperMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const T* dInv,
-                             T* v,
+                             const MatrixScalar* dInv,
+                             LinearSolverScalar* v,
                              int threadBlockSize);
 
 /**
@@ -161,7 +161,6 @@ void computeDiluDiagonal(T* reorderedMat,
                          int rowsInLevelSet,
                          T* dInv,
                          int threadBlockSize);
-template <class T, int blocksize>
 
 /**
  * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
@@ -184,18 +183,22 @@ template <class T, int blocksize>
  * function
  * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
  */
-void computeDiluDiagonalSplit(T* reorderedLowerMat,
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                               int* lowerRowIndices,
                               int* lowerColIndices,
-                              T* reorderedUpperMat,
+                              const InputScalar* srcReorderedUpperMat,
                               int* upperRowIndices,
                               int* upperColIndices,
-                              T* diagonal,
+                              const InputScalar* srcDiagonal,
                               int* reorderedToNatural,
                               int* naturalToReordered,
                               int startIdx,
                               int rowsInLevelSet,
-                              T* dInv,
+                              InputScalar* dInv,
+                              OutputScalar* dstDiagonal,
+                              OutputScalar* dstLowerMat,
+                              OutputScalar* dstUpperMat,
                               int threadBlockSize);
 
 } // namespace Opm::gpuistl::detail::DILU

From 76d68e7f471ea666a25e995f6a5fabaca145600d Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Wed, 16 Oct 2024 15:37:16 +0200
Subject: [PATCH 3/6] Add MP version of DILU with double on diagonal

---
 CMakeLists_files.cmake                        |   1 +
 .../linalg/PreconditionerFactory_impl.hpp     |  12 +-
 opm/simulators/linalg/gpuistl/GpuDILU.cpp     |  71 +++++++++--
 opm/simulators/linalg/gpuistl/GpuDILU.hpp     |   5 +-
 .../linalg/gpuistl/detail/kernelEnums.hpp     |  46 +++++++
 .../preconditionerKernels/DILUKernels.cu      | 120 +++++++++++-------
 .../preconditionerKernels/DILUKernels.hpp     |  11 +-
 7 files changed, 194 insertions(+), 72 deletions(-)
 create mode 100644 opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 94c5bb4e4ab..81ba61f4aa2 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -253,6 +253,7 @@ if (HAVE_CUDA)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernelEnums.hpp)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp)
   ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
index 0c905e9b2fb..e5aba601726 100644
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -349,10 +349,10 @@ struct StandardPreconditioners {
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
             using field_type = typename V::field_type;
             using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
+            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);
 
             auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
             auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@@ -632,16 +632,16 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
             using field_type = typename V::field_type;
             using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
         });
 
         F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
 
             using block_type = typename V::block_type;
             using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
@@ -650,7 +650,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
             using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
             using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
             auto converted = std::make_shared<Converter>(op.getmat());
-            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
             converted->setUnderlyingPreconditioner(adapted);
             return converted;
         });
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index d1bcdf1bf77..2fc96cc819c 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {
 
 template <class M, class X, class Y, int l>
-GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
+GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
     : m_cpuMatrix(A)
     , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
     , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@@ -52,9 +52,12 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
     , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
     , m_splitMatrix(splitMatrix)
     , m_tuneThreadBlockSizes(tuneKernels)
-    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
-
 {
+
+    OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme),
+                 fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme));
+    m_mixedPrecisionScheme = static_cast<MixedPrecisionScheme>(mixedPrecisionScheme);
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -82,14 +85,16 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
             m_cpuMatrix, m_reorderedToNatural);
     }
 
-    if (m_storeFactorizationAsFloat) {
+    if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT) {
         if (!m_splitMatrix){
             OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
         }
         m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
         m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
-        m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
-        m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+        // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
+        if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+            m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+        }
     }
 
     computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@@ -123,7 +128,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
@@ -135,6 +140,18 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     d.data(),
                     v.data(),
                     lowerSolveThreadBlockSize);
+            }else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
             } else {
                 detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@@ -170,7 +187,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
         const int numOfRowsInLevel = m_levelSets[level].size();
         levelStartIdx -= numOfRowsInLevel;
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
@@ -181,6 +198,17 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     m_gpuDInvFloat->data(),
                     v.data(),
                     upperSolveThreadBlockSize);
+            } else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
             } else {
                 detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@@ -271,8 +299,9 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+                // printf("update: MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT\n");
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
@@ -289,9 +318,29 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                     factorizationBlockSize);
+            } else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
+                // printf("update: MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE\n");
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    nullptr,
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    factorizationBlockSize);
             } else {
+                // printf("update: MixedPrecisionScheme::DEFAULT\n");
                 // TODO: should this be field type twice or field type then float in the template?
-                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MixedPrecisionScheme::DEFAULT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index 623e151322e..40209f17b4b 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -23,6 +23,7 @@
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
 #include <vector>
 
 
@@ -62,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
@@ -145,7 +146,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
     bool m_tuneThreadBlockSizes;
     //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
-    bool m_storeFactorizationAsFloat;
+    MixedPrecisionScheme m_mixedPrecisionScheme;
     //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
     //! The default value of -1 indicates that we have not calibrated and selected a value yet
     int m_upperSolveThreadBlockSize = -1;
diff --git a/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp b/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp
new file mode 100644
index 00000000000..fea8220f65e
--- /dev/null
+++ b/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp
@@ -0,0 +1,46 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP
+#define OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP
+
+/*
+    This file is here to organize a growing amount of different mixed precision options for the preconditioners.
+*/
+
+namespace Opm::gpuistl {
+    enum MixedPrecisionScheme {
+        DEFAULT,
+        STORE_ENTIRE_FACTORIZATION_AS_FLOAT,
+        STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE
+    };
+
+    inline bool isValidMixedPrecisionScheme(int scheme) {
+        switch (scheme) {
+            case DEFAULT:
+            case STORE_ENTIRE_FACTORIZATION_AS_FLOAT:
+            case STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE:
+                return true;
+            default:
+                return false;
+        }
+    }
+}
+
+#endif // OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index 050de298938..31231bfce47 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -59,14 +59,14 @@ namespace
         }
     }
 
-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
     __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                               const LinearSolverScalar* d,
                                               LinearSolverScalar* v)
     {
@@ -88,7 +88,7 @@ namespace
                 mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -118,14 +118,14 @@ namespace
         }
     }
 
-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
     __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                               LinearSolverScalar* v)
     {
         const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@@ -140,7 +140,7 @@ namespace
                 umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 
@@ -213,7 +213,7 @@ namespace
 
     // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
     // TOOD: The important part is to only cast after that is fully computed
-    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme mixedPrecisionScheme>
     __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                                int* lowerRowIndices,
                                                int* lowerColIndices,
@@ -255,7 +255,7 @@ namespace
 
                 const int symOppositeBlock = symOppositeIdx;
 
-                if constexpr (copyResultToOtherMatrix) {
+                if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
                     // TODO: think long and hard about whether this performs only the wanted memory transfers
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
@@ -275,7 +275,8 @@ namespace
             //     }
             // }
             moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
-            if constexpr (copyResultToOtherMatrix) {
+            // only if we store the entire matrix as a float should we be moving the block to the float diagonal
+            if constexpr (mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
                 moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
             }
         }
@@ -304,7 +305,7 @@ solveLowerLevelSet(T* reorderedMat,
 }
 
 
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
@@ -312,15 +313,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                         const LinearSolverScalar* d,
                         LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
@@ -343,7 +344,7 @@ solveUpperLevelSet(T* reorderedMat,
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
@@ -351,14 +352,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -391,7 +392,7 @@ computeDiluDiagonal(T* reorderedMat,
     }
 }
 
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme scheme>
 void
 computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                          int* lowerRowIndices,
@@ -412,9 +413,9 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
 {
     if (blocksize <= 3) {
         int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
+            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
         int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                                      lowerRowIndices,
                                                                                      lowerColIndices,
                                                                                      srcReorderedUpperMat,
@@ -437,13 +438,17 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
 // TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
     template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, false>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MixedPrecisionScheme::DEFAULT>(                                                              \
         const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, false>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MixedPrecisionScheme::DEFAULT>(                                                              \
         const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, true>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(                                                              \
         const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, true>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(                                                              \
         const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
     template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
@@ -461,29 +466,48 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
-
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+
+// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double);
 
 } // namespace Opm::gpuistl::detail::DILU
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
index 44b94c2f178..6dc8f61eb93 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@@ -22,6 +22,7 @@
 #include <cstddef>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
 #include <vector>
 
 namespace Opm::gpuistl::detail::DILU
@@ -71,14 +72,14 @@ void solveLowerLevelSet(T* reorderedMat,
  * @param d Stores the defect
  * @param [out] v Will store the results of the lower solve
  */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                              const LinearSolverScalar* d,
                              LinearSolverScalar* v,
                              int threadBlockSize);
@@ -124,14 +125,14 @@ void solveUpperLevelSet(T* reorderedMat,
  * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
  * solve
  */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
                              int threadBlockSize);
 
@@ -183,7 +184,7 @@ void computeDiluDiagonal(T* reorderedMat,
  * function
  * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
  */
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme>
 void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                               int* lowerRowIndices,
                               int* lowerColIndices,

From f7da4ffe4e4a5a4f66b3b33e2c43731db4dd6fd5 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Tue, 22 Oct 2024 15:19:16 +0200
Subject: [PATCH 4/6] WIP, think double diagonal is working, broken float MP

---
 .../linalg/PreconditionerFactory_impl.hpp     |  8 +--
 opm/simulators/linalg/gpuistl/GpuDILU.cpp     |  6 +-
 opm/simulators/linalg/gpuistl/GpuDILU.hpp     |  2 +-
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp  | 59 ++++++++++++---
 opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp  |  8 +--
 .../preconditionerKernels/DILUKernels.cu      | 50 ++++---------
 .../preconditionerKernels/ILU0Kernels.cu      | 71 ++++++++++++-------
 .../preconditionerKernels/ILU0Kernels.hpp     |  7 +-
 8 files changed, 122 insertions(+), 89 deletions(-)

diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
index e5aba601726..46d8712094b 100644
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -362,10 +362,10 @@ struct StandardPreconditioners {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
             using field_type = typename V::field_type;
             using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
+            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);
 
             auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
             auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@@ -621,12 +621,12 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
 
             using field_type = typename V::field_type;
             using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
 
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
         });
 
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
index 2fc96cc819c..0e0f969fbcb 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@@ -129,7 +129,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
             if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
                     m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@@ -141,7 +141,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     v.data(),
                     lowerSolveThreadBlockSize);
             }else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
                     m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@@ -153,7 +153,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                     v.data(),
                     lowerSolveThreadBlockSize);
             } else {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
index 40209f17b4b..86ad4aa4dc4 100644
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@@ -63,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 24ba06ecb06..99022b2f6db 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {
 
 template <class M, class X, class Y, int l>
-OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
+OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
     : m_cpuMatrix(A)
     , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
     , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@@ -56,8 +56,12 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
     , m_splitMatrix(splitMatrix)
     , m_tuneThreadBlockSizes(tuneKernels)
-    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
 {
+
+    OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme),
+                 fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme));
+    m_mixedPrecisionScheme = static_cast<MixedPrecisionScheme>(mixedPrecisionScheme);
+
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
     OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@@ -85,13 +89,16 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
             m_cpuMatrix, m_reorderedToNatural);
     }
 
-    if (m_storeFactorizationAsFloat){
+    if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){
         OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
 
         // initialize mixed precision datastructures
         m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
         m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
-        m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
+        if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+            m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        }
     }
 
     LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
@@ -128,7 +135,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
+            if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
                 detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
                     m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
@@ -171,8 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
         const int numOfRowsInLevel = m_levelSets[level].size();
         levelStartIdx -= numOfRowsInLevel;
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, float>(
                         m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                         m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
                         m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
@@ -183,8 +190,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         v.data(),
                         upperSolveThreadBlockSize);
             }
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
+                        m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                        m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                        m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                        m_gpuReorderToNatural.data(),
+                        levelStartIdx,
+                        numOfRowsInLevel,
+                        m_gpuMatrixReorderedDiag.value().data(),
+                        v.data(),
+                        upperSolveThreadBlockSize);
+            }
             else{
-                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                     m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                     m_gpuMatrixReorderedUpper->getRowIndices().data(),
                     m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@@ -272,8 +291,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
         const int numOfRowsInLevel = m_levelSets[level].size();
 
         if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
@@ -290,8 +309,26 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                     numOfRowsInLevel,
                     factorizationThreadBlockSize);
             }
+            else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    nullptr,
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    factorizationThreadBlockSize);
+            }
             else{
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::DEFAULT>(
                     m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                     m_gpuMatrixReorderedLower->getRowIndices().data(),
                     m_gpuMatrixReorderedLower->getColumnIndices().data(),
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index 025be1f5bf5..c9453a25a9c 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>
@@ -64,7 +65,7 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
+    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
@@ -143,9 +144,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
     bool m_splitMatrix;
     //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
     bool m_tuneThreadBlockSizes;
-    //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
-    //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
-    bool m_storeFactorizationAsFloat;
+    //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
+    MixedPrecisionScheme m_mixedPrecisionScheme;
     //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
     //! The default value of -1 indicates that we have not calibrated and selected a value yet
     int m_upperSolveThreadBlockSize = -1;
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
index 31231bfce47..eb997a7e987 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@@ -473,41 +473,19 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
         MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
 
 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
+
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6);
 
 } // namespace Opm::gpuistl::detail::DILU
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index 5d304e3660f..9d028c92351 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -120,7 +120,7 @@ namespace
         }
     }
 
-    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme mixedPrecisionScheme>
     __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                                            int* lowerRowIndices,
                                            int* lowerColIndices,
@@ -161,7 +161,7 @@ namespace
                 mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
                                                   &srcDiagonal[j * scalarsInBlock],
                                                   &srcReorderedLowerMat[ij * scalarsInBlock]);
-                if (copyResultToOtherMatrix) {
+                if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock],
                                                                     &dstReorderedLowerMat[ij * scalarsInBlock]);
                 }
@@ -184,17 +184,17 @@ namespace
                     int ikColumn;
                     if (ikState == POSITION_TYPE::UNDER_DIAG) {
                         ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
+                        if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT)
                             dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock];
                         ikColumn = lowerColIndices[ik];
                     } else if (ikState == POSITION_TYPE::ON_DIAG) {
                         ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
+                        if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT)
                             dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock];
                         ikColumn = naturalIdx;
                     } else { // ikState == POSITION_TYPE::ABOVE_DIAG
                         ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
+                        if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT)
                             dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock];
                         ikColumn = upperColIndices[ik];
                     }
@@ -219,10 +219,13 @@ namespace
                 }
             }
             invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
-            if (copyResultToOtherMatrix) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
-                                                                &dstDiagonal[reorderedIdx * scalarsInBlock]);
-
+            if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
+                // if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
+                                                                    &dstDiagonal[reorderedIdx * scalarsInBlock]);
+                }
                 // also move all values above the diagonal on this row
                 for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
                     moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock],
@@ -321,14 +324,14 @@ namespace
         }
     }
 
-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
     __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                               LinearSolverScalar* v)
     {
         auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@@ -348,7 +351,7 @@ namespace
                     &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
                 &dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
@@ -410,7 +413,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
@@ -418,14 +421,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                         LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -447,7 +450,7 @@ LUFactorization(T* srcMatrix,
         srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
 
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme mixedPrecisionScheme>
 void
 LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                      int* lowerRowIndices,
@@ -466,9 +469,9 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                      int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
+        cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>
+    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>
         <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                              lowerRowIndices,
                                              lowerColIndices,
@@ -489,14 +492,18 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
     template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
     template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, true>(                                                     \
+    template void LUFactorizationSplit<blocksize, T, float, MixedPrecisionScheme::DEFAULT>(                                                     \
         T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, true>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MixedPrecisionScheme::DEFAULT>(                                                    \
         T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, false>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(                                                    \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
+    template void LUFactorizationSplit<blocksize, T, double, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(                                                   \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
+    template void LUFactorizationSplit<blocksize, T, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(                                                    \
         T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, false>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);
+    template void LUFactorizationSplit<blocksize, T, double, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(                                                   \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@@ -523,15 +530,25 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
         float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
                                                                                                                        \
     /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double>(                                                  \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
         double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
     /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float>(                                                   \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
+        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    /* float preconditioner */                                                                                         \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
+        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    /* double preconditioner */                                                                                        \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
+        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    /* float matrix, double compute preconditioner */                                                                  \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
         float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
     /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float>(                                                    \
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
         float*, int*, int*, int*, int, int, const float*, float*, int);
 
+
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3);
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
index 03dbbe0c9ec..f88a47fc495 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -20,6 +20,7 @@
 #define OPM_ILU0_KERNELS_HPP
 #include <cstddef>
 #include <vector>
+#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
 namespace Opm::gpuistl::detail::ILU0
 {
 
@@ -89,14 +90,14 @@ void solveLowerLevelSet(T* reorderedMat,
  * solve
  * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
  */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                              LinearSolverScalar* v,
                              int threadBlockSize);
 
@@ -176,7 +177,7 @@ void LUFactorization(T* reorderedMat,
  * function
  * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
  */
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MixedPrecisionScheme mixedPrecisionScheme>
 void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                           int* lowerRowIndices,
                           int* lowerColIndices,

From 5f62f48ae3caac48d846968a4d2a356dd0f28049 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Tue, 22 Oct 2024 16:12:45 +0200
Subject: [PATCH 5/6] found one bug, store_factorization_as_float still not
 converging

---
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp  |  2 +-
 .../preconditionerKernels/ILU0Kernels.cu      | 22 +++++++++----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 99022b2f6db..3958ac75fe4 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -190,7 +190,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         v.data(),
                         upperSolveThreadBlockSize);
             }
-            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
                     detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
                         m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                         m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index 9d028c92351..b3617a66b15 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -505,18 +505,16 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
     template void LUFactorizationSplit<blocksize, T, double, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(                                                   \
         T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
 
-INSTANTIATE_KERNEL_WRAPPERS(float, 1);
-INSTANTIATE_KERNEL_WRAPPERS(float, 2);
-INSTANTIATE_KERNEL_WRAPPERS(float, 3);
-INSTANTIATE_KERNEL_WRAPPERS(float, 4);
-INSTANTIATE_KERNEL_WRAPPERS(float, 5);
-INSTANTIATE_KERNEL_WRAPPERS(float, 6);
-INSTANTIATE_KERNEL_WRAPPERS(double, 1);
-INSTANTIATE_KERNEL_WRAPPERS(double, 2);
-INSTANTIATE_KERNEL_WRAPPERS(double, 3);
-INSTANTIATE_KERNEL_WRAPPERS(double, 4);
-INSTANTIATE_KERNEL_WRAPPERS(double, 5);
-INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 6);
+
+INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
+INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
 
 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
     /* double preconditioner */                                                                                        \

From b88fcedca04ce7ddd1f758397eb9311aac13c51a Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Wed, 23 Oct 2024 09:20:27 +0200
Subject: [PATCH 6/6] fix crucial if-else bug

---
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 3958ac75fe4..b2dd32f12f5 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -89,7 +89,7 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
             m_cpuMatrix, m_reorderedToNatural);
     }
 
-    if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){
+    if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT){
         OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
 
         // initialize mixed precision datastructures
@@ -97,6 +97,7 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
         m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
         // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
         if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
+            printf("now using m_gpuMatrixReorderedDiagFloat\n");
             m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
         }
     }
@@ -190,7 +191,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                         v.data(),
                         upperSolveThreadBlockSize);
             }
-            if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
+            else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
                     detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
                         m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                         m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),