From c57f5d83d35d3d01c26e20dff59e1c3849c77358 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Mon, 14 Oct 2024 16:17:56 +0200 Subject: [PATCH 1/6] add mixed precision option to gpudilu class --- opm/simulators/linalg/PreconditionerFactory_impl.hpp | 9 ++++++--- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 11 ++++++++++- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 10 +++++++++- tests/gpuistl/test_GpuDILU.cpp | 6 +++--- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index c53c069e7cd..0c905e9b2fb 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -349,9 +349,10 @@ struct StandardPreconditioners { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); using field_type = typename V::field_type; using GpuDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels); + auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); auto adapted = std::make_shared>(gpuDILU); auto wrapped = std::make_shared>(adapted, comm); @@ -631,14 +632,16 @@ struct StandardPreconditioners { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); using field_type = typename V::field_type; using GPUDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); }); F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); + const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); using block_type = typename V::block_type; using VTo = Dune::BlockVector>; @@ -647,7 +650,7 @@ struct StandardPreconditioners { using Adapter = typename gpuistl::PreconditionerAdapter; using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter; auto converted = std::make_shared(op.getmat()); - auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels)); + auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); converted->setUnderlyingPreconditioner(adapted); return converted; }); diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index cba3be2c993..53e13dfba5d 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels) +GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -52,6 +52,7 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels) , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) + , m_storeFactorizationAsFloat(storeFactorizationAsFloat) { // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? @@ -80,6 +81,14 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels) m_gpuMatrixReordered = detail::createReorderedMatrix>( m_cpuMatrix, m_reorderedToNatural); } + + if (m_storeFactorizationAsFloat) { + OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); + m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); + m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); + m_gpuMatrixReorderedDiagFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + } + computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); if (m_tuneThreadBlockSizes) { diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 665bd667b2f..edea8c70792 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -53,6 +53,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate using field_type = typename X::field_type; //! \brief The GPU matrix type using CuMat = GpuSparseMatrix; + using FloatMat = GpuSparseMatrix; + using FloatVec = GpuVector; //! \brief Constructor. //! @@ -60,7 +62,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels); + explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -127,6 +129,10 @@ class GpuDILU : public Dune::PreconditionerWithUpdate std::unique_ptr m_gpuMatrixReorderedUpper; //! \brief If matrix splitting is enabled, we also store the diagonal separately std::unique_ptr> m_gpuMatrixReorderedDiag; + //! \brief If mixed precision is enabled, store a float matrix + std::unique_ptr m_gpuMatrixReorderedLowerFloat; + std::unique_ptr m_gpuMatrixReorderedUpperFloat; + std::unique_ptr m_gpuMatrixReorderedDiagFloat; //! row conversion from natural to reordered matrix indices stored on the GPU GpuVector m_gpuNaturalToReorder; //! row conversion from reordered to natural matrix indices stored on the GPU @@ -137,6 +143,8 @@ class GpuDILU : public Dune::PreconditionerWithUpdate bool m_splitMatrix; //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; + //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision + bool m_storeFactorizationAsFloat; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/tests/gpuistl/test_GpuDILU.cpp b/tests/gpuistl/test_GpuDILU.cpp index 5c796651fa8..ac9044f429e 100644 --- a/tests/gpuistl/test_GpuDILU.cpp +++ b/tests/gpuistl/test_GpuDILU.cpp @@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApply) // Initialize preconditioner objects Dune::MultithreadDILU cpudilu(matA); - auto gpudilu = GpuDilu1x1(matA, true, true); + auto gpudilu = GpuDilu1x1(matA, true, true, false); // Use the apply gpudilu.apply(d_output, d_input); @@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked) // init matrix with 2x2 blocks Sp2x2BlockMatrix matA = get2x2BlockTestMatrix(); - auto gpudilu = GpuDilu2x2(matA, true, true); + auto gpudilu = GpuDilu2x2(matA, true, true, false); Dune::MultithreadDILU cpudilu(matA); // create input/output buffers for the apply @@ -275,7 +275,7 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge) { // create gpu dilu preconditioner Sp1x1BlockMatrix matA = get1x1BlockTestMatrix(); - auto gpudilu = GpuDilu1x1(matA, true, true); + auto gpudilu = GpuDilu1x1(matA, true, true, false); matA[0][0][0][0] = 11.0; matA[0][1][0][0] = 12.0; From fee3f4f09229480bc08712fb6b8d4b8a79fc8311 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Tue, 15 Oct 2024 13:22:55 +0200 Subject: [PATCH 2/6] working MP for gpu DILU --- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 126 +++++++++---- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 1 + .../gpuistl/detail/deviceBlockOperations.hpp | 15 +- .../preconditionerKernels/DILUKernels.cu | 166 ++++++++++++------ .../preconditionerKernels/DILUKernels.hpp | 31 ++-- 5 files changed, 231 insertions(+), 108 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index 53e13dfba5d..d1bcdf1bf77 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -83,10 +83,13 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo } if (m_storeFactorizationAsFloat) { - OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); + if (!m_splitMatrix){ + OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); + } m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); m_gpuMatrixReorderedDiagFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + m_gpuDInvFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); } computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); @@ -120,17 +123,31 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::DILU::solveLowerLevelSetSplit( - m_gpuMatrixReorderedLower->getNonZeroValues().data(), - m_gpuMatrixReorderedLower->getRowIndices().data(), - m_gpuMatrixReorderedLower->getColumnIndices().data(), - m_gpuReorderToNatural.data(), - levelStartIdx, - numOfRowsInLevel, - m_gpuDInv.data(), - d.data(), - v.data(), - lowerSolveThreadBlockSize); + if (m_storeFactorizationAsFloat) { + detail::DILU::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), + m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInvFloat->data(), + d.data(), + v.data(), + lowerSolveThreadBlockSize); + } else { + detail::DILU::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + d.data(), + v.data(), + lowerSolveThreadBlockSize); + } } else { detail::DILU::solveLowerLevelSet( m_gpuMatrixReordered->getNonZeroValues().data(), @@ -153,16 +170,29 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - detail::DILU::solveUpperLevelSetSplit( - m_gpuMatrixReorderedUpper->getNonZeroValues().data(), - m_gpuMatrixReorderedUpper->getRowIndices().data(), - m_gpuMatrixReorderedUpper->getColumnIndices().data(), - m_gpuReorderToNatural.data(), - levelStartIdx, - numOfRowsInLevel, - m_gpuDInv.data(), - v.data(), - upperSolveThreadBlockSize); + if (m_storeFactorizationAsFloat){ + detail::DILU::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInvFloat->data(), + v.data(), + upperSolveThreadBlockSize); + } else { + detail::DILU::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + v.data(), + upperSolveThreadBlockSize); + } } else { detail::DILU::solveUpperLevelSet( m_gpuMatrixReordered->getNonZeroValues().data(), @@ -241,20 +271,44 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - detail::DILU::computeDiluDiagonalSplit( - m_gpuMatrixReorderedLower->getNonZeroValues().data(), - m_gpuMatrixReorderedLower->getRowIndices().data(), - m_gpuMatrixReorderedLower->getColumnIndices().data(), - m_gpuMatrixReorderedUpper->getNonZeroValues().data(), - m_gpuMatrixReorderedUpper->getRowIndices().data(), - m_gpuMatrixReorderedUpper->getColumnIndices().data(), - m_gpuMatrixReorderedDiag->data(), - m_gpuReorderToNatural.data(), - m_gpuNaturalToReorder.data(), - levelStartIdx, - numOfRowsInLevel, - m_gpuDInv.data(), - factorizationBlockSize); + if (m_storeFactorizationAsFloat) { + detail::DILU::computeDiluDiagonalSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag->data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + m_gpuDInvFloat->data(), + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + factorizationBlockSize); + } else { + // TODO: should this be field type twice or field type then float in the template? + detail::DILU::computeDiluDiagonalSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag->data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + nullptr, + nullptr, + nullptr, + factorizationBlockSize); + } } else { detail::DILU::computeDiluDiagonal( m_gpuMatrixReordered->getNonZeroValues().data(), diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index edea8c70792..623e151322e 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -133,6 +133,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate std::unique_ptr m_gpuMatrixReorderedLowerFloat; std::unique_ptr m_gpuMatrixReorderedUpperFloat; std::unique_ptr m_gpuMatrixReorderedDiagFloat; + std::unique_ptr m_gpuDInvFloat; //! row conversion from natural to reordered matrix indices stored on the GPU GpuVector m_gpuNaturalToReorder; //! row conversion from reordered to natural matrix indices stored on the GPU diff --git a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp index 5ad30abb27e..ee22db3ae72 100644 --- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp +++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp @@ -159,7 +159,7 @@ mmv(const T* a, const T* b, T* c) // dst -= A*B*C template __device__ __forceinline__ void -mmx2Subtraction(T* A, T* B, T* C, T* dst) +mmx2Subtraction(const T* A, const T* B, const T* C, T* dst) { T tmp[blocksize * blocksize] = {0}; @@ -256,6 +256,19 @@ mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c) } } +// TODO: consider merging with existing block operations +// mixed precision general version of c += Ab +template +__device__ __forceinline__ void +umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c) +{ + for (int i = 0; i < blocksize; ++i) { + for (int j = 0; j < blocksize; ++j) { + c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j])); + } + } +} + // TODO: consider merging with existing block operations // Mixed precision general version of c -= Ab template diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index f0379d2dccc..050de298938 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -59,16 +59,16 @@ namespace } } - template - __global__ void cuSolveLowerLevelSetSplit(T* mat, + template + __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - const T* d, - T* v) + const MatrixScalar* dInv, + const LinearSolverScalar* d, + LinearSolverScalar* v) { const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); if (reorderedRowIdx < rowsInLevelSet + startIdx) { @@ -77,7 +77,7 @@ namespace const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; const int naturalRowIdx = indexConversion[reorderedRowIdx]; - T rhs[blocksize]; + LinearSolverScalar rhs[blocksize]; for (int i = 0; i < blocksize; i++) { rhs[i] = d[naturalRowIdx * blocksize + i]; } @@ -85,10 +85,10 @@ namespace // TODO: removce the first condition in the for loop for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - mmv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -118,15 +118,15 @@ namespace } } - template - __global__ void cuSolveUpperLevelSetSplit(T* mat, + template + __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v) + const MatrixScalar* dInv, + LinearSolverScalar* v) { const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); if (reorderedRowIdx < rowsInLevelSet + startIdx) { @@ -134,13 +134,13 @@ namespace const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1]; const int naturalRowIdx = indexConversion[reorderedRowIdx]; - T rhs[blocksize] = {0}; + LinearSolverScalar rhs[blocksize] = {0}; for (int block = nnzIdx; block < nnzIdxLim; ++block) { const int col = colIndices[block]; - umv(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); + umvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mmv(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -211,19 +211,24 @@ namespace } } - template - __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat, + // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results + // TOOD: The important part is to only cast after that is fully computed + template + __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + const InputScalar* srcReorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + const InputScalar* srcDiagonal, int* reorderedToNatural, int* naturalToReordered, const int startIdx, int rowsInLevelSet, - T* dInv) + InputScalar* dInv, + OutputScalar* dstDiag, // TODO: should this be diag or dInv? + OutputScalar* dstLowerMat, + OutputScalar* dstUpperMat) { const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; if (reorderedRowIdx < rowsInLevelSet + startIdx) { @@ -231,10 +236,10 @@ namespace const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx]; const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1]; - T dInvTmp[blocksize * blocksize]; + InputScalar dInvTmp[blocksize * blocksize]; for (int i = 0; i < blocksize; ++i) { for (int j = 0; j < blocksize; ++j) { - dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; + dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j]; } } @@ -250,18 +255,28 @@ namespace const int symOppositeBlock = symOppositeIdx; - mmx2Subtraction(&reorderedLowerMat[block * blocksize * blocksize], + if constexpr (copyResultToOtherMatrix) { + // TODO: think long and hard about whether this performs only the wanted memory transfers + moveBlock(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); + moveBlock(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); + } + + mmx2Subtraction(&srcReorderedLowerMat[block * blocksize * blocksize], &dInv[col * blocksize * blocksize], - &reorderedUpperMat[symOppositeBlock * blocksize * blocksize], + &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], dInvTmp); } - invBlockInPlace(dInvTmp); + invBlockInPlace(dInvTmp); - for (int i = 0; i < blocksize; ++i) { - for (int j = 0; j < blocksize; ++j) { - dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; - } + // for (int i = 0; i < blocksize; ++i) { + // for (int j = 0; j < blocksize; ++j) { + // dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j]; + // } + // } + moveBlock(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); + if constexpr (copyResultToOtherMatrix) { + moveBlock(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! } } } @@ -289,23 +304,23 @@ solveLowerLevelSet(T* reorderedMat, } -template +template void -solveLowerLevelSetSplit(T* reorderedMat, +solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, + const MatrixScalar* dInv, + const LinearSolverScalar* d, + LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveLowerLevelSetSplit, thrBlockSize); + cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( + cuSolveLowerLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } // perform the upper solve for all rows in the same level set @@ -328,22 +343,22 @@ solveUpperLevelSet(T* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } -template +template void -solveUpperLevelSetSplit(T* reorderedMat, +solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v, + const MatrixScalar* dInv, + LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -376,51 +391,62 @@ computeDiluDiagonal(T* reorderedMat, } } -template +template void -computeDiluDiagonalSplit(T* reorderedLowerMat, +computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + const InputScalar* srcReorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + const InputScalar* srcDiagonal, int* reorderedToNatural, int* naturalToReordered, const int startIdx, int rowsInLevelSet, - T* dInv, + InputScalar* dInv, + OutputScalar* dstDiag, + OutputScalar* dstLowerMat, + OutputScalar* dstUpperMat, int thrBlockSize) { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuComputeDiluDiagonalSplit, thrBlockSize); + cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(reorderedLowerMat, + cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, - reorderedUpperMat, + srcReorderedUpperMat, upperRowIndices, upperColIndices, - diagonal, + srcDiagonal, reorderedToNatural, naturalToReordered, startIdx, rowsInLevelSet, - dInv); + dInv, + dstDiag, + dstLowerMat, + dstUpperMat); } else { OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); } } +// TODO: format #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ - T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \ - template void solveUpperLevelSetSplit(T*, int*, int*, int*, int, int, const T*, T*, int); \ - template void solveLowerLevelSetSplit(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); + template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); @@ -434,4 +460,30 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3); INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); + +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar) \ + template void solveUpperLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int); \ + template void solveLowerLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); + +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float); + } // namespace Opm::gpuistl::detail::DILU diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index 74a7e78fe95..44b94c2f178 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -71,16 +71,16 @@ void solveLowerLevelSet(T* reorderedMat, * @param d Stores the defect * @param [out] v Will store the results of the lower solve */ -template -void solveLowerLevelSetSplit(T* reorderedUpperMat, +template +void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - const T* d, - T* v, + const MatrixScalar* dInv, + const LinearSolverScalar* d, + LinearSolverScalar* v, int threadBlockSize); /** @@ -124,15 +124,15 @@ void solveUpperLevelSet(T* reorderedMat, * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower * solve */ -template -void solveUpperLevelSetSplit(T* reorderedUpperMat, +template +void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const T* dInv, - T* v, + const MatrixScalar* dInv, + LinearSolverScalar* v, int threadBlockSize); /** @@ -161,7 +161,6 @@ void computeDiluDiagonal(T* reorderedMat, int rowsInLevelSet, T* dInv, int threadBlockSize); -template /** * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector @@ -184,18 +183,22 @@ template * function * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner */ -void computeDiluDiagonalSplit(T* reorderedLowerMat, +template +void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, - T* reorderedUpperMat, + const InputScalar* srcReorderedUpperMat, int* upperRowIndices, int* upperColIndices, - T* diagonal, + const InputScalar* srcDiagonal, int* reorderedToNatural, int* naturalToReordered, int startIdx, int rowsInLevelSet, - T* dInv, + InputScalar* dInv, + OutputScalar* dstDiagonal, + OutputScalar* dstLowerMat, + OutputScalar* dstUpperMat, int threadBlockSize); } // namespace Opm::gpuistl::detail::DILU From 76d68e7f471ea666a25e995f6a5fabaca145600d Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 16 Oct 2024 15:37:16 +0200 Subject: [PATCH 3/6] Add MP version of DILU with double on diagonal --- CMakeLists_files.cmake | 1 + .../linalg/PreconditionerFactory_impl.hpp | 12 +- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 71 +++++++++-- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 5 +- .../linalg/gpuistl/detail/kernelEnums.hpp | 46 +++++++ .../preconditionerKernels/DILUKernels.cu | 120 +++++++++++------- .../preconditionerKernels/DILUKernels.hpp | 11 +- 7 files changed, 194 insertions(+), 72 deletions(-) create mode 100644 opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 94c5bb4e4ab..81ba61f4aa2 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -253,6 +253,7 @@ if (HAVE_CUDA) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu) + ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernelEnums.hpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp) ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index 0c905e9b2fb..e5aba601726 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -349,10 +349,10 @@ struct StandardPreconditioners { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GpuDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); + auto gpuDILU = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme); auto adapted = std::make_shared>(gpuDILU); auto wrapped = std::make_shared>(adapted, comm); @@ -632,16 +632,16 @@ struct StandardPreconditioners { F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GPUDILU = typename gpuistl::GpuDILU, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); }); F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using block_type = typename V::block_type; using VTo = Dune::BlockVector>; @@ -650,7 +650,7 @@ struct StandardPreconditioners { using Adapter = typename gpuistl::PreconditionerAdapter; using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter; auto converted = std::make_shared(op.getmat()); - auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + auto adapted = std::make_shared(std::make_shared(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); converted->setUnderlyingPreconditioner(adapted); return converted; }); diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index d1bcdf1bf77..2fc96cc819c 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) +GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -52,9 +52,12 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) - , m_storeFactorizationAsFloat(storeFactorizationAsFloat) - { + + OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme), + fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme)); + m_mixedPrecisionScheme = static_cast(mixedPrecisionScheme); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -82,14 +85,16 @@ GpuDILU::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo m_cpuMatrix, m_reorderedToNatural); } - if (m_storeFactorizationAsFloat) { + if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT) { if (!m_splitMatrix){ OPM_THROW(std::runtime_error, "Matrix must be split when storing as float."); } m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); - m_gpuMatrixReorderedDiagFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); - m_gpuDInvFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + m_gpuDInvFloat = std::make_unique(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()); + } } computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize); @@ -123,7 +128,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), @@ -135,6 +140,18 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int d.data(), v.data(), lowerSolveThreadBlockSize); + }else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { + detail::DILU::solveLowerLevelSetSplit( + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), + m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + d.data(), + v.data(), + lowerSolveThreadBlockSize); } else { detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), @@ -170,7 +187,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){ detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), @@ -181,6 +198,17 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int m_gpuDInvFloat->data(), v.data(), upperSolveThreadBlockSize); + } else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){ + detail::DILU::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + v.data(), + upperSolveThreadBlockSize); } else { detail::DILU::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), @@ -271,8 +299,9 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { - detail::DILU::computeDiluDiagonalSplit( + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + // printf("update: MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT\n"); + detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -289,9 +318,29 @@ GpuDILU::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), factorizationBlockSize); + } else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { + // printf("update: MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE\n"); + detail::DILU::computeDiluDiagonalSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag->data(), + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuDInv.data(), + nullptr, + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + factorizationBlockSize); } else { + // printf("update: MixedPrecisionScheme::DEFAULT\n"); // TODO: should this be field type twice or field type then float in the template? - detail::DILU::computeDiluDiagonalSplit( + detail::DILU::computeDiluDiagonalSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 623e151322e..40209f17b4b 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -62,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); + explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -145,7 +146,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision - bool m_storeFactorizationAsFloat; + MixedPrecisionScheme m_mixedPrecisionScheme; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp b/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp new file mode 100644 index 00000000000..fea8220f65e --- /dev/null +++ b/opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp @@ -0,0 +1,46 @@ +/* + Copyright 2024 SINTEF AS + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP +#define OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP + +/* + This file is here to organize a growing amount of different mixed precision options for the preconditioners. +*/ + +namespace Opm::gpuistl { + enum MixedPrecisionScheme { + DEFAULT, + STORE_ENTIRE_FACTORIZATION_AS_FLOAT, + STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE + }; + + inline bool isValidMixedPrecisionScheme(int scheme) { + switch (scheme) { + case DEFAULT: + case STORE_ENTIRE_FACTORIZATION_AS_FLOAT: + case STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE: + return true; + default: + return false; + } + } +} + +#endif // OPM_GPUISTL_PRECONDITIONER_STORAGE_OPTION_HPP diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index 050de298938..31231bfce47 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -59,14 +59,14 @@ namespace } } - template + template __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v) { @@ -88,7 +88,7 @@ namespace mmvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -118,14 +118,14 @@ namespace } } - template + template __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v) { const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); @@ -140,7 +140,7 @@ namespace umvMixedGeneral(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); + mmvMixedGeneral(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -213,7 +213,7 @@ namespace // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results // TOOD: The important part is to only cast after that is fully computed - template + template __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, @@ -255,7 +255,7 @@ namespace const int symOppositeBlock = symOppositeIdx; - if constexpr (copyResultToOtherMatrix) { + if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) { // TODO: think long and hard about whether this performs only the wanted memory transfers moveBlock(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]); moveBlock(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]); @@ -275,7 +275,8 @@ namespace // } // } moveBlock(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]); - if constexpr (copyResultToOtherMatrix) { + // only if we store the entire matrix as a float should we be moving the block to the float diagonal + if constexpr (mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { moveBlock(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important! } } @@ -304,7 +305,7 @@ solveLowerLevelSet(T* reorderedMat, } -template +template void solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -312,15 +313,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveLowerLevelSetSplit, thrBlockSize); + cuSolveLowerLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveLowerLevelSetSplit<<>>( + cuSolveLowerLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v); } // perform the upper solve for all rows in the same level set @@ -343,7 +344,7 @@ solveUpperLevelSet(T* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -351,14 +352,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -391,7 +392,7 @@ computeDiluDiagonal(T* reorderedMat, } } -template +template void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -412,9 +413,9 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, { if (blocksize <= 3) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuComputeDiluDiagonalSplit, thrBlockSize); + cuComputeDiluDiagonalSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, + cuComputeDiluDiagonalSplit<<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, srcReorderedUpperMat, @@ -437,13 +438,17 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, // TODO: format #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ template void computeDiluDiagonal(T*, int*, int*, int*, int*, const int, int, T*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ - template void computeDiluDiagonalSplit( \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ + template void computeDiluDiagonalSplit( \ + const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int); \ + template void computeDiluDiagonalSplit( \ const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int); \ template void solveUpperLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); @@ -461,29 +466,48 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4); INSTANTIATE_KERNEL_WRAPPERS(double, 5); INSTANTIATE_KERNEL_WRAPPERS(double, 6); -#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar) \ - template void solveUpperLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int); \ - template void solveLowerLevelSetSplit( \ - MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); - -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float); +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar) \ + template void solveUpperLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int); \ + template void solveLowerLevelSetSplit( \ + MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); + +// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double); } // namespace Opm::gpuistl::detail::DILU diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp index 44b94c2f178..6dc8f61eb93 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include namespace Opm::gpuistl::detail::DILU @@ -71,14 +72,14 @@ void solveLowerLevelSet(T* reorderedMat, * @param d Stores the defect * @param [out] v Will store the results of the lower solve */ -template +template void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, const LinearSolverScalar* d, LinearSolverScalar* v, int threadBlockSize); @@ -124,14 +125,14 @@ void solveUpperLevelSet(T* reorderedMat, * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower * solve */ -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int threadBlockSize); @@ -183,7 +184,7 @@ void computeDiluDiagonal(T* reorderedMat, * function * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner */ -template +template void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, From f7da4ffe4e4a5a4f66b3b33e2c43731db4dd6fd5 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Tue, 22 Oct 2024 15:19:16 +0200 Subject: [PATCH 4/6] WIP, think double diagonal is working, broken float MP --- .../linalg/PreconditionerFactory_impl.hpp | 8 +-- opm/simulators/linalg/gpuistl/GpuDILU.cpp | 6 +- opm/simulators/linalg/gpuistl/GpuDILU.hpp | 2 +- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 59 ++++++++++++--- opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp | 8 +-- .../preconditionerKernels/DILUKernels.cu | 50 ++++--------- .../preconditionerKernels/ILU0Kernels.cu | 71 ++++++++++++------- .../preconditionerKernels/ILU0Kernels.hpp | 7 +- 8 files changed, 122 insertions(+), 89 deletions(-) diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp index e5aba601726..46d8712094b 100644 --- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp +++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp @@ -362,10 +362,10 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t, const C& comm) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using OpmGpuILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float); + auto gpuilu0 = std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme); auto adapted = std::make_shared>(gpuilu0); auto wrapped = std::make_shared>(adapted, comm); @@ -621,12 +621,12 @@ struct StandardPreconditioners { F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { const bool split_matrix = prm.get("split_matrix", true); const bool tune_gpu_kernels = prm.get("tune_gpu_kernels", true); - const bool store_factorization_as_float = prm.get("store_factorization_as_float", false); + const int mixed_precision_scheme = prm.get("mixed_precision_scheme", 0); using field_type = typename V::field_type; using GPUILU0 = typename gpuistl::OpmGpuILU0, gpuistl::GpuVector>; - return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float)); + return std::make_shared>(std::make_shared(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme)); }); F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function&, std::size_t) { diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.cpp b/opm/simulators/linalg/gpuistl/GpuDILU.cpp index 2fc96cc819c..0e0f969fbcb 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp @@ -129,7 +129,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { - detail::DILU::solveLowerLevelSetSplit( + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), @@ -141,7 +141,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int v.data(), lowerSolveThreadBlockSize); }else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { - detail::DILU::solveLowerLevelSetSplit( + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(), @@ -153,7 +153,7 @@ GpuDILU::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int v.data(), lowerSolveThreadBlockSize); } else { - detail::DILU::solveLowerLevelSetSplit( + detail::DILU::solveLowerLevelSetSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/GpuDILU.hpp b/opm/simulators/linalg/gpuistl/GpuDILU.hpp index 40209f17b4b..86ad4aa4dc4 100644 --- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp +++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp @@ -63,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat); + explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 24ba06ecb06..99022b2f6db 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -41,7 +41,7 @@ namespace Opm::gpuistl { template -OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat) +OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme) : m_cpuMatrix(A) , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER)) , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets)) @@ -56,8 +56,12 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()) , m_splitMatrix(splitMatrix) , m_tuneThreadBlockSizes(tuneKernels) - , m_storeFactorizationAsFloat(storeFactorizationAsFloat) { + + OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme), + fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme)); + m_mixedPrecisionScheme = static_cast(mixedPrecisionScheme); + // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode? // Some sanity check OPM_ERROR_IF(A.N() != m_gpuMatrix.N(), @@ -85,13 +89,16 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel m_cpuMatrix, m_reorderedToNatural); } - if (m_storeFactorizationAsFloat){ + if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){ OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true"); // initialize mixed precision datastructures m_gpuMatrixReorderedLowerFloat = std::make_unique(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_); m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); - m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); + // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); + } } LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize); @@ -128,7 +135,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i for (int level = 0; level < m_levelSets.size(); ++level) { const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ + if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) { detail::ILU0::solveLowerLevelSetSplit( m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), m_gpuMatrixReorderedLowerFloat->getRowIndices().data(), @@ -171,8 +178,8 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i const int numOfRowsInLevel = m_levelSets[level].size(); levelStartIdx -= numOfRowsInLevel; if (m_splitMatrix) { - if (m_storeFactorizationAsFloat) { - detail::ILU0::solveUpperLevelSetSplit( + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), @@ -183,8 +190,20 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i v.data(), upperSolveThreadBlockSize); } + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + detail::ILU0::solveUpperLevelSetSplit( + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), + m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(), + m_gpuReorderToNatural.data(), + levelStartIdx, + numOfRowsInLevel, + m_gpuMatrixReorderedDiag.value().data(), + v.data(), + upperSolveThreadBlockSize); + } else{ - detail::ILU0::solveUpperLevelSetSplit( + detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(), @@ -272,8 +291,8 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact const int numOfRowsInLevel = m_levelSets[level].size(); if (m_splitMatrix) { - if (m_storeFactorizationAsFloat){ - detail::ILU0::LUFactorizationSplit( + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){ + detail::ILU0::LUFactorizationSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), @@ -290,8 +309,26 @@ OpmGpuILU0::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact numOfRowsInLevel, factorizationThreadBlockSize); } + else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){ + detail::ILU0::LUFactorizationSplit( + m_gpuMatrixReorderedLower->getNonZeroValues().data(), + m_gpuMatrixReorderedLower->getRowIndices().data(), + m_gpuMatrixReorderedLower->getColumnIndices().data(), + m_gpuMatrixReorderedUpper->getNonZeroValues().data(), + m_gpuMatrixReorderedUpper->getRowIndices().data(), + m_gpuMatrixReorderedUpper->getColumnIndices().data(), + m_gpuMatrixReorderedDiag.value().data(), + m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(), + m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), + nullptr, + m_gpuReorderToNatural.data(), + m_gpuNaturalToReorder.data(), + levelStartIdx, + numOfRowsInLevel, + factorizationThreadBlockSize); + } else{ - detail::ILU0::LUFactorizationSplit( + detail::ILU0::LUFactorizationSplit( m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp index 025be1f5bf5..c9453a25a9c 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,7 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate //! \param A The matrix to operate on. //! \param w The relaxation factor. //! - explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat); + explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme); //! \brief Prepare the preconditioner. //! \note Does nothing at the time being. @@ -143,9 +144,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate bool m_splitMatrix; //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards bool m_tuneThreadBlockSizes; - //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure. - //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed. - bool m_storeFactorizationAsFloat; + //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision + MixedPrecisionScheme m_mixedPrecisionScheme; //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards //! The default value of -1 indicates that we have not calibrated and selected a value yet int m_upperSolveThreadBlockSize = -1; diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu index 31231bfce47..eb997a7e987 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu @@ -473,41 +473,19 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6); MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int); // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double); -INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double); +#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \ + INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double); + +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5); +INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6); } // namespace Opm::gpuistl::detail::DILU diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 5d304e3660f..9d028c92351 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -120,7 +120,7 @@ namespace } } - template + template __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, @@ -161,7 +161,7 @@ namespace mmOverlap(&srcReorderedLowerMat[ij * scalarsInBlock], &srcDiagonal[j * scalarsInBlock], &srcReorderedLowerMat[ij * scalarsInBlock]); - if (copyResultToOtherMatrix) { + if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) { moveBlock(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]); } @@ -184,17 +184,17 @@ namespace int ikColumn; if (ikState == POSITION_TYPE::UNDER_DIAG) { ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock]; - if (copyResultToOtherMatrix) + if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock]; ikColumn = lowerColIndices[ik]; } else if (ikState == POSITION_TYPE::ON_DIAG) { ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock]; - if (copyResultToOtherMatrix) + if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock]; ikColumn = naturalIdx; } else { // ikState == POSITION_TYPE::ABOVE_DIAG ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock]; - if (copyResultToOtherMatrix) + if (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock]; ikColumn = upperColIndices[ik]; } @@ -219,10 +219,13 @@ namespace } } invBlockInPlace(&srcDiagonal[reorderedIdx * scalarsInBlock]); - if (copyResultToOtherMatrix) { - moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], - &dstDiagonal[reorderedIdx * scalarsInBlock]); - + if constexpr (mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) { + // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float + // if not then we just use the double diagonal that is already now stored in srcDiagonal + if constexpr (mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){ + moveBlock(&srcDiagonal[reorderedIdx * scalarsInBlock], + &dstDiagonal[reorderedIdx * scalarsInBlock]); + } // also move all values above the diagonal on this row for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) { moveBlock(&srcReorderedUpperMat[block * scalarsInBlock], @@ -321,14 +324,14 @@ namespace } } - template + template __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v) { auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); @@ -348,7 +351,7 @@ namespace &mat[block * blocksize * blocksize], &v[col * blocksize], rhs); } - mvMixedGeneral( + mvMixedGeneral( &dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]); } } @@ -410,7 +413,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat, reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v); } // perform the upper solve for all rows in the same level set -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, @@ -418,14 +421,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuSolveUpperLevelSetSplit, thrBlockSize); + cuSolveUpperLevelSetSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuSolveUpperLevelSetSplit<<>>( + cuSolveUpperLevelSetSplit<<>>( reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v); } @@ -447,7 +450,7 @@ LUFactorization(T* srcMatrix, srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx); } -template +template void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, @@ -466,9 +469,9 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int thrBlockSize) { int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize( - cuLUFactorizationSplit, thrBlockSize); + cuLUFactorizationSplit, thrBlockSize); int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize); - cuLUFactorizationSplit + cuLUFactorizationSplit <<>>(srcReorderedLowerMat, lowerRowIndices, lowerColIndices, @@ -489,14 +492,18 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, template void solveUpperLevelSet(T*, int*, int*, int*, int, int, T*, int); \ template void solveLowerLevelSet(T*, int*, int*, int*, int, int, const T*, T*, int); \ template void LUFactorization(T*, int*, int*, int*, int*, size_t, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \ + template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int); \ - template void LUFactorizationSplit( \ - T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); + template void LUFactorizationSplit( \ + T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 2); @@ -523,15 +530,25 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6); float*, int*, int*, int*, int, int, const float*, float*, int); \ \ /* double preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ double*, int*, int*, int*, int, int, const double*, double*, int); \ /* float matrix, double compute preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, double*, int); \ + /* float preconditioner */ \ + template void solveUpperLevelSetSplit( \ + float*, int*, int*, int*, int, int, const double*, float*, int); \ + /* double preconditioner */ \ + template void solveUpperLevelSetSplit( \ + double*, int*, int*, int*, int, int, const float*, double*, int); \ + /* float matrix, double compute preconditioner */ \ + template void solveUpperLevelSetSplit( \ float*, int*, int*, int*, int, int, const float*, double*, int); \ /* float preconditioner */ \ - template void solveUpperLevelSetSplit( \ + template void solveUpperLevelSetSplit( \ float*, int*, int*, int*, int, int, const float*, float*, int); + INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2); INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3); diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp index 03dbbe0c9ec..f88a47fc495 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp @@ -20,6 +20,7 @@ #define OPM_ILU0_KERNELS_HPP #include #include +#include namespace Opm::gpuistl::detail::ILU0 { @@ -89,14 +90,14 @@ void solveLowerLevelSet(T* reorderedMat, * solve * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template +template void solveUpperLevelSetSplit(MatrixScalar* reorderedMat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, - const MatrixScalar* dInv, + const DiagonalScalar* dInv, LinearSolverScalar* v, int threadBlockSize); @@ -176,7 +177,7 @@ void LUFactorization(T* reorderedMat, * function * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen */ -template +template void LUFactorizationSplit(InputScalar* srcReorderedLowerMat, int* lowerRowIndices, int* lowerColIndices, From 5f62f48ae3caac48d846968a4d2a356dd0f28049 Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Tue, 22 Oct 2024 16:12:45 +0200 Subject: [PATCH 5/6] found one bug, store_factorization_as_float still not converging --- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 2 +- .../preconditionerKernels/ILU0Kernels.cu | 22 +++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 99022b2f6db..3958ac75fe4 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -190,7 +190,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i v.data(), upperSolveThreadBlockSize); } - if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(), diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu index 9d028c92351..b3617a66b15 100644 --- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu +++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu @@ -505,18 +505,16 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat, template void LUFactorizationSplit( \ T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); -INSTANTIATE_KERNEL_WRAPPERS(float, 1); -INSTANTIATE_KERNEL_WRAPPERS(float, 2); -INSTANTIATE_KERNEL_WRAPPERS(float, 3); -INSTANTIATE_KERNEL_WRAPPERS(float, 4); -INSTANTIATE_KERNEL_WRAPPERS(float, 5); -INSTANTIATE_KERNEL_WRAPPERS(float, 6); -INSTANTIATE_KERNEL_WRAPPERS(double, 1); -INSTANTIATE_KERNEL_WRAPPERS(double, 2); -INSTANTIATE_KERNEL_WRAPPERS(double, 3); -INSTANTIATE_KERNEL_WRAPPERS(double, 4); -INSTANTIATE_KERNEL_WRAPPERS(double, 5); -INSTANTIATE_KERNEL_WRAPPERS(double, 6); +#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \ + INSTANTIATE_KERNEL_WRAPPERS(T, 1); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 2); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 3); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 4); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 5); \ + INSTANTIATE_KERNEL_WRAPPERS(T, 6); + +INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float) +INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double) #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \ /* double preconditioner */ \ From b88fcedca04ce7ddd1f758397eb9311aac13c51a Mon Sep 17 00:00:00 2001 From: Tobias Meyer Andersen Date: Wed, 23 Oct 2024 09:20:27 +0200 Subject: [PATCH 6/6] fix crucial if-else bug --- opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp index 3958ac75fe4..b2dd32f12f5 100644 --- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp +++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp @@ -89,7 +89,7 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel m_cpuMatrix, m_reorderedToNatural); } - if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){ + if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT){ OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true"); // initialize mixed precision datastructures @@ -97,6 +97,7 @@ OpmGpuILU0::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel m_gpuMatrixReorderedUpperFloat = std::make_unique(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_); // The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) { + printf("now using m_gpuMatrixReorderedDiagFloat\n"); m_gpuMatrixReorderedDiagFloat.emplace(GpuVector(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())); } } @@ -190,7 +191,7 @@ OpmGpuILU0::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i v.data(), upperSolveThreadBlockSize); } - if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { + else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) { detail::ILU0::solveUpperLevelSetSplit( m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(), m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),