Skip to content

Commit

Permalink
WIP, think double diagonal is working, broken float MP
Browse files Browse the repository at this point in the history
  • Loading branch information
multitalentloes committed Oct 22, 2024
1 parent 276f6ca commit b6d2394
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 89 deletions.
8 changes: 4 additions & 4 deletions opm/simulators/linalg/PreconditionerFactory_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,10 +362,10 @@ struct StandardPreconditioners {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
using field_type = typename V::field_type;
using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);

auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
Expand Down Expand Up @@ -621,12 +621,12 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);

using field_type = typename V::field_type;
using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;

return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
});

F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
Expand Down
6 changes: 3 additions & 3 deletions opm/simulators/linalg/gpuistl/GpuDILU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
Expand All @@ -141,7 +141,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
v.data(),
lowerSolveThreadBlockSize);
}else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE) {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
Expand All @@ -153,7 +153,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
v.data(),
lowerSolveThreadBlockSize);
} else {
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand Down
2 changes: 1 addition & 1 deletion opm/simulators/linalg/gpuistl/GpuDILU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class GpuDILU : public Dune::PreconditionerWithUpdate<X, Y>
//! \param A The matrix to operate on.
//! \param w The relaxation factor.
//!
explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int storeFactorizationAsFloat);
explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);

//! \brief Prepare the preconditioner.
//! \note Does nothing at the time being.
Expand Down
59 changes: 48 additions & 11 deletions opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace Opm::gpuistl
{

template <class M, class X, class Y, int l>
OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
: m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
Expand All @@ -56,8 +56,12 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
, m_splitMatrix(splitMatrix)
, m_tuneThreadBlockSizes(tuneKernels)
, m_storeFactorizationAsFloat(storeFactorizationAsFloat)
{

OPM_ERROR_IF(!isValidMixedPrecisionScheme(mixedPrecisionScheme),
fmt::format("Invalid mixed precision scheme chosen: {}", mixedPrecisionScheme));
m_mixedPrecisionScheme = static_cast<MixedPrecisionScheme>(mixedPrecisionScheme);

// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
Expand Down Expand Up @@ -85,13 +89,16 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
m_cpuMatrix, m_reorderedToNatural);
}

if (m_storeFactorizationAsFloat){
if (!m_mixedPrecisionScheme == MixedPrecisionScheme::DEFAULT){
OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");

// initialize mixed precision datastructures
m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
// The MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE does not need to allocate this float vector
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
}
}

LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
Expand Down Expand Up @@ -128,7 +135,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){
if (m_mixedPrecisionScheme != MixedPrecisionScheme::DEFAULT) {
detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
Expand Down Expand Up @@ -171,8 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) {
if (m_storeFactorizationAsFloat) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, float>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
Expand All @@ -183,8 +190,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
v.data(),
upperSolveThreadBlockSize);
}
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT) {
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
upperSolveThreadBlockSize);
}
else{
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
Expand Down Expand Up @@ -272,8 +291,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
const int numOfRowsInLevel = m_levelSets[level].size();

if (m_splitMatrix) {
if (m_storeFactorizationAsFloat){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ENTIRE_FACTORIZATION_AS_FLOAT>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand All @@ -290,8 +309,26 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
numOfRowsInLevel,
factorizationThreadBlockSize);
}
else if (m_mixedPrecisionScheme == MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE){
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::STORE_ONLY_FACTORIZED_DIAGONAL_AS_DOUBLE>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
nullptr,
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
factorizationThreadBlockSize);
}
else{
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MixedPrecisionScheme::DEFAULT>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
Expand Down
8 changes: 4 additions & 4 deletions opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
#include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
#include <opm/simulators/linalg/gpuistl/detail/kernelEnums.hpp>
#include <optional>
#include <type_traits>
#include <vector>
Expand Down Expand Up @@ -64,7 +65,7 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
//! \param A The matrix to operate on.
//! \param w The relaxation factor.
//!
explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);

//! \brief Prepare the preconditioner.
//! \note Does nothing at the time being.
Expand Down Expand Up @@ -143,9 +144,8 @@ class OpmGpuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
bool m_splitMatrix;
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
bool m_tuneThreadBlockSizes;
//! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
//! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
bool m_storeFactorizationAsFloat;
//! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
MixedPrecisionScheme m_mixedPrecisionScheme;
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
//! The default value of -1 indicates that we have not calibrated and selected a value yet
int m_upperSolveThreadBlockSize = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -473,41 +473,19 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);

// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, float);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float, double);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float, double);
#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);

INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5);
INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6);

} // namespace Opm::gpuistl::detail::DILU
Loading

0 comments on commit b6d2394

Please sign in to comment.