From bec3d9d2e5b033cd7293f5a1d474003de381732a Mon Sep 17 00:00:00 2001 From: Ricardo Fabbri Date: Fri, 3 Nov 2023 00:18:33 -0300 Subject: [PATCH] [Trifocal+P2Pt] inlining Eigen perm product --- minus/Eigen/src/Core/ProductEvaluators.h | 4 ++-- minus/lsolve.hxx | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/minus/Eigen/src/Core/ProductEvaluators.h b/minus/Eigen/src/Core/ProductEvaluators.h index 246bca3..f41dc76 100644 --- a/minus/Eigen/src/Core/ProductEvaluators.h +++ b/minus/Eigen/src/Core/ProductEvaluators.h @@ -972,7 +972,7 @@ struct permutation_matrix_product typedef typename remove_all::type MatrixTypeCleaned; template - static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) + static inline void __attribute__((always_inline)) run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) { MatrixType mat(xpr); const Index n = Side==OnTheLeft ? mat.rows() : mat.cols(); @@ -1026,7 +1026,7 @@ template struct generic_product_impl { template - static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + static void __attribute__((always_inline)) evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) { permutation_matrix_product::run(dst, lhs, rhs); } diff --git a/minus/lsolve.hxx b/minus/lsolve.hxx index 613a8e1..074d994 100644 --- a/minus/lsolve.hxx +++ b/minus/lsolve.hxx @@ -6,6 +6,7 @@ lsolve( Map, minus_core::f::nve, 1>, Aligned > & __restrict b, Map, minus_core::f::nve, 1>,Aligned> & __restrict x) { + asm("#------ Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm typedef minus_core M; typedef PermutationMatrix PermutationType; typedef Transpositions TranspositionType; @@ -31,10 +32,13 @@ lsolve( if (k < rows-1) m.bottomRightCorner(rrows,rrows).noalias() -= m.col(k).tail(rrows) * m.row(k).tail(rrows); } + asm("#------ assign"); m_p = m_rowsTranspositions; + asm("#------ permute"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm x = m_p * b; // TODO: use block indexing and std::vector-std::vector multiplication + asm("#------ fwdsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm x(1) -= m(1,0)*x(0); x(2) -= m(2,0)*x(0)+m(2,1)*x(1); x(3) -= m(3,0)*x(0)+m(3,1)*x(1)+m(3,2)*x(2); @@ -52,6 +56,7 @@ lsolve( // Step 2 //m.template triangularView().solveInPlace(x); + asm("#------ backsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm x(13) /= m(13,13); x(12) -= m(12,13)*x(13); x(12) /= m(12,12); x(11) -= (m(11,12)*x(12)+m(11,13)*x(13)); x(11) /= m(11,11); @@ -66,4 +71,5 @@ lsolve( x(2) -= (m(2,3)*x(3)+m(2,4)*x(4)+m(2,5)*x(5)+m(2,6)*x(6)+m(2,7)*x(7)+m(2,8)*x(8)+m(2,9)*x(9)+m(2,10)*x(10)+m(2,11)*x(11)+m(2,12)*x(12)+m(2,13)*x(13)); x(2) /= m(2,2); x(1) -= (m(1,2)*x(2)+m(1,3)*x(3)+m(1,4)*x(4)+m(1,5)*x(5)+m(1,6)*x(6)+m(1,7)*x(7)+m(1,8)*x(8)+m(1,9)*x(9)+m(1,10)*x(10)+m(1,11)*x(11)+m(1,12)*x(12)+m(1,13)*x(13)); x(1) /= m(1,1); x(0) -= (m(0,1)*x(1)+m(0,2)*x(2)+m(0,3)*x(3)+m(0,4)*x(4)+m(0,5)*x(5)+m(0,6)*x(6)+m(0,7)*x(7)+m(0,8)*x(8)+m(0,9)*x(9)+m(0,10)*x(10)+m(0,11)*x(11)+m(0,12)*x(12)+m(0,13)*x(13)); x(0) /= m(0,0); + asm("#------ END Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm }