From bec3d9d2e5b033cd7293f5a1d474003de381732a Mon Sep 17 00:00:00 2001
From: Ricardo Fabbri <rfabbri@gmail.com>
Date: Fri, 3 Nov 2023 00:18:33 -0300
Subject: [PATCH] [Trifocal+P2Pt] inlining Eigen perm product

---
 minus/Eigen/src/Core/ProductEvaluators.h | 4 ++--
 minus/lsolve.hxx                         | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/minus/Eigen/src/Core/ProductEvaluators.h b/minus/Eigen/src/Core/ProductEvaluators.h
index 246bca3..f41dc76 100644
--- a/minus/Eigen/src/Core/ProductEvaluators.h
+++ b/minus/Eigen/src/Core/ProductEvaluators.h
@@ -972,7 +972,7 @@ struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
     typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
 
     template<typename Dest, typename PermutationType>
-    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    static inline void __attribute__((always_inline)) run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
       MatrixType mat(xpr);
       const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
@@ -1026,7 +1026,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
 struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
 {
   template<typename Dest>
-  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  static void __attribute__((always_inline)) evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
   {
     permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
   }
diff --git a/minus/lsolve.hxx b/minus/lsolve.hxx
index 613a8e1..074d994 100644
--- a/minus/lsolve.hxx
+++ b/minus/lsolve.hxx
@@ -6,6 +6,7 @@ lsolve(
     Map<const Matrix<C<F>, minus_core<P,F>::f::nve, 1>, Aligned > & __restrict b,
     Map<Matrix<C<F>, minus_core<P,F>::f::nve, 1>,Aligned> & __restrict x)
 {
+  asm("#------ Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
   typedef minus_core<P, F> M;
   typedef PermutationMatrix<M::f::nve, M::f::nve> PermutationType;
   typedef Transpositions<M::f::nve, M::f::nve> TranspositionType;
@@ -31,10 +32,13 @@ lsolve(
     if (k < rows-1)
       m.bottomRightCorner(rrows,rrows).noalias() -= m.col(k).tail(rrows) * m.row(k).tail(rrows);
   }
+  asm("#------ assign"); 
   m_p = m_rowsTranspositions;
+  asm("#------ permute"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
   x = m_p * b;
 
   // TODO: use block indexing and std::vector-std::vector multiplication
+  asm("#------ fwdsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
   x(1)  -= m(1,0)*x(0);
   x(2)  -= m(2,0)*x(0)+m(2,1)*x(1);
   x(3)  -= m(3,0)*x(0)+m(3,1)*x(1)+m(3,2)*x(2);
@@ -52,6 +56,7 @@ lsolve(
   // Step 2
   //m.template triangularView<UnitLower>().solveInPlace(x);
 
+  asm("#------ backsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
   x(13) /= m(13,13);
   x(12) -= m(12,13)*x(13); x(12) /= m(12,12);
   x(11) -= (m(11,12)*x(12)+m(11,13)*x(13)); x(11) /= m(11,11);
@@ -66,4 +71,5 @@ lsolve(
   x(2)  -= (m(2,3)*x(3)+m(2,4)*x(4)+m(2,5)*x(5)+m(2,6)*x(6)+m(2,7)*x(7)+m(2,8)*x(8)+m(2,9)*x(9)+m(2,10)*x(10)+m(2,11)*x(11)+m(2,12)*x(12)+m(2,13)*x(13)); x(2) /= m(2,2);
   x(1)  -= (m(1,2)*x(2)+m(1,3)*x(3)+m(1,4)*x(4)+m(1,5)*x(5)+m(1,6)*x(6)+m(1,7)*x(7)+m(1,8)*x(8)+m(1,9)*x(9)+m(1,10)*x(10)+m(1,11)*x(11)+m(1,12)*x(12)+m(1,13)*x(13)); x(1) /= m(1,1);
   x(0)  -= (m(0,1)*x(1)+m(0,2)*x(2)+m(0,3)*x(3)+m(0,4)*x(4)+m(0,5)*x(5)+m(0,6)*x(6)+m(0,7)*x(7)+m(0,8)*x(8)+m(0,9)*x(9)+m(0,10)*x(10)+m(0,11)*x(11)+m(0,12)*x(12)+m(0,13)*x(13)); x(0) /= m(0,0);
+  asm("#------ END Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
 }