Skip to content

Commit

Permalink
[Trifocal+P2Pt] inlining Eigen perm product
Browse files Browse the repository at this point in the history
  • Loading branch information
rfabbri committed Nov 3, 2023
1 parent 803b28e commit bec3d9d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
4 changes: 2 additions & 2 deletions minus/Eigen/src/Core/ProductEvaluators.h
Original file line number Diff line number Diff line change
Expand Up @@ -972,7 +972,7 @@ struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;

template<typename Dest, typename PermutationType>
static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
static inline void __attribute__((always_inline)) run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
{
MatrixType mat(xpr);
const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
Expand Down Expand Up @@ -1026,7 +1026,7 @@ template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
{
template<typename Dest>
static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
static void __attribute__((always_inline)) evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
{
permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
}
Expand Down
6 changes: 6 additions & 0 deletions minus/lsolve.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ lsolve(
Map<const Matrix<C<F>, minus_core<P,F>::f::nve, 1>, Aligned > & __restrict b,
Map<Matrix<C<F>, minus_core<P,F>::f::nve, 1>,Aligned> & __restrict x)
{
asm("#------ Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
typedef minus_core<P, F> M;
typedef PermutationMatrix<M::f::nve, M::f::nve> PermutationType;
typedef Transpositions<M::f::nve, M::f::nve> TranspositionType;
Expand All @@ -31,10 +32,13 @@ lsolve(
if (k < rows-1)
m.bottomRightCorner(rrows,rrows).noalias() -= m.col(k).tail(rrows) * m.row(k).tail(rrows);
}
asm("#------ assign");
m_p = m_rowsTranspositions;
asm("#------ permute"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
x = m_p * b;

// TODO: use block indexing and std::vector-std::vector multiplication
asm("#------ fwdsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
x(1) -= m(1,0)*x(0);
x(2) -= m(2,0)*x(0)+m(2,1)*x(1);
x(3) -= m(3,0)*x(0)+m(3,1)*x(1)+m(3,2)*x(2);
Expand All @@ -52,6 +56,7 @@ lsolve(
// Step 2
//m.template triangularView<UnitLower>().solveInPlace(x);

asm("#------ backsubst"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
x(13) /= m(13,13);
x(12) -= m(12,13)*x(13); x(12) /= m(12,12);
x(11) -= (m(11,12)*x(12)+m(11,13)*x(13)); x(11) /= m(11,11);
Expand All @@ -66,4 +71,5 @@ lsolve(
x(2) -= (m(2,3)*x(3)+m(2,4)*x(4)+m(2,5)*x(5)+m(2,6)*x(6)+m(2,7)*x(7)+m(2,8)*x(8)+m(2,9)*x(9)+m(2,10)*x(10)+m(2,11)*x(11)+m(2,12)*x(12)+m(2,13)*x(13)); x(2) /= m(2,2);
x(1) -= (m(1,2)*x(2)+m(1,3)*x(3)+m(1,4)*x(4)+m(1,5)*x(5)+m(1,6)*x(6)+m(1,7)*x(7)+m(1,8)*x(8)+m(1,9)*x(9)+m(1,10)*x(10)+m(1,11)*x(11)+m(1,12)*x(12)+m(1,13)*x(13)); x(1) /= m(1,1);
x(0) -= (m(0,1)*x(1)+m(0,2)*x(2)+m(0,3)*x(3)+m(0,4)*x(4)+m(0,5)*x(5)+m(0,6)*x(6)+m(0,7)*x(7)+m(0,8)*x(8)+m(0,9)*x(9)+m(0,10)*x(10)+m(0,11)*x(11)+m(0,12)*x(12)+m(0,13)*x(13)); x(0) /= m(0,0);
asm("#------ END Lsolve itself"); // it is not inlining it, and also there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm
}

0 comments on commit bec3d9d

Please sign in to comment.