From 0024f667a7d804e862268f36b7c9b1dadf243c8a Mon Sep 17 00:00:00 2001 From: Ricardo Fabbri Date: Sat, 4 Nov 2023 17:53:48 -0300 Subject: [PATCH] [Trifocal+P2Pt] Specializing ::track to Chicago --- minus/chicago14a-HxH.hxx | 28 ------------------ minus/chicago14a-Hxt.hxx | 28 ------------------ minus/chicago14a-lsolve.hxx | 58 +++++++++++++++++++++++++++++++++++++ minus/lsolve.hxx | 58 ------------------------------------- minus/minus.hxx | 18 +++++++++++- 5 files changed, 75 insertions(+), 115 deletions(-) create mode 100644 minus/chicago14a-lsolve.hxx diff --git a/minus/chicago14a-HxH.hxx b/minus/chicago14a-HxH.hxx index 6d49374..cdc491c 100644 --- a/minus/chicago14a-HxH.hxx +++ b/minus/chicago14a-HxH.hxx @@ -2995,9 +2995,7 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[8] = G468; y[9] = G487; y[10] = G517; - y[11] = C3; y[12] = G520; - y[13] = C3; y[14] = G551; y[15] = G563; y[16] = G587; @@ -3009,9 +3007,7 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[22] = G692; y[23] = G703; y[24] = G733; - y[25] = C3; y[26] = G736; - y[27] = C3; y[28] = G764; y[29] = G775; y[30] = G798; @@ -3023,9 +3019,7 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[36] = G900; y[37] = G911; y[38] = G941; - y[39] = C3; y[40] = G944; - y[41] = C3; y[42] = G970; y[43] = G982; y[44] = G1003; @@ -3037,9 +3031,7 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[50] = G1102; y[51] = G1114; y[52] = G1144; - y[53] = C3; y[54] = G1147; - y[55] = C3; y[56] = G1226; y[57] = G1254; y[58] = G1290; @@ -3051,8 +3043,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[64] = G1446; y[65] = G1462; y[66] = G1492; - y[67] = C3; - y[68] = C3; y[69] = G1495; y[70] = G1526; y[71] = G1538; @@ -3065,8 +3055,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[78] = G1667; y[79] = G1678; y[80] = G1708; - y[81] = C3; - y[82] = C3; y[83] = G1711; y[84] = G1739; y[85] = G1750; @@ -3079,8 +3067,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[92] = G1875; y[93] = G1886; y[94] = G1916; - y[95] = C3; - y[96] = C3; y[97] = G1919; y[98] = G1945; y[99] = G1957; @@ -3093,8 +3079,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[106] = G2077; y[107] = G2089; y[108] = G2119; - y[109] = C3; - y[110] = C3; y[111] = G2122; y[112] = G2131; y[113] = G2137; @@ -3108,8 +3092,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[121] = G2197; y[122] = G2221; y[123] = G2224; - y[124] = C3; - y[125] = C3; y[126] = G2233; y[127] = G2239; y[128] = G2248; @@ -3122,8 +3104,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[135] = G2299; y[136] = G2323; y[137] = G2326; - y[138] = C3; - y[139] = C3; y[140] = G2335; y[141] = G2341; y[142] = G2350; @@ -3136,8 +3116,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[149] = G2401; y[150] = G2425; y[151] = G2428; - y[152] = C3; - y[153] = C3; y[154] = G2435; y[155] = G2440; y[156] = G2447; @@ -3150,8 +3128,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[163] = G2488; y[164] = G2509; y[165] = G2512; - y[166] = C3; - y[167] = C3; y[168] = G2519; y[169] = G2524; y[170] = G2531; @@ -3164,8 +3140,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[177] = G2572; y[178] = G2593; y[179] = G2596; - y[180] = C3; - y[181] = C3; y[182] = G2603; y[183] = G2608; y[184] = G2615; @@ -3178,8 +3152,6 @@ HxH(const C* __restrict ux /*x and t*/, const C * __restrict uparams, C y[191] = G2656; y[192] = G2677; y[193] = G2680; - y[194] = C3; - y[195] = C3; y[196] = -G2692; y[197] = -G2700; diff --git a/minus/chicago14a-Hxt.hxx b/minus/chicago14a-Hxt.hxx index 9d9bf54..e7cdbb3 100644 --- a/minus/chicago14a-Hxt.hxx +++ b/minus/chicago14a-Hxt.hxx @@ -3609,9 +3609,7 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[8] = G468; y[9] = G487; y[10] = G517; - y[11] = C3; y[12] = G520; - y[13] = C3; y[14] = G551; y[15] = G563; y[16] = G587; @@ -3623,9 +3621,7 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[22] = G692; y[23] = G703; y[24] = G733; - y[25] = C3; y[26] = G736; - y[27] = C3; y[28] = G764; y[29] = G775; y[30] = G798; @@ -3637,9 +3633,7 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[36] = G900; y[37] = G911; y[38] = G941; - y[39] = C3; y[40] = G944; - y[41] = C3; y[42] = G970; y[43] = G982; y[44] = G1003; @@ -3651,9 +3645,7 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[50] = G1102; y[51] = G1114; y[52] = G1144; - y[53] = C3; y[54] = G1147; - y[55] = C3; y[56] = G1226; y[57] = G1254; y[58] = G1290; @@ -3665,8 +3657,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[64] = G1446; y[65] = G1462; y[66] = G1492; - y[67] = C3; - y[68] = C3; y[69] = G1495; y[70] = G1526; y[71] = G1538; @@ -3679,8 +3669,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[78] = G1667; y[79] = G1678; y[80] = G1708; - y[81] = C3; - y[82] = C3; y[83] = G1711; y[84] = G1739; y[85] = G1750; @@ -3693,8 +3681,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[92] = G1875; y[93] = G1886; y[94] = G1916; - y[95] = C3; - y[96] = C3; y[97] = G1919; y[98] = G1945; y[99] = G1957; @@ -3707,8 +3693,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[106] = G2077; y[107] = G2089; y[108] = G2119; - y[109] = C3; - y[110] = C3; y[111] = G2122; y[112] = G2131; y[113] = G2137; @@ -3722,8 +3706,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[121] = G2197; y[122] = G2221; y[123] = G2224; - y[124] = C3; - y[125] = C3; y[126] = G2233; y[127] = G2239; y[128] = G2248; @@ -3736,8 +3718,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[135] = G2299; y[136] = G2323; y[137] = G2326; - y[138] = C3; - y[139] = C3; y[140] = G2335; y[141] = G2341; y[142] = G2350; @@ -3750,8 +3730,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[149] = G2401; y[150] = G2425; y[151] = G2428; - y[152] = C3; - y[153] = C3; y[154] = G2435; y[155] = G2440; y[156] = G2447; @@ -3764,8 +3742,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[163] = G2488; y[164] = G2509; y[165] = G2512; - y[166] = C3; - y[167] = C3; y[168] = G2519; y[169] = G2524; y[170] = G2531; @@ -3778,8 +3754,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[177] = G2572; y[178] = G2593; y[179] = G2596; - y[180] = C3; - y[181] = C3; y[182] = G2603; y[183] = G2608; y[184] = G2615; @@ -3792,8 +3766,6 @@ Hxt(const C * __restrict ux, const C * __restrict uparams, C * __restri y[191] = G2656; y[192] = G2677; y[193] = G2680; - y[194] = C3; - y[195] = C3; y[196] = -G2752; y[197] = -G2782; diff --git a/minus/chicago14a-lsolve.hxx b/minus/chicago14a-lsolve.hxx new file mode 100644 index 0000000..a9f07fb --- /dev/null +++ b/minus/chicago14a-lsolve.hxx @@ -0,0 +1,58 @@ +// Specific to Chicago +template +__attribute__((always_inline)) inline void +lsolve( + Map, minus_core::f::nve, minus_core::f::nve +1>,Aligned> & __restrict m, + C __restrict *ux) +{ + C * const x= reinterpret_cast *> (__builtin_assume_aligned(ux,64)); + //asm("#------ Lsolve begin"); // there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm + typedef minus_core M; + static constexpr unsigned char rows = M::f::nve; + for(unsigned char k = 0; k < M::f::nve; ++k) { // unroll this loop make it + // specific to m structure of + // having many zeros in last 3 + // rowsj + const unsigned char rrows = rows-k-1; unsigned char row_of_biggest_in_col = k; + F biggest_in_corner = std::norm(m(k,k)); + for (unsigned j=rows-1; j != k; --j) { // todo: no need to go beyond row 10, Hxt rows 11 12 and 13 are fixed + F tmp; + if ((tmp = std::norm(m(j,k))) > biggest_in_corner*1e3) { + biggest_in_corner = tmp; row_of_biggest_in_col = j; + break; + } + } + if (k != row_of_biggest_in_col) m.row(k).swap(m.row(row_of_biggest_in_col)); + m.col(k).tail(rrows) /= m(k,k); + if (k < rows-1) + m.block(M::f::nve-rrows,M::f::nve-rrows,rrows,rrows).noalias() -= m.col(k).tail(rrows) * m.row(k).segment(M::f::nve-rrows,rrows); + } + x[0] = m(0,14); + x[1] = m(1,14)-m(1,0)*x[0]; + x[2] = m(2,14)-(m(2,0)*x[0]+m(2,1)*x[1]); + x[3] = m(3,14)-(m(3,0)*x[0]+m(3,1)*x[1]+m(3,2)*x[2]); + x[4] = m(4,14)-(m(4,0)*x[0]+m(4,1)*x[1]+m(4,2)*x[2]+m(4,3)*x[3]); + x[5] = m(5,14)-(m(5,0)*x[0]+m(5,1)*x[1]+m(5,2)*x[2]+m(5,3)*x[3]+m(5,4)*x[4]); + x[6] = m(6,14)-(m(6,0)*x[0]+m(6,1)*x[1]+m(6,2)*x[2]+m(6,3)*x[3]+m(6,4)*x[4]+m(6,5)*x[5]); + x[7] = m(7,14)-(m(7,0)*x[0]+m(7,1)*x[1]+m(7,2)*x[2]+m(7,3)*x[3]+m(7,4)*x[4]+m(7,5)*x[5]+m(7,6)*x[6]); + x[8] = m(8,14)-(m(8,0)*x[0]+m(8,1)*x[1]+m(8,2)*x[2]+m(8,3)*x[3]+m(8,4)*x[4]+m(8,5)*x[5]+m(8,6)*x[6]+m(8,7)*x[7]); + x[9] = m(9,14)-(m(9,0)*x[0]+m(9,1)*x[1]+m(9,2)*x[2]+m(9,3)*x[3]+m(9,4)*x[4]+m(9,5)*x[5]+m(9,6)*x[6]+m(9,7)*x[7]+m(9,8)*x[8]); + x[10] = m(10,14)-(m(10,0)*x[0]+m(10,1)*x[1]+m(10,2)*x[2]+m(10,3)*x[3]+m(10,4)*x[4]+m(10,5)*x[5]+m(10,6)*x[6]+m(10,7)*x[7]+m(10,8)*x[8]+m(10,9)*x[9]); + x[11] = m(11,14)-(m(11,0)*x[0]+m(11,1)*x[1]+m(11,2)*x[2]+m(11,3)*x[3]+m(11,4)*x[4]+m(11,5)*x[5]+m(11,6)*x[6]+m(11,7)*x[7]+m(11,8)*x[8]+m(11,9)*x[9]+m(11,10)*x[10]); + x[12] = m(12,14)-(m(12,0)*x[0]+m(12,1)*x[1]+m(12,2)*x[2]+m(12,3)*x[3]+m(12,4)*x[4]+m(12,5)*x[5]+m(12,6)*x[6]+m(12,7)*x[7]+m(12,8)*x[8]+m(12,9)*x[9]+m(12,10)*x[10]+m(12,11)*x[11]); + x[13] = m(13,14)-(m(13,0)*x[0]+m(13,1)*x[1]+m(13,2)*x[2]+m(13,3)*x[3]+m(13,4)*x[4]+m(13,5)*x[5]+m(13,6)*x[6]+m(13,7)*x[7]+m(13,8)*x[8]+m(13,9)*x[9]+m(13,10)*x[10]+m(13,11)*x[11]+m(13,12)*x[12]); + x[13] /= m(13,13); + x[12] -= m(12,13)*x[13]; x[12] /= m(12,12); + x[11] -= (m(11,12)*x[12]+m(11,13)*x[13]); x[11] /= m(11,11); + x[10] -= (m(10,11)*x[11]+m(10,12)*x[12]+m(10,13)*x[13]); x[10] /= m(10,10); + x[9] -= (m(9,10)*x[10]+m(9,11)*x[11]+m(9,12)*x[12]+m(9,13)*x[13]); x[9] /= m(9,9); + x[8] -= (m(8,9)*x[9]+m(8,10)*x[10]+m(8,11)*x[11]+m(8,12)*x[12]+m(8,13)*x[13]); x[8] /= m(8,8); + x[7] -= (m(7,8)*x[8]+m(7,9)*x[9]+m(7,10)*x[10]+m(7,11)*x[11]+m(7,12)*x[12]+m(7,13)*x[13]); x[7] /= m(7,7); + x[6] -= (m(6,7)*x[7]+m(6,8)*x[8]+m(6,9)*x[9]+m(6,10)*x[10]+m(6,11)*x[11]+m(6,12)*x[12]+m(6,13)*x[13]); x[6] /= m(6,6); + x[5] -= (m(5,6)*x[6]+m(5,7)*x[7]+m(5,8)*x[8]+m(5,9)*x[9]+m(5,10)*x[10]+m(5,11)*x[11]+m(5,12)*x[12]+m(5,13)*x[13]); x[5] /= m(5,5); + x[4] -= (m(4,5)*x[5]+m(4,6)*x[6]+m(4,7)*x[7]+m(4,8)*x[8]+m(4,9)*x[9]+m(4,10)*x[10]+m(4,11)*x[11]+m(4,12)*x[12]+m(4,13)*x[13]); x[4] /= m(4,4); + x[3] -= (m(3,4)*x[4]+m(3,5)*x[5]+m(3,6)*x[6]+m(3,7)*x[7]+m(3,8)*x[8]+m(3,9)*x[9]+m(3,10)*x[10]+m(3,11)*x[11]+m(3,12)*x[12]+m(3,13)*x[13]); x[3] /= m(3,3); + x[2] -= (m(2,3)*x[3]+m(2,4)*x[4]+m(2,5)*x[5]+m(2,6)*x[6]+m(2,7)*x[7]+m(2,8)*x[8]+m(2,9)*x[9]+m(2,10)*x[10]+m(2,11)*x[11]+m(2,12)*x[12]+m(2,13)*x[13]); x[2] /= m(2,2); + x[1] -= (m(1,2)*x[2]+m(1,3)*x[3]+m(1,4)*x[4]+m(1,5)*x[5]+m(1,6)*x[6]+m(1,7)*x[7]+m(1,8)*x[8]+m(1,9)*x[9]+m(1,10)*x[10]+m(1,11)*x[11]+m(1,12)*x[12]+m(1,13)*x[13]); x[1] /= m(1,1); + x[0] -= (m(0,1)*x[1]+m(0,2)*x[2]+m(0,3)*x[3]+m(0,4)*x[4]+m(0,5)*x[5]+m(0,6)*x[6]+m(0,7)*x[7]+m(0,8)*x[8]+m(0,9)*x[9]+m(0,10)*x[10]+m(0,11)*x[11]+m(0,12)*x[12]+m(0,13)*x[13]); x[0] /= m(0,0); +} diff --git a/minus/lsolve.hxx b/minus/lsolve.hxx index c01afdf..e69de29 100644 --- a/minus/lsolve.hxx +++ b/minus/lsolve.hxx @@ -1,58 +0,0 @@ -// Specific to Chicago -template -__attribute__((always_inline)) inline void -lsolve_14x14_chicago( - Map, minus_core::f::nve, minus_core::f::nve +1>,Aligned> & __restrict m, - C __restrict *ux) -{ - C * const x= reinterpret_cast *> (__builtin_assume_aligned(ux,64)); - //asm("#------ Lsolve begin"); // there is too many vmovsd moving data. It is sub-vectorized, using only xmm no y or zmm - typedef minus_core M; - static constexpr unsigned char rows = M::f::nve; - for(unsigned char k = 0; k < M::f::nve; ++k) { // unroll this loop make it - // specific to m structure of - // having many zeros in last 3 - // rowsj - const unsigned char rrows = rows-k-1; unsigned char row_of_biggest_in_col = k; - F biggest_in_corner = std::norm(m(k,k)); - for (unsigned j=rows-1; j != k; --j) { // todo: no need to go beyond row 10, Hxt rows 11 12 and 13 are fixed - F tmp; - if ((tmp = std::norm(m(j,k))) > biggest_in_corner*1e3) { - biggest_in_corner = tmp; row_of_biggest_in_col = j; - break; - } - } - if (k != row_of_biggest_in_col) m.row(k).swap(m.row(row_of_biggest_in_col)); - m.col(k).tail(rrows) /= m(k,k); - if (k < rows-1) - m.block(M::f::nve-rrows,M::f::nve-rrows,rrows,rrows).noalias() -= m.col(k).tail(rrows) * m.row(k).segment(M::f::nve-rrows,rrows); - } - x[0] = m(0,14); - x[1] = m(1,14)-m(1,0)*x[0]; - x[2] = m(2,14)-(m(2,0)*x[0]+m(2,1)*x[1]); - x[3] = m(3,14)-(m(3,0)*x[0]+m(3,1)*x[1]+m(3,2)*x[2]); - x[4] = m(4,14)-(m(4,0)*x[0]+m(4,1)*x[1]+m(4,2)*x[2]+m(4,3)*x[3]); - x[5] = m(5,14)-(m(5,0)*x[0]+m(5,1)*x[1]+m(5,2)*x[2]+m(5,3)*x[3]+m(5,4)*x[4]); - x[6] = m(6,14)-(m(6,0)*x[0]+m(6,1)*x[1]+m(6,2)*x[2]+m(6,3)*x[3]+m(6,4)*x[4]+m(6,5)*x[5]); - x[7] = m(7,14)-(m(7,0)*x[0]+m(7,1)*x[1]+m(7,2)*x[2]+m(7,3)*x[3]+m(7,4)*x[4]+m(7,5)*x[5]+m(7,6)*x[6]); - x[8] = m(8,14)-(m(8,0)*x[0]+m(8,1)*x[1]+m(8,2)*x[2]+m(8,3)*x[3]+m(8,4)*x[4]+m(8,5)*x[5]+m(8,6)*x[6]+m(8,7)*x[7]); - x[9] = m(9,14)-(m(9,0)*x[0]+m(9,1)*x[1]+m(9,2)*x[2]+m(9,3)*x[3]+m(9,4)*x[4]+m(9,5)*x[5]+m(9,6)*x[6]+m(9,7)*x[7]+m(9,8)*x[8]); - x[10] = m(10,14)-(m(10,0)*x[0]+m(10,1)*x[1]+m(10,2)*x[2]+m(10,3)*x[3]+m(10,4)*x[4]+m(10,5)*x[5]+m(10,6)*x[6]+m(10,7)*x[7]+m(10,8)*x[8]+m(10,9)*x[9]); - x[11] = m(11,14)-(m(11,0)*x[0]+m(11,1)*x[1]+m(11,2)*x[2]+m(11,3)*x[3]+m(11,4)*x[4]+m(11,5)*x[5]+m(11,6)*x[6]+m(11,7)*x[7]+m(11,8)*x[8]+m(11,9)*x[9]+m(11,10)*x[10]); - x[12] = m(12,14)-(m(12,0)*x[0]+m(12,1)*x[1]+m(12,2)*x[2]+m(12,3)*x[3]+m(12,4)*x[4]+m(12,5)*x[5]+m(12,6)*x[6]+m(12,7)*x[7]+m(12,8)*x[8]+m(12,9)*x[9]+m(12,10)*x[10]+m(12,11)*x[11]); - x[13] = m(13,14)-(m(13,0)*x[0]+m(13,1)*x[1]+m(13,2)*x[2]+m(13,3)*x[3]+m(13,4)*x[4]+m(13,5)*x[5]+m(13,6)*x[6]+m(13,7)*x[7]+m(13,8)*x[8]+m(13,9)*x[9]+m(13,10)*x[10]+m(13,11)*x[11]+m(13,12)*x[12]); - x[13] /= m(13,13); - x[12] -= m(12,13)*x[13]; x[12] /= m(12,12); - x[11] -= (m(11,12)*x[12]+m(11,13)*x[13]); x[11] /= m(11,11); - x[10] -= (m(10,11)*x[11]+m(10,12)*x[12]+m(10,13)*x[13]); x[10] /= m(10,10); - x[9] -= (m(9,10)*x[10]+m(9,11)*x[11]+m(9,12)*x[12]+m(9,13)*x[13]); x[9] /= m(9,9); - x[8] -= (m(8,9)*x[9]+m(8,10)*x[10]+m(8,11)*x[11]+m(8,12)*x[12]+m(8,13)*x[13]); x[8] /= m(8,8); - x[7] -= (m(7,8)*x[8]+m(7,9)*x[9]+m(7,10)*x[10]+m(7,11)*x[11]+m(7,12)*x[12]+m(7,13)*x[13]); x[7] /= m(7,7); - x[6] -= (m(6,7)*x[7]+m(6,8)*x[8]+m(6,9)*x[9]+m(6,10)*x[10]+m(6,11)*x[11]+m(6,12)*x[12]+m(6,13)*x[13]); x[6] /= m(6,6); - x[5] -= (m(5,6)*x[6]+m(5,7)*x[7]+m(5,8)*x[8]+m(5,9)*x[9]+m(5,10)*x[10]+m(5,11)*x[11]+m(5,12)*x[12]+m(5,13)*x[13]); x[5] /= m(5,5); - x[4] -= (m(4,5)*x[5]+m(4,6)*x[6]+m(4,7)*x[7]+m(4,8)*x[8]+m(4,9)*x[9]+m(4,10)*x[10]+m(4,11)*x[11]+m(4,12)*x[12]+m(4,13)*x[13]); x[4] /= m(4,4); - x[3] -= (m(3,4)*x[4]+m(3,5)*x[5]+m(3,6)*x[6]+m(3,7)*x[7]+m(3,8)*x[8]+m(3,9)*x[9]+m(3,10)*x[10]+m(3,11)*x[11]+m(3,12)*x[12]+m(3,13)*x[13]); x[3] /= m(3,3); - x[2] -= (m(2,3)*x[3]+m(2,4)*x[4]+m(2,5)*x[5]+m(2,6)*x[6]+m(2,7)*x[7]+m(2,8)*x[8]+m(2,9)*x[9]+m(2,10)*x[10]+m(2,11)*x[11]+m(2,12)*x[12]+m(2,13)*x[13]); x[2] /= m(2,2); - x[1] -= (m(1,2)*x[2]+m(1,3)*x[3]+m(1,4)*x[4]+m(1,5)*x[5]+m(1,6)*x[6]+m(1,7)*x[7]+m(1,8)*x[8]+m(1,9)*x[9]+m(1,10)*x[10]+m(1,11)*x[11]+m(1,12)*x[12]+m(1,13)*x[13]); x[1] /= m(1,1); - x[0] -= (m(0,1)*x[1]+m(0,2)*x[2]+m(0,3)*x[3]+m(0,4)*x[4]+m(0,5)*x[5]+m(0,6)*x[6]+m(0,7)*x[7]+m(0,8)*x[8]+m(0,9)*x[9]+m(0,10)*x[10]+m(0,11)*x[11]+m(0,12)*x[12]+m(0,13)*x[13]); x[0] /= m(0,0); -} diff --git a/minus/minus.hxx b/minus/minus.hxx index b8c7e8f..da0959c 100644 --- a/minus/minus.hxx +++ b/minus/minus.hxx @@ -19,7 +19,17 @@ namespace MiNuS { using namespace Eigen; // only used for linear solve -#include "lsolve.hxx" +#include "chicago14a-lsolve.hxx" + +//template +//__attribute__((always_inline)) inline void +//memoize(C __restrict *Hxxblock) +//{ +// C *const h = reinterpret_cast *> (__builtin_assume_aligned(Hxxblock,64)); + +//} + +// memoize_HxH make it pure function. // THE MEAT ////////////////////////////////////////////////////////////////////// // t: tracker settings @@ -52,6 +62,12 @@ track(const track_settings &s, const C s_sols_u[f::nve*f::nsols], const C static constexpr F the_smallest_number = 1e-13; // XXX BENCHMARK THIS typedef minus_array v; typedef minus_array vp; + Hxt[11]=Hxt[13]=Hxt[25]=Hxt[27]=Hxt[39]=Hxt[41]=Hxt[53]=Hxt[55]=Hxt[67]= + Hxt[68]=Hxt[81]=Hxt[82]=Hxt[95]=Hxt[96]=Hxt[109]=Hxt[110]=Hxt[124]= + Hxt[125]=Hxt[138]=Hxt[139]=Hxt[152]=Hxt[153]=Hxt[166]=Hxt[167]= + Hxt[180]=Hxt[181]=Hxt[194]=Hxt[195]=0; + + solution *t_s = raw_solutions + sol_min; // current target solution const C* __restrict s_s = s_sols + sol_min*f::nve; // current start solution for (unsigned sol_n = sol_min; sol_n < sol_max; ++sol_n) { // solution loop