From b5fcf8ef5c29df53cfff60e180b4992a3b2332a6 Mon Sep 17 00:00:00 2001 From: Hong Bo PENG Date: Sun, 16 Jun 2024 16:53:11 +0800 Subject: [PATCH] ggml : fix and optimize ppc64le (ggml/849) * fix compile issues introduced by loongarch_asx * restore quant changes to merge * fix compile issues introduced by loongarch_asx * further optimize by using vec_msum & vec_sum4s on ppc64le --- ggml-quants.c | 643 ++++++++++++++++++++++---------------------------- 1 file changed, 281 insertions(+), 362 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 678adae1684c8..0b346c11e6b2a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1076,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) } vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]); vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]); + } #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { @@ -1435,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) accv = vec_add(accv, vec_sld(accv, accv, 4)); accv = vec_add(accv, vec_sld(accv, accv, 8)); y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0)); + } #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { @@ -4111,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); const vector unsigned char v4 = vec_splats((unsigned char)0x4); const vector signed char v8 = vec_splats((signed char)0x8); vector float vsumf0 = vec_splats(0.0f); -#pragma GCC unroll 4 +#pragma GCC unroll 8 for (int i = 0; i < nb; i++) { __builtin_prefetch(x[i].qs, 0, 1); __builtin_prefetch(y[i].qs, 0, 1); @@ -4138,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); - qv0 = vec_add(qv0, qv1); + vector signed int vsumi0 = v0; - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi0 = vec_sum4s(qv1, vsumi0); vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); } @@ -4514,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); const vector unsigned char v4 = vec_splats((unsigned char)0x4); vector float vsumf0 = vec_splats(0.0f); @@ -4535,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r vector signed char q8y0 = vec_xl( 0, y[i].qs); vector signed char q8y1 = vec_xl(16, y[i].qs); - vector signed char q4x0 = vec_and(qxs, lowMask); - vector signed char q4x1 = vec_sr(qxs, v4); - - vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); + vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask); + vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4); - qv0 = vec_add(qv0, qv1); + vector signed int vsumi0 = v0; - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + vsumi0 = vec_msum(q8y0, q4x0, vsumi0); + vsumi0 = vec_msum(q8y1, q4x1, vsumi0); vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); } @@ -5245,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); const vector unsigned char v4 = vec_splats((unsigned char)0x4); vector float vsumf0 = vec_splats(0.0f); @@ -5270,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs); - vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0); - vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1); + vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0); + vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1); vector signed char q8y0 = vec_xl( 0, y[i].qs); vector signed char q8y1 = vec_xl( 16, y[i].qs); - vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0)); - vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1)); - - qv0 = vec_add(qv0, qv1); + vector signed int vsumi0 = v0; - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); + vsumi0 = vec_msum(q8y0, q5x0, vsumi0); + vsumi0 = vec_msum(q8y1, q5x1, vsumi0); vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); } @@ -5521,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r *s = sumf; #elif defined(__POWER9_VECTOR__) + const vector signed int v0 = vec_splats((int32_t)0); vector float vsumf0 = vec_splats(0.0f); -#pragma GCC unroll 4 +#pragma GCC unroll 8 for (int i = 0; i < nb; i++) { __builtin_prefetch(x[i].qs, 0, 1); __builtin_prefetch(y[i].qs, 0, 1); @@ -5542,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r vector signed short qv2 = vec_mule(q8x1, q8y1); vector signed short qv3 = vec_mulo(q8x1, q8y1); - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1)); - vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1)); - vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3)); - vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3)); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; - vsumi0 = vec_add(vsumi0, vsumi2); - vsumi1 = vec_add(vsumi1, vsumi3); + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi1 = vec_sum4s(qv1, vsumi1); + vsumi0 = vec_sum4s(qv2, vsumi0); + vsumi1 = vec_sum4s(qv3, vsumi1); vsumi0 = vec_add(vsumi0, vsumi1); @@ -5936,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0x3); const vector signed char lowScaleMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); const vector unsigned char v2 = vec_splats((unsigned char)0x2); const vector unsigned char v6 = vec_splats((unsigned char)0x6); const vector unsigned char v4 = vec_splats((unsigned char)0x4); @@ -5973,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; + const uint8_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; for (int j = 0; j < QK_K/128; ++j) { __builtin_prefetch(q2, 0, 1); @@ -5991,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char qxs1 = (vector signed char)vec_xl(16, q2); q2 += 32; - vector signed char q2x00 = vec_and(qxs0, lowMask); - vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask); - vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask); - vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask); - vector signed char q2x10 = vec_and(qxs1, lowMask); - vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask); - vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask); - vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask); + vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask); + vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask); + vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask); + vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask); + vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask); + vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask); + vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask); + vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask); vector signed char q8y00 = vec_xl( 0, q8); vector signed char q8y10 = vec_xl( 16, q8); @@ -6010,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char q8y13 = vec_xl(112, q8); q8 += 128; - vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00)); - vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01)); - vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02)); - vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03)); - vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10)); - vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11)); - vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12)); - vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13)); - - vector signed short vscales_h = vec_unpackh(vscales); - vector signed short vs0 = vec_splat(vscales_h, 0); - vector signed short vs1 = vec_splat(vscales_h, 1); - vector signed short vs2 = vec_splat(vscales_h, 2); - vector signed short vs3 = vec_splat(vscales_h, 3); - vector signed short vs4 = vec_splat(vscales_h, 4); - vector signed short vs5 = vec_splat(vscales_h, 5); - vector signed short vs6 = vec_splat(vscales_h, 6); - vector signed short vs7 = vec_splat(vscales_h, 7); + vector signed int qv0 = vec_msum(q8y00, q2x00, v0); + vector signed int qv1 = vec_msum(q8y01, q2x01, v0); + vector signed int qv2 = vec_msum(q8y02, q2x02, v0); + vector signed int qv3 = vec_msum(q8y03, q2x03, v0); + vector signed int qv4 = vec_msum(q8y10, q2x10, v0); + vector signed int qv5 = vec_msum(q8y11, q2x11, v0); + vector signed int qv6 = vec_msum(q8y12, q2x12, v0); + vector signed int qv7 = vec_msum(q8y13, q2x13, v0); + + vector signed short vscales_07 = vec_unpackh(vscales); + vector signed int vscales_03 = vec_unpackh(vscales_07); + vector signed int vscales_47 = vec_unpackl(vscales_07); + vector signed int vs0 = vec_splat(vscales_03, 0); + vector signed int vs1 = vec_splat(vscales_03, 1); + vector signed int vs2 = vec_splat(vscales_03, 2); + vector signed int vs3 = vec_splat(vscales_03, 3); + vector signed int vs4 = vec_splat(vscales_47, 0); + vector signed int vs5 = vec_splat(vscales_47, 1); + vector signed int vs6 = vec_splat(vscales_47, 2); + vector signed int vs7 = vec_splat(vscales_47, 3); vscales = vec_sld(vscales, vscales, 8); - qv0 = vec_mul(qv0, vs0); - qv1 = vec_mul(qv1, vs2); - qv2 = vec_mul(qv2, vs4); - qv3 = vec_mul(qv3, vs6); - - qv0 = vec_madd(qv4, vs1, qv0); - qv1 = vec_madd(qv5, vs3, qv1); - qv2 = vec_madd(qv6, vs5, qv2); - qv3 = vec_madd(qv7, vs7, qv3); - - vsumi0 = vec_add(vec_unpackh(qv0), vsumi0); - vsumi1 = vec_add(vec_unpackh(qv1), vsumi1); - vsumi2 = vec_add(vec_unpackh(qv2), vsumi2); - vsumi3 = vec_add(vec_unpackh(qv3), vsumi3); - - vsumi4 = vec_add(vec_unpackl(qv0), vsumi4); - vsumi5 = vec_add(vec_unpackl(qv1), vsumi5); - vsumi6 = vec_add(vec_unpackl(qv2), vsumi6); - vsumi7 = vec_add(vec_unpackl(qv3), vsumi7); + vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1); + vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2); + vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3); + vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4); + vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5); + vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6); + vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7); } vsumi0 = vec_add(vsumi0, vsumi4); @@ -6639,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0x3); + const vector signed char lowMask1 = vec_splats((int8_t)0xf); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); const vector signed char v1 = vec_splats((signed char)0x1); const vector unsigned char v2 = vec_splats((unsigned char)0x2); const vector unsigned char v3 = vec_splats((unsigned char)0x3); @@ -6656,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - uint32_t aux[3]; - uint32_t utmp[4]; + UNUSED(kmask1); + UNUSED(kmask2); - memcpy(aux, x[i].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(u0, lowMask1); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2)); + vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4); + vector signed char u31 = vec_and(u3, lowMask2); - vector signed char vscales = (vector signed char)vec_xl( 0, utmp); + u1 = vec_or(u1, u30); + u2 = vec_or(vec_sr(u0, v4), u31); + + vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2); vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask); vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask); vscales = vec_sub(vscales, off); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); - + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; const uint8_t * restrict q3 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6753,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12)); vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13)); - vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0)); - vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2)); - vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4)); - vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6)); - vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1)); - vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3)); - vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5)); - vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7)); - - vsumi0 = vec_add(vsum0, vsumi0); - vsumi1 = vec_add(vsum1, vsumi1); - vsumi2 = vec_add(vsum2, vsumi2); - vsumi3 = vec_add(vsum3, vsumi3); - vsumi4 = vec_add(vsum4, vsumi4); - vsumi5 = vec_add(vsum5, vsumi5); - vsumi6 = vec_add(vsum6, vsumi6); - vsumi7 = vec_add(vsum7, vsumi7); + vsumi0 = vec_msum(qv00, vs0, vsumi0); + vsumi1 = vec_msum(qv01, vs2, vsumi1); + vsumi2 = vec_msum(qv02, vs4, vsumi2); + vsumi3 = vec_msum(qv03, vs6, vsumi3); + vsumi4 = vec_msum(qv10, vs1, vsumi4); + vsumi5 = vec_msum(qv11, vs3, vsumi5); + vsumi6 = vec_msum(qv12, vs5, vsumi6); + vsumi7 = vec_msum(qv13, vs7, vsumi7); } vsumi0 = vec_add(vsumi0, vsumi4); @@ -7268,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed char lowMask1 = vec_splats((int8_t)0x3f); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); + const vector unsigned char v2 = vec_splats((uint8_t)2); const vector unsigned char v4 = vec_splats((unsigned char)0x4); vector float vsumf0 = vec_splats(0.0f); @@ -7286,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - memcpy(utmp, x[i].scales, 12); + UNUSED(kmask1); + UNUSED(kmask2); + UNUSED(kmask3); + UNUSED(utmp); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = vec_sr(u2, v4); + + vector signed char u30 = u1; + vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); + + u1 = vec_and(u0, lowMask1); + u2 = vec_or(u30, u31); + + vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); - vector signed char utmps = (vector signed char)vec_xl( 0, utmp); vector signed short vscales = vec_unpackh(utmps); vector signed short q4xmins = vec_unpackl(utmps); vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins); @@ -7310,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2); vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint8_t * restrict q4 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -7332,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char qxs3 = (vector signed char)vec_xl(48, q4); q4 += 64; - vector signed char q4x00 = vec_and(qxs0, lowMask); - vector signed char q4x01 = vec_sr(qxs0, v4); - vector signed char q4x10 = vec_and(qxs1, lowMask); - vector signed char q4x11 = vec_sr(qxs1, v4); - vector signed char q4x20 = vec_and(qxs2, lowMask); - vector signed char q4x21 = vec_sr(qxs2, v4); - vector signed char q4x30 = vec_and(qxs3, lowMask); - vector signed char q4x31 = vec_sr(qxs3, v4); + vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask); + vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4); + vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask); + vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4); + vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask); + vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4); + vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask); + vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4); vector signed char q8y00 = vec_xl( 0, q8); vector signed char q8y10 = vec_xl( 16, q8); @@ -7351,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char q8y31 = vec_xl(112, q8); q8 += 128; - vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00)); - vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01)); - vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10)); - vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11)); - vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20)); - vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21)); - vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30)); - vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31)); - - vector signed short vs0 = vec_splat(vscales, 0); - vector signed short vs1 = vec_splat(vscales, 1); - vector signed short vs2 = vec_splat(vscales, 2); - vector signed short vs3 = vec_splat(vscales, 3); + vector signed int qv00 = vec_msum(q8y00, q4x00, v0); + vector signed int qv01 = vec_msum(q8y01, q4x01, v0); + vector signed int qv10 = vec_msum(q8y10, q4x10, v0); + vector signed int qv11 = vec_msum(q8y11, q4x11, v0); + vector signed int qv20 = vec_msum(q8y20, q4x20, v0); + vector signed int qv21 = vec_msum(q8y21, q4x21, v0); + vector signed int qv30 = vec_msum(q8y30, q4x30, v0); + vector signed int qv31 = vec_msum(q8y31, q4x31, v0); + + vector signed int vscales_h = vec_unpackh(vscales); + vector signed int vs0 = vec_splat(vscales_h, 0); + vector signed int vs1 = vec_splat(vscales_h, 1); + vector signed int vs2 = vec_splat(vscales_h, 2); + vector signed int vs3 = vec_splat(vscales_h, 3); vscales = vec_sld(vscales, vscales, 8); - qv00 = vec_add(qv00, qv10); - qv10 = vec_add(qv01, qv11); - qv20 = vec_add(qv20, qv30); - qv30 = vec_add(qv21, qv31); + vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1); + vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2); + vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3); - vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); - vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); - vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2); - vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3); - vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4); - vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5); - vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6); - vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7); + vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1); + vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2); + vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -7887,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed char lowMask1 = vec_splats((int8_t)0x3f); + const vector signed char lowMask2 = vec_splats((int8_t)0x30); + const vector int v0 = vec_splats((int32_t)0); const vector unsigned char v1 = vec_splats((unsigned char)0x1); const vector unsigned char v2 = vec_splats((unsigned char)0x2); const vector unsigned char v3 = vec_splats((unsigned char)0x3); @@ -7905,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); - memcpy(utmp, x[i].scales, 12); + UNUSED(kmask1); + UNUSED(kmask2); + UNUSED(kmask3); + UNUSED(utmp); - utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); - const uint32_t uaux = utmp[1] & kmask1; - utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); - utmp[2] = uaux; - utmp[0] &= kmask1; + vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8); + vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2); + vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4); + vector signed char u3 = vec_sr(u2, v4); + + vector signed char u30 = u1; + vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3); + + u1 = vec_and(u0, lowMask1); + u2 = vec_or(u30, u31); + + vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2); vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); vector signed short q8ysums1 = vec_xl(16, y[i].bsums); - vector signed char utmps = (vector signed char)vec_xl( 0, utmp); vector signed short vscales = vec_unpackh(utmps); vector signed short q5xmins = vec_unpackl(utmps); @@ -7936,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh); vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -7964,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r qxhs0 = vec_sr(qxhs0, v2); qxhs1 = vec_sr(qxhs1, v2); - vector signed char q5x00 = vec_or(q5h00, qxs00); - vector signed char q5x01 = vec_or(q5h01, qxs01); - vector signed char q5x10 = vec_or(q5h10, qxs10); - vector signed char q5x11 = vec_or(q5h11, qxs11); + vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00); + vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01); + vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10); + vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11); vector signed char q8y00 = vec_xl( 0, q8); vector signed char q8y10 = vec_xl(16, q8); @@ -7975,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed char q8y11 = vec_xl(48, q8); q8 += 64; - vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00)); - vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01)); - vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10)); - vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11)); + vector signed int qv00 = vec_msum(q8y00, q5x00, v0); + vector signed int qv01 = vec_msum(q8y01, q5x01, v0); + vector signed int qv10 = vec_msum(q8y10, q5x10, v0); + vector signed int qv11 = vec_msum(q8y11, q5x11, v0); - vector signed short vs0 = vec_splat(vscales, 0); - vector signed short vs1 = vec_splat(vscales, 1); + vector signed int vscales_h = vec_unpackh(vscales); + vector signed int vs0 = vec_splat(vscales_h, 0); + vector signed int vs1 = vec_splat(vscales_h, 1); vscales = vec_sld(vscales, vscales, 12); - qv00 = vec_add(qv00, qv10); - qv01 = vec_add(qv01, qv11); - - vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); - vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); - vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2); - vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3); + vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0); + vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1); + vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2); + vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3); } vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); @@ -8551,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); const vector unsigned char v2 = vec_splats((unsigned char)0x2); const vector unsigned char v3 = vec_splats((unsigned char)0x3); const vector unsigned char v4 = vec_splats((unsigned char)0x4); @@ -8567,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; + vector signed int vsumi4 = v0; + vector signed int vsumi5 = v0; + vector signed int vsumi6 = v0; + vector signed int vsumi7 = v0; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -8654,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r vector signed short vs6 = vec_splat(vscales, 6); vector signed short vs7 = vec_splat(vscales, 7); - vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0); - vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1); - vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2); - vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3); - vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4); - vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5); - vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6); - vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7); - - vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0); - vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1); - vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2); - vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3); - vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4); - vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5); - vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6); - vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7); + vsumi0 = vec_msum(qv00, vs0, vsumi0); + vsumi1 = vec_msum(qv01, vs4, vsumi1); + vsumi2 = vec_msum(qv10, vs1, vsumi2); + vsumi3 = vec_msum(qv11, vs5, vsumi3); + vsumi4 = vec_msum(qv20, vs2, vsumi4); + vsumi5 = vec_msum(qv21, vs6, vsumi5); + vsumi6 = vec_msum(qv30, vs3, vsumi6); + vsumi7 = vec_msum(qv31, vs7, vsumi7); } vsumi0 = vec_add(vsumi0, vsumi4); @@ -8951,6 +8948,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void *s = 0.125f * hsum_float_8(accumf); #elif defined(__POWER9_VECTOR__) + const vector int v0 = vec_splats((int32_t)0); vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); vector float vsumf2 = vec_splats(0.0f); @@ -8963,14 +8961,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint16_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -9017,21 +9011,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1)); vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1)); - vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -9423,6 +9408,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * *s = 0.125f * hsum_float_8(accumf); #elif defined(__POWER9_VECTOR__) + const vector int v0 = vec_splats((int32_t)0); vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); vector float vsumf2 = vec_splats(0.0f); @@ -9435,14 +9421,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint16_t * restrict q2 = x[i].qs; const uint8_t * restrict sc = x[i].scales; @@ -9490,21 +9472,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); - vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7); + vsumi0 = vec_msum(qv0, vscales0, vsumi0); + vsumi1 = vec_msum(qv1, vscales1, vsumi1); + vsumi2 = vec_msum(qv2, vscales2, vsumi2); + vsumi3 = vec_msum(qv3, vscales3, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -9727,6 +9700,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + const vector int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); vector float vsumf2 = vec_splats(0.0f); @@ -9741,14 +9716,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint8_t * restrict q2 = x[i].qs; const uint8_t * restrict qh = x[i].qh; @@ -9808,21 +9779,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1)); vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1)); - vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7); + vsumi0 = vec_msum(qv0, vscales0, vsumi0); + vsumi1 = vec_msum(qv1, vscales1, vsumi1); + vsumi2 = vec_msum(qv2, vscales2, vsumi2); + vsumi3 = vec_msum(qv3, vscales3, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -10060,6 +10022,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #elif defined(__POWER9_VECTOR__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + const vector int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); vector float vsumf2 = vec_splats(0.0f); @@ -10070,14 +10034,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; const uint8_t * restrict q3 = x[i].qs; const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4); @@ -10122,21 +10082,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); - vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -10426,6 +10377,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + const vector int v0 = vec_splats((int32_t)0); + vector float vsumf0 = vec_splats(0.0f); vector float vsumf1 = vec_splats(0.0f); vector float vsumf2 = vec_splats(0.0f); @@ -10446,14 +10399,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * const uint8_t * restrict sc = x[i].scales; const int8_t * restrict q8 = y[i].qs; - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; for (int j = 0; j < QK_K/32; j += 2) { __builtin_prefetch(q3, 0, 1); @@ -10507,21 +10456,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1)); vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); - vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -10802,10 +10742,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void vector signed int vsumi1 = vec_splats((int32_t)0); vector signed int vsumi2 = vec_splats((int32_t)0); vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); vector signed int vsumi8 = vec_splats((int32_t)0); const uint8_t * restrict q1 = x[i].qs; @@ -10847,14 +10783,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1)); vector signed short vscales = vec_sld(vscales23, vscales01, 8); - vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); vector signed short q8ysums = vec_xl_len(qs, 8); qs += 4; @@ -10869,11 +10801,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2); @@ -11267,6 +11194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector signed int v0 = vec_splats((int32_t)0); const vector unsigned char v4 = vec_splats((unsigned char)0x4); vector float vsumf0 = vec_splats(0.0f); @@ -11297,8 +11225,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0)); vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1)); - vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0)); - vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1)); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + + vsumi0 = vec_sum4s(qv0, vsumi0); + vsumi1 = vec_sum4s(qv1, vsumi1); vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); @@ -11453,6 +11384,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * #elif defined(__POWER9_VECTOR__) const vector signed char lowMask = vec_splats((signed char)0xF); + const vector int v0 = vec_splats((int32_t)0); const vector unsigned char v4 = vec_splats((unsigned char)0x4); vector float vsumf0 = vec_splats(0.0f); @@ -11468,14 +11400,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * vector float vyd = vec_splats(y[ibl].d); vector float vd = vec_mul(vxd, vyd); - vector signed int vsumi0 = vec_splats((int32_t)0); - vector signed int vsumi1 = vec_splats((int32_t)0); - vector signed int vsumi2 = vec_splats((int32_t)0); - vector signed int vsumi3 = vec_splats((int32_t)0); - vector signed int vsumi4 = vec_splats((int32_t)0); - vector signed int vsumi5 = vec_splats((int32_t)0); - vector signed int vsumi6 = vec_splats((int32_t)0); - vector signed int vsumi7 = vec_splats((int32_t)0); + vector signed int vsumi0 = v0; + vector signed int vsumi1 = v0; + vector signed int vsumi2 = v0; + vector signed int vsumi3 = v0; uint16_t h = x[ibl].scales_h; @@ -11520,21 +11448,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * vector signed short vscales01 = vec_splats((int16_t)ls0); vector signed short vscales23 = vec_splats((int16_t)ls1); - vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0); - vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1); - vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2); - vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3); - vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4); - vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5); - vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6); - vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7); + vsumi0 = vec_msum(qv0, vscales01, vsumi0); + vsumi1 = vec_msum(qv1, vscales01, vsumi1); + vsumi2 = vec_msum(qv2, vscales23, vsumi2); + vsumi3 = vec_msum(qv3, vscales23, vsumi3); } - vsumi0 = vec_add(vsumi0, vsumi4); - vsumi1 = vec_add(vsumi1, vsumi5); - vsumi2 = vec_add(vsumi2, vsumi6); - vsumi3 = vec_add(vsumi3, vsumi7); - vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1); vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);