From 9fc9359b5ea28f0a98cddfbaabdcc5aaa89d540a Mon Sep 17 00:00:00 2001 From: leekillough <15950023+leekillough@users.noreply.github.com> Date: Thu, 6 Jul 2023 04:31:31 -0500 Subject: [PATCH] Use the new tuple intrinsics to get rid of build errors in X280 BLIS code. However, it does not get correct results for complex BLIS routines which use segment loads (or call those that do). The intrinsic types check out and make sense, but it returns wrong answers. It's probably something really simple. For historical reference, see: https://github.com/riscv-non-isa/riscv-c-api-doc/issues/43 https://github.com/flame/blis/pull/737#issuecomment-1612219910 https://reviews.llvm.org/D152134 https://github.com/riscv-non-isa/rvv-intrinsic-doc/issues/139 https://github.com/riscv-non-isa/rvv-intrinsic-doc/pull/198 https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob/master/rvv-intrinsic-rfc.md https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob/master/auto-generated/intrinsic_funcs/02_vector_unit-stride_segment_load_store_instructions_zvlsseg.md https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob/master/auto-generated/intrinsic_funcs/03_vector_stride_segment_load_store_instructions_zvlsseg.md https://github.com/riscv-non-isa/rvv-intrinsic-doc/blob/master/auto-generated/intrinsic_funcs/04_vector_indexed_segment_load_store_instructions_zvlsseg.md --- .../bli_addv_sifive_x280_intr_complex.c | 28 +++++++++----- .../bli_axpbyv_sifive_x280_intr_complex.c | 36 +++++++++++------- .../bli_axpyv_sifive_x280_intr_complex.c | 31 +++++++++------ .../bli_dotv_sifive_x280_intr_complex.c | 22 ++++++----- .../bli_dotxv_sifive_x280_intr_complex.c | 22 ++++++----- .../bli_scal2v_sifive_x280_intr_complex.c | 25 ++++++++---- .../bli_scalv_sifive_x280_intr_complex.c | 25 +++++++----- .../bli_subv_sifive_x280_intr_complex.c | 30 +++++++++------ .../bli_xpbyv_sifive_x280_intr_complex.c | 30 +++++++++------ .../bli_axpy2v_sifive_x280_intr_complex.c | 36 +++++++++++------- .../bli_dotaxpyv_sifive_x280_intr_complex.c | 38 ++++++++++++------- .../sifive_x280/riscv_overloaded_intrinsics.h | 26 +++++++++++-- 12 files changed, 226 insertions(+), 123 deletions(-) diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c index 26f32d8269..76081fc760 100644 --- a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c @@ -47,29 +47,37 @@ ADDV(PRECISION_CHAR, void) size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + yvec_real = VFADD_VV(PREC, LMUL)(yvec_real, xvec_real, vl); if (conjx == BLIS_NO_CONJUGATE) yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); else yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + RVV_SET_REAL(PREC, LMUL, yvec, yvec_real); + RVV_SET_IMAG(PREC, LMUL, yvec, yvec_imag); + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec_real, yvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec_real, yvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c index b0cfe6739a..9b62fa3a17 100644 --- a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c @@ -38,7 +38,7 @@ AXPBYV(PRECISION_CHAR, void) { // Computes y := beta * y + alpha * conjx(x) - + if (n <= 0) return; const DATATYPE* restrict alpha = alpha_; @@ -59,7 +59,7 @@ AXPBYV(PRECISION_CHAR, void) return; } - // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we + // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we // will canonicalize NaNs whereas the reference code will propagate NaN payloads. // TO DO (optimization): special cases for alpha = +-1, +-i, beta = +-1, +-i @@ -68,23 +68,28 @@ AXPBYV(PRECISION_CHAR, void) size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, temp_real, temp_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + // Computed as: // y.real = beta.real * y.real - beta.imag * y.imag + alpha.real * x.real - alpha.imag * conj(x.imag) // y.imag = beta.real * y.imag + beta.imag * y.real + alpha.imag * x.real + alpha.real * conj(x.imag) - temp_real = VFMUL_VF(PREC, LMUL) (yvec_real, beta->real, vl); - temp_imag = VFMUL_VF(PREC, LMUL) (yvec_imag, beta->real, vl); + RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL) (yvec_real, beta->real, vl); + RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL) (yvec_imag, beta->real, vl); temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, beta->imag, yvec_imag, vl); temp_imag = VFMACC_VF(PREC, LMUL) (temp_imag, beta->imag, yvec_real, vl); yvec_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->real, xvec_real, vl); @@ -97,11 +102,14 @@ AXPBYV(PRECISION_CHAR, void) yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); } + RVV_SET_REAL(PREC, LMUL, yvec, yvec_real); + RVV_SET_IMAG(PREC, LMUL, yvec, yvec_imag); + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec_real, yvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec_real, yvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c index 8e71117a5f..d28d960a1d 100644 --- a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c @@ -41,25 +41,30 @@ AXPYV(PRECISION_CHAR, void) const DATATYPE* restrict alpha = alpha_; const DATATYPE* restrict x = x_; DATATYPE* restrict y = y_; - + if (n <= 0) return; if (alpha->real == 0 && alpha->imag == 0) return; size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + yvec_real = VFMACC_VF(PREC, LMUL)( yvec_real, alpha->real, xvec_real, vl); yvec_imag = VFMACC_VF(PREC, LMUL)( yvec_imag, alpha->imag, xvec_real, vl); if (conjx == BLIS_NO_CONJUGATE){ @@ -70,11 +75,15 @@ AXPYV(PRECISION_CHAR, void) yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); } + + RVV_SET_REAL(PREC, LMUL, yvec, yvec_real); + RVV_SET_IMAG(PREC, LMUL, yvec, yvec_imag); + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec_real, yvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec_real, yvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c index 96c783d6b6..3a0680fe08 100644 --- a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c @@ -42,7 +42,7 @@ DOTV(PRECISION_CHAR, void) DATATYPE* restrict rho = rho_; const DATATYPE* restrict x = x_; const DATATYPE* restrict y = y_; - + if (n <= 0) { rho->real = 0; rho->imag = 0; @@ -60,18 +60,23 @@ DOTV(PRECISION_CHAR, void) bool first = true; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + if (first) { acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl); acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl); @@ -93,7 +98,6 @@ DOTV(PRECISION_CHAR, void) avl -= vl; } - RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1); RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1); sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n); diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c index a0fdb94e70..753cde912e 100644 --- a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c @@ -44,7 +44,7 @@ DOTXV(PRECISION_CHAR, void) DATATYPE* restrict rho = rho_; const DATATYPE* restrict x = x_; const DATATYPE* restrict y = y_; - + if (beta->real == 0 && beta->imag == 0){ rho->real = 0; rho->imag = 0; @@ -69,18 +69,23 @@ DOTXV(PRECISION_CHAR, void) bool first = true; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + if (first) { acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl); acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl); @@ -102,7 +107,6 @@ DOTXV(PRECISION_CHAR, void) avl -= vl; } - RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1); RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1); sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n); diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c index 935a935005..15996ae92d 100644 --- a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c @@ -41,7 +41,7 @@ SCAL2V(PRECISION_CHAR, void) const DATATYPE* restrict alpha = alpha_; const DATATYPE* restrict x = x_; DATATYPE* restrict y = y_; - + if (n <= 0) return; if (alpha->real == 0 && alpha->imag == 0) { SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx); @@ -56,13 +56,18 @@ SCAL2V(PRECISION_CHAR, void) size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + yvec_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl); yvec_imag = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->imag, vl); if (conjx == BLIS_NO_CONJUGATE) { @@ -73,11 +78,15 @@ SCAL2V(PRECISION_CHAR, void) yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); } + RVV_SET_REAL(PREC, LMUL, yvec, yvec_real); + RVV_SET_IMAG(PREC, LMUL, yvec, yvec_imag); +#pragma GCC diagnostic ignored "-Wuninitialized" + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec_real, yvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec_real, yvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c index 322dcfad77..1dfa1d37cf 100644 --- a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c @@ -40,7 +40,7 @@ SCALV(PRECISION_CHAR, void) // Computes x = conjalpha(alpha) * x const DATATYPE* restrict alpha = alpha_; DATATYPE* restrict x = x_; - + if (n <= 0 || (alpha->real == 1 && alpha->imag == 0)) return; if (alpha->real == 0 && alpha->imag==0){ @@ -51,13 +51,16 @@ SCALV(PRECISION_CHAR, void) size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl); RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL)(xvec_imag, alpha->real, vl); if (conjalpha == BLIS_NO_CONJUGATE) { @@ -67,13 +70,17 @@ SCALV(PRECISION_CHAR, void) temp_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->imag, xvec_imag, vl); temp_imag = VFNMSAC_VF(PREC, LMUL)(temp_imag, alpha->imag, xvec_real, vl); } - + + RVV_TYPE_F_X2(PREC, LMUL) temp; + RVV_SET_REAL(PREC, LMUL, temp, temp_real); + RVV_SET_IMAG(PREC, LMUL, temp, temp_imag); +#pragma GCC diagnostic ignored "-Wuninitialized" if (incx == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, temp_real, temp_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, temp, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) x, 2*FLT_SIZE*incx, temp_real, temp_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) x, 2*FLT_SIZE*incx, temp, vl); + x += vl*incx; avl -= vl; } diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c index b5d040a7e8..3aa2914518 100644 --- a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c @@ -41,35 +41,43 @@ SUBV(PRECISION_CHAR, void) (void) cntx; const DATATYPE* restrict x = x_; DATATYPE* restrict y = y_; - + if (n <= 0) return; size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + yvec_real = VFSUB_VV(PREC, LMUL)(yvec_real, xvec_real, vl); if (conjx == BLIS_NO_CONJUGATE) yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); else yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + RVV_SET_REAL(PREC, LMUL, yvec, yvec_real); + RVV_SET_IMAG(PREC, LMUL, yvec, yvec_imag); + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec_real, yvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, yvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec_real, yvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c index 83c4dd3a3f..05323cc63c 100644 --- a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c @@ -41,7 +41,7 @@ XPBYV(PRECISION_CHAR, void) const DATATYPE* restrict beta = beta_; const DATATYPE* restrict x = x_; DATATYPE* restrict y = y_; - + if (n <= 0) return; if (beta->real == 0 && beta->imag == 0){ @@ -54,18 +54,23 @@ XPBYV(PRECISION_CHAR, void) size_t avl = n; while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl ); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl ); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + // xpbyv is computed with FMAs as follows: // y[i].real = ( x[i].real + beta.real * y[i].real) - beta.imag * y[i].imag // y[i].imag = (conjx(x[i].imag + beta.imag * y[i].real) + beta.real * y[i].imag @@ -78,11 +83,14 @@ XPBYV(PRECISION_CHAR, void) xvec_imag = VFMSAC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl); xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->real, yvec_imag, vl); + RVV_SET_REAL(PREC, LMUL, xvec, xvec_real); + RVV_SET_IMAG(PREC, LMUL, xvec, xvec_imag); + if (incy == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, xvec_real, xvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, xvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) y, 2*FLT_SIZE*incy, xvec_real, xvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, xvec, vl); + x += vl*incx; y += vl*incy; avl -= vl; diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c index b5779f2808..0528bf9cd1 100644 --- a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c @@ -43,7 +43,7 @@ AXPY2V(PRECISION_CHAR, void) const DATATYPE* restrict x = x_; const DATATYPE* restrict y = y_; DATATYPE* restrict z = z_; - + if (n <= 0) return; @@ -51,22 +51,29 @@ AXPY2V(PRECISION_CHAR, void) while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec, zvec; if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + if (incz == 1) - VLSEG2_V_F(PREC, LMUL)( &zvec_real, &zvec_imag, (BASE_DT*) z, vl); + zvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, vl); else - VLSSEG2_V_F(PREC, LMUL)(&zvec_real, &zvec_imag, (BASE_DT*) z, 2*FLT_SIZE*incz, vl); + zvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, 2*FLT_SIZE*incz, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) zvec_real = RVV_GET_REAL(PREC, LMUL, zvec); + RVV_TYPE_F(PREC, LMUL) zvec_imag = RVV_GET_IMAG(PREC, LMUL, zvec); // + alphax * conjx(x) zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->real, xvec_real, vl); @@ -90,11 +97,14 @@ AXPY2V(PRECISION_CHAR, void) zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphay->real, yvec_imag, vl); } + RVV_SET_REAL(PREC, LMUL, zvec, zvec_real); + RVV_SET_IMAG(PREC, LMUL, zvec, zvec_imag); + if (incz == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, zvec_real, zvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, zvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec_real, zvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl); + x += vl*incx; y += vl*incy; z += vl*incz; diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c index 6b25add2d1..b228c3ddd4 100644 --- a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c +++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c @@ -54,23 +54,30 @@ DOTAXPYV(PRECISION_CHAR, void) while (avl) { size_t vl = VSETVL(PREC, LMUL)(avl); - RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag; + RVV_TYPE_F_X2(PREC, LMUL) xvec, yvec, zvec; // Loads if (incx == 1) - VLSEG2_V_F(PREC, LMUL)( &xvec_real, &xvec_imag, (BASE_DT*) x, vl); + xvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, vl); else - VLSSEG2_V_F(PREC, LMUL)(&xvec_real, &xvec_imag, (BASE_DT*) x, 2*FLT_SIZE*incx, vl); - + xvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) x, 2*FLT_SIZE*incx, vl); + if (incy == 1) - VLSEG2_V_F(PREC, LMUL)( &yvec_real, &yvec_imag, (BASE_DT*) y, vl); + yvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, vl); else - VLSSEG2_V_F(PREC, LMUL)(&yvec_real, &yvec_imag, (BASE_DT*) y, 2*FLT_SIZE*incy, vl); - + yvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) y, 2*FLT_SIZE*incy, vl); + if (incz == 1) - VLSEG2_V_F(PREC, LMUL)( &zvec_real, &zvec_imag, (BASE_DT*) z, vl); + zvec = VLSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, vl); else - VLSSEG2_V_F(PREC, LMUL)(&zvec_real, &zvec_imag, (BASE_DT*) z, 2*FLT_SIZE*incz, vl); + zvec = VLSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, 2*FLT_SIZE*incz, vl); + + RVV_TYPE_F(PREC, LMUL) xvec_real = RVV_GET_REAL(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) xvec_imag = RVV_GET_IMAG(PREC, LMUL, xvec); + RVV_TYPE_F(PREC, LMUL) yvec_real = RVV_GET_REAL(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) yvec_imag = RVV_GET_IMAG(PREC, LMUL, yvec); + RVV_TYPE_F(PREC, LMUL) zvec_real = RVV_GET_REAL(PREC, LMUL, zvec); + RVV_TYPE_F(PREC, LMUL) zvec_imag = RVV_GET_IMAG(PREC, LMUL, zvec); // z := z + alpha * conjx(x) zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->real, xvec_real, vl); @@ -82,7 +89,7 @@ DOTAXPYV(PRECISION_CHAR, void) zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->imag, xvec_imag, vl); zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alpha->real, xvec_imag, vl); } - + // rho := conjxt(x)^T * conjy(y) // We accumulate the current term of the dot product as (a*c-b*d) + (a*d+b*c)*i, // conjugating when necessary @@ -114,13 +121,16 @@ DOTAXPYV(PRECISION_CHAR, void) acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl); else acc_imag = VFNMSAC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl); - + + RVV_SET_REAL(PREC, LMUL, zvec, zvec_real); + RVV_SET_IMAG(PREC, LMUL, zvec, zvec_imag); + // Stores if (incz == 1) - VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, zvec_real, zvec_imag, vl); + VSSEG2_V_F(PREC, LMUL)( (BASE_DT*) z, zvec, vl); else - VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec_real, zvec_imag, vl); - + VSSSEG2_V_F(PREC, LMUL)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl); + x += vl*incx; y += vl*incy; z += vl*incz; diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h index 68b4b3224d..2ce7f92414 100644 --- a/kernels/sifive_x280/riscv_overloaded_intrinsics.h +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -35,6 +35,8 @@ // 6. Configuration-Setting and Utility Functions #define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t #define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL) +#define RVV_TYPE_F_X2_(PRECISION, LMUL) vfloat##PRECISION##LMUL##x2_t +#define RVV_TYPE_F_X2(PRECISION, LMUL) RVV_TYPE_F_X2_(PRECISION, LMUL) #define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL #define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL) @@ -44,18 +46,18 @@ #define VLE_V_F(PRECISION, LMUL) VLE_V_F_(PRECISION, LMUL) #define VLSE_V_F_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL #define VLSE_V_F(PRECISION, LMUL) VLSE_V_F_(PRECISION, LMUL) -#define VLSEG2_V_F_(PRECISION, LMUL) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL +#define VLSEG2_V_F_(PRECISION, LMUL) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x2 #define VLSEG2_V_F(PRECISION, LMUL) VLSEG2_V_F_(PRECISION, LMUL) -#define VLSSEG2_V_F_(PRECISION, LMUL) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL +#define VLSSEG2_V_F_(PRECISION, LMUL) __riscv_vlsseg2e##PRECISION##_v_tuple_f##PRECISION##LMUL##x2 #define VLSSEG2_V_F(PRECISION, LMUL) VLSSEG2_V_F_(PRECISION, LMUL) // Stores #define VSE_V_F_(PRECISION, LMUL) __riscv_vse##PRECISION##_v_f##PRECISION##LMUL #define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL) #define VSSE_V_F_(PRECISION, LMUL) __riscv_vsse##PRECISION##_v_f##PRECISION##LMUL #define VSSE_V_F(PRECISION, LMUL) VSSE_V_F_(PRECISION, LMUL) -#define VSSEG2_V_F_(PRECISION, LMUL) __riscv_vsseg2e##PRECISION##_v_f##PRECISION##LMUL +#define VSSEG2_V_F_(PRECISION, LMUL) __riscv_vsseg2e##PRECISION##_v_tuple_f##PRECISION##LMUL##x2 #define VSSEG2_V_F(PRECISION, LMUL) VSSEG2_V_F_(PRECISION, LMUL) -#define VSSSEG2_V_F_(PRECISION, LMUL) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL +#define VSSSEG2_V_F_(PRECISION, LMUL) __riscv_vssseg2e##PRECISION##_v_tuple_f##PRECISION##LMUL##x2 #define VSSSEG2_V_F(PRECISION, LMUL) VSSSEG2_V_F_(PRECISION, LMUL) // 13. Vector Floating-Point Operations @@ -106,6 +108,22 @@ #define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL #define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL) +// Vector tuple field getters/setters +#define RVV_GET_SET_PAIR(PRECISION, LMUL1, LMUL2) f##PRECISION##LMUL1##_f##PRECISION##LMUL2 + +#define RVV_GET_2FIELDS__(PAIR) __riscv_vget_v_##PAIR +#define RVV_GET_2FIELDS_(PAIR) RVV_GET_2FIELDS__(PAIR) +#define RVV_GET_2FIELDS(PRECISION, LMUL) RVV_GET_2FIELDS_(RVV_GET_SET_PAIR(PRECISION, LMUL##x2, LMUL)) + +#define RVV_SET_2FIELDS__(PAIR) __riscv_vset_v_##PAIR +#define RVV_SET_2FIELDS_(PAIR) RVV_SET_2FIELDS__(PAIR) +#define RVV_SET_2FIELDS(PRECISION, LMUL) RVV_SET_2FIELDS_(RVV_GET_SET_PAIR(PRECISION, LMUL, LMUL##x2)) + +// Complex vector tuple getters/setters +#define RVV_GET_REAL(PRECISION, LMUL, SRC) RVV_GET_2FIELDS(PRECISION, LMUL)(SRC, 0) +#define RVV_SET_REAL(PRECISION, LMUL, DEST, SRC) RVV_SET_2FIELDS(PRECISION, LMUL)(DEST, 0, SRC) +#define RVV_GET_IMAG(PRECISION, LMUL, SRC) RVV_GET_2FIELDS(PRECISION, LMUL)(SRC, 1) +#define RVV_SET_IMAG(PRECISION, LMUL, DEST, SRC) RVV_SET_2FIELDS(PRECISION, LMUL)(DEST, 1, SRC) // Non-vector functions #define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__))