From 00552709c2884fbd00d3178ed0e720b450118f5d Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Thu, 23 Mar 2023 14:46:29 +0100 Subject: [PATCH] [apps] Introduce new ad-hoc 569JNRXZghikmswarming for fconv,fmatmul,jacobi --- apps/benchmarks/benchmark/fconv3d.bmark | 9 + apps/benchmarks/benchmark/fmatmul.bmark | 2 + apps/benchmarks/benchmark/jacobi2d.bmark | 9 +- apps/fconv3d/fconv3d.h | 6 + apps/fconv3d/fconv3d_3x7x7.c | 369 +++++++++++++++++++++++ apps/jacobi2d/kernel/jacobi2d.c | 139 +++++++++ 6 files changed, 533 insertions(+), 1 deletion(-) diff --git a/apps/benchmarks/benchmark/fconv3d.bmark b/apps/benchmarks/benchmark/fconv3d.bmark index 8bbf43ec2..b2af37e3e 100644 --- a/apps/benchmarks/benchmark/fconv3d.bmark +++ b/apps/benchmarks/benchmark/fconv3d.bmark @@ -40,8 +40,17 @@ extern int64_t CH; extern int64_t F; void warm_caches(uint64_t heat) { + volatile double buf; + for (uint64_t k = 0; k < heat; ++k) fconv3d_CHx7x7(o, i, f, M, N, CH, F); +// The following artificial warming ensures, with a larger cache, +// not to experience any cache misses +#ifdef AD_HOC_WARMING + for (uint64_t k = 0; k < F*F*CH; ++k) + buf = (volatile double) *(&(f[k])); + fconv3d_CHx7x7_warm(o, i, f, M, N, CH, F); +#endif } int main() { diff --git a/apps/benchmarks/benchmark/fmatmul.bmark b/apps/benchmarks/benchmark/fmatmul.bmark index a8d90b95e..d394aaadc 100644 --- a/apps/benchmarks/benchmark/fmatmul.bmark +++ b/apps/benchmarks/benchmark/fmatmul.bmark @@ -42,12 +42,14 @@ void warm_caches(uint64_t heat) { for (uint64_t k = 0; k < heat; ++k) fmatmul(c, a, b, M, N, P); +#ifdef AD_HOC_WARMING // Vector stores have invalidated the A mtx cache lines! // Fetch them again for (int m = 0; m < M; ++m) { buf = (volatile double) *a_; a_ += N; } +#endif } int main() { diff --git a/apps/benchmarks/benchmark/jacobi2d.bmark b/apps/benchmarks/benchmark/jacobi2d.bmark index c3b271c63..48ba4ddc0 100644 --- a/apps/benchmarks/benchmark/jacobi2d.bmark +++ b/apps/benchmarks/benchmark/jacobi2d.bmark @@ -102,8 +102,15 @@ extern DATA_TYPE A_s[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); extern DATA_TYPE B_s[] __attribute__((aligned(4 * NR_LANES), section(".l2"))); void warm_caches(uint64_t heat, DATA_TYPE* A_fixed_v, DATA_TYPE* B_fixed_v) { + + volatile double buf; + for (uint64_t k = 0; k < heat; ++k) j2d_v(R, C, A_fixed_v, B_fixed_v, TSTEPS); +#ifdef AD_HOC_WARMING + for (uint64_t k = 0; k < R*C; ++k) + buf = (volatile double)* &(A_fixed_v[k]); +#endif } int main() { @@ -117,7 +124,7 @@ int main() { #ifndef SPIKE // Warm-up caches - warm_caches(WARM_CACHES_ITER, A_fixed_s, B_fixed_s); + warm_caches(WARM_CACHES_ITER, A_fixed_v, B_fixed_s); #endif // Measure vector kernel execution diff --git a/apps/fconv3d/fconv3d.h b/apps/fconv3d/fconv3d.h index 386e55069..77c76a680 100644 --- a/apps/fconv3d/fconv3d.h +++ b/apps/fconv3d/fconv3d.h @@ -28,6 +28,12 @@ void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N, void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N, int64_t n_, int64_t C, int64_t F); +void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N, + int64_t C, int64_t F); + +void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N, + int64_t n_, int64_t C, int64_t F); + #define MIN(a, b) ((a) < (b) ? (a) : (b)) // Threshold for FP numbers comparison during the final check diff --git a/apps/fconv3d/fconv3d_3x7x7.c b/apps/fconv3d/fconv3d_3x7x7.c index a2c01cd72..57b20efd9 100644 --- a/apps/fconv3d/fconv3d_3x7x7.c +++ b/apps/fconv3d/fconv3d_3x7x7.c @@ -75,6 +75,29 @@ void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N, } } +void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N, + int64_t C, int64_t F) { + + unsigned long int block_size_n; + + // Set the vector configuration + asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(block_size_n) : "r"(N)); + + // Slice the matrix into a manageable number of columns n_ + for (unsigned long int n = 0; n < N; n += block_size_n) { + // Set the vector length + const unsigned long int n_ = MIN(N - n, block_size_n); + + // Find pointers to the submatrices + const double *i_ = i + n; + double *o_ = o + n; + + asm volatile("vsetvli zero, %0, e64, m2, ta, ma" ::"r"(n_)); + + fconv3d_warm(o_, i_, f, M, N, n_, C, F); + } +} + void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N, int64_t n_, int64_t C, int64_t F) { @@ -883,6 +906,352 @@ void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N, asm volatile("vse64.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } + +void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N, + int64_t n_, int64_t C, int64_t F) { + + // Helper variables + int64_t ldo = N << 3; + int64_t ldi_pad = (N + F - 1) << 3; + + // Number of elements that separates two adjacent channels + int64_t ich_len = (M + F - 1) * (N + F - 1); + int64_t fch_len = F * F; + + double *i_ = i; + double *i__ = i; + + // Very last column of coefficients + double fl0, fl1, fl2, fl3, fl4, fl5, fl6; + // Buffers for coefficients preloading (solve 16-lane starvation problem) + double f0_buf, f1_buf, f2_buf, f3_buf, f4_buf, f5_buf, f6_buf; + + double *i_slide_ptr_0; + double *i_slide_ptr_1; + double *i_slide_ptr_2; + double *i_slide_ptr_3; + + //////////////// + // Row 0 -> 3 // + //////////////// + + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { + + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Point to the scalar elements to insert during a slide + i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1); + i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); + i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); + i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1); + + // Main kernel, unrolled by 2 + // Unrolled because of double buffering + // With HW renaming, this unroll is not needed + for (int64_t k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + + if ((k | ch) == 0) + asm volatile("vfmul.vf v16, v0, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfmacc.vf v16, %0, v0" ::"f"(f[0 + base_idx_0])); + if ((k | ch) == 0) + asm volatile("vfmul.vf v18, v4, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfmacc.vf v18, %0, v4" ::"f"(f[0 + base_idx_0])); + asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++)); + asm volatile("vfmacc.vf v16, %0, v4" ::"f"(f[7 + base_idx_0])); + if ((k | ch) == 0) + asm volatile("vfmul.vf v22, v12, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfmacc.vf v22, %0, v12" ::"f"(f[0 + base_idx_0])); + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); + asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++)); + asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++)); + asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); + asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++)); + asm volatile("vfmacc.vf v20, %0, v14" ::"f"(f[7 + base_idx_1])); + } + + int64_t base_idx_0 = (F - 1) + (ch * fch_len); + + // Don't slide during the last iteration + } + + // Bump the input ptr + i_ += 4 * (N + F - 1); + + //////////////// + // Row 4 -> 6 // + //////////////// + + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { + + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1); + i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); + i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); + + + // Main kernel, unrolled by 2 + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + + // Unroll 0 + asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++)); + + asm volatile("vfmacc.vf v18, %0, v10" ::"f"(f[35 + base_idx_0])); + asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++)); + + asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); + + if ((k | ch) == 0) + asm volatile("vfmul.vf v26, v6, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfmacc.vf v26, %0, v6" ::"f"(f[0 + base_idx_0])); + asm volatile("vfmacc.vf v26, %0, v10" ::"f"(f[7 + base_idx_0])); + + if ((k | ch) == 0) + asm volatile("vfmul.vf v28, v10, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfmacc.vf v28, %0, v10" ::"f"(f[0 + base_idx_0])); + + // Unroll 1 + asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++)); + + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + + asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); + + } + + // The very last iterations require mixing the instructions with the store + // and the moves + if (ch != C - 1) { + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (F - 1) + (ch * fch_len); + + // Don't slide the elements here + } + } + + // Reuse preloaded coefficients + // Buffer the next coefficients for faster use + + + // Bump the input ptr + i_ += 3 * (N + F - 1); + + //////////// + // REGIME // + //////////// + + // The following loop is unrolled by 2 + // The input matrix has M + F - 1 rows + // We have computed F input rows already + // Nompute now until only F input rows are left + // (The last F-1 rows do not contribute to F output rows each, so keep them + // outside of this loop) (We keep F rows outside because of the unrolling by + // 2, just for easeness) + for (int j = 0; j < ((M + F - 1) - 2 * F) / 2; ++j) { + + // Work on F output rows + + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_0 = i__ + n_; + + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Look ahead to the first element of the current column (k+2) of the + // current channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + + asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++)); + f1_buf = f[7 + base_idx_1]; + if ((k | ch) == 0) + asm volatile("vfmul.vf v28, v0, %0" ::"f"(f0_buf)); + else + asm volatile("vfmacc.vf v28, %0, v0" ::"f"(f0_buf)); + f0_buf = f[0 + base_idx_1]; + + // Nalculate F contributions of the input rows, on F different output + // rows + asm volatile("vfmacc.vf v16, %0, v2" ::"f"(f6_buf)); + asm volatile("vfmacc.vf v18, %0, v2" ::"f"(f5_buf)); + f6_buf = f[42 + base_idx_0]; + asm volatile("vfmacc.vf v20, %0, v2" ::"f"(f4_buf)); + f5_buf = f[35 + base_idx_0]; + asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++)); + f4_buf = f[28 + base_idx_0]; + asm volatile("vfmacc.vf v22, %0, v2" ::"f"(f3_buf)); + f3_buf = f[21 + base_idx_0]; + asm volatile("vfmacc.vf v24, %0, v2" ::"f"(f2_buf)); + f2_buf = f[14 + base_idx_0]; + asm volatile("vfmacc.vf v26, %0, v2" ::"f"(f1_buf)); + f1_buf = f[7 + base_idx_0]; + asm volatile("vfmacc.vf v28, %0, v2" ::"f"(f0_buf)); + f0_buf = f[0 + base_idx_0]; + } + + if (ch != C - 1) { + int64_t base_idx_0 = (ch + 1) * fch_len; + + } + } + } + + // Bump the input ptr + i_ += N + F - 1; + +#ifdef VCD_DUMP + // Stop dumping VCD + event_trigger = -1; +#endif + + ////////////// + // UNROLL 1 // + ////////////// + + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { + + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; + + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + } + + + // Bump the input ptr + i_ += N + F - 1; + } + + //////////////////////// + // Row I-F -> (I-1)-3 // + //////////////////////// + + for (int64_t ch = 0; ch < C; ++ch) { + + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Point to the scalar elements to insert during a slide + // i_slide_ptr_0 has already been computed + i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1); + i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); + i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); + i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1); + + // Main kernel, unrolled by 2 + // Process 4 input rows + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + + asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++)); + if ((k | ch) == 0) + asm volatile("vfmul.vf v28, v0, %0" ::"f"(f[0 + base_idx_0])); + else + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); + asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++)); + + asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++)); + asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); + asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++)); + } + + } + + + // Bump the input ptr + i_ += 4 * (N + F - 1); + + ////////////////////////// + // Row (I-1)-3 -> (I-1) // + ////////////////////////// + + for (int64_t ch = 0; ch < C; ++ch) { + + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; + + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1); + i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); + i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); + + // Main kernel, unrolled by 2 + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + + asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++)); + asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); + + asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++)); + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); + } + + if (ch != C - 1) { + int64_t base_idx_0 = (F - 1) + (ch * fch_len); + } + } +} + + /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/jacobi2d/kernel/jacobi2d.c b/apps/jacobi2d/kernel/jacobi2d.c index d04644648..b72ff7759 100644 --- a/apps/jacobi2d/kernel/jacobi2d.c +++ b/apps/jacobi2d/kernel/jacobi2d.c @@ -143,6 +143,145 @@ void j2d_kernel_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) { } } +// Optimized version of the jacobi2d kernel +void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) { + DATA_TYPE izq_0, izq_1, izq_2; + DATA_TYPE der_0, der_1, der_2; + uint32_t size_x = c - 2; + uint32_t size_y = r - 2; + // Simplify pointer calc + uint32_t sc_ptr_0, sc_ptr_1; + uint32_t mtx_ptr_0, mtx_ptr_1; + + // Avoid division. 1/5 == 0.2 + double five_ = 0.2; + + size_t gvl; + + asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(gvl) : "r"(size_x)); + + for (uint32_t j = 1; j <= size_x; j = j + gvl) { + asm volatile("vsetvli %0, %1, e64, m4, ta, ma" + : "=r"(gvl) + : "r"(size_x - j + 1)); + mtx_ptr_0 = j; // 0 * c + j + asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top + mtx_ptr_1 = j + c; // 1 * c + j + asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_1])); // v4 middle + mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j + asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom + + // Look ahead and load the next coefficients + // Do it before vector stores + sc_ptr_0 = mtx_ptr_1 - 1; // 1 * c + j - 1 + izq_0 = A[sc_ptr_0]; + sc_ptr_1 = mtx_ptr_1 + gvl; // 1 * c + j + gvl + der_0 = A[sc_ptr_1]; + + // mtx_ptr_0 = 2 * c + j + // mtx_ptr_1 = 1 * c + j + // sc_ptr_0 = 1 * c + j - 1 + // sc_ptr_1 = 1 * c + j + gvl + + for (uint32_t i = 1; i <= size_y; i += 3) { +#ifdef VCD_DUMP + // Start dumping VCD + if (i == 7) + event_trigger = +1; + // Stop dumping VCD + if (i == 13) + event_trigger = -1; +#endif + // mtx_ptr_0 = (i + 1) * c + j + // mtx_ptr_1 = i * c + j + // sc_ptr_0 = i * c + j - 1 + // sc_ptr_1 = i * c + j + gvl + + asm volatile("vfslide1up.vf v24, v4, %0" ::"f"(izq_0)); + asm volatile("vfslide1down.vf v28, v4, %0" ::"f"(der_0)); + asm volatile("vfadd.vv v12, v4, v0"); // middle - top + mtx_ptr_0 += c; // (i + 2) * c + j + asm volatile("vfadd.vv v12, v12, v8"); // bottom + sc_ptr_0 += c; // (i + 1) * c + j - 1 + asm volatile("vfadd.vv v12, v12, v24"); // left + if ((i + 1) <= size_y) { + asm volatile( + "vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top + } + asm volatile("vfadd.vv v12, v12, v28"); // right + sc_ptr_1 += c; // (i + 1) * c + j + gvl + asm volatile("vfmul.vf v12, v12, %0" ::"f"(five_)); + if ((i + 1) <= size_y) { + izq_1 = A[sc_ptr_0]; + der_1 = A[sc_ptr_1]; + } + asm volatile("vse64.v v12, (%0)" ::"r"(&B[mtx_ptr_1])); + mtx_ptr_1 += c; // (i + 1) * c + j + + // mtx_ptr_0 = (i + 2) * c + j + // mtx_ptr_1 = (i + 1) * c + j + // sc_ptr_0 = (i + 1) * c + j - 1 + // sc_ptr_1 = (i + 1) * c + j + gvl + + if ((i + 1) <= size_y) { + asm volatile("vfslide1up.vf v24, v8, %0" ::"f"(izq_1)); + asm volatile("vfslide1down.vf v28, v8, %0" ::"f"(der_1)); + asm volatile("vfadd.vv v16, v4, v8"); // middle - top + mtx_ptr_0 += c; // (i + 3) * c + j + asm volatile("vfadd.vv v16, v16, v0"); // bottom + sc_ptr_0 += c; // (i + 2) * c + j - 1 + asm volatile("vfadd.vv v16, v16, v24"); // left + if ((i + 2) <= size_y) { + asm volatile( + "vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle + } + asm volatile("vfadd.vv v16, v16, v28"); // right + sc_ptr_1 += c; // (i + 2) * c + j + gvl + asm volatile("vfmul.vf v16, v16, %0" ::"f"(five_)); + if ((i + 2) <= size_y) { + izq_2 = A[sc_ptr_0]; + der_2 = A[sc_ptr_1]; + } + asm volatile("vse64.v v16, (%0)" ::"r"(&B[mtx_ptr_1])); + mtx_ptr_1 += c; // (i + 2) * c + j + + // mtx_ptr_0 = (i + 3) * c + j + // mtx_ptr_1 = (i + 2) * c + j + // sc_ptr_0 = (i + 2) * c + j - 1 + // sc_ptr_1 = (i + 2) * c + j + gvl + + if ((i + 2) <= size_y) { + asm volatile("vfslide1up.vf v24, v0, %0" ::"f"(izq_2)); + asm volatile("vfslide1down.vf v28, v0, %0" ::"f"(der_2)); + asm volatile("vfadd.vv v20, v0, v8"); // middle - top + mtx_ptr_0 += c; // (i + 4) * c + j + asm volatile("vfadd.vv v20, v20, v4"); // bottom + sc_ptr_0 += c; // (i + 3) * c + j - 1 + asm volatile("vfadd.vv v20, v20, v24"); // left + if ((i + 3) <= size_y) { + asm volatile("vle64.v v8, (%0)" ::"r"( + &A[mtx_ptr_0])); // v8 bottom + } + asm volatile("vfadd.vv v20, v20, v28"); // right + sc_ptr_1 += c; // (i + 3) * c + j + gvl + asm volatile("vfmul.vf v20, v20, %0" ::"f"(five_)); + if ((i + 3) <= size_y) { + izq_0 = A[sc_ptr_0]; + der_0 = A[sc_ptr_1]; + } + asm volatile("vse64.v v20, (%0)" ::"r"(&B[mtx_ptr_1])); + mtx_ptr_1 += c; // (i + 3) * c + j + + // mtx_ptr_0 = (i + 4) * c + j + // mtx_ptr_1 = (i + 3) * c + j + // sc_ptr_0 = (i + 3) * c + j - 1 + // sc_ptr_1 = (i + 3) * c + j + gvl + } + } + } + } +} + // Optimized version of the jacobi2d kernel // 1) Preload the coefficients, before each vstore // 2) Eliminate WAW and WAR hazards