From 00552709c2884fbd00d3178ed0e720b450118f5d Mon Sep 17 00:00:00 2001
From: Matteo Perotti <mperotti@iis.ee.ethz.ch>
Date: Thu, 23 Mar 2023 14:46:29 +0100
Subject: [PATCH] [apps] Introduce new ad-hoc 569JNRXZghikmswarming for
 fconv,fmatmul,jacobi

---
 apps/benchmarks/benchmark/fconv3d.bmark  |   9 +
 apps/benchmarks/benchmark/fmatmul.bmark  |   2 +
 apps/benchmarks/benchmark/jacobi2d.bmark |   9 +-
 apps/fconv3d/fconv3d.h                   |   6 +
 apps/fconv3d/fconv3d_3x7x7.c             | 369 +++++++++++++++++++++++
 apps/jacobi2d/kernel/jacobi2d.c          | 139 +++++++++
 6 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/apps/benchmarks/benchmark/fconv3d.bmark b/apps/benchmarks/benchmark/fconv3d.bmark
index 8bbf43ec2..b2af37e3e 100644
--- a/apps/benchmarks/benchmark/fconv3d.bmark
+++ b/apps/benchmarks/benchmark/fconv3d.bmark
@@ -40,8 +40,17 @@ extern int64_t CH;
 extern int64_t F;
 
 void warm_caches(uint64_t heat) {
+  volatile double buf;
+
   for (uint64_t k = 0; k < heat; ++k)
     fconv3d_CHx7x7(o, i, f, M, N, CH, F);
+// The following artificial warming ensures, with a larger cache,
+// not to experience any cache misses
+#ifdef AD_HOC_WARMING
+  for (uint64_t k = 0; k < F*F*CH; ++k)
+    buf = (volatile double) *(&(f[k]));
+  fconv3d_CHx7x7_warm(o, i, f, M, N, CH, F);
+#endif
 }
 
 int main() {
diff --git a/apps/benchmarks/benchmark/fmatmul.bmark b/apps/benchmarks/benchmark/fmatmul.bmark
index a8d90b95e..d394aaadc 100644
--- a/apps/benchmarks/benchmark/fmatmul.bmark
+++ b/apps/benchmarks/benchmark/fmatmul.bmark
@@ -42,12 +42,14 @@ void warm_caches(uint64_t heat) {
 
   for (uint64_t k = 0; k < heat; ++k)
     fmatmul(c, a, b, M, N, P);
+#ifdef AD_HOC_WARMING
     // Vector stores have invalidated the A mtx cache lines!
     // Fetch them again
     for (int m = 0; m < M; ++m) {
       buf = (volatile double) *a_;
       a_ += N;
     }
+#endif
 }
 
 int main() {
diff --git a/apps/benchmarks/benchmark/jacobi2d.bmark b/apps/benchmarks/benchmark/jacobi2d.bmark
index c3b271c63..48ba4ddc0 100644
--- a/apps/benchmarks/benchmark/jacobi2d.bmark
+++ b/apps/benchmarks/benchmark/jacobi2d.bmark
@@ -102,8 +102,15 @@ extern DATA_TYPE A_s[] __attribute__((aligned(4 * NR_LANES), section(".l2")));
 extern DATA_TYPE B_s[] __attribute__((aligned(4 * NR_LANES), section(".l2")));
 
 void warm_caches(uint64_t heat, DATA_TYPE* A_fixed_v, DATA_TYPE* B_fixed_v) {
+
+  volatile double buf;
+
   for (uint64_t k = 0; k < heat; ++k)
     j2d_v(R, C, A_fixed_v, B_fixed_v, TSTEPS);
+#ifdef AD_HOC_WARMING
+  for (uint64_t k = 0; k < R*C; ++k)
+    buf = (volatile double)* &(A_fixed_v[k]);
+#endif
 }
 
 int main() {
@@ -117,7 +124,7 @@ int main() {
 
 #ifndef SPIKE
   // Warm-up caches
-  warm_caches(WARM_CACHES_ITER, A_fixed_s, B_fixed_s);
+  warm_caches(WARM_CACHES_ITER, A_fixed_v, B_fixed_s);
 #endif
 
   // Measure vector kernel execution
diff --git a/apps/fconv3d/fconv3d.h b/apps/fconv3d/fconv3d.h
index 386e55069..77c76a680 100644
--- a/apps/fconv3d/fconv3d.h
+++ b/apps/fconv3d/fconv3d.h
@@ -28,6 +28,12 @@ void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N,
 void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N,
                           int64_t n_, int64_t C, int64_t F);
 
+void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N,
+                    int64_t C, int64_t F);
+
+void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
+                          int64_t n_, int64_t C, int64_t F);
+
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
 // Threshold for FP numbers comparison during the final check
diff --git a/apps/fconv3d/fconv3d_3x7x7.c b/apps/fconv3d/fconv3d_3x7x7.c
index a2c01cd72..57b20efd9 100644
--- a/apps/fconv3d/fconv3d_3x7x7.c
+++ b/apps/fconv3d/fconv3d_3x7x7.c
@@ -75,6 +75,29 @@ void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N,
   }
 }
 
+void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N,
+                    int64_t C, int64_t F) {
+
+  unsigned long int block_size_n;
+
+  // Set the vector configuration
+  asm volatile("vsetvli %0, %1, e64, m2, ta, ma" : "=r"(block_size_n) : "r"(N));
+
+  // Slice the matrix into a manageable number of columns n_
+  for (unsigned long int n = 0; n < N; n += block_size_n) {
+    // Set the vector length
+    const unsigned long int n_ = MIN(N - n, block_size_n);
+
+    // Find pointers to the submatrices
+    const double *i_ = i + n;
+    double *o_ = o + n;
+
+    asm volatile("vsetvli zero, %0, e64, m2, ta, ma" ::"r"(n_));
+
+    fconv3d_warm(o_, i_, f, M, N, n_, C, F);
+  }
+}
+
 void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N,
                           int64_t n_, int64_t C, int64_t F) {
 
@@ -883,6 +906,352 @@ void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N,
   asm volatile("vse64.v  v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo));
 }
 
+
+void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
+                          int64_t n_, int64_t C, int64_t F) {
+
+  // Helper variables
+  int64_t ldo = N << 3;
+  int64_t ldi_pad = (N + F - 1) << 3;
+
+  // Number of elements that separates two adjacent channels
+  int64_t ich_len = (M + F - 1) * (N + F - 1);
+  int64_t fch_len = F * F;
+
+  double *i_ = i;
+  double *i__ = i;
+
+  // Very last column of coefficients
+  double fl0, fl1, fl2, fl3, fl4, fl5, fl6;
+  // Buffers for coefficients preloading (solve 16-lane starvation problem)
+  double f0_buf, f1_buf, f2_buf, f3_buf, f4_buf, f5_buf, f6_buf;
+
+  double *i_slide_ptr_0;
+  double *i_slide_ptr_1;
+  double *i_slide_ptr_2;
+  double *i_slide_ptr_3;
+
+  ////////////////
+  // Row 0 -> 3 //
+  ////////////////
+
+  // Loop on the channels
+  for (int ch = 0; ch < C; ++ch) {
+
+    // Point to the first element of the channel ch
+    i__ = i_ + ch * ich_len;
+
+    // Point to the scalar elements to insert during a slide
+    i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1);
+    i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1);
+    i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1);
+    i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1);
+
+    // Main kernel, unrolled by 2
+    // Unrolled because of double buffering
+    // With HW renaming, this unroll is not needed
+    for (int64_t k = 0; k < F / 2; ++k) {
+      // Two base indexes because of the unrolling
+      // Point to the first element of the current column (k) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_0 = (2 * k) + (ch * fch_len);
+      // Point to the first element of the current column (k+1) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v16, v0, %0" ::"f"(f[0 + base_idx_0]));
+      else
+        asm volatile("vfmacc.vf v16, %0, v0" ::"f"(f[0 + base_idx_0]));
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v18, v4, %0" ::"f"(f[0 + base_idx_0]));
+      else
+        asm volatile("vfmacc.vf v18, %0, v4" ::"f"(f[0 + base_idx_0]));
+      asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++));
+      asm volatile("vfmacc.vf v16, %0, v4" ::"f"(f[7 + base_idx_0]));
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v22, v12, %0" ::"f"(f[0 + base_idx_0]));
+      else
+        asm volatile("vfmacc.vf v22, %0, v12" ::"f"(f[0 + base_idx_0]));
+      asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));
+      asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++));
+      asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++));
+      asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++));
+      asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++));
+      asm volatile("vfmacc.vf v20, %0, v14" ::"f"(f[7 + base_idx_1]));
+    }
+
+    int64_t base_idx_0 = (F - 1) + (ch * fch_len);
+
+    // Don't slide during the last iteration
+  }
+
+  // Bump the input ptr
+  i_ += 4 * (N + F - 1);
+
+  ////////////////
+  // Row 4 -> 6 //
+  ////////////////
+
+  // Loop on the channels
+  for (int ch = 0; ch < C; ++ch) {
+
+    // Point to the first element of the channel ch
+    i__ = i_ + ch * ich_len;
+
+    // Start calculating the next pointers to the elements to be slided in
+    i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1);
+    i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1);
+    i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1);
+
+
+    // Main kernel, unrolled by 2
+    for (int k = 0; k < F / 2; ++k) {
+      // Two base indexes because of the unrolling
+      // Point to the first element of the current column (k) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_0 = (2 * k) + (ch * fch_len);
+      // Point to the first element of the current column (k+1) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+
+      // Unroll 0
+      asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++));
+
+      asm volatile("vfmacc.vf v18, %0, v10" ::"f"(f[35 + base_idx_0]));
+      asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++));
+
+      asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++));
+
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v26, v6, %0" ::"f"(f[0 + base_idx_0]));
+      else
+        asm volatile("vfmacc.vf v26, %0, v6" ::"f"(f[0 + base_idx_0]));
+      asm volatile("vfmacc.vf v26, %0, v10" ::"f"(f[7 + base_idx_0]));
+
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v28, v10, %0" ::"f"(f[0 + base_idx_0]));
+      else
+        asm volatile("vfmacc.vf v28, %0, v10" ::"f"(f[0 + base_idx_0]));
+
+      // Unroll 1
+      asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++));
+
+      asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
+
+      asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));
+
+    }
+
+    // The very last iterations require mixing the instructions with the store
+    // and the moves
+    if (ch != C - 1) {
+      // Point to the first element of the current column (k) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_0 = (F - 1) + (ch * fch_len);
+
+      // Don't slide the elements here
+    }
+  }
+
+  // Reuse preloaded coefficients
+  // Buffer the next coefficients for faster use
+
+
+  // Bump the input ptr
+  i_ += 3 * (N + F - 1);
+
+  ////////////
+  // REGIME //
+  ////////////
+
+  // The following loop is unrolled by 2
+  // The input matrix has M + F - 1 rows
+  // We have computed F input rows already
+  // Nompute now until only F input rows are left
+  // (The last F-1 rows do not contribute to F output rows each, so keep them
+  // outside of this loop) (We keep F rows outside because of the unrolling by
+  // 2, just for easeness)
+  for (int j = 0; j < ((M + F - 1) - 2 * F) / 2; ++j) {
+
+    // Work on F output rows
+
+    // Loop on the channels
+    for (int ch = 0; ch < C; ++ch) {
+      // Point to the first element of the channel ch
+      i__ = i_ + ch * ich_len;
+
+      // Start calculating the next pointers to the elements to be slided in
+      i_slide_ptr_0 = i__ + n_;
+
+      for (int k = 0; k < F / 2; ++k) {
+        // Two base indexes because of the unrolling
+        // Look ahead to the first element of the current column (k+2) of the
+        // current channel (ch) of the filter (f)
+        int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len);
+        // Point to the first element of the current column (k+1) of the current
+        // channel (ch) of the filter (f)
+        int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+
+        asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++));
+        f1_buf = f[7 + base_idx_1];
+        if ((k | ch) == 0)
+          asm volatile("vfmul.vf v28, v0, %0" ::"f"(f0_buf));
+        else
+          asm volatile("vfmacc.vf v28, %0, v0" ::"f"(f0_buf));
+        f0_buf = f[0 + base_idx_1];
+
+        // Nalculate F contributions of the input rows, on F different output
+        // rows
+        asm volatile("vfmacc.vf v16, %0, v2" ::"f"(f6_buf));
+        asm volatile("vfmacc.vf v18, %0, v2" ::"f"(f5_buf));
+        f6_buf = f[42 + base_idx_0];
+        asm volatile("vfmacc.vf v20, %0, v2" ::"f"(f4_buf));
+        f5_buf = f[35 + base_idx_0];
+        asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++));
+        f4_buf = f[28 + base_idx_0];
+        asm volatile("vfmacc.vf v22, %0, v2" ::"f"(f3_buf));
+        f3_buf = f[21 + base_idx_0];
+        asm volatile("vfmacc.vf v24, %0, v2" ::"f"(f2_buf));
+        f2_buf = f[14 + base_idx_0];
+        asm volatile("vfmacc.vf v26, %0, v2" ::"f"(f1_buf));
+        f1_buf = f[7 + base_idx_0];
+        asm volatile("vfmacc.vf v28, %0, v2" ::"f"(f0_buf));
+        f0_buf = f[0 + base_idx_0];
+      }
+
+      if (ch != C - 1) {
+        int64_t base_idx_0 = (ch + 1) * fch_len;
+
+      }
+    }
+   }
+
+    // Bump the input ptr
+    i_ += N + F - 1;
+
+#ifdef VCD_DUMP
+    // Stop dumping VCD
+    event_trigger = -1;
+#endif
+
+    //////////////
+    // UNROLL 1 //
+    //////////////
+
+    // Loop on the channels
+    for (int ch = 0; ch < C; ++ch) {
+
+      // Point to the first element of the channel ch
+      i__ = i_ + ch * ich_len;
+
+      // Start calculating the next pointers to the elements to be slided in
+      i_slide_ptr_1 = i__ + n_;
+
+      for (int k = 0; k < F / 2; ++k) {
+        // Two base indexes because of the unrolling
+        // Point to the first element of the current column (k) of the current
+        // channel (ch) of the filter (f)
+        int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len);
+        // Point to the first element of the current column (k+1) of the current
+        // channel (ch) of the filter (f)
+        int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+    }
+
+
+    // Bump the input ptr
+    i_ += N + F - 1;
+  }
+
+  ////////////////////////
+  // Row I-F -> (I-1)-3 //
+  ////////////////////////
+
+  for (int64_t ch = 0; ch < C; ++ch) {
+
+    // Point to the first element of the channel ch
+    i__ = i_ + ch * ich_len;
+
+    // Point to the scalar elements to insert during a slide
+    // i_slide_ptr_0 has already been computed
+    i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1);
+    i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1);
+    i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1);
+    i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1);
+
+    // Main kernel, unrolled by 2
+    // Process 4 input rows
+    for (int k = 0; k < F / 2; ++k) {
+      // Two base indexes because of the unrolling
+      // Point to the first element of the current column (k) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_0 = (2 * k) + (ch * fch_len);
+      // Point to the first element of the current column (k+1) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+
+      asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++));
+      if ((k | ch) == 0)
+        asm volatile("vfmul.vf v28, v0, %0" ::"f"(f[0 + base_idx_0]));
+      else
+      asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));
+      asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++));
+
+      asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++));
+      asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++));
+      asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++));
+    }
+
+  }
+
+
+  // Bump the input ptr
+  i_ += 4 * (N + F - 1);
+
+  //////////////////////////
+  // Row (I-1)-3 -> (I-1) //
+  //////////////////////////
+
+  for (int64_t ch = 0; ch < C; ++ch) {
+
+    // Point to the first element of the channel ch
+    i__ = i_ + ch * ich_len;
+
+    // Start calculating the next pointers to the elements to be slided in
+    i_slide_ptr_0 = i__ + n_ + 0 * (N + F - 1);
+    i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1);
+    i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1);
+
+    // Main kernel, unrolled by 2
+    for (int k = 0; k < F / 2; ++k) {
+      // Two base indexes because of the unrolling
+      // Point to the first element of the current column (k) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_0 = (2 * k) + (ch * fch_len);
+      // Point to the first element of the current column (k+1) of the current
+      // channel (ch) of the filter (f)
+      int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
+
+      asm volatile("vfslide1down.vf v0, v2, %0" ::"f"(*i_slide_ptr_0++));
+      asm volatile("vfslide1down.vf v4, v6, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++));
+
+      asm volatile("vfslide1down.vf v2, v0, %0" ::"f"(*i_slide_ptr_0++));
+      asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
+      asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));
+    }
+
+    if (ch != C - 1) {
+      int64_t base_idx_0 = (F - 1) + (ch * fch_len);
+    }
+  }
+}
+
+
 /*
   ////////////////////
   // MAIN ALGOMITHM //
diff --git a/apps/jacobi2d/kernel/jacobi2d.c b/apps/jacobi2d/kernel/jacobi2d.c
index d04644648..b72ff7759 100644
--- a/apps/jacobi2d/kernel/jacobi2d.c
+++ b/apps/jacobi2d/kernel/jacobi2d.c
@@ -143,6 +143,145 @@ void j2d_kernel_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
   }
 }
 
+// Optimized version of the jacobi2d kernel
+void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
+  DATA_TYPE izq_0, izq_1, izq_2;
+  DATA_TYPE der_0, der_1, der_2;
+  uint32_t size_x = c - 2;
+  uint32_t size_y = r - 2;
+  // Simplify pointer calc
+  uint32_t sc_ptr_0, sc_ptr_1;
+  uint32_t mtx_ptr_0, mtx_ptr_1;
+
+  // Avoid division. 1/5 == 0.2
+  double five_ = 0.2;
+
+  size_t gvl;
+
+  asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(gvl) : "r"(size_x));
+
+  for (uint32_t j = 1; j <= size_x; j = j + gvl) {
+    asm volatile("vsetvli %0, %1, e64, m4, ta, ma"
+                 : "=r"(gvl)
+                 : "r"(size_x - j + 1));
+    mtx_ptr_0 = j; // 0 * c + j
+    asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
+    mtx_ptr_1 = j + c; // 1 * c + j
+    asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_1])); // v4 middle
+    mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j
+    asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom
+
+    // Look ahead and load the next coefficients
+    // Do it before vector stores
+    sc_ptr_0 = mtx_ptr_1 - 1; // 1 * c + j - 1
+    izq_0 = A[sc_ptr_0];
+    sc_ptr_1 = mtx_ptr_1 + gvl; // 1 * c + j + gvl
+    der_0 = A[sc_ptr_1];
+
+    // mtx_ptr_0 = 2 * c + j
+    // mtx_ptr_1 = 1 * c + j
+    // sc_ptr_0  = 1 * c + j - 1
+    // sc_ptr_1  = 1 * c + j + gvl
+
+    for (uint32_t i = 1; i <= size_y; i += 3) {
+#ifdef VCD_DUMP
+      // Start dumping VCD
+      if (i == 7)
+        event_trigger = +1;
+      // Stop dumping VCD
+      if (i == 13)
+        event_trigger = -1;
+#endif
+      // mtx_ptr_0 = (i + 1) * c + j
+      // mtx_ptr_1 = i * c + j
+      // sc_ptr_0  = i * c + j - 1
+      // sc_ptr_1  = i * c + j + gvl
+
+      asm volatile("vfslide1up.vf v24, v4, %0" ::"f"(izq_0));
+      asm volatile("vfslide1down.vf v28, v4, %0" ::"f"(der_0));
+      asm volatile("vfadd.vv v12, v4, v0");   // middle - top
+      mtx_ptr_0 += c; // (i + 2) * c + j
+      asm volatile("vfadd.vv v12, v12, v8");  // bottom
+      sc_ptr_0 += c; // (i + 1) * c + j - 1
+      asm volatile("vfadd.vv v12, v12, v24"); // left
+      if ((i + 1) <= size_y) {
+        asm volatile(
+            "vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
+      }
+      asm volatile("vfadd.vv v12, v12, v28"); // right
+      sc_ptr_1 += c; // (i + 1) * c + j + gvl
+      asm volatile("vfmul.vf v12, v12, %0" ::"f"(five_));
+      if ((i + 1) <= size_y) {
+        izq_1 = A[sc_ptr_0];
+        der_1 = A[sc_ptr_1];
+      }
+      asm volatile("vse64.v v12, (%0)" ::"r"(&B[mtx_ptr_1]));
+      mtx_ptr_1 += c; // (i + 1) * c + j
+
+      // mtx_ptr_0 = (i + 2) * c + j
+      // mtx_ptr_1 = (i + 1) * c + j
+      // sc_ptr_0  = (i + 1) * c + j - 1
+      // sc_ptr_1  = (i + 1) * c + j + gvl
+
+      if ((i + 1) <= size_y) {
+        asm volatile("vfslide1up.vf v24, v8, %0" ::"f"(izq_1));
+        asm volatile("vfslide1down.vf v28, v8, %0" ::"f"(der_1));
+        asm volatile("vfadd.vv v16, v4, v8");   // middle - top
+        mtx_ptr_0 += c; // (i + 3) * c + j
+        asm volatile("vfadd.vv v16, v16, v0");  // bottom
+        sc_ptr_0 += c; // (i + 2) * c + j - 1
+        asm volatile("vfadd.vv v16, v16, v24"); // left
+        if ((i + 2) <= size_y) {
+          asm volatile(
+              "vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle
+        }
+        asm volatile("vfadd.vv v16, v16, v28"); // right
+        sc_ptr_1 += c; // (i + 2) * c + j + gvl
+        asm volatile("vfmul.vf v16, v16, %0" ::"f"(five_));
+        if ((i + 2) <= size_y) {
+          izq_2 = A[sc_ptr_0];
+          der_2 = A[sc_ptr_1];
+        }
+        asm volatile("vse64.v v16, (%0)" ::"r"(&B[mtx_ptr_1]));
+        mtx_ptr_1 += c; // (i + 2) * c + j
+
+        // mtx_ptr_0 = (i + 3) * c + j
+        // mtx_ptr_1 = (i + 2) * c + j
+        // sc_ptr_0  = (i + 2) * c + j - 1
+        // sc_ptr_1  = (i + 2) * c + j + gvl
+
+        if ((i + 2) <= size_y) {
+          asm volatile("vfslide1up.vf v24, v0, %0" ::"f"(izq_2));
+          asm volatile("vfslide1down.vf v28, v0, %0" ::"f"(der_2));
+          asm volatile("vfadd.vv v20, v0, v8");   // middle - top
+          mtx_ptr_0 += c; // (i + 4) * c + j
+          asm volatile("vfadd.vv v20, v20, v4");  // bottom
+          sc_ptr_0 += c; // (i + 3) * c + j - 1
+          asm volatile("vfadd.vv v20, v20, v24"); // left
+          if ((i + 3) <= size_y) {
+            asm volatile("vle64.v v8, (%0)" ::"r"(
+                &A[mtx_ptr_0])); // v8 bottom
+          }
+          asm volatile("vfadd.vv v20, v20, v28"); // right
+          sc_ptr_1 += c; // (i + 3) * c + j + gvl
+          asm volatile("vfmul.vf v20, v20, %0" ::"f"(five_));
+          if ((i + 3) <= size_y) {
+            izq_0 = A[sc_ptr_0];
+            der_0 = A[sc_ptr_1];
+          }
+          asm volatile("vse64.v v20, (%0)" ::"r"(&B[mtx_ptr_1]));
+          mtx_ptr_1 += c; // (i + 3) * c + j
+
+          // mtx_ptr_0 = (i + 4) * c + j
+          // mtx_ptr_1 = (i + 3) * c + j
+          // sc_ptr_0  = (i + 3) * c + j - 1
+          // sc_ptr_1  = (i + 3) * c + j + gvl
+        }
+      }
+    }
+  }
+}
+
 // Optimized version of the jacobi2d kernel
 // 1) Preload the coefficients, before each vstore
 // 2) Eliminate WAW and WAR hazards