From 26dacc31a5ad9ffc5615d748a79e9b811598cc80 Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Mon, 2 Sep 2024 13:09:31 +0200
Subject: [PATCH] [software] Add f32 and f16 axpy app

---
 software/apps/baremetal/Makefile              |  14 +-
 software/apps/baremetal/axpy_f16/main.c       |  69 ++++++++
 software/apps/baremetal/axpy_f32/main.c       |  62 +++++++
 software/apps/baremetal/axpy_i32/main.c       |   2 +-
 software/data/data_axpy_f16.h.tpl             |  26 +++
 software/data/data_axpy_f32.h.tpl             |  26 +++
 software/data/generate_dotp.py                |  44 ++++-
 software/kernels/baremetal/mempool_axpy_f16.h | 124 +++++++++++++
 software/kernels/baremetal/mempool_axpy_f32.h | 165 ++++++++++++++++++
 ...mempool_axpy_i32p.h => mempool_axpy_i32.h} |   0
 10 files changed, 528 insertions(+), 4 deletions(-)
 create mode 100644 software/apps/baremetal/axpy_f16/main.c
 create mode 100644 software/apps/baremetal/axpy_f32/main.c
 create mode 100644 software/data/data_axpy_f16.h.tpl
 create mode 100644 software/data/data_axpy_f32.h.tpl
 create mode 100644 software/kernels/baremetal/mempool_axpy_f16.h
 create mode 100644 software/kernels/baremetal/mempool_axpy_f32.h
 rename software/kernels/baremetal/{mempool_axpy_i32p.h => mempool_axpy_i32.h} (100%)

diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index b4b2ee496..ffecfabba 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -22,8 +22,18 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL))
-ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL))
+FP_APPS := axpy_f16 axpy_f32
+FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
+FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
+FP_APPS += dotp_f16 dotp_f32
+FP_APPS += mimo_mmse_f32 mimo_mmse_f16 ofdm
+
+I_APPS := synth_i32
+I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
+I_APPS += cmatmul_q16 mimo_mmse_q16
+
+ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
+ALL_LLVM := $(filter-out $(I_APPS), $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)
diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
new file mode 100644
index 000000000..9fe49d299
--- /dev/null
+++ b/software/apps/baremetal/axpy_f16/main.c
@@ -0,0 +1,69 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_axpy_f16.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+
+// Vectors for kernel computation
+__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+
+#include "baremetal/mempool_axpy_f16.h"
+#include "baremetal/mempool_checks.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t));
+  }
+  mempool_barrier(num_cores);
+
+  //  // SINGLE
+  //  time_init = mempool_get_timer();
+  //  axpy_f16s(l1_A, l1_B, l1_C, LEN);
+  //  time_end = mempool_get_timer();
+
+  //  // PARALLEL
+  //  time_init = mempool_get_timer();
+  //  axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
+  //  time_end = mempool_get_timer();
+
+  // PARALLEL, LOCAL ACCESSES
+  time_init = mempool_get_timer();
+  axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  time_end = mempool_get_timer();
+
+  mempool_barrier(num_cores);
+  // Check results
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+  }
+  mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_barrier(num_cores);
+
+  return 0;
+}
diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
new file mode 100644
index 000000000..262342fb2
--- /dev/null
+++ b/software/apps/baremetal/axpy_f32/main.c
@@ -0,0 +1,62 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_axpy_f32.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+#define SINGLE_CORE_REDUCTION
+// #define BINARY_REDUCTION
+
+// Vectors for kernel computation
+float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+
+#include "baremetal/mempool_axpy_f32.h"
+#include "baremetal/mempool_checks.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t));
+  }
+  mempool_barrier(num_cores);
+
+  // PARALLEL
+  time_init = mempool_get_timer();
+  // axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores);
+  // axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
+  axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  time_end = mempool_get_timer();
+
+  // Check results
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+  }
+  mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_barrier(num_cores);
+
+  return 0;
+}
diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c
index a9354796e..b63e31499 100644
--- a/software/apps/baremetal/axpy_i32/main.c
+++ b/software/apps/baremetal/axpy_i32/main.c
@@ -7,7 +7,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_axpy_i32.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl
new file mode 100644
index 000000000..09ea72cbf
--- /dev/null
+++ b/software/data/data_axpy_f16.h.tpl
@@ -0,0 +1,26 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(__fp16){:.4f}, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LEN (${Len})
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl
new file mode 100644
index 000000000..2efe34b45
--- /dev/null
+++ b/software/data/data_axpy_f32.h.tpl
@@ -0,0 +1,26 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}f, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LEN (${Len})
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
index 66fc95bcc..64170f573 100644
--- a/software/data/generate_dotp.py
+++ b/software/data/generate_dotp.py
@@ -40,6 +40,26 @@ def generate_dotp_f16(Len):
     C = (np.dot(A, B)).astype(np.float16)
     return A, B, C
 
+
+def generate_axpy_f32(Len):
+
+    # Create matrix
+    A = np.random.rand(Len).astype(np.float32)
+    B = np.random.rand(Len).astype(np.float32)
+    C = np.random.rand(Len).astype(np.float32)
+    out = C + A * B
+    return A, B, C, out
+
+
+def generate_axpy_f16(Len):
+
+    # Create matrix
+    A = np.random.rand(Len).astype(np.float16)
+    B = np.random.rand(Len).astype(np.float16)
+    C = np.random.rand(Len).astype(np.float16)
+    out = C + A * B
+    return A, B, C, out
+
 ##################
 # compute_result #
 ##################
@@ -73,7 +93,7 @@ def main():
         "--length",
         type=int,
         required=False,
-        default=4096,
+        default=1024,
         help='First dimension.'
     )
 
@@ -110,6 +130,28 @@ def main():
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
+    A, B, C, out = generate_axpy_f32(Len)
+    tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
+    kwargs = {
+        'name': 'data_axpy_f32',
+        'A': A,
+        'B': B,
+        'C': C,
+        'out': out,
+        'Len': Len}
+    gen_data_header_file(args.outdir, tpl, **kwargs)
+
+    A, B, C, out = generate_axpy_f16(Len)
+    tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
+    kwargs = {
+        'name': 'data_axpy_f16',
+        'A': A,
+        'B': B,
+        'C': C,
+        'out': out,
+        'Len': Len}
+    gen_data_header_file(args.outdir, tpl, **kwargs)
+
 
 if __name__ == "__main__":
     main()
diff --git a/software/kernels/baremetal/mempool_axpy_f16.h b/software/kernels/baremetal/mempool_axpy_f16.h
new file mode 100644
index 000000000..e54331d2d
--- /dev/null
+++ b/software/kernels/baremetal/mempool_axpy_f16.h
@@ -0,0 +1,124 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#pragma once
+#include "builtins_v2.h"
+
+#define AXPYF16VEC_UNROLLED4_LOOP                                              \
+  {                                                                            \
+    a01 = (*(v2h *)&in_a[i]);                                                  \
+    a23 = (*(v2h *)&in_a[i + 2]);                                              \
+    b01 = (*(v2h *)&in_b[i]);                                                  \
+    b23 = (*(v2h *)&in_b[i + 2]);                                              \
+    c01 = (*(v2h *)&in_c[i]);                                                  \
+    c23 = (*(v2h *)&in_c[i + 2]);                                              \
+    asm volatile(                                                              \
+        "vfmac.h %[c01], %[a01], %[b01];"                                      \
+        "vfmac.h %[c23], %[a23], %[b23];"                                      \
+        : [c01] "+&r"(c01), [c23] "+&r"(c23)                                   \
+        : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23));     \
+    (*(v2h *)&in_c[i]) = c01;                                                  \
+    (*(v2h *)&in_c[i + 2]) = c23;                                              \
+  }
+
+/* Single-core dot-product */
+void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    // Kernel execution
+    __fp16 *end = in_a + Len / 2;
+    do {
+      asm volatile("fmadd.h %0, %1, %2, %0;"
+                   : "+&r"(*in_c)
+                   : "r"(*in_a), "r"(*in_b));
+      in_a++;
+      in_b++;
+      in_c++;
+    } while (in_a < end);
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
+/* Single-core dot-product unrolled4 */
+void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
+                         uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    uint32_t i = 0;
+    v2h a01, a23;
+    v2h b01, b23;
+    v2h c01, c23;
+    for (i = 0; i < Len; i += 4) {
+      AXPYF16VEC_UNROLLED4_LOOP;
+    }
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
+/* Parallel dot-product */
+void axpy_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len,
+               uint32_t nPE) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  __fp16 a, b, c;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    c = in_c[i];
+    asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b));
+    in_c[i] = c;
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void axpy_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
+                            uint32_t Len, uint32_t nPE) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t i;
+  v2h a01, a23;
+  v2h b01, b23;
+  v2h c01, c23;
+  for (i = core_id * step; i < core_id * step + step; i += 4) {
+    AXPYF16VEC_UNROLLED4_LOOP;
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+void axpy_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
+                                  uint32_t Len) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  v2h a01, a23;
+  v2h b01, b23;
+  v2h c01, c23;
+  for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) {
+    AXPYF16VEC_UNROLLED4_LOOP;
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
diff --git a/software/kernels/baremetal/mempool_axpy_f32.h b/software/kernels/baremetal/mempool_axpy_f32.h
new file mode 100644
index 000000000..ff069524c
--- /dev/null
+++ b/software/kernels/baremetal/mempool_axpy_f32.h
@@ -0,0 +1,165 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define AXPYF32_UNROLLED4_LOOP                                                 \
+  {                                                                            \
+    a0 = in_a[i];                                                              \
+    b0 = in_b[i];                                                              \
+    c0 = in_c[i];                                                              \
+    a1 = in_a[i + 1];                                                          \
+    b1 = in_b[i + 1];                                                          \
+    c1 = in_c[i + 1];                                                          \
+    a2 = in_a[i + 2];                                                          \
+    b2 = in_b[i + 2];                                                          \
+    c2 = in_c[i + 2];                                                          \
+    a3 = in_a[i + 3];                                                          \
+    b3 = in_b[i + 3];                                                          \
+    c3 = in_c[i + 3];                                                          \
+    asm volatile(                                                              \
+        "fmadd.s %[c0], %[a0], %[b0], %[c0];"                                  \
+        "fmadd.s %[c1], %[a1], %[b1], %[c1];"                                  \
+        "fmadd.s %[c2], %[a2], %[b2], %[c2];"                                  \
+        "fmadd.s %[c3], %[a3], %[b3], %[c3];"                                  \
+        : [c0] "+&r"(c0), [c1] "+&r"(c1), [c2] "+&r"(c2), [c3] "+&r"(c3)       \
+        : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3),              \
+          [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3));             \
+    in_c[i] = c0;                                                              \
+    in_c[i + 1] = c1;                                                          \
+    in_c[i + 2] = c2;                                                          \
+    in_c[i + 3] = c3;                                                          \
+  }
+
+/* Single-core dot-product */
+void axpy_f32s(float *in_a, float *in_b, float *in_c, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    // Kernel execution
+    float *end = in_a + Len;
+    do {
+      asm volatile("fmadd.s %0, %1, %2, %0;"
+                   : "+&r"(*in_c)
+                   : "r"(*in_a), "r"(*in_b));
+      in_a++;
+      in_b++;
+      in_c++;
+    } while (in_a < end);
+    mempool_stop_benchmark();
+  }
+  return;
+}
+
+/* Single-core dot-product unrolled4 */
+void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    uint32_t reminder = Len % 4;
+    uint32_t i = 0;
+
+    register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+    register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+    register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
+
+    for (i = 0; i < (Len - reminder); i += 4) {
+      AXPYF32_UNROLLED4_LOOP;
+    }
+    while (i < Len) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      c0 = in_c[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
+      in_c[i] = c0;
+      i++;
+    }
+    mempool_stop_benchmark();
+  }
+  return;
+}
+
+/* Parallel dot-product */
+void axpy_f32p(float *in_a, float *in_b, float *in_c, uint32_t Len,
+               uint32_t nPE) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+
+  register float a, b, c;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    c = in_c[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b));
+    in_c[i] = c;
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void axpy_f32p_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len,
+                         uint32_t nPE) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t reminder = step % 4;
+  uint32_t i;
+
+  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+  register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
+
+  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
+    AXPYF32_UNROLLED4_LOOP;
+  }
+  i = core_id * step + step - reminder;
+  while (i < step) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    c0 = in_c[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
+    in_c[i] = c0;
+    i++;
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+void axpy_f32p_local_unrolled4(float *in_a, float *in_b, float *in_c,
+                               uint32_t Len) {
+
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t const remainder = Len % BANKING_FACTOR;
+  uint32_t const idx_stop = Len - remainder;
+
+  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+  register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
+
+  for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) {
+    AXPYF32_UNROLLED4_LOOP;
+  }
+  if (core_id == ((Len % NUM_BANKS) / 4)) {
+    for (uint32_t i = Len - remainder; i < Len; i++) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
+      in_c[i] = c0;
+    }
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
diff --git a/software/kernels/baremetal/mempool_axpy_i32p.h b/software/kernels/baremetal/mempool_axpy_i32.h
similarity index 100%
rename from software/kernels/baremetal/mempool_axpy_i32p.h
rename to software/kernels/baremetal/mempool_axpy_i32.h