diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
new file mode 100644
index 000000000..36a7f8f99
--- /dev/null
+++ b/software/apps/baremetal/dotp_f16/main.c
@@ -0,0 +1,78 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_dotp_f16.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+// #define SINGLE_CORE_REDUCTION
+#define BINARY_REDUCTION
+
+// Vectors for kernel computation
+__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+uint32_t red_barrier[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 sum[2 * NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+
+#include "baremetal/mempool_dotp_f16.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
+  }
+  for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
+    sum[k] = 0;
+    red_barrier[k] = 0;
+  }
+  mempool_barrier(num_cores);
+
+  //  // SINGLE-CORE
+  //  time_init = mempool_get_timer();
+  //  dotp_f16s(l1_A, l1_B, sum, LEN);
+  //  // dotp_f16s_unrolled4(l1_A, l1_B, sum, LEN);
+  //  time_end = mempool_get_timer();
+
+  //  // PARALLEL
+  //  time_init = mempool_get_timer();
+  //  dotp_f16vecp_unrolled4(l1_A, l1_B, sum, LEN, num_cores);
+  //  // dotp_f16p(l1_A, l1_B, sum, LEN, num_cores);
+  //  time_end = mempool_get_timer();
+
+  // PARALLEL, LOCAL ACCESSES
+  time_init = mempool_get_timer();
+  dotp_f16vecp_local_unrolled4(l1_A, l1_B, sum, LEN);
+  time_end = mempool_get_timer();
+
+  // Check results
+  mempool_barrier(num_cores);
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+    printf("Result ==> %x\n", *(uint32_t *)&sum[0]);
+    printf("Check  ==> %x\n\n", *(uint32_t *)&l2_C);
+  }
+  mempool_barrier(num_cores);
+
+  return 0;
+}
diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
new file mode 100644
index 000000000..8c3c7e8cd
--- /dev/null
+++ b/software/apps/baremetal/dotp_f32/main.c
@@ -0,0 +1,76 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "data_dotp_f32.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+// #define SINGLE_CORE_REDUCTION
+#define BINARY_REDUCTION
+
+// Vectors for kernel computation
+float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+uint32_t red_barrier[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+
+#include "baremetal/mempool_dotp_f32.h"
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  uint32_t time_init, time_end;
+  mempool_barrier_init(core_id);
+
+  time_init = 0;
+  time_end = 0;
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
+  }
+  for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
+    sum[k] = 0;
+    red_barrier[k] = 0;
+  }
+  mempool_barrier(num_cores);
+
+  //    // SINGLE-CORE
+  //    time_init = mempool_get_timer();
+  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, LEN);
+  //    time_end = mempool_get_timer();
+
+  //   // PARALLEL
+  //   time_init = mempool_get_timer();
+  //   dotp_f32p(l1_A, l1_B, sum, LEN, num_cores);
+  //   time_end = mempool_get_timer();
+
+  // PARALLEL, LOCAL ACCESSES
+  time_init = mempool_get_timer();
+  dotp_f32p_local_unrolled4(l1_A, l1_B, sum, LEN);
+  time_end = mempool_get_timer();
+
+  // Check results
+  mempool_barrier(num_cores);
+  if (core_id == 0) {
+    uint32_t clock_cycles = (time_end - time_init);
+    printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
+    printf("Result ==> %d\n", sum[0]);
+    printf("Check  ==> %d\n\n", l2_C);
+  }
+  mempool_barrier(num_cores);
+
+  return 0;
+}
diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
index da00a937e..441c98355 100644
--- a/software/apps/baremetal/dotp_i32/main.c
+++ b/software/apps/baremetal/dotp_i32/main.c
@@ -28,8 +28,7 @@ uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
-#include "baremetal/mempool_dotp_i32p.h"
-#include "baremetal/mempool_dotp_i32s.h"
+#include "baremetal/mempool_dotp_i32.h"
 
 int main() {
 
diff --git a/software/data/data_dotp_f16.h.tpl b/software/data/data_dotp_f16.h.tpl
new file mode 100644
index 000000000..f7cacaed3
--- /dev/null
+++ b/software/data/data_dotp_f16.h.tpl
@@ -0,0 +1,24 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(__fp16){:.4f}, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LEN (${Len})
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = (__fp16)${C}f;
diff --git a/software/data/data_dotp_f32.h.tpl b/software/data/data_dotp_f32.h.tpl
new file mode 100644
index 000000000..3af0fbe66
--- /dev/null
+++ b/software/data/data_dotp_f32.h.tpl
@@ -0,0 +1,24 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}f, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LEN (${Len})
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}f;
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
index 6bacf2488..66fc95bcc 100644
--- a/software/data/generate_dotp.py
+++ b/software/data/generate_dotp.py
@@ -22,6 +22,24 @@ def generate_dotp_i32(Len):
     C = np.dot(A, B)
     return A, B, C
 
+
+def generate_dotp_f32(Len):
+
+    # Create matrix
+    A = np.random.rand(Len).astype(np.float32)
+    B = np.random.rand(Len).astype(np.float32)
+    C = (np.dot(A, B)).astype(np.float32)
+    return A, B, C
+
+
+def generate_dotp_f16(Len):
+
+    # Create matrix
+    A = np.random.rand(Len).astype(np.float16)
+    B = np.random.rand(Len).astype(np.float16)
+    C = (np.dot(A, B)).astype(np.float16)
+    return A, B, C
+
 ##################
 # compute_result #
 ##################
@@ -72,6 +90,26 @@ def main():
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
+    A, B, C = generate_dotp_f32(Len)
+    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f32.h.tpl"
+    kwargs = {
+        'name': 'data_dotp_f32',
+        'A': A,
+        'B': B,
+        'C': C,
+        'Len': Len}
+    gen_data_header_file(args.outdir, tpl, **kwargs)
+
+    A, B, C = generate_dotp_f16(Len)
+    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f16.h.tpl"
+    kwargs = {
+        'name': 'data_dotp_f16',
+        'A': A,
+        'B': B,
+        'C': C,
+        'Len': Len}
+    gen_data_header_file(args.outdir, tpl, **kwargs)
+
 
 if __name__ == "__main__":
     main()
diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h
new file mode 100644
index 000000000..17d13df24
--- /dev/null
+++ b/software/kernels/baremetal/mempool_dotp_f16.h
@@ -0,0 +1,222 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#pragma once
+#include "builtins_v2.h"
+
+#define DOTPF16VEC_UNROLLED4_LOOP                                              \
+  {                                                                            \
+    a01 = (*(v2h *)&in_a[i]);                                                  \
+    a23 = (*(v2h *)&in_a[i + 2]);                                              \
+    b01 = (*(v2h *)&in_b[i]);                                                  \
+    b23 = (*(v2h *)&in_b[i + 2]);                                              \
+    asm volatile(                                                              \
+        "vfdotpex.s.h %[local_sum0], %[a01], %[b01];"                          \
+        "vfdotpex.s.h %[local_sum1], %[a23], %[b23];"                          \
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)       \
+        : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23));     \
+  }
+
+/* Single core reduction */
+void mempool_reduction_f16(__fp16 *sum, uint32_t num_cores) {
+
+  // The last core to the reduction barrier sums the partial reductions
+  if ((num_cores - 1) ==
+      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
+
+    // Reduction
+    uint32_t idx_red = 0;
+    __fp16 local_sum = (__fp16)0.0f;
+    while (idx_red < NUM_BANKS) {
+      asm volatile("fadd.h %0, %0, %1;" : "+&r"(local_sum) : "r"(sum[idx_red]));
+      idx_red += 2 * BANKING_FACTOR;
+    }
+    sum[0] = local_sum;
+
+    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Bynary tree reduction */
+void mempool_binary_reduction_f16(__fp16 *sum, uint32_t core_id,
+                                  uint32_t num_cores) {
+
+  uint32_t idx, step = 2, previous_step = 1;
+  while (num_cores > 1) {
+    idx = (step * (core_id / step)) * BANKING_FACTOR;
+    // dump_prova(idx);
+    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
+                           __ATOMIC_RELAXED)) {
+
+      // Reduction
+      __fp16 add = sum[2 * (idx + previous_step * BANKING_FACTOR)];
+      asm volatile("fadd.h %0, %0, %1;" : "+&r"(sum[2 * idx]) : "r"(add));
+
+      // Next level of binary tree
+      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
+                       __ATOMIC_RELAXED);
+      num_cores = num_cores / 2;
+      previous_step = step;
+      step = step * 2;
+
+    } else {
+      // Goes to sleep
+      break;
+    }
+  }
+
+  // Last core wakes everyone
+  if (num_cores == 1) {
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Single-core dot-product */
+void dotp_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    // Kernel execution
+    __fp16 local_sum = (__fp16)0.0f;
+    __fp16 *end = in_a + Len;
+    do {
+      asm volatile("fmadd.h %0, %1, %2, %0;"
+                   : "+&r"(local_sum)
+                   : "r"(*in_a), "r"(*in_b));
+      in_a++;
+      in_b++;
+    } while (in_a < end);
+    s[0] = local_sum;
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Single-core dot-product unrolled4 */
+void dotp_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    uint32_t i = 0;
+
+    v2h a01, a23;
+    v2h b01, b23;
+    float local_sum0 = 0.0f;
+    float local_sum1 = 0.0f;
+
+    for (i = 0; i < Len; i += 4) {
+      DOTPF16VEC_UNROLLED4_LOOP;
+    }
+    // Reduction
+    asm volatile(
+        "fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+        "fcvt.h.s %[local_sum0], %[local_sum0];"
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+        :);
+    s[0] = *(__fp16 *)&local_sum0;
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+
+  return;
+}
+
+/* Parallel dot-product */
+void dotp_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len,
+               uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  __fp16 local_sum = (__fp16)0.0f;
+  __fp16 a, b;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(local_sum) : "r"(a), "r"(b));
+  }
+  s[2 * core_id * BANKING_FACTOR] = local_sum;
+
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f16(s, num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void dotp_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len,
+                            uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t i;
+
+  v2h a01, a23;
+  v2h b01, b23;
+  float local_sum0 = 0.0f;
+  float local_sum1 = 0.0f;
+
+  for (i = core_id * step; i < core_id * step + step; i += 4) {
+    DOTPF16VEC_UNROLLED4_LOOP;
+  }
+  asm volatile("fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fcvt.h.s %[local_sum0], %[local_sum0];"
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+               :);
+  s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0;
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f16(s, num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+void dotp_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s,
+                                  uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+
+  v2h a01, a23;
+  v2h b01, b23;
+  float local_sum0 = 0.0f;
+  float local_sum1 = 0.0f;
+  for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) {
+    DOTPF16VEC_UNROLLED4_LOOP;
+  }
+  asm volatile("fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fcvt.h.s %[local_sum0], %[local_sum0];"
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+               :);
+  s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0;
+
+// The last core to the reduction barrier sums the partial reductions
+#if defined(SINGLE_CORE_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f16(s, num_cores);
+// A) Cores store locally in sum array
+// B) Partial sums are reduced logarithmically
+#elif defined(BINARY_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_binary_reduction_f16(s, core_id, num_cores);
+#endif
+
+  return;
+}
diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h
new file mode 100644
index 000000000..58fa0e9d5
--- /dev/null
+++ b/software/kernels/baremetal/mempool_dotp_f32.h
@@ -0,0 +1,267 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define DOTPF32_UNROLLED4_LOOP                                                 \
+  {                                                                            \
+    a0 = in_a[i];                                                              \
+    b0 = in_b[i];                                                              \
+    a1 = in_a[i + 1];                                                          \
+    b1 = in_b[i + 1];                                                          \
+    a2 = in_a[i + 2];                                                          \
+    b2 = in_b[i + 2];                                                          \
+    a3 = in_a[i + 3];                                                          \
+    b3 = in_b[i + 3];                                                          \
+    asm volatile(                                                              \
+        "fmadd.s %[local_sum0], %[a0], %[b0], %[local_sum0];"                  \
+        "fmadd.s %[local_sum1], %[a1], %[b1], %[local_sum1];"                  \
+        "fmadd.s %[local_sum2], %[a2], %[b2], %[local_sum2];"                  \
+        "fmadd.s %[local_sum3], %[a3], %[b3], %[local_sum3];"                  \
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),      \
+          [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)       \
+        : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3),              \
+          [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3));             \
+  }
+
+/* Single core reduction */
+void mempool_reduction_f32(float *sum, uint32_t num_cores) {
+
+  // The last core to the reduction barrier sums the partial reductions
+  if ((num_cores - 1) ==
+      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
+
+    // Reduction
+    uint32_t idx_red = 0;
+    float local_sum = 0.0f;
+    while (idx_red < NUM_BANKS) {
+      asm volatile("fadd.s %0, %0, %1;" : "+&r"(local_sum) : "r"(sum[idx_red]));
+      idx_red += BANKING_FACTOR;
+    }
+    sum[0] = local_sum;
+
+    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Bynary tree reduction */
+void mempool_binary_reduction_f32(float *sum, uint32_t core_id,
+                                  uint32_t num_cores) {
+
+  uint32_t idx, step = 2, previous_step = 1;
+  while (num_cores > 1) {
+    idx = (step * (core_id / step)) * BANKING_FACTOR;
+    // dump_prova(idx);
+    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
+                           __ATOMIC_RELAXED)) {
+
+      // Reduction
+      float add = sum[idx + previous_step * BANKING_FACTOR];
+      asm volatile("fadd.s %0, %0, %1;" : "+&r"(sum[idx]) : "r"(add));
+
+      // Next level of binary tree
+      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
+                       __ATOMIC_RELAXED);
+      num_cores = num_cores / 2;
+      previous_step = step;
+      step = step * 2;
+
+    } else {
+      // Goes to sleep
+      break;
+    }
+  }
+
+  // Last core wakes everyone
+  if (num_cores == 1) {
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Single-core dot-product */
+void dotp_f32s(float *in_a, float *in_b, float *s, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    // Kernel execution
+    register float local_sum = 0;
+    float *end = in_a + Len;
+    do {
+      asm volatile("fmadd.s %0, %1, %2, %0;"
+                   : "+&r"(local_sum)
+                   : "r"(*in_a), "r"(*in_b));
+      in_a++;
+      in_b++;
+    } while (in_a < end);
+    *s = local_sum;
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
+/* Single-core dot-product unrolled4 */
+void dotp_f32s_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    uint32_t reminder = Len % 4;
+    uint32_t i = 0;
+
+    register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+    register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+    register float local_sum0 = 0.0f;
+    register float local_sum1 = 0.0f;
+    register float local_sum2 = 0.0f;
+    register float local_sum3 = 0.0f;
+
+    for (i = 0; i < (Len - reminder); i += 4) {
+      DOTPF32_UNROLLED4_LOOP;
+    }
+    while (i < Len) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;"
+                   : "+&r"(local_sum0)
+                   : "r"(a0), "r"(b0));
+      i++;
+    }
+    // Reduction
+    asm volatile(
+        "fadd.s %[local_sum0], %[local_sum0], %[local_sum1];"
+        "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];"
+        "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];"
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+          [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
+        :);
+    *s = local_sum0;
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
+/* Parallel dot-product */
+void dotp_f32p(float *in_a, float *in_b, float *s, uint32_t Len, uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  register float local_sum = 0;
+  register float a, b;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(local_sum) : "r"(a), "r"(b));
+  }
+  s[core_id * BANKING_FACTOR] = local_sum;
+
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f32(s, num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void dotp_f32p_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len,
+                         uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t reminder = step % 4;
+  uint32_t i;
+
+  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+  register float local_sum0 = 0.0f;
+  register float local_sum1 = 0.0f;
+  register float local_sum2 = 0.0f;
+  register float local_sum3 = 0.0f;
+
+  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
+    DOTPF32_UNROLLED4_LOOP;
+  }
+  i = core_id * step + step - reminder;
+  while (i < step) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;"
+                 : "+&r"(local_sum0)
+                 : "r"(a0), "r"(b0));
+    i++;
+  }
+  asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];"
+               "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];"
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+                 [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
+               :);
+  s[core_id * BANKING_FACTOR] = local_sum0;
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f32(s, num_cores);
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+void dotp_f32p_local_unrolled4(float *in_a, float *in_b, float *s,
+                               uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t const remainder = Len % BANKING_FACTOR;
+  uint32_t const idx_stop = Len - remainder;
+
+  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+  register float local_sum0 = 0.0f;
+  register float local_sum1 = 0.0f;
+  register float local_sum2 = 0.0f;
+  register float local_sum3 = 0.0f;
+
+  for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) {
+    DOTPF32_UNROLLED4_LOOP;
+  }
+  if (core_id == ((Len % NUM_BANKS) / 4)) {
+    for (uint32_t i = Len - remainder; i < Len; i++) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;"
+                   : "+&r"(local_sum0)
+                   : "r"(a0), "r"(b0));
+    }
+  }
+  asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];"
+               "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];"
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+                 [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
+               :);
+  s[core_id * BANKING_FACTOR] = local_sum0;
+
+// The last core to the reduction barrier sums the partial reductions
+#if defined(SINGLE_CORE_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_reduction_f32(s, num_cores);
+
+// A) Cores store locally in sum array
+// B) Partial sums are reduced logarithmically
+#elif defined(BINARY_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_binary_reduction_f32(s, core_id, num_cores);
+
+#endif
+
+  return;
+}
diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32.h
similarity index 63%
rename from software/kernels/baremetal/mempool_dotp_i32p.h
rename to software/kernels/baremetal/mempool_dotp_i32.h
index 26fbe03e9..4b80e92ed 100644
--- a/software/kernels/baremetal/mempool_dotp_i32p.h
+++ b/software/kernels/baremetal/mempool_dotp_i32.h
@@ -4,6 +4,115 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+#define DOTPI32_UNROLLED4_LOOP                                                 \
+  {                                                                            \
+    a0 = in_a[i];                                                              \
+    b0 = in_b[i];                                                              \
+    a1 = in_a[i + 1];                                                          \
+    b1 = in_b[i + 1];                                                          \
+    a2 = in_a[i + 2];                                                          \
+    b2 = in_b[i + 2];                                                          \
+    a3 = in_a[i + 3];                                                          \
+    b3 = in_b[i + 3];                                                          \
+    local_sum0 += a0 * b0;                                                     \
+    local_sum1 += a1 * b1;                                                     \
+    local_sum2 += a2 * b2;                                                     \
+    local_sum3 += a3 * b3;                                                     \
+  }
+
+/* Bynary tree reduction */
+void mempool_binary_reduction_i32(int32_t *sum, uint32_t core_id,
+                                  uint32_t num_cores) {
+
+  uint32_t idx, step = 2, previous_step = 1;
+  while (num_cores > 1) {
+    idx = (step * (core_id / step)) * BANKING_FACTOR;
+    // dump_prova(idx);
+    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
+                           __ATOMIC_RELAXED)) {
+
+      // Reduction
+      sum[idx] += sum[idx + previous_step * BANKING_FACTOR];
+
+      // Next level of binary tree
+      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
+                       __ATOMIC_RELAXED);
+      num_cores = num_cores / 2;
+      previous_step = step;
+      step = step * 2;
+
+    } else {
+      // Goes to sleep
+      break;
+    }
+  }
+
+  // Last core wakes everyone
+  if (num_cores == 1) {
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Single-core dot-product */
+void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    // Kernel execution
+    register int32_t local_sum = 0;
+    int32_t *end = in_a + Len;
+    do {
+      local_sum += ((*in_a++) * (*in_b++));
+    } while (in_a < end);
+    *s = local_sum;
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
+/* Single-core dot-product unrolled4 */
+void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
+                         uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    uint32_t reminder = Len % 4;
+    uint32_t i = 0;
+
+    register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
+    register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0;
+    register int32_t local_sum0 = 0;
+    register int32_t local_sum1 = 0;
+    register int32_t local_sum2 = 0;
+    register int32_t local_sum3 = 0;
+
+    for (i = 0; i < (Len - reminder); i += 4) {
+      DOTPI32_UNROLLED4_LOOP;
+    }
+    while (i < Len) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      local_sum0 += a0 * b0;
+      i++;
+    }
+    // Reduction
+    local_sum0 += local_sum1;
+    local_sum2 += local_sum3;
+    local_sum0 += local_sum2;
+    *s = local_sum0;
+    mempool_stop_benchmark();
+  }
+
+  return;
+}
+
 /* Parallel dot-product */
 void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
                uint32_t nPE) {
@@ -18,12 +127,14 @@ void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
     local_sum += a * b;
   }
   __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
+
 #ifdef LOG_BARRIERS
   mempool_log_barrier(2, core_id);
 #else
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier(num_cores);
 #endif
+
   return;
 }
 
@@ -42,19 +153,9 @@ void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
   register int32_t local_sum1 = 0;
   register int32_t local_sum2 = 0;
   register int32_t local_sum3 = 0;
+
   for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    a1 = in_a[i + 1];
-    b1 = in_b[i + 1];
-    a2 = in_a[i + 2];
-    b2 = in_b[i + 2];
-    a3 = in_a[i + 3];
-    b3 = in_b[i + 3];
-    local_sum0 += a0 * b0;
-    local_sum1 += a1 * b1;
-    local_sum2 += a2 * b2;
-    local_sum3 += a3 * b3;
+    DOTPI32_UNROLLED4_LOOP;
   }
   i = core_id * step + step - reminder;
   while (i < step) {
@@ -67,48 +168,13 @@ void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
   local_sum2 += local_sum3;
   local_sum0 += local_sum2;
   __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
+
 #ifdef LOG_BARRIERS
   mempool_log_barrier(2, core_id);
 #else
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier(num_cores);
 #endif
-  return;
-}
-
-/* Bynary tree reduction */
-void mempool_binary_reduction(int32_t *sum, uint32_t core_id,
-                              uint32_t num_cores) {
-
-  uint32_t idx, step = 2, previous_step = 1;
-  while (num_cores > 1) {
-    idx = (step * (core_id / step)) * BANKING_FACTOR;
-    // dump_prova(idx);
-    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
-                           __ATOMIC_RELAXED)) {
-
-      // Reduction
-      sum[idx] += sum[idx + previous_step * BANKING_FACTOR];
-
-      // Next level of binary tree
-      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
-                       __ATOMIC_RELAXED);
-      num_cores = num_cores / 2;
-      previous_step = step;
-      step = step * 2;
-
-    } else {
-      // Goes to sleep
-      break;
-    }
-  }
-
-  // Last core wakes everyone
-  if (num_cores == 1) {
-    __sync_synchronize();
-    wake_up_all();
-  }
-  mempool_wfi();
 
   return;
 }
@@ -120,7 +186,7 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
                                uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
-  uint32_t const remainder = Len % 4;
+  uint32_t const remainder = Len % BANKING_FACTOR;
   uint32_t const idx_stop = Len - remainder;
 
   register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
@@ -130,7 +196,7 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
   register int32_t local_sum2 = 0;
   register int32_t local_sum3 = 0;
 
-  for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) {
+  for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) {
     a0 = in_a[i];
     b0 = in_b[i];
     a1 = in_a[i + 1];
@@ -187,8 +253,8 @@ void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
 // B) Partial sums are reduced logarithmically
 #elif defined(BINARY_REDUCTION)
   uint32_t num_cores = mempool_get_core_count();
-  s[core_id * 4] = local_sum0;
-  mempool_binary_reduction(s, core_id, num_cores);
+  s[core_id * BANKING_FACTOR] = local_sum0;
+  mempool_binary_reduction_i32(s, core_id, num_cores);
 
 #endif
 
diff --git a/software/kernels/baremetal/mempool_dotp_i32s.h b/software/kernels/baremetal/mempool_dotp_i32s.h
deleted file mode 100644
index dd562debb..000000000
--- a/software/kernels/baremetal/mempool_dotp_i32s.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/* Single-core dot-product */
-void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  if (core_id == 0) {
-    mempool_start_benchmark();
-    // Kernel execution
-    register int32_t local_sum = 0;
-    int32_t *end = in_a + Len;
-    do {
-      local_sum += ((*in_a++) * (*in_b++));
-    } while (in_a < end);
-    *s = local_sum;
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-}
-
-/* Single-core dot-product unrolled4 */
-void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                         uint32_t Len) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  if (core_id == 0) {
-    mempool_start_benchmark();
-    uint32_t reminder = Len % 4;
-    uint32_t i = 0;
-    int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0;
-    register int32_t local_sum_1 = 0;
-    register int32_t local_sum_2 = 0;
-    register int32_t local_sum_3 = 0;
-    register int32_t local_sum_4 = 0;
-    for (i = 0; i < (Len - reminder); i += 4) {
-      a0 = in_a[i];
-      b0 = in_b[i];
-      a1 = in_a[i + 1];
-      b1 = in_b[i + 1];
-      a2 = in_a[i + 2];
-      b2 = in_b[i + 2];
-      a3 = in_a[i + 3];
-      b3 = in_b[i + 3];
-      local_sum_1 += a0 * b0;
-      local_sum_2 += a1 * b1;
-      local_sum_3 += a2 * b2;
-      local_sum_4 += a3 * b3;
-    }
-    while (i < Len) {
-      a0 = in_a[i];
-      b0 = in_b[i];
-      local_sum_1 += a0 * b0;
-      i++;
-    }
-    // Reduction
-    local_sum_1 += local_sum_2;
-    local_sum_3 += local_sum_4;
-    local_sum_1 += local_sum_3;
-    *s = local_sum_1;
-    mempool_stop_benchmark();
-  }
-  mempool_barrier(num_cores);
-}