From bf0e68d2347f6772c70aa0f0a00056139f46d309 Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Mon, 2 Sep 2024 13:09:31 +0200 Subject: [PATCH] [software] Add f32 and f16 axpy app --- software/apps/baremetal/Makefile | 14 +- software/apps/baremetal/axpy_f16/main.c | 69 ++++++++ software/apps/baremetal/axpy_f32/main.c | 62 +++++++ software/apps/baremetal/axpy_i32/main.c | 2 +- software/data/data_axpy_f16.h.tpl | 26 +++ software/data/data_axpy_f32.h.tpl | 26 +++ software/data/generate_dotp.py | 44 ++++- software/kernels/baremetal/mempool_axpy_f16.h | 124 +++++++++++++ software/kernels/baremetal/mempool_axpy_f32.h | 165 ++++++++++++++++++ ...mempool_axpy_i32p.h => mempool_axpy_i32.h} | 0 10 files changed, 528 insertions(+), 4 deletions(-) create mode 100644 software/apps/baremetal/axpy_f16/main.c create mode 100644 software/apps/baremetal/axpy_f32/main.c create mode 100644 software/data/data_axpy_f16.h.tpl create mode 100644 software/data/data_axpy_f32.h.tpl create mode 100644 software/kernels/baremetal/mempool_axpy_f16.h create mode 100644 software/kernels/baremetal/mempool_axpy_f32.h rename software/kernels/baremetal/{mempool_axpy_i32p.h => mempool_axpy_i32.h} (100%) diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index b4b2ee496..ffecfabba 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -22,8 +22,18 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py)) BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) -ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL)) -ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL)) +FP_APPS := axpy_f16 axpy_f32 +FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16 +FP_APPS += cmatmul_f16 matmul_f16 matmul_f32 +FP_APPS += dotp_f16 dotp_f32 +FP_APPS += mimo_mmse_f32 mimo_mmse_f16 ofdm + +I_APPS := synth_i32 +I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 +I_APPS += cmatmul_q16 mimo_mmse_q16 + +ALL_GCC := $(filter-out $(FP_APPS), $(ALL)) +ALL_LLVM := $(filter-out $(I_APPS), $(ALL)) # Make all applications all: $(ALL_GCC) diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c new file mode 100644 index 000000000..9fe49d299 --- /dev/null +++ b/software/apps/baremetal/axpy_f16/main.c @@ -0,0 +1,69 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include + +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_axpy_f16.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) + +// Vectors for kernel computation +__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); + +#include "baremetal/mempool_axpy_f16.h" +#include "baremetal/mempool_checks.h" + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t time_init, time_end; + mempool_barrier_init(core_id); + + time_init = 0; + time_end = 0; + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t)); + } + mempool_barrier(num_cores); + + // // SINGLE + // time_init = mempool_get_timer(); + // axpy_f16s(l1_A, l1_B, l1_C, LEN); + // time_end = mempool_get_timer(); + + // // PARALLEL + // time_init = mempool_get_timer(); + // axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores); + // time_end = mempool_get_timer(); + + // PARALLEL, LOCAL ACCESSES + time_init = mempool_get_timer(); + axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN); + time_end = mempool_get_timer(); + + mempool_barrier(num_cores); + // Check results + if (core_id == 0) { + uint32_t clock_cycles = (time_end - time_init); + printf("\nKernel execution takes %d clock cycles\n", clock_cycles); + } + mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0); + mempool_barrier(num_cores); + + return 0; +} diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c new file mode 100644 index 000000000..262342fb2 --- /dev/null +++ b/software/apps/baremetal/axpy_f32/main.c @@ -0,0 +1,62 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include + +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#include "data_axpy_f32.h" +#define NUM_BANKS (NUM_CORES * BANKING_FACTOR) +#define SINGLE_CORE_REDUCTION +// #define BINARY_REDUCTION + +// Vectors for kernel computation +float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); + +#include "baremetal/mempool_axpy_f32.h" +#include "baremetal/mempool_checks.h" + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + uint32_t time_init, time_end; + mempool_barrier_init(core_id); + + time_init = 0; + time_end = 0; + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t)); + } + mempool_barrier(num_cores); + + // PARALLEL + time_init = mempool_get_timer(); + // axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores); + // axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores); + axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN); + time_end = mempool_get_timer(); + + // Check results + if (core_id == 0) { + uint32_t clock_cycles = (time_end - time_init); + printf("\nKernel execution takes %d clock cycles\n", clock_cycles); + } + mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0); + mempool_barrier(num_cores); + + return 0; +} diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c index a9354796e..b63e31499 100644 --- a/software/apps/baremetal/axpy_i32/main.c +++ b/software/apps/baremetal/axpy_i32/main.c @@ -7,7 +7,7 @@ #include #include -#include "baremetal/mempool_axpy_i32p.h" +#include "baremetal/mempool_axpy_i32.h" #include "encoding.h" #include "printf.h" #include "runtime.h" diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl new file mode 100644 index 000000000..09ea72cbf --- /dev/null +++ b/software/data/data_axpy_f16.h.tpl @@ -0,0 +1,26 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '(__fp16){:.4f}, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LEN (${Len}) + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)}; + +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl new file mode 100644 index 000000000..2efe34b45 --- /dev/null +++ b/software/data/data_axpy_f32.h.tpl @@ -0,0 +1,26 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}f, '.format(a) + i += 1 + if i % 8 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LEN (${Len}) + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)}; + +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py index 66fc95bcc..64170f573 100644 --- a/software/data/generate_dotp.py +++ b/software/data/generate_dotp.py @@ -40,6 +40,26 @@ def generate_dotp_f16(Len): C = (np.dot(A, B)).astype(np.float16) return A, B, C + +def generate_axpy_f32(Len): + + # Create matrix + A = np.random.rand(Len).astype(np.float32) + B = np.random.rand(Len).astype(np.float32) + C = np.random.rand(Len).astype(np.float32) + out = C + A * B + return A, B, C, out + + +def generate_axpy_f16(Len): + + # Create matrix + A = np.random.rand(Len).astype(np.float16) + B = np.random.rand(Len).astype(np.float16) + C = np.random.rand(Len).astype(np.float16) + out = C + A * B + return A, B, C, out + ################## # compute_result # ################## @@ -73,7 +93,7 @@ def main(): "--length", type=int, required=False, - default=4096, + default=1024, help='First dimension.' ) @@ -110,6 +130,28 @@ def main(): 'Len': Len} gen_data_header_file(args.outdir, tpl, **kwargs) + A, B, C, out = generate_axpy_f32(Len) + tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl" + kwargs = { + 'name': 'data_axpy_f32', + 'A': A, + 'B': B, + 'C': C, + 'out': out, + 'Len': Len} + gen_data_header_file(args.outdir, tpl, **kwargs) + + A, B, C, out = generate_axpy_f16(Len) + tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl" + kwargs = { + 'name': 'data_axpy_f16', + 'A': A, + 'B': B, + 'C': C, + 'out': out, + 'Len': Len} + gen_data_header_file(args.outdir, tpl, **kwargs) + if __name__ == "__main__": main() diff --git a/software/kernels/baremetal/mempool_axpy_f16.h b/software/kernels/baremetal/mempool_axpy_f16.h new file mode 100644 index 000000000..e54331d2d --- /dev/null +++ b/software/kernels/baremetal/mempool_axpy_f16.h @@ -0,0 +1,124 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#pragma once +#include "builtins_v2.h" + +#define AXPYF16VEC_UNROLLED4_LOOP \ + { \ + a01 = (*(v2h *)&in_a[i]); \ + a23 = (*(v2h *)&in_a[i + 2]); \ + b01 = (*(v2h *)&in_b[i]); \ + b23 = (*(v2h *)&in_b[i + 2]); \ + c01 = (*(v2h *)&in_c[i]); \ + c23 = (*(v2h *)&in_c[i + 2]); \ + asm volatile( \ + "vfmac.h %[c01], %[a01], %[b01];" \ + "vfmac.h %[c23], %[a23], %[b23];" \ + : [c01] "+&r"(c01), [c23] "+&r"(c23) \ + : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23)); \ + (*(v2h *)&in_c[i]) = c01; \ + (*(v2h *)&in_c[i + 2]) = c23; \ + } + +/* Single-core dot-product */ +void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + // Kernel execution + __fp16 *end = in_a + Len / 2; + do { + asm volatile("fmadd.h %0, %1, %2, %0;" + : "+&r"(*in_c) + : "r"(*in_a), "r"(*in_b)); + in_a++; + in_b++; + in_c++; + } while (in_a < end); + mempool_stop_benchmark(); + } + + return; +} + +/* Single-core dot-product unrolled4 */ +void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, + uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + uint32_t i = 0; + v2h a01, a23; + v2h b01, b23; + v2h c01, c23; + for (i = 0; i < Len; i += 4) { + AXPYF16VEC_UNROLLED4_LOOP; + } + mempool_stop_benchmark(); + } + + return; +} + +/* Parallel dot-product */ +void axpy_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len, + uint32_t nPE) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + __fp16 a, b, c; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + c = in_c[i]; + asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b)); + in_c[i] = c; + } + mempool_barrier(num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling*/ +void axpy_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, + uint32_t Len, uint32_t nPE) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t i; + v2h a01, a23; + v2h b01, b23; + v2h c01, c23; + for (i = core_id * step; i < core_id * step + step; i += 4) { + AXPYF16VEC_UNROLLED4_LOOP; + } + mempool_barrier(num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +void axpy_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, + uint32_t Len) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + v2h a01, a23; + v2h b01, b23; + v2h c01, c23; + for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) { + AXPYF16VEC_UNROLLED4_LOOP; + } + mempool_barrier(num_cores); + + return; +} diff --git a/software/kernels/baremetal/mempool_axpy_f32.h b/software/kernels/baremetal/mempool_axpy_f32.h new file mode 100644 index 000000000..ff069524c --- /dev/null +++ b/software/kernels/baremetal/mempool_axpy_f32.h @@ -0,0 +1,165 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#define AXPYF32_UNROLLED4_LOOP \ + { \ + a0 = in_a[i]; \ + b0 = in_b[i]; \ + c0 = in_c[i]; \ + a1 = in_a[i + 1]; \ + b1 = in_b[i + 1]; \ + c1 = in_c[i + 1]; \ + a2 = in_a[i + 2]; \ + b2 = in_b[i + 2]; \ + c2 = in_c[i + 2]; \ + a3 = in_a[i + 3]; \ + b3 = in_b[i + 3]; \ + c3 = in_c[i + 3]; \ + asm volatile( \ + "fmadd.s %[c0], %[a0], %[b0], %[c0];" \ + "fmadd.s %[c1], %[a1], %[b1], %[c1];" \ + "fmadd.s %[c2], %[a2], %[b2], %[c2];" \ + "fmadd.s %[c3], %[a3], %[b3], %[c3];" \ + : [c0] "+&r"(c0), [c1] "+&r"(c1), [c2] "+&r"(c2), [c3] "+&r"(c3) \ + : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), \ + [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)); \ + in_c[i] = c0; \ + in_c[i + 1] = c1; \ + in_c[i + 2] = c2; \ + in_c[i + 3] = c3; \ + } + +/* Single-core dot-product */ +void axpy_f32s(float *in_a, float *in_b, float *in_c, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + // Kernel execution + float *end = in_a + Len; + do { + asm volatile("fmadd.s %0, %1, %2, %0;" + : "+&r"(*in_c) + : "r"(*in_a), "r"(*in_b)); + in_a++; + in_b++; + in_c++; + } while (in_a < end); + mempool_stop_benchmark(); + } + return; +} + +/* Single-core dot-product unrolled4 */ +void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) { + + uint32_t core_id = mempool_get_core_id(); + if (core_id == 0) { + mempool_start_benchmark(); + uint32_t reminder = Len % 4; + uint32_t i = 0; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; + + for (i = 0; i < (Len - reminder); i += 4) { + AXPYF32_UNROLLED4_LOOP; + } + while (i < Len) { + a0 = in_a[i]; + b0 = in_b[i]; + c0 = in_c[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); + in_c[i] = c0; + i++; + } + mempool_stop_benchmark(); + } + return; +} + +/* Parallel dot-product */ +void axpy_f32p(float *in_a, float *in_b, float *in_c, uint32_t Len, + uint32_t nPE) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + + register float a, b, c; + for (uint32_t i = core_id * step; i < core_id * step + step; i++) { + a = in_a[i]; + b = in_b[i]; + c = in_c[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b)); + in_c[i] = c; + } + mempool_barrier(num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling*/ +void axpy_f32p_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len, + uint32_t nPE) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + uint32_t step = Len / nPE; + uint32_t reminder = step % 4; + uint32_t i; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; + + for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { + AXPYF32_UNROLLED4_LOOP; + } + i = core_id * step + step - reminder; + while (i < step) { + a0 = in_a[i]; + b0 = in_b[i]; + c0 = in_c[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); + in_c[i] = c0; + i++; + } + mempool_barrier(num_cores); + + return; +} + +/* Parallel dot-product with loop unrolling */ +/* Load and stores only in local memory */ +void axpy_f32p_local_unrolled4(float *in_a, float *in_b, float *in_c, + uint32_t Len) { + + uint32_t num_cores = mempool_get_core_count(); + uint32_t core_id = mempool_get_core_id(); + uint32_t const remainder = Len % BANKING_FACTOR; + uint32_t const idx_stop = Len - remainder; + + register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; + + for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) { + AXPYF32_UNROLLED4_LOOP; + } + if (core_id == ((Len % NUM_BANKS) / 4)) { + for (uint32_t i = Len - remainder; i < Len; i++) { + a0 = in_a[i]; + b0 = in_b[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); + in_c[i] = c0; + } + } + mempool_barrier(num_cores); + + return; +} diff --git a/software/kernels/baremetal/mempool_axpy_i32p.h b/software/kernels/baremetal/mempool_axpy_i32.h similarity index 100% rename from software/kernels/baremetal/mempool_axpy_i32p.h rename to software/kernels/baremetal/mempool_axpy_i32.h