From 11366d5fe9baed757730fa727c1aa696a229f075 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Wed, 8 Nov 2023 10:43:40 +0100
Subject: [PATCH] sw/math: Extend and refactor to proper library (#60)

* sw/math: Refactor to proper library

The previous header-only library style led to conflicts on certain
defines (for instance `N`) defined in both math library sources and
application sources.

* sw/math: Add `exp`, `sqrt`, `log` and `ceil` functions

* sw/math: Add safe FP <--> INT conversions

* ci: Properly initialize Bender dependencies

* sw/math: Implement safe `tanh` function
---
 .github/workflows/ci.yml                      |   2 +
 .gitlab-ci.yml                                |   2 +-
 Bender.yml                                    |  29 ++-
 Makefile                                      |   3 +-
 hw/mem_interface/util/compile.sh              |   2 +-
 hw/reqrsp_interface/util/compile.sh           |   2 +-
 hw/snitch_cluster/util/compile.sh             |   2 +-
 hw/snitch_icache/util/compile.sh              |   2 +-
 hw/snitch_ssr/util/compile.sh                 |   2 +-
 hw/tcdm_interface/util/compile.sh             |   2 +-
 ...2-sw-math-Refactor-to-proper-library.patch | 125 +++++++++++
 ...-sw-math-Add-safe-FP-INT-conversions.patch |  81 +++++++
 ...sw-math-Implement-safe-tanh-function.patch | 149 +++++++++++++
 sw/math/Makefile                              |  77 ++++++-
 sw/math/include/float.h                       |  52 +++++
 sw/math/include/math.h                        |   3 -
 sw/math/src/internal/libm.h                   |  82 ++++++-
 sw/math/src/math/__math_divzero.c             |   6 +
 sw/math/src/math/__math_invalid.c             |   6 +
 sw/math/src/math/__math_invalidf.c            |   6 +
 sw/math/src/math/__math_invalidl.c            |   9 +
 sw/math/src/math/__math_oflow.c               |   6 +
 sw/math/src/math/__math_oflowf.c              |   6 +
 sw/math/src/math/__math_uflow.c               |   6 +
 sw/math/src/math/__math_uflowf.c              |   6 +
 sw/math/src/math/__math_xflow.c               |   6 +
 sw/math/src/math/__math_xflowf.c              |   6 +
 sw/math/src/math/ceil.c                       |  31 +++
 sw/math/src/math/ceilf.c                      |  27 +++
 sw/math/src/math/ceill.c                      |  34 +++
 sw/math/src/math/exp2f_data.c                 |  35 +++
 sw/math/src/math/exp2f_data.h                 |  23 ++
 sw/math/src/math/expf.c                       |  80 +++++++
 sw/math/src/math/expm1.c                      |  34 ++-
 sw/math/src/math/log2.c                       | 122 +++++++++++
 sw/math/src/math/log2_data.c                  | 201 ++++++++++++++++++
 sw/math/src/math/log2_data.h                  |  28 +++
 sw/math/src/math/log2f.c                      |  72 +++++++
 sw/math/src/math/log2f_data.c                 |  33 +++
 sw/math/src/math/log2f_data.h                 |  19 ++
 sw/math/src/math/sqrt.c                       | 158 ++++++++++++++
 sw/math/src/math/sqrt_data.c                  |  19 ++
 sw/math/src/math/sqrt_data.h                  |  13 ++
 sw/math/src/math/sqrtf.c                      |  83 ++++++++
 sw/math/src/math/tanh.c                       |  17 +-
 target/snitch_cluster/sw/Makefile             |  10 +-
 target/snitch_cluster/sw/apps/common.mk       |  10 +-
 target/snitch_cluster/sw/math/Makefile        |   8 +
 target/snitch_cluster/sw/toolchain.mk         |   1 +
 49 files changed, 1692 insertions(+), 46 deletions(-)
 create mode 100644 sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
 create mode 100644 sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
 create mode 100644 sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
 create mode 100644 sw/math/include/float.h
 create mode 100644 sw/math/src/math/__math_divzero.c
 create mode 100644 sw/math/src/math/__math_invalid.c
 create mode 100644 sw/math/src/math/__math_invalidf.c
 create mode 100644 sw/math/src/math/__math_invalidl.c
 create mode 100644 sw/math/src/math/__math_oflow.c
 create mode 100644 sw/math/src/math/__math_oflowf.c
 create mode 100644 sw/math/src/math/__math_uflow.c
 create mode 100644 sw/math/src/math/__math_uflowf.c
 create mode 100644 sw/math/src/math/__math_xflow.c
 create mode 100644 sw/math/src/math/__math_xflowf.c
 create mode 100644 sw/math/src/math/ceil.c
 create mode 100644 sw/math/src/math/ceilf.c
 create mode 100644 sw/math/src/math/ceill.c
 create mode 100644 sw/math/src/math/exp2f_data.c
 create mode 100644 sw/math/src/math/exp2f_data.h
 create mode 100644 sw/math/src/math/expf.c
 create mode 100644 sw/math/src/math/log2.c
 create mode 100644 sw/math/src/math/log2_data.c
 create mode 100644 sw/math/src/math/log2_data.h
 create mode 100644 sw/math/src/math/log2f.c
 create mode 100644 sw/math/src/math/log2f_data.c
 create mode 100644 sw/math/src/math/log2f_data.h
 create mode 100644 sw/math/src/math/sqrt.c
 create mode 100644 sw/math/src/math/sqrt_data.c
 create mode 100644 sw/math/src/math/sqrt_data.h
 create mode 100644 sw/math/src/math/sqrtf.c
 create mode 100644 target/snitch_cluster/sw/math/Makefile

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f2c3e692a..8df08c866 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -36,6 +36,7 @@ jobs:
           submodules: 'recursive'
       - name: Build Software
         run: |
+          bender vendor init
           make -C target/snitch_cluster sw
       - name: Build Hardware
         run: |
@@ -61,6 +62,7 @@ jobs:
           submodules: 'recursive'
       - name: Build Software
         run: |
+          bender vendor init
           make -C target/snitch_cluster SELECT_RUNTIME=banshee sw
       - name: Run Tests
         env:
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 610c271ea..624c2a003 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -13,7 +13,6 @@ variables:
   VERILATOR: verilator-4.110
   QUESTA: questa-2022.3
   LLVM_BINROOT: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin
-  CLANG: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang
   CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc
   LLVM_SYS_120_PREFIX: /usr/pack/llvm-12.0.1-af
   CMAKE: cmake-3.18.1
@@ -22,6 +21,7 @@ before_script:
   - $PYTHON -m venv .venv
   - source .venv/bin/activate
   - pip install -r python-requirements.txt
+  - $BENDER vendor init
 
 ##############
 # Build docs #
diff --git a/Bender.yml b/Bender.yml
index 732788d0d..4ae9d28e9 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -37,13 +37,40 @@ vendor_package:
       - "Makefile"
       - ".gitignore"
       - "README"
-      - "src/math/tanh.c"
+      - "src/math/ceil.c"
+      - "src/math/ceilf.c"
+      - "src/math/ceill.c"
       - "src/math/expm1.c"
+      - "src/math/expf.c"
+      - "src/math/exp2f_data.c"
+      - "src/math/exp2f_data.h"
+      - "src/math/log2.c"
+      - "src/math/log2_data.c"
+      - "src/math/log2_data.h"
+      - "src/math/log2f.c"
+      - "src/math/log2f_data.c"
+      - "src/math/log2f_data.h"
+      - "src/math/__math_divzero.c"
+      - "src/math/__math_invalid.c"
+      - "src/math/__math_invalidf.c"
+      - "src/math/__math_invalidl.c"
+      - "src/math/__math_oflow.c"
+      - "src/math/__math_oflowf.c"
+      - "src/math/__math_uflow.c"
+      - "src/math/__math_uflowf.c"
+      - "src/math/__math_xflow.c"
+      - "src/math/__math_xflowf.c"
+      - "src/math/sqrt.c"
+      - "src/math/sqrtf.c"
+      - "src/math/sqrt_data.c"
+      - "src/math/sqrt_data.h"
+      - "src/math/tanh.c"
       - "src/internal/libm.h"
       - "src/include/features.h"
       - "include/endian.h"
       - "include/math.h"
       - "include/features.h"
+      - "include/float.h"
       - "include/alltypes.h.in"
       - "arch/riscv64/bits/alltypes.h.in"
       - "arch/riscv64/bits/float.h"
diff --git a/Makefile b/Makefile
index 06a1c662f..7aba09569 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
-REGGEN = $(shell bender path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
+BENDER ?= bender
+REGGEN  = $(shell $(BENDER) path register_interface)/vendor/lowrisc_opentitan/util/regtool.py
 
 GENERATED_DOCS_DIR = docs/generated
 GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md
diff --git a/hw/mem_interface/util/compile.sh b/hw/mem_interface/util/compile.sh
index 73ccc7fca..51814285a 100755
--- a/hw/mem_interface/util/compile.sh
+++ b/hw/mem_interface/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/hw/reqrsp_interface/util/compile.sh b/hw/reqrsp_interface/util/compile.sh
index 73ccc7fca..84956acea 100755
--- a/hw/reqrsp_interface/util/compile.sh
+++ b/hw/reqrsp_interface/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/hw/snitch_cluster/util/compile.sh b/hw/snitch_cluster/util/compile.sh
index 73ccc7fca..51814285a 100755
--- a/hw/snitch_cluster/util/compile.sh
+++ b/hw/snitch_cluster/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/hw/snitch_icache/util/compile.sh b/hw/snitch_icache/util/compile.sh
index 73ccc7fca..51814285a 100755
--- a/hw/snitch_icache/util/compile.sh
+++ b/hw/snitch_icache/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/hw/snitch_ssr/util/compile.sh b/hw/snitch_ssr/util/compile.sh
index 73ccc7fca..84956acea 100755
--- a/hw/snitch_ssr/util/compile.sh
+++ b/hw/snitch_ssr/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$(BENDER) script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/hw/tcdm_interface/util/compile.sh b/hw/tcdm_interface/util/compile.sh
index 73ccc7fca..51814285a 100755
--- a/hw/tcdm_interface/util/compile.sh
+++ b/hw/tcdm_interface/util/compile.sh
@@ -10,7 +10,7 @@ set -e
 
 [ ! -z "$VSIM" ] || VSIM=vsim
 
-bender script vsim -t test \
+$BENDER script vsim -t test \
     --vlog-arg="-svinputport=compat" \
     --vlog-arg="-override_timescale 1ns/1ps" \
     --vlog-arg="-suppress 2583" \
diff --git a/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
new file mode 100644
index 000000000..068851f3c
--- /dev/null
+++ b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch
@@ -0,0 +1,125 @@
+From 91c1b48e44629a80bdc1832111707c051ab0b3b2 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Mon, 23 Oct 2023 14:30:18 +0200
+Subject: [PATCH] sw/math: Refactor to proper library
+
+The previous header-only library style led to conflicts on certain
+defines (for instance `N`) defined in both math library sources and
+application sources.
+---
+ Makefile       | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
+ include/math.h |  3 --
+ 2 files changed, 73 insertions(+), 7 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index 1327953..a6f7a1a 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,17 +1,86 @@
+-BITS_DIR = include/bits
++# Copyright 2023 ETH Zurich and University of Bologna.
++# Licensed under the Apache License, Version 2.0, see LICENSE for details.
++# SPDX-License-Identifier: Apache-2.0
++#
++# Luca Colagrande <colluca@iis.ee.ethz.ch>
++# Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
++
++# Usage of absolute paths is required to externally include
++# this Makefile from multiple different locations
++MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
++
++###############
++# Directories #
++###############
++
++BUILDDIR ?= $(abspath build)
++SRC_DIR   = $(MK_DIR)/src/math
++BITS_DIR  = $(MK_DIR)/include/bits
++
++###################
++# Build variables #
++###################
++
++INCDIRS += $(MK_DIR)/arch/riscv64/
++INCDIRS += $(MK_DIR)/arch/generic
++INCDIRS += $(MK_DIR)/src/include
++INCDIRS += $(MK_DIR)/src/internal
++INCDIRS += $(MK_DIR)/include/bits
++INCDIRS += $(MK_DIR)/include
++
++SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
++
++###########
++# Outputs #
++###########
++
+ ALLTYPES_H = $(BITS_DIR)/alltypes.h
+ 
++OBJS        = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
++DEPS        = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
++LIB         = $(BUILDDIR)/libmath.a
++DUMP        = $(BUILDDIR)/libmath.dump
++ALL_OUTPUTS = $(LIB) $(DUMP)
+ 
+-.PHONY: all clean
++#########
++# Rules #
++#########
+ 
+-all: $(ALLTYPES_H)
++.PHONY: all
++all: $(ALL_OUTPUTS)
+ 
++.PHONY: clean
+ clean:
+ 	rm -rf $(BITS_DIR)
+ 	rm -f $(ALLTYPES_H)
++	rm -rf $(BUILDDIR)
+ 
+ $(BITS_DIR):
+ 	mkdir -p $@
+ 
+ $(ALLTYPES_H): | $(BITS_DIR)
+-	sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
++	sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
++
++$(DEPS): $(ALLTYPES_H)
++
++$(BUILDDIR):
++	mkdir -p $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
++
++$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
++	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
++
++$(LIB): $(OBJS) | $(BUILDDIR)
++	$(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
++
++$(DUMP): $(LIB) | $(BUILDDIR)
++	$(RISCV_OBJDUMP) -D $< > $@
++
++ifneq ($(MAKECMDGOALS),clean)
++-include $(DEPS)
++endif
+diff --git a/include/math.h b/include/math.h
+index 6dad71c..14f28ec 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -435,9 +435,6 @@ float       pow10f(float);
+ long double pow10l(long double);
+ #endif
+ 
+-#include "../src/math/expm1.c"
+-#include "../src/math/tanh.c"
+-
+ #ifdef __cplusplus
+ }
+ #endif
+-- 
+2.28.0
+
diff --git a/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
new file mode 100644
index 000000000..050af9d33
--- /dev/null
+++ b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch
@@ -0,0 +1,81 @@
+From eb96f4d7454a07498f571eb1ed18aa1db2413551 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Mon, 23 Oct 2023 16:45:17 +0200
+Subject: [PATCH] `sw/math`: Add safe FP <--> INT conversions
+
+---
+ src/internal/libm.h | 51 +++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 47 insertions(+), 4 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 72ad17d..60b9866 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,47 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+ 
++/* FPU fence to synchronize the FPU and integer core in Snitch. */
++inline void snrt_fpu_fence() {
++    unsigned tmp;
++    __asm__ volatile(
++        "fmv.x.w %0, fa0\n"
++        "mv      %0, %0\n"
++        : "+r"(tmp)::"memory");
++}
++
++/* Synch-secure double to uint64 conversion functions. */
++static inline uint64_t asuint64(double f) {
++    uint64_t result;
++    snrt_fpu_fence();
++    result = *(uint64_t *)&f;
++    return result;
++}
++
++/* Synch-secure float to uint conversion functions. */
++static inline uint64_t asuint(float f) {
++    uint32_t result;
++    snrt_fpu_fence();
++    result = *(uint32_t *)&f;
++    return result;
++}
++
++/* Synch-secure uint64 to double conversion functions. */
++static inline double asdouble(uint64_t i) {
++    double result;
++    snrt_fpu_fence();
++    result = *(double *)&i;
++    return result;
++}
++
++/* Synch-secure uint to float conversion functions. */
++static inline float asfloat(uint32_t i) {
++	float result;
++	snrt_fpu_fence();
++	result = *(float *)&i;
++	return result;
++}
++
+ /* Evaluate an expression as the specified type. With standard excess
+    precision handling a type cast or assignment is enough (with
+    -ffloat-store an assignment is required, in old compilers argument
+@@ -187,10 +228,12 @@ static inline void fp_force_evall(long double x)
+ 	}                                         \
+ } while(0)
+ 
+-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
+-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
+-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
+-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
++// Unsafe in Snitch due to the decoupled FPU and integer
++// arithmetic units. Use at your own risk.
++#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
++#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
++#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
++#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
+ 
+ #define EXTRACT_WORDS(hi,lo,d)                    \
+ do {                                              \
+-- 
+2.28.0
+
diff --git a/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
new file mode 100644
index 000000000..cffc3c407
--- /dev/null
+++ b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch
@@ -0,0 +1,149 @@
+From b419b07facc9591ba0d8683f53c9adefb8a9b0c6 Mon Sep 17 00:00:00 2001
+From: Luca Colagrande <luca.colagrande3@gmail.com>
+Date: Wed, 8 Nov 2023 09:35:17 +0100
+Subject: [PATCH] `sw/math`: Implement safe `tanh` function
+
+---
+ src/internal/libm.h | 31 +++++++++++++++++++++++++++++++
+ src/math/expm1.c    | 34 ++++++++++++++++++++++++++--------
+ src/math/tanh.c     | 17 ++++++++++++-----
+ 3 files changed, 69 insertions(+), 13 deletions(-)
+
+diff --git a/src/internal/libm.h b/src/internal/libm.h
+index 60b9866..c96c0ec 100644
+--- a/src/internal/libm.h
++++ b/src/internal/libm.h
+@@ -96,6 +96,37 @@ static int32_t converttoint(double_t);
+ #define predict_false(x) (x)
+ #endif
+ 
++/* Memory-consistent functions to manipulate the upper word of a
++   double-precision floating-point number in the integer core.
++   Since there is no dedicated instruction to move the upper 32-bits
++   of a double-precision floating point register to an integer register
++   the compiler resorts to moving the value through the memory. However in
++   Snitch neither the program ordering between floating-point and integer
++   instructions is guaranteed, nor is memory consistency between the integer
++   and floating-point threads.  */
++
++static inline uint32_t safe_extract_upper_32b_from_double(double x) {
++	double f;
++	uint32_t result;
++	asm volatile("fsd %[x], 0(%[ptr]) \n"
++	             "fld ft3, 0(%[ptr]) \n"
++				 "fmv.x.w t0, ft3 \n"
++				 "mv      t0, t0 \n"
++				 "lw %[result], 4(%[ptr]) \n"
++	 : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
++	return result;
++}
++
++static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
++	asm volatile("sw %[x], 4(%[ptr]) \n"
++                 "lw %[x], 4(%[ptr]) \n"
++                 "fmv.w.x ft3, %[x] \n"
++	 : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
++}
++
++/* TODO: the following functions are not really safe, compare previous two
++   functions */
++
+ /* FPU fence to synchronize the FPU and integer core in Snitch. */
+ inline void snrt_fpu_fence() {
+     unsigned tmp;
+diff --git a/src/math/expm1.c b/src/math/expm1.c
+index ac1e61e..d94f57f 100644
+--- a/src/math/expm1.c
++++ b/src/math/expm1.c
+@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
+ double expm1(double x)
+ {
+ 	double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
+-	union {double f; uint64_t i;} u = {x};
+-	uint32_t hx = u.i>>32 & 0x7fffffff;
+-	int k, sign = u.i>>63;
++	/// Original implementation
++	// union {double f; uint64_t i;} u = {x};
++	// uint32_t hx = u.i>>32 & 0x7fffffff;
++	// int k, sign = u.i>>63;
++	/// Safe implementation in Snitch
++	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++	uint32_t hx = upper_32b_x & 0x7fffffff;
++	int k, sign = upper_32b_x>>31;
+ 
+ 	/* filter out huge and non-finite argument */
+ 	if (hx >= 0x4043687A) {  /* if |x|>=56*ln2 */
+@@ -182,8 +187,12 @@ double expm1(double x)
+ 			return -2.0*(e-(x+0.5));
+ 		return 1.0+2.0*(x-e);
+ 	}
+-	u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
+-	twopk = u.f;
++	/// Original implementation
++	// u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
++	// twopk = u.f;
++	/// Safe implementation in Snitch
++	uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
++	safe_inject_into_upper_32b_double(u_i, &twopk);
+ 	if (k < 0 || k > 56) {  /* suffice to return exp(x)-1 */
+ 		y = x - e + 1.0;
+ 		if (k == 1024)
+@@ -192,10 +201,19 @@ double expm1(double x)
+ 			y = y*twopk;
+ 		return y - 1.0;
+ 	}
+-	u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
++	/// Original implementation
++	// u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
++	// if (k < 20)
++	// 	y = (x-e+(1-u.f))*twopk;
++	// else
++	// 	y = (x-(e+u.f)+1)*twopk;
++	/// Safe implementation in Snitch
++	u_i = (uint32_t)(0x3ff - k)<<20;
++	double u_f = 0;
++	safe_inject_into_upper_32b_double(u_i, &u_f);
+ 	if (k < 20)
+-		y = (x-e+(1-u.f))*twopk;
++		y = (x-e+(1-u_f))*twopk;
+ 	else
+-		y = (x-(e+u.f)+1)*twopk;
++		y = (x-(e+u_f)+1)*twopk;
+ 	return y;
+ }
+diff --git a/src/math/tanh.c b/src/math/tanh.c
+index 20d6dbc..2481db1 100644
+--- a/src/math/tanh.c
++++ b/src/math/tanh.c
+@@ -6,16 +6,23 @@
+  */
+ double tanh(double x)
+ {
+-	union {double f; uint64_t i;} u = {.f = x};
+ 	uint32_t w;
+ 	int sign;
+ 	double_t t;
+ 
+ 	/* x = |x| */
+-	sign = u.i >> 63;
+-	u.i &= (uint64_t)-1/2;
+-	x = u.f;
+-	w = u.i >> 32;
++	/// Original implementation
++	// union {double f; uint64_t i;} u = {.f = x};
++	// sign = u.i >> 63;
++	// u.i &= (uint64_t)-1/2;
++	// x = u.f;
++	// w = u.i >> 32;
++	/// Safe implementation in Snitch
++	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
++	sign = upper_32b_x >> 31;
++	uint32_t sign_mask = (~(1 << 31));
++	w = upper_32b_x & sign_mask;
++	safe_inject_into_upper_32b_double(w, &x);
+ 
+ 	if (w > 0x3fe193ea) {
+ 		/* |x| > log(3)/2 ~= 0.5493 or nan */
+-- 
+2.28.0
+
diff --git a/sw/math/Makefile b/sw/math/Makefile
index 132795388..afb3192d1 100644
--- a/sw/math/Makefile
+++ b/sw/math/Makefile
@@ -1,17 +1,86 @@
-BITS_DIR = include/bits
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+# Viviane Potocnik, ETH Zurich <vivianep@iis.ee.ethz.ch>
+
+# Usage of absolute paths is required to externally include
+# this Makefile from multiple different locations
+MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+
+###############
+# Directories #
+###############
+
+BUILDDIR ?= $(abspath build)
+SRC_DIR   = $(MK_DIR)/src/math
+BITS_DIR  = $(MK_DIR)/include/bits
+
+###################
+# Build variables #
+###################
+
+INCDIRS += $(MK_DIR)/arch/riscv64/
+INCDIRS += $(MK_DIR)/arch/generic
+INCDIRS += $(MK_DIR)/src/include
+INCDIRS += $(MK_DIR)/src/internal
+INCDIRS += $(MK_DIR)/include/bits
+INCDIRS += $(MK_DIR)/include
+
+SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c))
+
+###########
+# Outputs #
+###########
+
 ALLTYPES_H = $(BITS_DIR)/alltypes.h
 
+OBJS        = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS)))))
+DEPS        = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS)))))
+LIB         = $(BUILDDIR)/libmath.a
+DUMP        = $(BUILDDIR)/libmath.dump
+ALL_OUTPUTS = $(LIB) $(DUMP)
 
-.PHONY: all clean
+#########
+# Rules #
+#########
 
-all: $(ALLTYPES_H)
+.PHONY: all
+all: $(ALL_OUTPUTS)
 
+.PHONY: clean
 clean:
 	rm -rf $(BITS_DIR)
 	rm -f $(ALLTYPES_H)
+	rm -rf $(BUILDDIR)
 
 $(BITS_DIR):
 	mkdir -p $@
 
 $(ALLTYPES_H): | $(BITS_DIR)
-	sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@
+	sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@
+
+$(DEPS): $(ALLTYPES_H)
+
+$(BUILDDIR):
+	mkdir -p $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@
+
+$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR)
+	$(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@
+
+$(LIB): $(OBJS) | $(BUILDDIR)
+	$(RISCV_AR) $(RISCV_ARFLAGS) $@ $^
+
+$(DUMP): $(LIB) | $(BUILDDIR)
+	$(RISCV_OBJDUMP) -D $< > $@
+
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+endif
diff --git a/sw/math/include/float.h b/sw/math/include/float.h
new file mode 100644
index 000000000..713aadb90
--- /dev/null
+++ b/sw/math/include/float.h
@@ -0,0 +1,52 @@
+#ifndef _FLOAT_H
+#define _FLOAT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int __flt_rounds(void);
+#define FLT_ROUNDS (__flt_rounds())
+
+#define FLT_RADIX 2
+
+#define FLT_TRUE_MIN 1.40129846432481707092e-45F
+#define FLT_MIN 1.17549435082228750797e-38F
+#define FLT_MAX 3.40282346638528859812e+38F
+#define FLT_EPSILON 1.1920928955078125e-07F
+
+#define FLT_MANT_DIG 24
+#define FLT_MIN_EXP (-125)
+#define FLT_MAX_EXP 128
+#define FLT_HAS_SUBNORM 1
+
+#define FLT_DIG 6
+#define FLT_DECIMAL_DIG 9
+#define FLT_MIN_10_EXP (-37)
+#define FLT_MAX_10_EXP 38
+
+#define DBL_TRUE_MIN 4.94065645841246544177e-324
+#define DBL_MIN 2.22507385850720138309e-308
+#define DBL_MAX 1.79769313486231570815e+308
+#define DBL_EPSILON 2.22044604925031308085e-16
+
+#define DBL_MANT_DIG 53
+#define DBL_MIN_EXP (-1021)
+#define DBL_MAX_EXP 1024
+#define DBL_HAS_SUBNORM 1
+
+#define DBL_DIG 15
+#define DBL_DECIMAL_DIG 17
+#define DBL_MIN_10_EXP (-307)
+#define DBL_MAX_10_EXP 308
+
+#define LDBL_HAS_SUBNORM 1
+#define LDBL_DECIMAL_DIG DECIMAL_DIG
+
+#include <bits/float.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sw/math/include/math.h b/sw/math/include/math.h
index 6dad71c1e..14f28ec8c 100644
--- a/sw/math/include/math.h
+++ b/sw/math/include/math.h
@@ -435,9 +435,6 @@ float       pow10f(float);
 long double pow10l(long double);
 #endif
 
-#include "../src/math/expm1.c"
-#include "../src/math/tanh.c"
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/sw/math/src/internal/libm.h b/sw/math/src/internal/libm.h
index 72ad17d8e..c96c0eced 100644
--- a/sw/math/src/internal/libm.h
+++ b/sw/math/src/internal/libm.h
@@ -96,6 +96,78 @@ static int32_t converttoint(double_t);
 #define predict_false(x) (x)
 #endif
 
+/* Memory-consistent functions to manipulate the upper word of a
+   double-precision floating-point number in the integer core.
+   Since there is no dedicated instruction to move the upper 32-bits
+   of a double-precision floating point register to an integer register
+   the compiler resorts to moving the value through the memory. However in
+   Snitch neither the program ordering between floating-point and integer
+   instructions is guaranteed, nor is memory consistency between the integer
+   and floating-point threads.  */
+
+static inline uint32_t safe_extract_upper_32b_from_double(double x) {
+	double f;
+	uint32_t result;
+	asm volatile("fsd %[x], 0(%[ptr]) \n"
+	             "fld ft3, 0(%[ptr]) \n"
+				 "fmv.x.w t0, ft3 \n"
+				 "mv      t0, t0 \n"
+				 "lw %[result], 4(%[ptr]) \n"
+	 : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory");
+	return result;
+}
+
+static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) {
+	asm volatile("sw %[x], 4(%[ptr]) \n"
+                 "lw %[x], 4(%[ptr]) \n"
+                 "fmv.w.x ft3, %[x] \n"
+	 : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory");
+}
+
+/* TODO: the following functions are not really safe, compare previous two
+   functions */
+
+/* FPU fence to synchronize the FPU and integer core in Snitch. */
+inline void snrt_fpu_fence() {
+    unsigned tmp;
+    __asm__ volatile(
+        "fmv.x.w %0, fa0\n"
+        "mv      %0, %0\n"
+        : "+r"(tmp)::"memory");
+}
+
+/* Synch-secure double to uint64 conversion functions. */
+static inline uint64_t asuint64(double f) {
+    uint64_t result;
+    snrt_fpu_fence();
+    result = *(uint64_t *)&f;
+    return result;
+}
+
+/* Synch-secure float to uint conversion functions. */
+static inline uint64_t asuint(float f) {
+    uint32_t result;
+    snrt_fpu_fence();
+    result = *(uint32_t *)&f;
+    return result;
+}
+
+/* Synch-secure uint64 to double conversion functions. */
+static inline double asdouble(uint64_t i) {
+    double result;
+    snrt_fpu_fence();
+    result = *(double *)&i;
+    return result;
+}
+
+/* Synch-secure uint to float conversion functions. */
+static inline float asfloat(uint32_t i) {
+	float result;
+	snrt_fpu_fence();
+	result = *(float *)&i;
+	return result;
+}
+
 /* Evaluate an expression as the specified type. With standard excess
    precision handling a type cast or assignment is enough (with
    -ffloat-store an assignment is required, in old compilers argument
@@ -187,10 +259,12 @@ static inline void fp_force_evall(long double x)
 	}                                         \
 } while(0)
 
-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i
-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i
-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
+// Unsafe in Snitch due to the decoupled FPU and integer
+// arithmetic units. Use at your own risk.
+#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i
+#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f
+#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i
+#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f
 
 #define EXTRACT_WORDS(hi,lo,d)                    \
 do {                                              \
diff --git a/sw/math/src/math/__math_divzero.c b/sw/math/src/math/__math_divzero.c
new file mode 100644
index 000000000..59d213500
--- /dev/null
+++ b/sw/math/src/math/__math_divzero.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_divzero(uint32_t sign)
+{
+	return fp_barrier(sign ? -1.0 : 1.0) / 0.0;
+}
diff --git a/sw/math/src/math/__math_invalid.c b/sw/math/src/math/__math_invalid.c
new file mode 100644
index 000000000..177404900
--- /dev/null
+++ b/sw/math/src/math/__math_invalid.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_invalid(double x)
+{
+	return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidf.c b/sw/math/src/math/__math_invalidf.c
new file mode 100644
index 000000000..357d4b121
--- /dev/null
+++ b/sw/math/src/math/__math_invalidf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_invalidf(float x)
+{
+	return (x - x) / (x - x);
+}
diff --git a/sw/math/src/math/__math_invalidl.c b/sw/math/src/math/__math_invalidl.c
new file mode 100644
index 000000000..1fca99de4
--- /dev/null
+++ b/sw/math/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include <float.h>
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+	return (x - x) / (x - x);
+}
+#endif
diff --git a/sw/math/src/math/__math_oflow.c b/sw/math/src/math/__math_oflow.c
new file mode 100644
index 000000000..c85dbf982
--- /dev/null
+++ b/sw/math/src/math/__math_oflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_oflow(uint32_t sign)
+{
+	return __math_xflow(sign, 0x1p769);
+}
diff --git a/sw/math/src/math/__math_oflowf.c b/sw/math/src/math/__math_oflowf.c
new file mode 100644
index 000000000..fa7d06208
--- /dev/null
+++ b/sw/math/src/math/__math_oflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_oflowf(uint32_t sign)
+{
+	return __math_xflowf(sign, 0x1p97f);
+}
diff --git a/sw/math/src/math/__math_uflow.c b/sw/math/src/math/__math_uflow.c
new file mode 100644
index 000000000..b90594aee
--- /dev/null
+++ b/sw/math/src/math/__math_uflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_uflow(uint32_t sign)
+{
+	return __math_xflow(sign, 0x1p-767);
+}
diff --git a/sw/math/src/math/__math_uflowf.c b/sw/math/src/math/__math_uflowf.c
new file mode 100644
index 000000000..94d50f2bf
--- /dev/null
+++ b/sw/math/src/math/__math_uflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_uflowf(uint32_t sign)
+{
+	return __math_xflowf(sign, 0x1p-95f);
+}
diff --git a/sw/math/src/math/__math_xflow.c b/sw/math/src/math/__math_xflow.c
new file mode 100644
index 000000000..744203c4c
--- /dev/null
+++ b/sw/math/src/math/__math_xflow.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+double __math_xflow(uint32_t sign, double y)
+{
+	return eval_as_double(fp_barrier(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/__math_xflowf.c b/sw/math/src/math/__math_xflowf.c
new file mode 100644
index 000000000..f2c84784f
--- /dev/null
+++ b/sw/math/src/math/__math_xflowf.c
@@ -0,0 +1,6 @@
+#include "libm.h"
+
+float __math_xflowf(uint32_t sign, float y)
+{
+	return eval_as_float(fp_barrierf(sign ? -y : y) * y);
+}
diff --git a/sw/math/src/math/ceil.c b/sw/math/src/math/ceil.c
new file mode 100644
index 000000000..b13e6f2d6
--- /dev/null
+++ b/sw/math/src/math/ceil.c
@@ -0,0 +1,31 @@
+#include "libm.h"
+
+#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1
+#define EPS DBL_EPSILON
+#elif FLT_EVAL_METHOD==2
+#define EPS LDBL_EPSILON
+#endif
+static const double_t toint = 1/EPS;
+
+double ceil(double x)
+{
+	union {double f; uint64_t i;} u = {x};
+	int e = u.i >> 52 & 0x7ff;
+	double_t y;
+
+	if (e >= 0x3ff+52 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i >> 63)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3ff-1) {
+		FORCE_EVAL(y);
+		return u.i >> 63 ? -0.0 : 1;
+	}
+	if (y < 0)
+		return x + y + 1;
+	return x + y;
+}
diff --git a/sw/math/src/math/ceilf.c b/sw/math/src/math/ceilf.c
new file mode 100644
index 000000000..869835f39
--- /dev/null
+++ b/sw/math/src/math/ceilf.c
@@ -0,0 +1,27 @@
+#include "libm.h"
+
+float ceilf(float x)
+{
+	union {float f; uint32_t i;} u = {x};
+	int e = (int)(u.i >> 23 & 0xff) - 0x7f;
+	uint32_t m;
+
+	if (e >= 23)
+		return x;
+	if (e >= 0) {
+		m = 0x007fffff >> e;
+		if ((u.i & m) == 0)
+			return x;
+		FORCE_EVAL(x + 0x1p120f);
+		if (u.i >> 31 == 0)
+			u.i += m;
+		u.i &= ~m;
+	} else {
+		FORCE_EVAL(x + 0x1p120f);
+		if (u.i >> 31)
+			u.f = -0.0;
+		else if (u.i << 1)
+			u.f = 1.0;
+	}
+	return u.f;
+}
diff --git a/sw/math/src/math/ceill.c b/sw/math/src/math/ceill.c
new file mode 100644
index 000000000..60a83020d
--- /dev/null
+++ b/sw/math/src/math/ceill.c
@@ -0,0 +1,34 @@
+#include "libm.h"
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+long double ceill(long double x)
+{
+	return ceil(x);
+}
+#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
+
+static const long double toint = 1/LDBL_EPSILON;
+
+long double ceill(long double x)
+{
+	union ldshape u = {x};
+	int e = u.i.se & 0x7fff;
+	long double y;
+
+	if (e >= 0x3fff+LDBL_MANT_DIG-1 || x == 0)
+		return x;
+	/* y = int(x) - x, where int(x) is an integer neighbor of x */
+	if (u.i.se >> 15)
+		y = x - toint + toint - x;
+	else
+		y = x + toint - toint - x;
+	/* special case because of non-nearest rounding modes */
+	if (e <= 0x3fff-1) {
+		FORCE_EVAL(y);
+		return u.i.se >> 15 ? -0.0 : 1;
+	}
+	if (y < 0)
+		return x + y + 1;
+	return x + y;
+}
+#endif
diff --git a/sw/math/src/math/exp2f_data.c b/sw/math/src/math/exp2f_data.c
new file mode 100644
index 000000000..be324727f
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.c
@@ -0,0 +1,35 @@
+/*
+ * Shared data between expf, exp2f and powf.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "exp2f_data.h"
+
+#define N (1 << EXP2F_TABLE_BITS)
+
+const struct exp2f_data __exp2f_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .shift_scaled = 0x1.8p+52 / N,
+  .poly = {
+  0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1,
+  },
+  .shift = 0x1.8p+52,
+  .invln2_scaled = 0x1.71547652b82fep+0 * N,
+  .poly_scaled = {
+  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N,
+  },
+};
diff --git a/sw/math/src/math/exp2f_data.h b/sw/math/src/math/exp2f_data.h
new file mode 100644
index 000000000..fe744f15b
--- /dev/null
+++ b/sw/math/src/math/exp2f_data.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _EXP2F_DATA_H
+#define _EXP2F_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* Shared between expf, exp2f and powf.  */
+#define EXP2F_TABLE_BITS 5
+#define EXP2F_POLY_ORDER 3
+extern hidden const struct exp2f_data {
+	uint64_t tab[1 << EXP2F_TABLE_BITS];
+	double shift_scaled;
+	double poly[EXP2F_POLY_ORDER];
+	double shift;
+	double invln2_scaled;
+	double poly_scaled[EXP2F_POLY_ORDER];
+} __exp2f_data;
+
+#endif
diff --git a/sw/math/src/math/expf.c b/sw/math/src/math/expf.c
new file mode 100644
index 000000000..f9fbf8e72
--- /dev/null
+++ b/sw/math/src/math/expf.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision e^x function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "exp2f_data.h"
+
+/*
+EXP2F_TABLE_BITS = 5
+EXP2F_POLY_ORDER = 3
+
+ULP error: 0.502 (nearest rounding.)
+Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.)
+Wrong count: 170635 (all nearest rounding wrong results with fma.)
+Non-nearest ULP error: 1 (rounded ULP error)
+*/
+
+#define N (1 << EXP2F_TABLE_BITS)
+#define InvLn2N __exp2f_data.invln2_scaled
+#define T __exp2f_data.tab
+#define C __exp2f_data.poly_scaled
+
+static inline uint32_t top12(float x)
+{
+	return asuint(x) >> 20;
+}
+
+float expf(float x)
+{
+	uint32_t abstop;
+	uint64_t ki, t;
+	double_t kd, xd, z, r, r2, y, s;
+
+	xd = (double_t)x;
+	abstop = top12(x) & 0x7ff;
+	if (predict_false(abstop >= top12(88.0f))) {
+		/* |x| >= 88 or x is nan.  */
+		if (asuint(x) == asuint(-INFINITY))
+			return 0.0f;
+		if (abstop >= top12(INFINITY))
+			return x + x;
+		if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+			return __math_oflowf(0);
+		if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+			return __math_uflowf(0);
+	}
+
+	/* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
+	z = InvLn2N * xd;
+
+	/* Round and convert z to int, the result is in [-150*N, 128*N] and
+	   ideally ties-to-even rule is used, otherwise the magnitude of r
+	   can be bigger which gives larger approximation error.  */
+#if TOINT_INTRINSICS
+	kd = roundtoint(z);
+	ki = converttoint(z);
+#else
+# define SHIFT __exp2f_data.shift
+	kd = eval_as_double(z + SHIFT);
+	ki = asuint64(kd);
+	kd -= SHIFT;
+#endif
+	r = z - kd;
+
+	/* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+	t = T[ki % N];
+	t += ki << (52 - EXP2F_TABLE_BITS);
+	s = asdouble(t);
+	z = C[0] * r + C[1];
+	r2 = r * r;
+	y = C[2] * r + 1;
+	y = z * r2 + y;
+	y = y * s;
+	return eval_as_float(y);
+}
diff --git a/sw/math/src/math/expm1.c b/sw/math/src/math/expm1.c
index ac1e61e4f..d94f57fe5 100644
--- a/sw/math/src/math/expm1.c
+++ b/sw/math/src/math/expm1.c
@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
 double expm1(double x)
 {
 	double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk;
-	union {double f; uint64_t i;} u = {x};
-	uint32_t hx = u.i>>32 & 0x7fffffff;
-	int k, sign = u.i>>63;
+	/// Original implementation
+	// union {double f; uint64_t i;} u = {x};
+	// uint32_t hx = u.i>>32 & 0x7fffffff;
+	// int k, sign = u.i>>63;
+	/// Safe implementation in Snitch
+	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+	uint32_t hx = upper_32b_x & 0x7fffffff;
+	int k, sign = upper_32b_x>>31;
 
 	/* filter out huge and non-finite argument */
 	if (hx >= 0x4043687A) {  /* if |x|>=56*ln2 */
@@ -182,8 +187,12 @@ double expm1(double x)
 			return -2.0*(e-(x+0.5));
 		return 1.0+2.0*(x-e);
 	}
-	u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
-	twopk = u.f;
+	/// Original implementation
+	// u.i = (uint64_t)(0x3ff + k)<<52;  /* 2^k */
+	// twopk = u.f;
+	/// Safe implementation in Snitch
+	uint32_t u_i = (uint32_t)(0x3ff + k)<<20;
+	safe_inject_into_upper_32b_double(u_i, &twopk);
 	if (k < 0 || k > 56) {  /* suffice to return exp(x)-1 */
 		y = x - e + 1.0;
 		if (k == 1024)
@@ -192,10 +201,19 @@ double expm1(double x)
 			y = y*twopk;
 		return y - 1.0;
 	}
-	u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
+	/// Original implementation
+	// u.i = (uint64_t)(0x3ff - k)<<52;  /* 2^-k */
+	// if (k < 20)
+	// 	y = (x-e+(1-u.f))*twopk;
+	// else
+	// 	y = (x-(e+u.f)+1)*twopk;
+	/// Safe implementation in Snitch
+	u_i = (uint32_t)(0x3ff - k)<<20;
+	double u_f = 0;
+	safe_inject_into_upper_32b_double(u_i, &u_f);
 	if (k < 20)
-		y = (x-e+(1-u.f))*twopk;
+		y = (x-e+(1-u_f))*twopk;
 	else
-		y = (x-(e+u.f)+1)*twopk;
+		y = (x-(e+u_f)+1)*twopk;
 	return y;
 }
diff --git a/sw/math/src/math/log2.c b/sw/math/src/math/log2.c
new file mode 100644
index 000000000..1276ed4e3
--- /dev/null
+++ b/sw/math/src/math/log2.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision log2(x) function.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "log2_data.h"
+
+#define T __log2_data.tab
+#define T2 __log2_data.tab2
+#define B __log2_data.poly1
+#define A __log2_data.poly
+#define InvLn2hi __log2_data.invln2hi
+#define InvLn2lo __log2_data.invln2lo
+#define N (1 << LOG2_TABLE_BITS)
+#define OFF 0x3fe6000000000000
+
+/* Top 16 bits of a double.  */
+static inline uint32_t top16(double x)
+{
+	return asuint64(x) >> 48;
+}
+
+double log2(double x)
+{
+	double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p;
+	uint64_t ix, iz, tmp;
+	uint32_t top;
+	int k, i;
+
+	ix = asuint64(x);
+	top = top16(x);
+#define LO asuint64(1.0 - 0x1.5b51p-5)
+#define HI asuint64(1.0 + 0x1.6ab2p-5)
+	if (predict_false(ix - LO < HI - LO)) {
+		/* Handle close to 1.0 inputs separately.  */
+		/* Fix sign of zero with downward rounding when x==1.  */
+		if (WANT_ROUNDING && predict_false(ix == asuint64(1.0)))
+			return 0;
+		r = x - 1.0;
+#if __FP_FAST_FMA
+		hi = r * InvLn2hi;
+		lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi);
+#else
+		double_t rhi, rlo;
+		rhi = asdouble(asuint64(r) & -1ULL << 32);
+		rlo = r - rhi;
+		hi = rhi * InvLn2hi;
+		lo = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+		r2 = r * r; /* rounding error: 0x1p-62.  */
+		r4 = r2 * r2;
+		/* Worst-case error is less than 0.54 ULP (0.55 ULP without fma).  */
+		p = r2 * (B[0] + r * B[1]);
+		y = hi + p;
+		lo += hi - y + p;
+		lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) +
+			    r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9])));
+		y += lo;
+		return eval_as_double(y);
+	}
+	if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return __math_divzero(1);
+		if (ix == asuint64(INFINITY)) /* log(inf) == inf.  */
+			return x;
+		if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		ix -= 52ULL << 52;
+	}
+
+	/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+	   The range is split into N subintervals.
+	   The ith subinterval contains z and c is near its center.  */
+	tmp = ix - OFF;
+	i = (tmp >> (52 - LOG2_TABLE_BITS)) % N;
+	k = (int64_t)tmp >> 52; /* arithmetic shift */
+	iz = ix - (tmp & 0xfffULL << 52);
+	invc = T[i].invc;
+	logc = T[i].logc;
+	z = asdouble(iz);
+	kd = (double_t)k;
+
+	/* log2(x) = log2(z/c) + log2(c) + k.  */
+	/* r ~= z/c - 1, |r| < 1/(2*N).  */
+#if __FP_FAST_FMA
+	/* rounding error: 0x1p-55/N.  */
+	r = __builtin_fma(z, invc, -1.0);
+	t1 = r * InvLn2hi;
+	t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1);
+#else
+	double_t rhi, rlo;
+	/* rounding error: 0x1p-55/N + 0x1p-65.  */
+	r = (z - T2[i].chi - T2[i].clo) * invc;
+	rhi = asdouble(asuint64(r) & -1ULL << 32);
+	rlo = r - rhi;
+	t1 = rhi * InvLn2hi;
+	t2 = rlo * InvLn2hi + r * InvLn2lo;
+#endif
+
+	/* hi + lo = r/ln2 + log2(c) + k.  */
+	t3 = kd + logc;
+	hi = t3 + t1;
+	lo = t3 - hi + t1 + t2;
+
+	/* log2(r+1) = r/ln2 + r^2*poly(r).  */
+	/* Evaluation is optimized assuming superscalar pipelined execution.  */
+	r2 = r * r; /* rounding error: 0x1p-54/N^2.  */
+	r4 = r2 * r2;
+	/* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma).
+	   ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma).  */
+	p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]);
+	y = lo + r2 * p + hi;
+	return eval_as_double(y);
+}
diff --git a/sw/math/src/math/log2_data.c b/sw/math/src/math/log2_data.c
new file mode 100644
index 000000000..3dd1ca514
--- /dev/null
+++ b/sw/math/src/math/log2_data.c
@@ -0,0 +1,201 @@
+/*
+ * Data for log2.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2_data.h"
+
+#define N (1 << LOG2_TABLE_BITS)
+
+const struct log2_data __log2_data = {
+// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0
+.invln2hi = 0x1.7154765200000p+0,
+.invln2lo = 0x1.705fc2eefa200p-33,
+.poly1 = {
+// relative error: 0x1.2fad8188p-63
+// in -0x1.5b51p-5 0x1.6ab2p-5
+-0x1.71547652b82fep-1,
+0x1.ec709dc3a03f7p-2,
+-0x1.71547652b7c3fp-2,
+0x1.2776c50f05be4p-2,
+-0x1.ec709dd768fe5p-3,
+0x1.a61761ec4e736p-3,
+-0x1.7153fbc64a79bp-3,
+0x1.484d154f01b4ap-3,
+-0x1.289e4a72c383cp-3,
+0x1.0b32f285aee66p-3,
+},
+.poly = {
+// relative error: 0x1.a72c2bf8p-58
+// abs error: 0x1.67a552c8p-66
+// in -0x1.f45p-8 0x1.f45p-8
+-0x1.71547652b8339p-1,
+0x1.ec709dc3a04bep-2,
+-0x1.7154764702ffbp-2,
+0x1.2776c50034c48p-2,
+-0x1.ec7b328ea92bcp-3,
+0x1.a6225e117f92ep-3,
+},
+/* Algorithm:
+
+	x = 2^k z
+	log2(x) = k + log2(c) + log2(z/c)
+	log2(z/c) = poly(z/c - 1)
+
+where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls
+into the ith one, then table entries are computed as
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log2(c)
+	tab2[i].chi = (double)c
+	tab2[i].clo = (double)(c - (double)c)
+
+where c is near the center of the subinterval and is chosen by trying +-2^29
+floating point invc candidates around 1/center and selecting one for which
+
+	1) the rounding error in 0x1.8p10 + logc is 0,
+	2) the rounding error in z - chi - clo is < 0x1p-64 and
+	3) the rounding error in (double)log2(c) is minimized (< 0x1p-68).
+
+Note: 1) ensures that k + logc can be computed without rounding error, 2)
+ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a
+single rounding error when there is no fast fma for z*invc - 1, 3) ensures
+that logc + poly(z/c - 1) has small error, however near x == 1 when
+|log2(x)| < 0x1p-4, this is not enough so that is special cased.  */
+.tab = {
+{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1},
+{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1},
+{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1},
+{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2},
+{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2},
+{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2},
+{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2},
+{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2},
+{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2},
+{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2},
+{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2},
+{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2},
+{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2},
+{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2},
+{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2},
+{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2},
+{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2},
+{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2},
+{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2},
+{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2},
+{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3},
+{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3},
+{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3},
+{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3},
+{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3},
+{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3},
+{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3},
+{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3},
+{0x1.19453847f2200p+0, -0x1.162595afdc000p-3},
+{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4},
+{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4},
+{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4},
+{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4},
+{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4},
+{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4},
+{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5},
+{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5},
+{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6},
+{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6},
+{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8},
+{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7},
+{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5},
+{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5},
+{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4},
+{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4},
+{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4},
+{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3},
+{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3},
+{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3},
+{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3},
+{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3},
+{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3},
+{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2},
+{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2},
+{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2},
+{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2},
+{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2},
+{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2},
+{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2},
+{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2},
+{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2},
+{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2},
+{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2},
+{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2},
+},
+#if !__FP_FAST_FMA
+.tab2 = {
+{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55},
+{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57},
+{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55},
+{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55},
+{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55},
+{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56},
+{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56},
+{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57},
+{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55},
+{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57},
+{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55},
+{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55},
+{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56},
+{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56},
+{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56},
+{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55},
+{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57},
+{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55},
+{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55},
+{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58},
+{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55},
+{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58},
+{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56},
+{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56},
+{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57},
+{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56},
+{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56},
+{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55},
+{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58},
+{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56},
+{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55},
+{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56},
+{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55},
+{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56},
+{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55},
+{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55},
+{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55},
+{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59},
+{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58},
+{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55},
+{0x1.0200004292367p+0, 0x1.b7ff365324681p-54},
+{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55},
+{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58},
+{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54},
+{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55},
+{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54},
+{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54},
+{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54},
+{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55},
+{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55},
+{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56},
+{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54},
+{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56},
+{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54},
+{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56},
+{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54},
+{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56},
+{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55},
+{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55},
+{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56},
+{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54},
+{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55},
+{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55},
+{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54},
+},
+#endif
+};
diff --git a/sw/math/src/math/log2_data.h b/sw/math/src/math/log2_data.h
new file mode 100644
index 000000000..276a786d1
--- /dev/null
+++ b/sw/math/src/math/log2_data.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2_DATA_H
+#define _LOG2_DATA_H
+
+#include <features.h>
+
+#define LOG2_TABLE_BITS 6
+#define LOG2_POLY_ORDER 7
+#define LOG2_POLY1_ORDER 11
+extern hidden const struct log2_data {
+	double invln2hi;
+	double invln2lo;
+	double poly[LOG2_POLY_ORDER - 1];
+	double poly1[LOG2_POLY1_ORDER - 1];
+	struct {
+		double invc, logc;
+	} tab[1 << LOG2_TABLE_BITS];
+#if !__FP_FAST_FMA
+	struct {
+		double chi, clo;
+	} tab2[1 << LOG2_TABLE_BITS];
+#endif
+} __log2_data;
+
+#endif
diff --git a/sw/math/src/math/log2f.c b/sw/math/src/math/log2f.c
new file mode 100644
index 000000000..c368f88f3
--- /dev/null
+++ b/sw/math/src/math/log2f.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision log2 function.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include "libm.h"
+#include "log2f_data.h"
+
+/*
+LOG2F_TABLE_BITS = 4
+LOG2F_POLY_ORDER = 4
+
+ULP error: 0.752 (nearest rounding.)
+Relative error: 1.9 * 2^-26 (before rounding.)
+*/
+
+#define N (1 << LOG2F_TABLE_BITS)
+#define T __log2f_data.tab
+#define A __log2f_data.poly
+#define OFF 0x3f330000
+
+float log2f(float x)
+{
+	double_t z, r, r2, p, y, y0, invc, logc;
+	uint32_t ix, iz, top, tmp;
+	int k, i;
+
+	ix = asuint(x);
+	/* Fix sign of zero with downward rounding when x==1.  */
+	if (WANT_ROUNDING && predict_false(ix == 0x3f800000))
+		return 0;
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return __math_divzerof(1);
+		if (ix == 0x7f800000) /* log2(inf) == inf.  */
+			return x;
+		if ((ix & 0x80000000) || ix * 2 >= 0xff000000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
+	}
+
+	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+	   The range is split into N subintervals.
+	   The ith subinterval contains z and c is near its center.  */
+	tmp = ix - OFF;
+	i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N;
+	top = tmp & 0xff800000;
+	iz = ix - top;
+	k = (int32_t)tmp >> 23; /* arithmetic shift */
+	invc = T[i].invc;
+	logc = T[i].logc;
+	z = (double_t)asfloat(iz);
+
+	/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+	r = z * invc - 1;
+	y0 = logc + (double_t)k;
+
+	/* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */
+	r2 = r * r;
+	y = A[1] * r + A[2];
+	y = A[0] * r2 + y;
+	p = A[3] * r + y0;
+	y = y * r2 + p;
+	return eval_as_float(y);
+}
diff --git a/sw/math/src/math/log2f_data.c b/sw/math/src/math/log2f_data.c
new file mode 100644
index 000000000..24e450f1e
--- /dev/null
+++ b/sw/math/src/math/log2f_data.c
@@ -0,0 +1,33 @@
+/*
+ * Data definition for log2f.
+ *
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "log2f_data.h"
+
+const struct log2f_data __log2f_data = {
+  .tab = {
+  { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 },
+  { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 },
+  { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 },
+  { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 },
+  { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 },
+  { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 },
+  { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 },
+  { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 },
+  { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 },
+  { 0x1p+0, 0x0p+0 },
+  { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 },
+  { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 },
+  { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 },
+  { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 },
+  { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 },
+  { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 },
+  },
+  .poly = {
+  -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1,
+  0x1.715475f35c8b8p0,
+  }
+};
diff --git a/sw/math/src/math/log2f_data.h b/sw/math/src/math/log2f_data.h
new file mode 100644
index 000000000..4fa489560
--- /dev/null
+++ b/sw/math/src/math/log2f_data.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2017-2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef _LOG2F_DATA_H
+#define _LOG2F_DATA_H
+
+#include <features.h>
+
+#define LOG2F_TABLE_BITS 4
+#define LOG2F_POLY_ORDER 4
+extern hidden const struct log2f_data {
+	struct {
+		double invc, logc;
+	} tab[1 << LOG2F_TABLE_BITS];
+	double poly[LOG2F_POLY_ORDER];
+} __log2f_data;
+
+#endif
diff --git a/sw/math/src/math/sqrt.c b/sw/math/src/math/sqrt.c
new file mode 100644
index 000000000..5ba265596
--- /dev/null
+++ b/sw/math/src/math/sqrt.c
@@ -0,0 +1,158 @@
+#include <stdint.h>
+#include <math.h>
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
+
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
+	}
+
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
+	}
+	return y;
+}
diff --git a/sw/math/src/math/sqrt_data.c b/sw/math/src/math/sqrt_data.c
new file mode 100644
index 000000000..61bc22f43
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/sw/math/src/math/sqrt_data.h b/sw/math/src/math/sqrt_data.h
new file mode 100644
index 000000000..260c7f9c2
--- /dev/null
+++ b/sw/math/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/sw/math/src/math/sqrtf.c b/sw/math/src/math/sqrtf.c
new file mode 100644
index 000000000..740d81cba
--- /dev/null
+++ b/sw/math/src/math/sqrtf.c
@@ -0,0 +1,83 @@
+#include <stdint.h>
+#include <math.h>
+#include "libm.h"
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* see sqrt.c for more detailed comments.  */
+
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;
+
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
+	}
+
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;
+
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */
+
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
+	}
+	return y;
+}
diff --git a/sw/math/src/math/tanh.c b/sw/math/src/math/tanh.c
index 20d6dbcf4..2481db1dc 100644
--- a/sw/math/src/math/tanh.c
+++ b/sw/math/src/math/tanh.c
@@ -6,16 +6,23 @@
  */
 double tanh(double x)
 {
-	union {double f; uint64_t i;} u = {.f = x};
 	uint32_t w;
 	int sign;
 	double_t t;
 
 	/* x = |x| */
-	sign = u.i >> 63;
-	u.i &= (uint64_t)-1/2;
-	x = u.f;
-	w = u.i >> 32;
+	/// Original implementation
+	// union {double f; uint64_t i;} u = {.f = x};
+	// sign = u.i >> 63;
+	// u.i &= (uint64_t)-1/2;
+	// x = u.f;
+	// w = u.i >> 32;
+	/// Safe implementation in Snitch
+	uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x);
+	sign = upper_32b_x >> 31;
+	uint32_t sign_mask = (~(1 << 31));
+	w = upper_32b_x & sign_mask;
+	safe_inject_into_upper_32b_double(w, &x);
 
 	if (w > 0x3fe193ea) {
 		/* |x| > log(3)/2 ~= 0.5493 or nan */
diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile
index 9badf70ea..a0115d00a 100644
--- a/target/snitch_cluster/sw/Makefile
+++ b/target/snitch_cluster/sw/Makefile
@@ -13,21 +13,19 @@ else
 RUNTIME = runtime/rtl
 endif
 
-MATH = ../../../sw/math
-
-SUBDIRS = runtime/banshee runtime/rtl $(MATH) apps tests
+SUBDIRS = runtime/banshee runtime/rtl math apps tests
 
 .PHONY: all $(SUBDIRS)
 
 all: $(SUBDIRS)
 
 # Explicit dependency of apps on runtime
-apps: $(RUNTIME) $(MATH)
+apps: $(RUNTIME) math
 	$(MAKE) -C $@ TARGET=$(TARGET)
 
 # Explicit dependency of tests on runtime
-tests: $(RUNTIME) $(MATH)
+tests: $(RUNTIME) math
 	$(MAKE) -C $@ $(TARGET)
 
-runtime/rtl runtime/banshee $(MATH):
+runtime/rtl runtime/banshee math:
 	$(MAKE) -C $@ $(TARGET)
diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk
index d8b0659a4..8e1950860 100644
--- a/target/snitch_cluster/sw/apps/common.mk
+++ b/target/snitch_cluster/sw/apps/common.mk
@@ -22,6 +22,7 @@ RISCV_CFLAGS += -DBIST
 else
 RUNTIME_DIR := $(ROOT)/target/snitch_cluster/sw/runtime/rtl
 endif
+MATH_DIR := $(ROOT)/target/snitch_cluster/sw/math
 
 # Paths relative to the app including this Makefile
 BUILDDIR = $(abspath build)
@@ -37,18 +38,13 @@ INCDIRS += $(SNRT_DIR)/api/omp
 INCDIRS += $(SNRT_DIR)/src
 INCDIRS += $(SNRT_DIR)/src/omp
 INCDIRS += $(ROOT)/sw/deps/riscv-opcodes
-
-# Math library override
-INCDIRS += $(ROOT)/sw/math/arch/riscv64/bits/
-INCDIRS += $(ROOT)/sw/math/arch/generic
-INCDIRS += $(ROOT)/sw/math/src/include
-INCDIRS += $(ROOT)/sw/math/src/internal
-INCDIRS += $(ROOT)/sw/math/include/bits
 INCDIRS += $(ROOT)/sw/math/include
 
 RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR))
 RISCV_LDFLAGS += -T$(abspath $(SNRT_DIR)/base.ld)
 RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)/build/)
+RISCV_LDFLAGS += -L$(abspath $(MATH_DIR)/build/)
+RISCV_LDFLAGS += -lmath
 RISCV_LDFLAGS += -lsnRuntime
 
 ###########
diff --git a/target/snitch_cluster/sw/math/Makefile b/target/snitch_cluster/sw/math/Makefile
new file mode 100644
index 000000000..d0a83e86a
--- /dev/null
+++ b/target/snitch_cluster/sw/math/Makefile
@@ -0,0 +1,8 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+include ../toolchain.mk
+include ../../../../sw/math/Makefile
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index 4fa0fc5af..ca5f6a482 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -34,6 +34,7 @@ RISCV_CFLAGS += -mcmodel=medany
 # RISCV_CFLAGS += -mno-fdiv # Not supported by Clang
 RISCV_CFLAGS += -ffast-math
 RISCV_CFLAGS += -fno-builtin-printf
+RISCV_CFLAGS += -fno-builtin-sqrtf
 RISCV_CFLAGS += -fno-common
 RISCV_CFLAGS += -fopenmp
 RISCV_CFLAGS += -ftls-model=local-exec