From 11366d5fe9baed757730fa727c1aa696a229f075 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 8 Nov 2023 10:43:40 +0100 Subject: [PATCH] sw/math: Extend and refactor to proper library (#60) * sw/math: Refactor to proper library The previous header-only library style led to conflicts on certain defines (for instance `N`) defined in both math library sources and application sources. * sw/math: Add `exp`, `sqrt`, `log` and `ceil` functions * sw/math: Add safe FP <--> INT conversions * ci: Properly initialize Bender dependencies * sw/math: Implement safe `tanh` function --- .github/workflows/ci.yml | 2 + .gitlab-ci.yml | 2 +- Bender.yml | 29 ++- Makefile | 3 +- hw/mem_interface/util/compile.sh | 2 +- hw/reqrsp_interface/util/compile.sh | 2 +- hw/snitch_cluster/util/compile.sh | 2 +- hw/snitch_icache/util/compile.sh | 2 +- hw/snitch_ssr/util/compile.sh | 2 +- hw/tcdm_interface/util/compile.sh | 2 +- ...2-sw-math-Refactor-to-proper-library.patch | 125 +++++++++++ ...-sw-math-Add-safe-FP-INT-conversions.patch | 81 +++++++ ...sw-math-Implement-safe-tanh-function.patch | 149 +++++++++++++ sw/math/Makefile | 77 ++++++- sw/math/include/float.h | 52 +++++ sw/math/include/math.h | 3 - sw/math/src/internal/libm.h | 82 ++++++- sw/math/src/math/__math_divzero.c | 6 + sw/math/src/math/__math_invalid.c | 6 + sw/math/src/math/__math_invalidf.c | 6 + sw/math/src/math/__math_invalidl.c | 9 + sw/math/src/math/__math_oflow.c | 6 + sw/math/src/math/__math_oflowf.c | 6 + sw/math/src/math/__math_uflow.c | 6 + sw/math/src/math/__math_uflowf.c | 6 + sw/math/src/math/__math_xflow.c | 6 + sw/math/src/math/__math_xflowf.c | 6 + sw/math/src/math/ceil.c | 31 +++ sw/math/src/math/ceilf.c | 27 +++ sw/math/src/math/ceill.c | 34 +++ sw/math/src/math/exp2f_data.c | 35 +++ sw/math/src/math/exp2f_data.h | 23 ++ sw/math/src/math/expf.c | 80 +++++++ sw/math/src/math/expm1.c | 34 ++- sw/math/src/math/log2.c | 122 +++++++++++ sw/math/src/math/log2_data.c | 201 ++++++++++++++++++ sw/math/src/math/log2_data.h | 28 +++ sw/math/src/math/log2f.c | 72 +++++++ sw/math/src/math/log2f_data.c | 33 +++ sw/math/src/math/log2f_data.h | 19 ++ sw/math/src/math/sqrt.c | 158 ++++++++++++++ sw/math/src/math/sqrt_data.c | 19 ++ sw/math/src/math/sqrt_data.h | 13 ++ sw/math/src/math/sqrtf.c | 83 ++++++++ sw/math/src/math/tanh.c | 17 +- target/snitch_cluster/sw/Makefile | 10 +- target/snitch_cluster/sw/apps/common.mk | 10 +- target/snitch_cluster/sw/math/Makefile | 8 + target/snitch_cluster/sw/toolchain.mk | 1 + 49 files changed, 1692 insertions(+), 46 deletions(-) create mode 100644 sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch create mode 100644 sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch create mode 100644 sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch create mode 100644 sw/math/include/float.h create mode 100644 sw/math/src/math/__math_divzero.c create mode 100644 sw/math/src/math/__math_invalid.c create mode 100644 sw/math/src/math/__math_invalidf.c create mode 100644 sw/math/src/math/__math_invalidl.c create mode 100644 sw/math/src/math/__math_oflow.c create mode 100644 sw/math/src/math/__math_oflowf.c create mode 100644 sw/math/src/math/__math_uflow.c create mode 100644 sw/math/src/math/__math_uflowf.c create mode 100644 sw/math/src/math/__math_xflow.c create mode 100644 sw/math/src/math/__math_xflowf.c create mode 100644 sw/math/src/math/ceil.c create mode 100644 sw/math/src/math/ceilf.c create mode 100644 sw/math/src/math/ceill.c create mode 100644 sw/math/src/math/exp2f_data.c create mode 100644 sw/math/src/math/exp2f_data.h create mode 100644 sw/math/src/math/expf.c create mode 100644 sw/math/src/math/log2.c create mode 100644 sw/math/src/math/log2_data.c create mode 100644 sw/math/src/math/log2_data.h create mode 100644 sw/math/src/math/log2f.c create mode 100644 sw/math/src/math/log2f_data.c create mode 100644 sw/math/src/math/log2f_data.h create mode 100644 sw/math/src/math/sqrt.c create mode 100644 sw/math/src/math/sqrt_data.c create mode 100644 sw/math/src/math/sqrt_data.h create mode 100644 sw/math/src/math/sqrtf.c create mode 100644 target/snitch_cluster/sw/math/Makefile diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2c3e692a..8df08c866 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,6 +36,7 @@ jobs: submodules: 'recursive' - name: Build Software run: | + bender vendor init make -C target/snitch_cluster sw - name: Build Hardware run: | @@ -61,6 +62,7 @@ jobs: submodules: 'recursive' - name: Build Software run: | + bender vendor init make -C target/snitch_cluster SELECT_RUNTIME=banshee sw - name: Run Tests env: diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 610c271ea..624c2a003 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -13,7 +13,6 @@ variables: VERILATOR: verilator-4.110 QUESTA: questa-2022.3 LLVM_BINROOT: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin - CLANG: /usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin/clang CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER: /usr/pack/gcc-9.2.0-af/linux-x64/bin/gcc LLVM_SYS_120_PREFIX: /usr/pack/llvm-12.0.1-af CMAKE: cmake-3.18.1 @@ -22,6 +21,7 @@ before_script: - $PYTHON -m venv .venv - source .venv/bin/activate - pip install -r python-requirements.txt + - $BENDER vendor init ############## # Build docs # diff --git a/Bender.yml b/Bender.yml index 732788d0d..4ae9d28e9 100644 --- a/Bender.yml +++ b/Bender.yml @@ -37,13 +37,40 @@ vendor_package: - "Makefile" - ".gitignore" - "README" - - "src/math/tanh.c" + - "src/math/ceil.c" + - "src/math/ceilf.c" + - "src/math/ceill.c" - "src/math/expm1.c" + - "src/math/expf.c" + - "src/math/exp2f_data.c" + - "src/math/exp2f_data.h" + - "src/math/log2.c" + - "src/math/log2_data.c" + - "src/math/log2_data.h" + - "src/math/log2f.c" + - "src/math/log2f_data.c" + - "src/math/log2f_data.h" + - "src/math/__math_divzero.c" + - "src/math/__math_invalid.c" + - "src/math/__math_invalidf.c" + - "src/math/__math_invalidl.c" + - "src/math/__math_oflow.c" + - "src/math/__math_oflowf.c" + - "src/math/__math_uflow.c" + - "src/math/__math_uflowf.c" + - "src/math/__math_xflow.c" + - "src/math/__math_xflowf.c" + - "src/math/sqrt.c" + - "src/math/sqrtf.c" + - "src/math/sqrt_data.c" + - "src/math/sqrt_data.h" + - "src/math/tanh.c" - "src/internal/libm.h" - "src/include/features.h" - "include/endian.h" - "include/math.h" - "include/features.h" + - "include/float.h" - "include/alltypes.h.in" - "arch/riscv64/bits/alltypes.h.in" - "arch/riscv64/bits/float.h" diff --git a/Makefile b/Makefile index 06a1c662f..7aba09569 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 -REGGEN = $(shell bender path register_interface)/vendor/lowrisc_opentitan/util/regtool.py +BENDER ?= bender +REGGEN = $(shell $(BENDER) path register_interface)/vendor/lowrisc_opentitan/util/regtool.py GENERATED_DOCS_DIR = docs/generated GENERATED_DOC_SRCS = $(GENERATED_DOCS_DIR)/peripherals.md diff --git a/hw/mem_interface/util/compile.sh b/hw/mem_interface/util/compile.sh index 73ccc7fca..51814285a 100755 --- a/hw/mem_interface/util/compile.sh +++ b/hw/mem_interface/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/hw/reqrsp_interface/util/compile.sh b/hw/reqrsp_interface/util/compile.sh index 73ccc7fca..84956acea 100755 --- a/hw/reqrsp_interface/util/compile.sh +++ b/hw/reqrsp_interface/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$(BENDER) script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/hw/snitch_cluster/util/compile.sh b/hw/snitch_cluster/util/compile.sh index 73ccc7fca..51814285a 100755 --- a/hw/snitch_cluster/util/compile.sh +++ b/hw/snitch_cluster/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/hw/snitch_icache/util/compile.sh b/hw/snitch_icache/util/compile.sh index 73ccc7fca..51814285a 100755 --- a/hw/snitch_icache/util/compile.sh +++ b/hw/snitch_icache/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/hw/snitch_ssr/util/compile.sh b/hw/snitch_ssr/util/compile.sh index 73ccc7fca..84956acea 100755 --- a/hw/snitch_ssr/util/compile.sh +++ b/hw/snitch_ssr/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$(BENDER) script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/hw/tcdm_interface/util/compile.sh b/hw/tcdm_interface/util/compile.sh index 73ccc7fca..51814285a 100755 --- a/hw/tcdm_interface/util/compile.sh +++ b/hw/tcdm_interface/util/compile.sh @@ -10,7 +10,7 @@ set -e [ ! -z "$VSIM" ] || VSIM=vsim -bender script vsim -t test \ +$BENDER script vsim -t test \ --vlog-arg="-svinputport=compat" \ --vlog-arg="-override_timescale 1ns/1ps" \ --vlog-arg="-suppress 2583" \ diff --git a/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch new file mode 100644 index 000000000..068851f3c --- /dev/null +++ b/sw/deps/patches/musl/0002-sw-math-Refactor-to-proper-library.patch @@ -0,0 +1,125 @@ +From 91c1b48e44629a80bdc1832111707c051ab0b3b2 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Mon, 23 Oct 2023 14:30:18 +0200 +Subject: [PATCH] sw/math: Refactor to proper library + +The previous header-only library style led to conflicts on certain +defines (for instance `N`) defined in both math library sources and +application sources. +--- + Makefile | 77 +++++++++++++++++++++++++++++++++++++++++++++++--- + include/math.h | 3 -- + 2 files changed, 73 insertions(+), 7 deletions(-) + +diff --git a/Makefile b/Makefile +index 1327953..a6f7a1a 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,17 +1,86 @@ +-BITS_DIR = include/bits ++# Copyright 2023 ETH Zurich and University of Bologna. ++# Licensed under the Apache License, Version 2.0, see LICENSE for details. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# Luca Colagrande ++# Viviane Potocnik, ETH Zurich ++ ++# Usage of absolute paths is required to externally include ++# this Makefile from multiple different locations ++MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) ++ ++############### ++# Directories # ++############### ++ ++BUILDDIR ?= $(abspath build) ++SRC_DIR = $(MK_DIR)/src/math ++BITS_DIR = $(MK_DIR)/include/bits ++ ++################### ++# Build variables # ++################### ++ ++INCDIRS += $(MK_DIR)/arch/riscv64/ ++INCDIRS += $(MK_DIR)/arch/generic ++INCDIRS += $(MK_DIR)/src/include ++INCDIRS += $(MK_DIR)/src/internal ++INCDIRS += $(MK_DIR)/include/bits ++INCDIRS += $(MK_DIR)/include ++ ++SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c)) ++ ++########### ++# Outputs # ++########### ++ + ALLTYPES_H = $(BITS_DIR)/alltypes.h + ++OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS))))) ++DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS))))) ++LIB = $(BUILDDIR)/libmath.a ++DUMP = $(BUILDDIR)/libmath.dump ++ALL_OUTPUTS = $(LIB) $(DUMP) + +-.PHONY: all clean ++######### ++# Rules # ++######### + +-all: $(ALLTYPES_H) ++.PHONY: all ++all: $(ALL_OUTPUTS) + ++.PHONY: clean + clean: + rm -rf $(BITS_DIR) + rm -f $(ALLTYPES_H) ++ rm -rf $(BUILDDIR) + + $(BITS_DIR): + mkdir -p $@ + + $(ALLTYPES_H): | $(BITS_DIR) +- sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@ ++ sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@ ++ ++$(DEPS): $(ALLTYPES_H) ++ ++$(BUILDDIR): ++ mkdir -p $@ ++ ++$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ ++ ++$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ ++ ++$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR) ++ $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@ ++ ++$(LIB): $(OBJS) | $(BUILDDIR) ++ $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ ++ ++$(DUMP): $(LIB) | $(BUILDDIR) ++ $(RISCV_OBJDUMP) -D $< > $@ ++ ++ifneq ($(MAKECMDGOALS),clean) ++-include $(DEPS) ++endif +diff --git a/include/math.h b/include/math.h +index 6dad71c..14f28ec 100644 +--- a/include/math.h ++++ b/include/math.h +@@ -435,9 +435,6 @@ float pow10f(float); + long double pow10l(long double); + #endif + +-#include "../src/math/expm1.c" +-#include "../src/math/tanh.c" +- + #ifdef __cplusplus + } + #endif +-- +2.28.0 + diff --git a/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch new file mode 100644 index 000000000..050af9d33 --- /dev/null +++ b/sw/deps/patches/musl/0003-sw-math-Add-safe-FP-INT-conversions.patch @@ -0,0 +1,81 @@ +From eb96f4d7454a07498f571eb1ed18aa1db2413551 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Mon, 23 Oct 2023 16:45:17 +0200 +Subject: [PATCH] `sw/math`: Add safe FP <--> INT conversions + +--- + src/internal/libm.h | 51 +++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 47 insertions(+), 4 deletions(-) + +diff --git a/src/internal/libm.h b/src/internal/libm.h +index 72ad17d..60b9866 100644 +--- a/src/internal/libm.h ++++ b/src/internal/libm.h +@@ -96,6 +96,47 @@ static int32_t converttoint(double_t); + #define predict_false(x) (x) + #endif + ++/* FPU fence to synchronize the FPU and integer core in Snitch. */ ++inline void snrt_fpu_fence() { ++ unsigned tmp; ++ __asm__ volatile( ++ "fmv.x.w %0, fa0\n" ++ "mv %0, %0\n" ++ : "+r"(tmp)::"memory"); ++} ++ ++/* Synch-secure double to uint64 conversion functions. */ ++static inline uint64_t asuint64(double f) { ++ uint64_t result; ++ snrt_fpu_fence(); ++ result = *(uint64_t *)&f; ++ return result; ++} ++ ++/* Synch-secure float to uint conversion functions. */ ++static inline uint64_t asuint(float f) { ++ uint32_t result; ++ snrt_fpu_fence(); ++ result = *(uint32_t *)&f; ++ return result; ++} ++ ++/* Synch-secure uint64 to double conversion functions. */ ++static inline double asdouble(uint64_t i) { ++ double result; ++ snrt_fpu_fence(); ++ result = *(double *)&i; ++ return result; ++} ++ ++/* Synch-secure uint to float conversion functions. */ ++static inline float asfloat(uint32_t i) { ++ float result; ++ snrt_fpu_fence(); ++ result = *(float *)&i; ++ return result; ++} ++ + /* Evaluate an expression as the specified type. With standard excess + precision handling a type cast or assignment is enough (with + -ffloat-store an assignment is required, in old compilers argument +@@ -187,10 +228,12 @@ static inline void fp_force_evall(long double x) + } \ + } while(0) + +-#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i +-#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f +-#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i +-#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f ++// Unsafe in Snitch due to the decoupled FPU and integer ++// arithmetic units. Use at your own risk. ++#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i ++#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f ++#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i ++#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f + + #define EXTRACT_WORDS(hi,lo,d) \ + do { \ +-- +2.28.0 + diff --git a/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch new file mode 100644 index 000000000..cffc3c407 --- /dev/null +++ b/sw/deps/patches/musl/0004-sw-math-Implement-safe-tanh-function.patch @@ -0,0 +1,149 @@ +From b419b07facc9591ba0d8683f53c9adefb8a9b0c6 Mon Sep 17 00:00:00 2001 +From: Luca Colagrande +Date: Wed, 8 Nov 2023 09:35:17 +0100 +Subject: [PATCH] `sw/math`: Implement safe `tanh` function + +--- + src/internal/libm.h | 31 +++++++++++++++++++++++++++++++ + src/math/expm1.c | 34 ++++++++++++++++++++++++++-------- + src/math/tanh.c | 17 ++++++++++++----- + 3 files changed, 69 insertions(+), 13 deletions(-) + +diff --git a/src/internal/libm.h b/src/internal/libm.h +index 60b9866..c96c0ec 100644 +--- a/src/internal/libm.h ++++ b/src/internal/libm.h +@@ -96,6 +96,37 @@ static int32_t converttoint(double_t); + #define predict_false(x) (x) + #endif + ++/* Memory-consistent functions to manipulate the upper word of a ++ double-precision floating-point number in the integer core. ++ Since there is no dedicated instruction to move the upper 32-bits ++ of a double-precision floating point register to an integer register ++ the compiler resorts to moving the value through the memory. However in ++ Snitch neither the program ordering between floating-point and integer ++ instructions is guaranteed, nor is memory consistency between the integer ++ and floating-point threads. */ ++ ++static inline uint32_t safe_extract_upper_32b_from_double(double x) { ++ double f; ++ uint32_t result; ++ asm volatile("fsd %[x], 0(%[ptr]) \n" ++ "fld ft3, 0(%[ptr]) \n" ++ "fmv.x.w t0, ft3 \n" ++ "mv t0, t0 \n" ++ "lw %[result], 4(%[ptr]) \n" ++ : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory"); ++ return result; ++} ++ ++static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) { ++ asm volatile("sw %[x], 4(%[ptr]) \n" ++ "lw %[x], 4(%[ptr]) \n" ++ "fmv.w.x ft3, %[x] \n" ++ : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory"); ++} ++ ++/* TODO: the following functions are not really safe, compare previous two ++ functions */ ++ + /* FPU fence to synchronize the FPU and integer core in Snitch. */ + inline void snrt_fpu_fence() { + unsigned tmp; +diff --git a/src/math/expm1.c b/src/math/expm1.c +index ac1e61e..d94f57f 100644 +--- a/src/math/expm1.c ++++ b/src/math/expm1.c +@@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */ + double expm1(double x) + { + double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk; +- union {double f; uint64_t i;} u = {x}; +- uint32_t hx = u.i>>32 & 0x7fffffff; +- int k, sign = u.i>>63; ++ /// Original implementation ++ // union {double f; uint64_t i;} u = {x}; ++ // uint32_t hx = u.i>>32 & 0x7fffffff; ++ // int k, sign = u.i>>63; ++ /// Safe implementation in Snitch ++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); ++ uint32_t hx = upper_32b_x & 0x7fffffff; ++ int k, sign = upper_32b_x>>31; + + /* filter out huge and non-finite argument */ + if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */ +@@ -182,8 +187,12 @@ double expm1(double x) + return -2.0*(e-(x+0.5)); + return 1.0+2.0*(x-e); + } +- u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ +- twopk = u.f; ++ /// Original implementation ++ // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ ++ // twopk = u.f; ++ /// Safe implementation in Snitch ++ uint32_t u_i = (uint32_t)(0x3ff + k)<<20; ++ safe_inject_into_upper_32b_double(u_i, &twopk); + if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */ + y = x - e + 1.0; + if (k == 1024) +@@ -192,10 +201,19 @@ double expm1(double x) + y = y*twopk; + return y - 1.0; + } +- u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ ++ /// Original implementation ++ // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ ++ // if (k < 20) ++ // y = (x-e+(1-u.f))*twopk; ++ // else ++ // y = (x-(e+u.f)+1)*twopk; ++ /// Safe implementation in Snitch ++ u_i = (uint32_t)(0x3ff - k)<<20; ++ double u_f = 0; ++ safe_inject_into_upper_32b_double(u_i, &u_f); + if (k < 20) +- y = (x-e+(1-u.f))*twopk; ++ y = (x-e+(1-u_f))*twopk; + else +- y = (x-(e+u.f)+1)*twopk; ++ y = (x-(e+u_f)+1)*twopk; + return y; + } +diff --git a/src/math/tanh.c b/src/math/tanh.c +index 20d6dbc..2481db1 100644 +--- a/src/math/tanh.c ++++ b/src/math/tanh.c +@@ -6,16 +6,23 @@ + */ + double tanh(double x) + { +- union {double f; uint64_t i;} u = {.f = x}; + uint32_t w; + int sign; + double_t t; + + /* x = |x| */ +- sign = u.i >> 63; +- u.i &= (uint64_t)-1/2; +- x = u.f; +- w = u.i >> 32; ++ /// Original implementation ++ // union {double f; uint64_t i;} u = {.f = x}; ++ // sign = u.i >> 63; ++ // u.i &= (uint64_t)-1/2; ++ // x = u.f; ++ // w = u.i >> 32; ++ /// Safe implementation in Snitch ++ uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); ++ sign = upper_32b_x >> 31; ++ uint32_t sign_mask = (~(1 << 31)); ++ w = upper_32b_x & sign_mask; ++ safe_inject_into_upper_32b_double(w, &x); + + if (w > 0x3fe193ea) { + /* |x| > log(3)/2 ~= 0.5493 or nan */ +-- +2.28.0 + diff --git a/sw/math/Makefile b/sw/math/Makefile index 132795388..afb3192d1 100644 --- a/sw/math/Makefile +++ b/sw/math/Makefile @@ -1,17 +1,86 @@ -BITS_DIR = include/bits +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# Viviane Potocnik, ETH Zurich + +# Usage of absolute paths is required to externally include +# this Makefile from multiple different locations +MK_DIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) + +############### +# Directories # +############### + +BUILDDIR ?= $(abspath build) +SRC_DIR = $(MK_DIR)/src/math +BITS_DIR = $(MK_DIR)/include/bits + +################### +# Build variables # +################### + +INCDIRS += $(MK_DIR)/arch/riscv64/ +INCDIRS += $(MK_DIR)/arch/generic +INCDIRS += $(MK_DIR)/src/include +INCDIRS += $(MK_DIR)/src/internal +INCDIRS += $(MK_DIR)/include/bits +INCDIRS += $(MK_DIR)/include + +SRCS = $(abspath $(wildcard $(SRC_DIR)/*.c)) + +########### +# Outputs # +########### + ALLTYPES_H = $(BITS_DIR)/alltypes.h +OBJS = $(addprefix $(BUILDDIR)/,$(addsuffix .o,$(basename $(notdir $(SRCS))))) +DEPS = $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(basename $(notdir $(SRCS))))) +LIB = $(BUILDDIR)/libmath.a +DUMP = $(BUILDDIR)/libmath.dump +ALL_OUTPUTS = $(LIB) $(DUMP) -.PHONY: all clean +######### +# Rules # +######### -all: $(ALLTYPES_H) +.PHONY: all +all: $(ALL_OUTPUTS) +.PHONY: clean clean: rm -rf $(BITS_DIR) rm -f $(ALLTYPES_H) + rm -rf $(BUILDDIR) $(BITS_DIR): mkdir -p $@ $(ALLTYPES_H): | $(BITS_DIR) - sed -f tools/mkalltypes.sed arch/riscv64/bits/alltypes.h.in include/alltypes.h.in > $@ + sed -f $(MK_DIR)/tools/mkalltypes.sed $(MK_DIR)/arch/riscv64/bits/alltypes.h.in $(MK_DIR)/include/alltypes.h.in > $@ + +$(DEPS): $(ALLTYPES_H) + +$(BUILDDIR): + mkdir -p $@ + +$(BUILDDIR)/%.o: $(SRC_DIR)/%.S | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.o: $(SRC_DIR)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -c $< -o $@ + +$(BUILDDIR)/%.d: $(SRC_DIR)/%.c | $(BUILDDIR) + $(RISCV_CC) $(RISCV_CFLAGS) -MM -MT '$(@:.d=.o)' $< > $@ + +$(LIB): $(OBJS) | $(BUILDDIR) + $(RISCV_AR) $(RISCV_ARFLAGS) $@ $^ + +$(DUMP): $(LIB) | $(BUILDDIR) + $(RISCV_OBJDUMP) -D $< > $@ + +ifneq ($(MAKECMDGOALS),clean) +-include $(DEPS) +endif diff --git a/sw/math/include/float.h b/sw/math/include/float.h new file mode 100644 index 000000000..713aadb90 --- /dev/null +++ b/sw/math/include/float.h @@ -0,0 +1,52 @@ +#ifndef _FLOAT_H +#define _FLOAT_H + +#ifdef __cplusplus +extern "C" { +#endif + +int __flt_rounds(void); +#define FLT_ROUNDS (__flt_rounds()) + +#define FLT_RADIX 2 + +#define FLT_TRUE_MIN 1.40129846432481707092e-45F +#define FLT_MIN 1.17549435082228750797e-38F +#define FLT_MAX 3.40282346638528859812e+38F +#define FLT_EPSILON 1.1920928955078125e-07F + +#define FLT_MANT_DIG 24 +#define FLT_MIN_EXP (-125) +#define FLT_MAX_EXP 128 +#define FLT_HAS_SUBNORM 1 + +#define FLT_DIG 6 +#define FLT_DECIMAL_DIG 9 +#define FLT_MIN_10_EXP (-37) +#define FLT_MAX_10_EXP 38 + +#define DBL_TRUE_MIN 4.94065645841246544177e-324 +#define DBL_MIN 2.22507385850720138309e-308 +#define DBL_MAX 1.79769313486231570815e+308 +#define DBL_EPSILON 2.22044604925031308085e-16 + +#define DBL_MANT_DIG 53 +#define DBL_MIN_EXP (-1021) +#define DBL_MAX_EXP 1024 +#define DBL_HAS_SUBNORM 1 + +#define DBL_DIG 15 +#define DBL_DECIMAL_DIG 17 +#define DBL_MIN_10_EXP (-307) +#define DBL_MAX_10_EXP 308 + +#define LDBL_HAS_SUBNORM 1 +#define LDBL_DECIMAL_DIG DECIMAL_DIG + +#include + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sw/math/include/math.h b/sw/math/include/math.h index 6dad71c1e..14f28ec8c 100644 --- a/sw/math/include/math.h +++ b/sw/math/include/math.h @@ -435,9 +435,6 @@ float pow10f(float); long double pow10l(long double); #endif -#include "../src/math/expm1.c" -#include "../src/math/tanh.c" - #ifdef __cplusplus } #endif diff --git a/sw/math/src/internal/libm.h b/sw/math/src/internal/libm.h index 72ad17d8e..c96c0eced 100644 --- a/sw/math/src/internal/libm.h +++ b/sw/math/src/internal/libm.h @@ -96,6 +96,78 @@ static int32_t converttoint(double_t); #define predict_false(x) (x) #endif +/* Memory-consistent functions to manipulate the upper word of a + double-precision floating-point number in the integer core. + Since there is no dedicated instruction to move the upper 32-bits + of a double-precision floating point register to an integer register + the compiler resorts to moving the value through the memory. However in + Snitch neither the program ordering between floating-point and integer + instructions is guaranteed, nor is memory consistency between the integer + and floating-point threads. */ + +static inline uint32_t safe_extract_upper_32b_from_double(double x) { + double f; + uint32_t result; + asm volatile("fsd %[x], 0(%[ptr]) \n" + "fld ft3, 0(%[ptr]) \n" + "fmv.x.w t0, ft3 \n" + "mv t0, t0 \n" + "lw %[result], 4(%[ptr]) \n" + : [result]"=r"(result) : [x]"f"(x), [ptr]"r"(&f): "ft3", "t0", "memory"); + return result; +} + +static inline void safe_inject_into_upper_32b_double(uint32_t x, double *f) { + asm volatile("sw %[x], 4(%[ptr]) \n" + "lw %[x], 4(%[ptr]) \n" + "fmv.w.x ft3, %[x] \n" + : : [x]"r"(x), [ptr]"r"(f): "ft3", "memory"); +} + +/* TODO: the following functions are not really safe, compare previous two + functions */ + +/* FPU fence to synchronize the FPU and integer core in Snitch. */ +inline void snrt_fpu_fence() { + unsigned tmp; + __asm__ volatile( + "fmv.x.w %0, fa0\n" + "mv %0, %0\n" + : "+r"(tmp)::"memory"); +} + +/* Synch-secure double to uint64 conversion functions. */ +static inline uint64_t asuint64(double f) { + uint64_t result; + snrt_fpu_fence(); + result = *(uint64_t *)&f; + return result; +} + +/* Synch-secure float to uint conversion functions. */ +static inline uint64_t asuint(float f) { + uint32_t result; + snrt_fpu_fence(); + result = *(uint32_t *)&f; + return result; +} + +/* Synch-secure uint64 to double conversion functions. */ +static inline double asdouble(uint64_t i) { + double result; + snrt_fpu_fence(); + result = *(double *)&i; + return result; +} + +/* Synch-secure uint to float conversion functions. */ +static inline float asfloat(uint32_t i) { + float result; + snrt_fpu_fence(); + result = *(float *)&i; + return result; +} + /* Evaluate an expression as the specified type. With standard excess precision handling a type cast or assignment is enough (with -ffloat-store an assignment is required, in old compilers argument @@ -187,10 +259,12 @@ static inline void fp_force_evall(long double x) } \ } while(0) -#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i -#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f -#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i -#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f +// Unsafe in Snitch due to the decoupled FPU and integer +// arithmetic units. Use at your own risk. +#define asuint_unsafe(f) ((union{float _f; uint32_t _i;}){f})._i +#define asfloat_unsafe(i) ((union{uint32_t _i; float _f;}){i})._f +#define asuint64_unsafe(f) ((union{double _f; uint64_t _i;}){f})._i +#define asdouble_unsafe(i) ((union{uint64_t _i; double _f;}){i})._f #define EXTRACT_WORDS(hi,lo,d) \ do { \ diff --git a/sw/math/src/math/__math_divzero.c b/sw/math/src/math/__math_divzero.c new file mode 100644 index 000000000..59d213500 --- /dev/null +++ b/sw/math/src/math/__math_divzero.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_divzero(uint32_t sign) +{ + return fp_barrier(sign ? -1.0 : 1.0) / 0.0; +} diff --git a/sw/math/src/math/__math_invalid.c b/sw/math/src/math/__math_invalid.c new file mode 100644 index 000000000..177404900 --- /dev/null +++ b/sw/math/src/math/__math_invalid.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_invalid(double x) +{ + return (x - x) / (x - x); +} diff --git a/sw/math/src/math/__math_invalidf.c b/sw/math/src/math/__math_invalidf.c new file mode 100644 index 000000000..357d4b121 --- /dev/null +++ b/sw/math/src/math/__math_invalidf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_invalidf(float x) +{ + return (x - x) / (x - x); +} diff --git a/sw/math/src/math/__math_invalidl.c b/sw/math/src/math/__math_invalidl.c new file mode 100644 index 000000000..1fca99de4 --- /dev/null +++ b/sw/math/src/math/__math_invalidl.c @@ -0,0 +1,9 @@ +#include +#include "libm.h" + +#if LDBL_MANT_DIG != DBL_MANT_DIG +long double __math_invalidl(long double x) +{ + return (x - x) / (x - x); +} +#endif diff --git a/sw/math/src/math/__math_oflow.c b/sw/math/src/math/__math_oflow.c new file mode 100644 index 000000000..c85dbf982 --- /dev/null +++ b/sw/math/src/math/__math_oflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_oflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p769); +} diff --git a/sw/math/src/math/__math_oflowf.c b/sw/math/src/math/__math_oflowf.c new file mode 100644 index 000000000..fa7d06208 --- /dev/null +++ b/sw/math/src/math/__math_oflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_oflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p97f); +} diff --git a/sw/math/src/math/__math_uflow.c b/sw/math/src/math/__math_uflow.c new file mode 100644 index 000000000..b90594aee --- /dev/null +++ b/sw/math/src/math/__math_uflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_uflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p-767); +} diff --git a/sw/math/src/math/__math_uflowf.c b/sw/math/src/math/__math_uflowf.c new file mode 100644 index 000000000..94d50f2bf --- /dev/null +++ b/sw/math/src/math/__math_uflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_uflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p-95f); +} diff --git a/sw/math/src/math/__math_xflow.c b/sw/math/src/math/__math_xflow.c new file mode 100644 index 000000000..744203c4c --- /dev/null +++ b/sw/math/src/math/__math_xflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_xflow(uint32_t sign, double y) +{ + return eval_as_double(fp_barrier(sign ? -y : y) * y); +} diff --git a/sw/math/src/math/__math_xflowf.c b/sw/math/src/math/__math_xflowf.c new file mode 100644 index 000000000..f2c84784f --- /dev/null +++ b/sw/math/src/math/__math_xflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_xflowf(uint32_t sign, float y) +{ + return eval_as_float(fp_barrierf(sign ? -y : y) * y); +} diff --git a/sw/math/src/math/ceil.c b/sw/math/src/math/ceil.c new file mode 100644 index 000000000..b13e6f2d6 --- /dev/null +++ b/sw/math/src/math/ceil.c @@ -0,0 +1,31 @@ +#include "libm.h" + +#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1 +#define EPS DBL_EPSILON +#elif FLT_EVAL_METHOD==2 +#define EPS LDBL_EPSILON +#endif +static const double_t toint = 1/EPS; + +double ceil(double x) +{ + union {double f; uint64_t i;} u = {x}; + int e = u.i >> 52 & 0x7ff; + double_t y; + + if (e >= 0x3ff+52 || x == 0) + return x; + /* y = int(x) - x, where int(x) is an integer neighbor of x */ + if (u.i >> 63) + y = x - toint + toint - x; + else + y = x + toint - toint - x; + /* special case because of non-nearest rounding modes */ + if (e <= 0x3ff-1) { + FORCE_EVAL(y); + return u.i >> 63 ? -0.0 : 1; + } + if (y < 0) + return x + y + 1; + return x + y; +} diff --git a/sw/math/src/math/ceilf.c b/sw/math/src/math/ceilf.c new file mode 100644 index 000000000..869835f39 --- /dev/null +++ b/sw/math/src/math/ceilf.c @@ -0,0 +1,27 @@ +#include "libm.h" + +float ceilf(float x) +{ + union {float f; uint32_t i;} u = {x}; + int e = (int)(u.i >> 23 & 0xff) - 0x7f; + uint32_t m; + + if (e >= 23) + return x; + if (e >= 0) { + m = 0x007fffff >> e; + if ((u.i & m) == 0) + return x; + FORCE_EVAL(x + 0x1p120f); + if (u.i >> 31 == 0) + u.i += m; + u.i &= ~m; + } else { + FORCE_EVAL(x + 0x1p120f); + if (u.i >> 31) + u.f = -0.0; + else if (u.i << 1) + u.f = 1.0; + } + return u.f; +} diff --git a/sw/math/src/math/ceill.c b/sw/math/src/math/ceill.c new file mode 100644 index 000000000..60a83020d --- /dev/null +++ b/sw/math/src/math/ceill.c @@ -0,0 +1,34 @@ +#include "libm.h" + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +long double ceill(long double x) +{ + return ceil(x); +} +#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384 + +static const long double toint = 1/LDBL_EPSILON; + +long double ceill(long double x) +{ + union ldshape u = {x}; + int e = u.i.se & 0x7fff; + long double y; + + if (e >= 0x3fff+LDBL_MANT_DIG-1 || x == 0) + return x; + /* y = int(x) - x, where int(x) is an integer neighbor of x */ + if (u.i.se >> 15) + y = x - toint + toint - x; + else + y = x + toint - toint - x; + /* special case because of non-nearest rounding modes */ + if (e <= 0x3fff-1) { + FORCE_EVAL(y); + return u.i.se >> 15 ? -0.0 : 1; + } + if (y < 0) + return x + y + 1; + return x + y; +} +#endif diff --git a/sw/math/src/math/exp2f_data.c b/sw/math/src/math/exp2f_data.c new file mode 100644 index 000000000..be324727f --- /dev/null +++ b/sw/math/src/math/exp2f_data.c @@ -0,0 +1,35 @@ +/* + * Shared data between expf, exp2f and powf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp2f_data.h" + +#define N (1 << EXP2F_TABLE_BITS) + +const struct exp2f_data __exp2f_data = { + /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) + used for computing 2^(k/N) for an int |k| < 150 N as + double(tab[k%N] + (k << 52-BITS)) */ + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, + .shift_scaled = 0x1.8p+52 / N, + .poly = { + 0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1, + }, + .shift = 0x1.8p+52, + .invln2_scaled = 0x1.71547652b82fep+0 * N, + .poly_scaled = { + 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, + }, +}; diff --git a/sw/math/src/math/exp2f_data.h b/sw/math/src/math/exp2f_data.h new file mode 100644 index 000000000..fe744f15b --- /dev/null +++ b/sw/math/src/math/exp2f_data.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP2F_DATA_H +#define _EXP2F_DATA_H + +#include +#include + +/* Shared between expf, exp2f and powf. */ +#define EXP2F_TABLE_BITS 5 +#define EXP2F_POLY_ORDER 3 +extern hidden const struct exp2f_data { + uint64_t tab[1 << EXP2F_TABLE_BITS]; + double shift_scaled; + double poly[EXP2F_POLY_ORDER]; + double shift; + double invln2_scaled; + double poly_scaled[EXP2F_POLY_ORDER]; +} __exp2f_data; + +#endif diff --git a/sw/math/src/math/expf.c b/sw/math/src/math/expf.c new file mode 100644 index 000000000..f9fbf8e72 --- /dev/null +++ b/sw/math/src/math/expf.c @@ -0,0 +1,80 @@ +/* + * Single-precision e^x function. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "exp2f_data.h" + +/* +EXP2F_TABLE_BITS = 5 +EXP2F_POLY_ORDER = 3 + +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) +Wrong count: 170635 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ + +#define N (1 << EXP2F_TABLE_BITS) +#define InvLn2N __exp2f_data.invln2_scaled +#define T __exp2f_data.tab +#define C __exp2f_data.poly_scaled + +static inline uint32_t top12(float x) +{ + return asuint(x) >> 20; +} + +float expf(float x) +{ + uint32_t abstop; + uint64_t ki, t; + double_t kd, xd, z, r, r2, y, s; + + xd = (double_t)x; + abstop = top12(x) & 0x7ff; + if (predict_false(abstop >= top12(88.0f))) { + /* |x| >= 88 or x is nan. */ + if (asuint(x) == asuint(-INFINITY)) + return 0.0f; + if (abstop >= top12(INFINITY)) + return x + x; + if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ + return __math_oflowf(0); + if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ + return __math_uflowf(0); + } + + /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */ + z = InvLn2N * xd; + + /* Round and convert z to int, the result is in [-150*N, 128*N] and + ideally ties-to-even rule is used, otherwise the magnitude of r + can be bigger which gives larger approximation error. */ +#if TOINT_INTRINSICS + kd = roundtoint(z); + ki = converttoint(z); +#else +# define SHIFT __exp2f_data.shift + kd = eval_as_double(z + SHIFT); + ki = asuint64(kd); + kd -= SHIFT; +#endif + r = z - kd; + + /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + t = T[ki % N]; + t += ki << (52 - EXP2F_TABLE_BITS); + s = asdouble(t); + z = C[0] * r + C[1]; + r2 = r * r; + y = C[2] * r + 1; + y = z * r2 + y; + y = y * s; + return eval_as_float(y); +} diff --git a/sw/math/src/math/expm1.c b/sw/math/src/math/expm1.c index ac1e61e4f..d94f57fe5 100644 --- a/sw/math/src/math/expm1.c +++ b/sw/math/src/math/expm1.c @@ -121,9 +121,14 @@ Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */ double expm1(double x) { double_t y,hi,lo,c,t,e,hxs,hfx,r1,twopk; - union {double f; uint64_t i;} u = {x}; - uint32_t hx = u.i>>32 & 0x7fffffff; - int k, sign = u.i>>63; + /// Original implementation + // union {double f; uint64_t i;} u = {x}; + // uint32_t hx = u.i>>32 & 0x7fffffff; + // int k, sign = u.i>>63; + /// Safe implementation in Snitch + uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); + uint32_t hx = upper_32b_x & 0x7fffffff; + int k, sign = upper_32b_x>>31; /* filter out huge and non-finite argument */ if (hx >= 0x4043687A) { /* if |x|>=56*ln2 */ @@ -182,8 +187,12 @@ double expm1(double x) return -2.0*(e-(x+0.5)); return 1.0+2.0*(x-e); } - u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ - twopk = u.f; + /// Original implementation + // u.i = (uint64_t)(0x3ff + k)<<52; /* 2^k */ + // twopk = u.f; + /// Safe implementation in Snitch + uint32_t u_i = (uint32_t)(0x3ff + k)<<20; + safe_inject_into_upper_32b_double(u_i, &twopk); if (k < 0 || k > 56) { /* suffice to return exp(x)-1 */ y = x - e + 1.0; if (k == 1024) @@ -192,10 +201,19 @@ double expm1(double x) y = y*twopk; return y - 1.0; } - u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ + /// Original implementation + // u.i = (uint64_t)(0x3ff - k)<<52; /* 2^-k */ + // if (k < 20) + // y = (x-e+(1-u.f))*twopk; + // else + // y = (x-(e+u.f)+1)*twopk; + /// Safe implementation in Snitch + u_i = (uint32_t)(0x3ff - k)<<20; + double u_f = 0; + safe_inject_into_upper_32b_double(u_i, &u_f); if (k < 20) - y = (x-e+(1-u.f))*twopk; + y = (x-e+(1-u_f))*twopk; else - y = (x-(e+u.f)+1)*twopk; + y = (x-(e+u_f)+1)*twopk; return y; } diff --git a/sw/math/src/math/log2.c b/sw/math/src/math/log2.c new file mode 100644 index 000000000..1276ed4e3 --- /dev/null +++ b/sw/math/src/math/log2.c @@ -0,0 +1,122 @@ +/* + * Double-precision log2(x) function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log2_data.h" + +#define T __log2_data.tab +#define T2 __log2_data.tab2 +#define B __log2_data.poly1 +#define A __log2_data.poly +#define InvLn2hi __log2_data.invln2hi +#define InvLn2lo __log2_data.invln2lo +#define N (1 << LOG2_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t top16(double x) +{ + return asuint64(x) >> 48; +} + +double log2(double x) +{ + double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64(x); + top = top16(x); +#define LO asuint64(1.0 - 0x1.5b51p-5) +#define HI asuint64(1.0 + 0x1.6ab2p-5) + if (predict_false(ix - LO < HI - LO)) { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) + return 0; + r = x - 1.0; +#if __FP_FAST_FMA + hi = r * InvLn2hi; + lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi); +#else + double_t rhi, rlo; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + hi = rhi * InvLn2hi; + lo = rlo * InvLn2hi + r * InvLn2lo; +#endif + r2 = r * r; /* rounding error: 0x1p-62. */ + r4 = r2 * r2; + /* Worst-case error is less than 0.54 ULP (0.55 ULP without fma). */ + p = r2 * (B[0] + r * B[1]); + y = hi + p; + lo += hi - y + p; + lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) + + r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9]))); + y += lo; + return eval_as_double(y); + } + if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero(1); + if (ix == asuint64(INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG2_TABLE_BITS)) % N; + k = (int64_t)tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble(iz); + kd = (double_t)k; + + /* log2(x) = log2(z/c) + log2(c) + k. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if __FP_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = __builtin_fma(z, invc, -1.0); + t1 = r * InvLn2hi; + t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1); +#else + double_t rhi, rlo; + /* rounding error: 0x1p-55/N + 0x1p-65. */ + r = (z - T2[i].chi - T2[i].clo) * invc; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + t1 = rhi * InvLn2hi; + t2 = rlo * InvLn2hi + r * InvLn2lo; +#endif + + /* hi + lo = r/ln2 + log2(c) + k. */ + t3 = kd + logc; + hi = t3 + t1; + lo = t3 - hi + t1 + t2; + + /* log2(r+1) = r/ln2 + r^2*poly(r). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + r4 = r2 * r2; + /* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma). + ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma). */ + p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]); + y = lo + r2 * p + hi; + return eval_as_double(y); +} diff --git a/sw/math/src/math/log2_data.c b/sw/math/src/math/log2_data.c new file mode 100644 index 000000000..3dd1ca514 --- /dev/null +++ b/sw/math/src/math/log2_data.c @@ -0,0 +1,201 @@ +/* + * Data for log2. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2_data.h" + +#define N (1 << LOG2_TABLE_BITS) + +const struct log2_data __log2_data = { +// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0 +.invln2hi = 0x1.7154765200000p+0, +.invln2lo = 0x1.705fc2eefa200p-33, +.poly1 = { +// relative error: 0x1.2fad8188p-63 +// in -0x1.5b51p-5 0x1.6ab2p-5 +-0x1.71547652b82fep-1, +0x1.ec709dc3a03f7p-2, +-0x1.71547652b7c3fp-2, +0x1.2776c50f05be4p-2, +-0x1.ec709dd768fe5p-3, +0x1.a61761ec4e736p-3, +-0x1.7153fbc64a79bp-3, +0x1.484d154f01b4ap-3, +-0x1.289e4a72c383cp-3, +0x1.0b32f285aee66p-3, +}, +.poly = { +// relative error: 0x1.a72c2bf8p-58 +// abs error: 0x1.67a552c8p-66 +// in -0x1.f45p-8 0x1.f45p-8 +-0x1.71547652b8339p-1, +0x1.ec709dc3a04bep-2, +-0x1.7154764702ffbp-2, +0x1.2776c50034c48p-2, +-0x1.ec7b328ea92bcp-3, +0x1.a6225e117f92ep-3, +}, +/* Algorithm: + + x = 2^k z + log2(x) = k + log2(c) + log2(z/c) + log2(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log2(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p10 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-64 and + 3) the rounding error in (double)log2(c) is minimized (< 0x1p-68). + +Note: 1) ensures that k + logc can be computed without rounding error, 2) +ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a +single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log2(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1}, +{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1}, +{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1}, +{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2}, +{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2}, +{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2}, +{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2}, +{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2}, +{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2}, +{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2}, +{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2}, +{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2}, +{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2}, +{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2}, +{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2}, +{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2}, +{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2}, +{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2}, +{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2}, +{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2}, +{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3}, +{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3}, +{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3}, +{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3}, +{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3}, +{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3}, +{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3}, +{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3}, +{0x1.19453847f2200p+0, -0x1.162595afdc000p-3}, +{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4}, +{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4}, +{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4}, +{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4}, +{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4}, +{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4}, +{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5}, +{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5}, +{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6}, +{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6}, +{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8}, +{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7}, +{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5}, +{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5}, +{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4}, +{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4}, +{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4}, +{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3}, +{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3}, +{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3}, +{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3}, +{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3}, +{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3}, +{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2}, +{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2}, +{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2}, +{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2}, +{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2}, +{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2}, +{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2}, +{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2}, +{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2}, +{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2}, +{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2}, +{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55}, +{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57}, +{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55}, +{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55}, +{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55}, +{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56}, +{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56}, +{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57}, +{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55}, +{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57}, +{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55}, +{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55}, +{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56}, +{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56}, +{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56}, +{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55}, +{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57}, +{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55}, +{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55}, +{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58}, +{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55}, +{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58}, +{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56}, +{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56}, +{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57}, +{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56}, +{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56}, +{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55}, +{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58}, +{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56}, +{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55}, +{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56}, +{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55}, +{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56}, +{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55}, +{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55}, +{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55}, +{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59}, +{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58}, +{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55}, +{0x1.0200004292367p+0, 0x1.b7ff365324681p-54}, +{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55}, +{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58}, +{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54}, +{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55}, +{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54}, +{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54}, +{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54}, +{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55}, +{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55}, +{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56}, +{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54}, +{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56}, +{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54}, +{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56}, +{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54}, +{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56}, +{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55}, +{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55}, +{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56}, +{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54}, +{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55}, +{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55}, +{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54}, +}, +#endif +}; diff --git a/sw/math/src/math/log2_data.h b/sw/math/src/math/log2_data.h new file mode 100644 index 000000000..276a786d1 --- /dev/null +++ b/sw/math/src/math/log2_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2_DATA_H +#define _LOG2_DATA_H + +#include + +#define LOG2_TABLE_BITS 6 +#define LOG2_POLY_ORDER 7 +#define LOG2_POLY1_ORDER 11 +extern hidden const struct log2_data { + double invln2hi; + double invln2lo; + double poly[LOG2_POLY_ORDER - 1]; + double poly1[LOG2_POLY1_ORDER - 1]; + struct { + double invc, logc; + } tab[1 << LOG2_TABLE_BITS]; +#if !__FP_FAST_FMA + struct { + double chi, clo; + } tab2[1 << LOG2_TABLE_BITS]; +#endif +} __log2_data; + +#endif diff --git a/sw/math/src/math/log2f.c b/sw/math/src/math/log2f.c new file mode 100644 index 000000000..c368f88f3 --- /dev/null +++ b/sw/math/src/math/log2f.c @@ -0,0 +1,72 @@ +/* + * Single-precision log2 function. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log2f_data.h" + +/* +LOG2F_TABLE_BITS = 4 +LOG2F_POLY_ORDER = 4 + +ULP error: 0.752 (nearest rounding.) +Relative error: 1.9 * 2^-26 (before rounding.) +*/ + +#define N (1 << LOG2F_TABLE_BITS) +#define T __log2f_data.tab +#define A __log2f_data.poly +#define OFF 0x3f330000 + +float log2f(float x) +{ + double_t z, r, r2, p, y, y0, invc, logc; + uint32_t ix, iz, top, tmp; + int k, i; + + ix = asuint(x); + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == 0x3f800000)) + return 0; + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzerof(1); + if (ix == 0x7f800000) /* log2(inf) == inf. */ + return x; + if ((ix & 0x80000000) || ix * 2 >= 0xff000000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N; + top = tmp & 0xff800000; + iz = ix - top; + k = (int32_t)tmp >> 23; /* arithmetic shift */ + invc = T[i].invc; + logc = T[i].logc; + z = (double_t)asfloat(iz); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ + r = z * invc - 1; + y0 = logc + (double_t)k; + + /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */ + r2 = r * r; + y = A[1] * r + A[2]; + y = A[0] * r2 + y; + p = A[3] * r + y0; + y = y * r2 + p; + return eval_as_float(y); +} diff --git a/sw/math/src/math/log2f_data.c b/sw/math/src/math/log2f_data.c new file mode 100644 index 000000000..24e450f1e --- /dev/null +++ b/sw/math/src/math/log2f_data.c @@ -0,0 +1,33 @@ +/* + * Data definition for log2f. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2f_data.h" + +const struct log2f_data __log2f_data = { + .tab = { + { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 }, + { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 }, + { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 }, + { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 }, + { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 }, + { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 }, + { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 }, + { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 }, + { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 }, + { 0x1p+0, 0x0p+0 }, + { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 }, + { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 }, + { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 }, + { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 }, + { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 }, + { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 }, + }, + .poly = { + -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1, + 0x1.715475f35c8b8p0, + } +}; diff --git a/sw/math/src/math/log2f_data.h b/sw/math/src/math/log2f_data.h new file mode 100644 index 000000000..4fa489560 --- /dev/null +++ b/sw/math/src/math/log2f_data.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2F_DATA_H +#define _LOG2F_DATA_H + +#include + +#define LOG2F_TABLE_BITS 4 +#define LOG2F_POLY_ORDER 4 +extern hidden const struct log2f_data { + struct { + double invc, logc; + } tab[1 << LOG2F_TABLE_BITS]; + double poly[LOG2F_POLY_ORDER]; +} __log2f_data; + +#endif diff --git a/sw/math/src/math/sqrt.c b/sw/math/src/math/sqrt.c new file mode 100644 index 000000000..5ba265596 --- /dev/null +++ b/sw/math/src/math/sqrt.c @@ -0,0 +1,158 @@ +#include +#include +#include "libm.h" +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +double sqrt(double x) +{ + uint64_t ix, top, m; + + /* special case handling. */ + ix = asuint64(x); + top = ix >> 52; + if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7ff0000000000000) + return x; + if (ix > 0x7ff0000000000000) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + top = ix >> 52; + top -= 52; + } + + /* argument reduction: + x = 4^e m; with integer e, and m in [1, 4) + m: fixed point representation [2.62] + 2^e is the exponent part of the result. */ + int even = top & 1; + m = (ix << 11) | 0x8000000000000000; + if (even) m >>= 1; + top = (top + 0x3ff) >> 1; + + /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) + + initial estimate: + 7bit table lookup (1bit exponent and 6bit significand). + + iterative approximation: + using 2 goldschmidt iterations with 32bit int arithmetics + and a final iteration with 64bit int arithmetics. + + details: + + the relative error (e = r0 sqrt(m)-1) of a linear estimate + (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, + a table lookup is faster and needs one less iteration + 6 bit lookup table (128b) gives |e| < 0x1.f9p-8 + 7 bit lookup table (256b) gives |e| < 0x1.fdp-9 + for single and double prec 6bit is enough but for quad + prec 7bit is needed (or modified iterations). to avoid + one more iteration >=13bit table would be needed (16k). + + a newton-raphson iteration for r is + w = r*r + u = 3 - m*w + r = r*u/2 + can use a goldschmidt iteration for s at the end or + s = m*r + + first goldschmidt iteration is + s = m*r + u = 3 - s*r + r = r*u/2 + s = s*u/2 + next goldschmidt iteration is + u = 3 - s*r + r = r*u/2 + s = s*u/2 + and at the end r is not computed only s. + + they use the same amount of operations and converge at the + same quadratic rate, i.e. if + r1 sqrt(m) - 1 = e, then + r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 + the advantage of goldschmidt is that the mul for s and r + are independent (computed in parallel), however it is not + "self synchronizing": it only uses the input m in the + first iteration so rounding errors accumulate. at the end + or when switching to larger precision arithmetics rounding + errors dominate so the first iteration should be used. + + the fixed point representations are + m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 + and after switching to 64 bit + m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */ + + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + + i = (ix >> 46) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1.fdp-9 */ + s = mul32(m>>32, r); + /* |s/sqrt(m) - 1| < 0x1.fdp-9 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ + r = r << 32; + s = mul64(m, r); + d = mul64(s, r); + u = (three<<32) - d; + s = mul64(s, u); /* repr: 3.61 */ + /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ + s = (s - 2) >> 9; /* repr: 12.52 */ + /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ + + /* s < sqrt(m) < s + 0x1.09p-52, + compute nearest rounded result: + the nearest result to 52 bits is either s or s+0x1p-52, + we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */ + uint64_t d0, d1, d2; + double y, t; + d0 = (m << 42) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 63; + s &= 0x000fffffffffffff; + s |= top << 52; + y = asdouble(s); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception: + only (s+1)^2 == 2^42 m case is exact otherwise + add a tiny value to cause the fenv effects. */ + uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; + tiny |= (d1^d2) & 0x8000000000000000; + t = asdouble(tiny); + y = eval_as_double(y + t); + } + return y; +} diff --git a/sw/math/src/math/sqrt_data.c b/sw/math/src/math/sqrt_data.c new file mode 100644 index 000000000..61bc22f43 --- /dev/null +++ b/sw/math/src/math/sqrt_data.c @@ -0,0 +1,19 @@ +#include "sqrt_data.h" +const uint16_t __rsqrt_tab[128] = { +0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43, +0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b, +0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1, +0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430, +0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59, +0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925, +0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479, +0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040, +0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234, +0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2, +0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1, +0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192, +0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f, +0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4, +0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59, +0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560, +}; diff --git a/sw/math/src/math/sqrt_data.h b/sw/math/src/math/sqrt_data.h new file mode 100644 index 000000000..260c7f9c2 --- /dev/null +++ b/sw/math/src/math/sqrt_data.h @@ -0,0 +1,13 @@ +#ifndef _SQRT_DATA_H +#define _SQRT_DATA_H + +#include +#include + +/* if x in [1,2): i = (int)(64*x); + if x in [2,4): i = (int)(32*x-64); + __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error: + |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */ +extern hidden const uint16_t __rsqrt_tab[128]; + +#endif diff --git a/sw/math/src/math/sqrtf.c b/sw/math/src/math/sqrtf.c new file mode 100644 index 000000000..740d81cba --- /dev/null +++ b/sw/math/src/math/sqrtf.c @@ -0,0 +1,83 @@ +#include +#include +#include "libm.h" +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* see sqrt.c for more detailed comments. */ + +float sqrtf(float x) +{ + uint32_t ix, m, m1, m0, even, ey; + + ix = asuint(x); + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7f800000) + return x; + if (ix > 0x7f800000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 4^e m; with int e and m in [1, 4). */ + even = ix & 0x00800000; + m1 = (ix << 8) | 0x80000000; + m0 = (ix << 7) & 0x7fffffff; + m = even ? m0 : m1; + + /* 2^e is the exponent part of the return value. */ + ey = ix >> 1; + ey += 0x3f800000 >> 1; + ey &= 0x7f800000; + + /* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations. */ + static const uint32_t three = 0xc0000000; + uint32_t r, s, d, u, i; + i = (ix >> 17) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r*sqrt(m) - 1| < 0x1p-8 */ + s = mul32(m, r); + /* |s/sqrt(m) - 1| < 0x1p-8 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r*sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + s = mul32(s, u); + /* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */ + s = (s - 1)>>6; + /* s < sqrt(m) < s + 0x1.08p-23 */ + + /* compute nearest rounded result. */ + uint32_t d0, d1, d2; + float y, t; + d0 = (m << 16) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 31; + s &= 0x007fffff; + s |= ey; + y = asfloat(s); + if (FENV_SUPPORT) { + /* handle rounding and inexact exception. */ + uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000; + tiny |= (d1^d2) & 0x80000000; + t = asfloat(tiny); + y = eval_as_float(y + t); + } + return y; +} diff --git a/sw/math/src/math/tanh.c b/sw/math/src/math/tanh.c index 20d6dbcf4..2481db1dc 100644 --- a/sw/math/src/math/tanh.c +++ b/sw/math/src/math/tanh.c @@ -6,16 +6,23 @@ */ double tanh(double x) { - union {double f; uint64_t i;} u = {.f = x}; uint32_t w; int sign; double_t t; /* x = |x| */ - sign = u.i >> 63; - u.i &= (uint64_t)-1/2; - x = u.f; - w = u.i >> 32; + /// Original implementation + // union {double f; uint64_t i;} u = {.f = x}; + // sign = u.i >> 63; + // u.i &= (uint64_t)-1/2; + // x = u.f; + // w = u.i >> 32; + /// Safe implementation in Snitch + uint32_t upper_32b_x = safe_extract_upper_32b_from_double(x); + sign = upper_32b_x >> 31; + uint32_t sign_mask = (~(1 << 31)); + w = upper_32b_x & sign_mask; + safe_inject_into_upper_32b_double(w, &x); if (w > 0x3fe193ea) { /* |x| > log(3)/2 ~= 0.5493 or nan */ diff --git a/target/snitch_cluster/sw/Makefile b/target/snitch_cluster/sw/Makefile index 9badf70ea..a0115d00a 100644 --- a/target/snitch_cluster/sw/Makefile +++ b/target/snitch_cluster/sw/Makefile @@ -13,21 +13,19 @@ else RUNTIME = runtime/rtl endif -MATH = ../../../sw/math - -SUBDIRS = runtime/banshee runtime/rtl $(MATH) apps tests +SUBDIRS = runtime/banshee runtime/rtl math apps tests .PHONY: all $(SUBDIRS) all: $(SUBDIRS) # Explicit dependency of apps on runtime -apps: $(RUNTIME) $(MATH) +apps: $(RUNTIME) math $(MAKE) -C $@ TARGET=$(TARGET) # Explicit dependency of tests on runtime -tests: $(RUNTIME) $(MATH) +tests: $(RUNTIME) math $(MAKE) -C $@ $(TARGET) -runtime/rtl runtime/banshee $(MATH): +runtime/rtl runtime/banshee math: $(MAKE) -C $@ $(TARGET) diff --git a/target/snitch_cluster/sw/apps/common.mk b/target/snitch_cluster/sw/apps/common.mk index d8b0659a4..8e1950860 100644 --- a/target/snitch_cluster/sw/apps/common.mk +++ b/target/snitch_cluster/sw/apps/common.mk @@ -22,6 +22,7 @@ RISCV_CFLAGS += -DBIST else RUNTIME_DIR := $(ROOT)/target/snitch_cluster/sw/runtime/rtl endif +MATH_DIR := $(ROOT)/target/snitch_cluster/sw/math # Paths relative to the app including this Makefile BUILDDIR = $(abspath build) @@ -37,18 +38,13 @@ INCDIRS += $(SNRT_DIR)/api/omp INCDIRS += $(SNRT_DIR)/src INCDIRS += $(SNRT_DIR)/src/omp INCDIRS += $(ROOT)/sw/deps/riscv-opcodes - -# Math library override -INCDIRS += $(ROOT)/sw/math/arch/riscv64/bits/ -INCDIRS += $(ROOT)/sw/math/arch/generic -INCDIRS += $(ROOT)/sw/math/src/include -INCDIRS += $(ROOT)/sw/math/src/internal -INCDIRS += $(ROOT)/sw/math/include/bits INCDIRS += $(ROOT)/sw/math/include RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)) RISCV_LDFLAGS += -T$(abspath $(SNRT_DIR)/base.ld) RISCV_LDFLAGS += -L$(abspath $(RUNTIME_DIR)/build/) +RISCV_LDFLAGS += -L$(abspath $(MATH_DIR)/build/) +RISCV_LDFLAGS += -lmath RISCV_LDFLAGS += -lsnRuntime ########### diff --git a/target/snitch_cluster/sw/math/Makefile b/target/snitch_cluster/sw/math/Makefile new file mode 100644 index 000000000..d0a83e86a --- /dev/null +++ b/target/snitch_cluster/sw/math/Makefile @@ -0,0 +1,8 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +include ../toolchain.mk +include ../../../../sw/math/Makefile diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk index 4fa0fc5af..ca5f6a482 100644 --- a/target/snitch_cluster/sw/toolchain.mk +++ b/target/snitch_cluster/sw/toolchain.mk @@ -34,6 +34,7 @@ RISCV_CFLAGS += -mcmodel=medany # RISCV_CFLAGS += -mno-fdiv # Not supported by Clang RISCV_CFLAGS += -ffast-math RISCV_CFLAGS += -fno-builtin-printf +RISCV_CFLAGS += -fno-builtin-sqrtf RISCV_CFLAGS += -fno-common RISCV_CFLAGS += -fopenmp RISCV_CFLAGS += -ftls-model=local-exec