From 44e901cc7e3d21bcd410e6a2960e16bcf741fc3d Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 12:18:15 +0100
Subject: [PATCH 01/32] Init dilithium3 stack optimized variant

---
 crypto_sign/dilithium3/m4fstack/api.h         |  26 +
 crypto_sign/dilithium3/m4fstack/config.h      |   7 +
 crypto_sign/dilithium3/m4fstack/macros.i      | 191 ++++
 crypto_sign/dilithium3/m4fstack/macros_fnt.i  | 158 ++++
 crypto_sign/dilithium3/m4fstack/ntt.S         | 402 +++++++++
 crypto_sign/dilithium3/m4fstack/ntt.h         |  13 +
 crypto_sign/dilithium3/m4fstack/packing.c     | 286 ++++++
 crypto_sign/dilithium3/m4fstack/packing.h     |  55 ++
 crypto_sign/dilithium3/m4fstack/params.h      |  83 ++
 .../dilithium3/m4fstack/pointwise_mont.h      |  13 +
 .../dilithium3/m4fstack/pointwise_mont.s      | 128 +++
 crypto_sign/dilithium3/m4fstack/poly.c        | 851 ++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/poly.h        |  82 ++
 crypto_sign/dilithium3/m4fstack/polyvec.c     | 429 +++++++++
 crypto_sign/dilithium3/m4fstack/polyvec.h     |  99 ++
 crypto_sign/dilithium3/m4fstack/reduce.h      |  29 +
 crypto_sign/dilithium3/m4fstack/rounding.c    | 102 +++
 crypto_sign/dilithium3/m4fstack/rounding.h    |  19 +
 crypto_sign/dilithium3/m4fstack/sign.c        | 352 ++++++++
 crypto_sign/dilithium3/m4fstack/sign.h        |  37 +
 crypto_sign/dilithium3/m4fstack/smallntt.S    | 837 +++++++++++++++++
 crypto_sign/dilithium3/m4fstack/smallntt.h    |  53 ++
 crypto_sign/dilithium3/m4fstack/smallpoly.c   |  84 ++
 crypto_sign/dilithium3/m4fstack/smallpoly.h   |  39 +
 .../dilithium3/m4fstack/symmetric-shake.c     |  28 +
 crypto_sign/dilithium3/m4fstack/symmetric.h   |  65 ++
 crypto_sign/dilithium3/m4fstack/vector.h      |  20 +
 crypto_sign/dilithium3/m4fstack/vector.s      | 210 +++++
 28 files changed, 4698 insertions(+)
 create mode 100644 crypto_sign/dilithium3/m4fstack/api.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/config.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/macros.i
 create mode 100644 crypto_sign/dilithium3/m4fstack/macros_fnt.i
 create mode 100644 crypto_sign/dilithium3/m4fstack/ntt.S
 create mode 100644 crypto_sign/dilithium3/m4fstack/ntt.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/packing.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/packing.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/params.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/pointwise_mont.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/pointwise_mont.s
 create mode 100644 crypto_sign/dilithium3/m4fstack/poly.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/poly.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/polyvec.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/polyvec.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/reduce.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/rounding.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/rounding.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/sign.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/sign.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/smallntt.S
 create mode 100644 crypto_sign/dilithium3/m4fstack/smallntt.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/smallpoly.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/smallpoly.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/symmetric-shake.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/symmetric.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/vector.h
 create mode 100644 crypto_sign/dilithium3/m4fstack/vector.s

diff --git a/crypto_sign/dilithium3/m4fstack/api.h b/crypto_sign/dilithium3/m4fstack/api.h
new file mode 100644
index 00000000..a289632c
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/api.h
@@ -0,0 +1,26 @@
+#ifndef API_H
+#define API_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+                          const uint8_t *m, size_t mlen,
+                          const uint8_t *sk);
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+                const uint8_t *m, size_t mlen,
+                const uint8_t *sk);
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+                       const uint8_t *m, size_t mlen,
+                       const uint8_t *pk);
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+                     const uint8_t *sm, size_t smlen,
+                     const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/config.h b/crypto_sign/dilithium3/m4fstack/config.h
new file mode 100644
index 00000000..55724079
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/config.h
@@ -0,0 +1,7 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#define DILITHIUM_MODE 3
+// #define SIGN_STACKSTRATEGY 2
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/macros.i b/crypto_sign/dilithium3/m4fstack/macros.i
new file mode 100644
index 00000000..25d98c2b
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/macros.i
@@ -0,0 +1,191 @@
+#ifndef MACROS_I
+#define MACROS_I
+// 3
+.macro montgomery_mul_32 a, b, Qprime, Q, tmp, tmp2
+    smull \tmp, \a, \a, \b
+    mul \tmp2, \tmp, \Qprime
+    smlal \tmp, \a, \tmp2, \Q
+.endm
+
+// 2
+.macro addSub1 c0, c1
+    add.w \c0, \c1
+    sub.w \c1, \c0, \c1, lsl #1
+.endm
+
+// 3
+.macro addSub2 c0, c1, c2, c3
+    add \c0, \c1
+    add \c2, \c3
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+.endm
+
+// 6
+.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
+    add \c0, \c1
+    add \c2, \c3
+    add \c4, \c5
+    add \c6, \c7
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+    sub.w \c5, \c4, \c5, lsl #1
+    sub.w \c7, \c6, \c7, lsl #1
+.endm
+
+.macro _2_layer_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
+    montgomery_mul_32 \c2, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+
+    montgomery_mul_32 \c1, \zeta1, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c1, \c2, \c3
+.endm
+
+.macro _2_layer_inv_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
+    montgomery_mul_32 \c1, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c1, \c2, \c3
+
+    montgomery_mul_32 \c2, \zeta1, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+.endm
+
+.macro _3_layer_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+
+    vmov.w \twiddle, \xi1
+    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
+
+    vmov.w \twiddle, \xi3
+    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi4
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi5
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi6
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
+.endm
+
+.macro _3_layer_inv_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
+
+    vmov.w \twiddle, \xi1
+    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
+
+    vmov.w \twiddle, \xi3
+    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi4
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi5
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi6
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+.endm
+
+/************************************************************
+* Name:         _3_layer_inv_butterfly_light_fast_first
+*
+* Description:  upper half of 3-layer inverse butterfly
+*               defined over X^8 - 1
+*
+* Input:        (c4, c1, c6, c3) = coefficients on the upper half;
+*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
+*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
+*               Montgomery domain
+*
+* Symbols:      R = 2^32
+*
+* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
+*
+* Output:
+*               c4 =  c4 + c1        + (c6 + c3)
+*               c5 = (c4 - c1) w_4   + (c6 + c3) w_8^3
+*               c6 =  c4 + c1        - (c6 + c3)
+*               c7 = (c4 - c1) w_8^3 + (c6 + c3) w_4
+************************************************************/
+// 15
+.macro _3_layer_inv_butterfly_light_fast_first c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    addSub2 \c4, \c1, \c6, \c3
+    addSub1 \c4, \c6
+
+    vmov.w \tmp, \xi4
+    vmov.w \tmp2, \xi6
+
+    smull.w \c0, \c5, \c1, \tmp
+    smlal.w \c0, \c5, \c3, \tmp2
+    mul.w \twiddle, \c0, \Qprime
+    smlal.w \c0, \c5, \twiddle, \Q
+
+    smull.w \c2, \c7, \c1, \tmp2
+    smlal.w \c2, \c7, \c3, \tmp
+    mul.w \twiddle, \c2, \Qprime
+    smlal.w \c2, \c7, \twiddle, \Q
+.endm
+
+/************************************************************
+* Name:         _3_layer_inv_butterfly_light_fast_second
+*
+* Description:  lower half of 3-layer inverse butterfly
+*               defined over X^8 - 1, and the 2nd
+*               layer of butterflies
+*
+* Input:
+*               (c4, c5, c6, c7) = results of the upper half;
+*               (c0, c1, c2, c3) = coefficients on the lower half;
+*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
+*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
+*               Montgomery domain
+*
+* Symbols:      R = 2^32
+*
+* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
+*
+* Output:       (normal order)
+*               c0 =   c0 + c1     + (c2 + c3)         + (  c4 + c5     + (c6 + c7)       )
+*               c1 =  (c0 - c1) w3 + (c2 - c3)  w4     + ( (c4 - c5) w5 + (c6 - c7) w6    )
+*               c2 = ( c0 + c1     - (c2 + c3)) w1     + (( c4 + c5     - (c6 + c7)   ) w2)
+*               c3 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 + (((c4 - c5) w5 - (c6 - c7) w6) w2)
+*               c4 =   c0 + c1     - (c2 + c3)         - (  c4 + c5     + (c6 + c7)       ) w0
+*               c5 =  (c0 - c1) w3 + (c2 - c3)  w4     - ( (c4 - c5) w5 + (c6 - c7) w6    ) w0
+*               c6 = ( c0 + c1     - (c2 + c3)) w1     - (( c4 + c5     - (c6 + c7)   ) w2) w0
+*               c7 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 - (((c4 - c5) w5 - (c6 - c7) w6) w2) w0
+************************************************************/
+// 19
+.macro _3_layer_inv_butterfly_light_fast_second c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    addSub2 \c0, \c1, \c2, \c3
+
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+.endm
+
+#endif /* MACROS_I */
diff --git a/crypto_sign/dilithium3/m4fstack/macros_fnt.i b/crypto_sign/dilithium3/m4fstack/macros_fnt.i
new file mode 100644
index 00000000..25903e41
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/macros_fnt.i
@@ -0,0 +1,158 @@
+// 2
+.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1
+    \ldrstr \c0, [\target, \mem0]
+    \ldrstr \c1, [\target, \mem1]
+.endm
+
+// 2
+.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c0, [\target], \jump
+.endm
+
+// 4
+.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3
+    \ldrstr \c0, [\target, \mem0]
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c2, [\target, \mem2]
+    \ldrstr \c3, [\target, \mem3]
+.endm
+
+// 4
+.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c2, [\target, \mem2]
+    \ldrstr \c3, [\target, \mem3]
+    \ldrstr \c0, [\target], \jump
+.endm
+
+// 8
+.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7
+    ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3
+    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
+.endm
+
+// 8
+.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump
+    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
+    ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump
+.endm
+
+
+
+.macro addSub1 c0, c1
+    add.w \c0, \c1
+    sub.w \c1, \c0, \c1, lsl #1
+.endm
+
+.macro addSub2 c0, c1, c2, c3
+    add \c0, \c1
+    add \c2, \c3
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+.endm
+
+.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
+    add \c0, \c1
+    add \c2, \c3
+    add \c4, \c5
+    add \c6, \c7
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+    sub.w \c5, \c4, \c5, lsl #1
+    sub.w \c7, \c6, \c7, lsl #1
+.endm
+
+// 2
+.macro barrett_32 a, Qbar, Q, tmp
+    smmulr.w \tmp, \a, \Qbar
+    mls.w \a, \tmp, \Q, \a
+.endm
+
+.macro FNT_CT_butterfly c0, c1, logW
+    add.w \c0, \c0, \c1, lsl #\logW
+    sub.w \c1, \c0, \c1, lsl #(\logW+1)
+.endm
+
+.macro shift_subAdd c0, c1, shlv
+    sub.w \c0, \c0, \c1, lsl #(\shlv)
+    add.w \c1, \c0, \c1, lsl #(\shlv+1)
+.endm
+
+.macro FNT_CT_ibutterfly c0, c1, shlv
+    shift_subAdd \c0, \c1, \shlv
+.endm
+
+// 46
+.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+
+    // c0, c1, c2, c3, c4, c5, c6, c7, c8
+    // 0,4
+    mla \tmp, \c4, \twiddle, \c0
+    mls \c4, \c4, \twiddle, \c0
+
+    // 1,5
+    mla \c0, \c5, \twiddle, \c1
+    mls \c5, \c5, \twiddle, \c1
+
+    // 2,6
+    mla \c1, \c6, \twiddle, \c2
+    mls \c6, \c6, \twiddle, \c2
+
+    // 3,7
+    mla \c2, \c7, \twiddle, \c3
+    mls \c7, \c7, \twiddle, \c3
+
+    // tmp, c0, c1, c2, c4, c5, c6, c7
+
+    barrett_32 \tmp, \Qprime, \Q, \c3
+    barrett_32 \c0, \Qprime, \Q, \c3
+    barrett_32 \c1, \Qprime, \Q, \c3
+    barrett_32 \c2, \Qprime, \Q, \c3
+    barrett_32 \c4, \Qprime, \Q, \c3
+    barrett_32 \c5, \Qprime, \Q, \c3
+    barrett_32 \c6, \Qprime, \Q, \c3
+    barrett_32 \c7, \Qprime, \Q, \c3
+
+    vmov.w \twiddle, \xi1
+    // 0,2
+    mla \tmp2, \c1, \twiddle, \tmp
+    mls \c3, \c1, \twiddle, \tmp
+
+    // 1,3
+    mla \tmp, \c2, \twiddle, \c0
+    mls \c0, \c2, \twiddle, \c0
+
+    vmov.w \twiddle, \xi2
+
+    // 4,6
+    mla \c2, \c6, \twiddle, \c4
+    mls \c1, \c6, \twiddle, \c4
+
+    // 5,7
+    mla \c6, \c7, \twiddle, \c5
+    mls \c7, \c7, \twiddle, \c5
+
+    // tmp2, tmp, c3, c0 | c2, c6, c1, c7
+
+    // 4,5
+    vmov.w \twiddle, \xi5
+    mla \c4, \c6, \twiddle, \c2
+    mls \c5, \c6, \twiddle, \c2
+
+    // 6,7
+    vmov.w \twiddle, \xi6
+    mla \c6, \c7, \twiddle, \c1
+    mls \c7, \c7, \twiddle, \c1
+
+    // 2,3
+    vmov.w \twiddle, \xi4
+    mla \c2, \c0, \twiddle, \c3
+    mls \c3, \c0, \twiddle, \c3
+
+    // 0,1
+    vmov.w \twiddle, \xi3
+    mla \c0, \tmp, \twiddle, \tmp2
+    mls \c1, \tmp, \twiddle, \tmp2
+.endm
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/ntt.S b/crypto_sign/dilithium3/m4fstack/ntt.S
new file mode 100644
index 00000000..bfd5f7a4
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/ntt.S
@@ -0,0 +1,402 @@
+// based on code by: Markus Krausz (18.03.18)
+// date 23.07.21: Now licensed under CC0 with permission of the authors.
+
+.syntax unified
+#include "macros.i"
+
+// This code uses UMULL - which is constant time on the M4, but not on the M3
+// Make sure that this code is never used on an M3
+smlad r0,r0,r0,r0
+
+// ##############################
+// ##########   NTT    ##########
+// ##############################
+
+//void pqcrystals_dilithium_ntt(int32_t p[N]);
+.global pqcrystals_dilithium_ntt
+.type pqcrystals_dilithium_ntt,%function
+.align 2
+pqcrystals_dilithium_ntt:
+  //bind aliases
+  ptr_p     .req R0
+  ptr_zeta  .req R1
+  zeta      .req R1
+  qinv      .req R2
+  q         .req R3
+  cntr      .req R4
+  pol4      .req R4
+  pol0      .req R5
+  pol1      .req R6
+  pol2      .req R7
+  pol3      .req R8
+  temp_h    .req R9
+  temp_l    .req R10
+  zeta0     .req R11
+  zeta1     .req R12
+  zeta2     .req R14
+  pol5     .req R11
+  pol6     .req R12
+  pol7     .req R14
+
+  //preserve registers
+  push {R4-R11, R14}
+  
+  //load constants, ptr
+  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
+  ldr.w q, inv_ntt_asm_smull_q
+
+  //stage 1 - 3
+  .equ distance, 512
+  .equ strincr, 4
+  
+  ldr ptr_zeta, =#zetas_new332
+  vldm ptr_zeta!, {s2-s8} 
+  vmov s0, ptr_zeta
+  
+  add.w temp_l, ptr_p, #32*strincr // 32 iterations
+  vmov s9, temp_l
+  1:
+    .rept 2
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol5, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol7, [ptr_p, #7*distance/4]
+
+    _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p], #strincr
+    .endr
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+    bne 1b
+  
+  sub ptr_p, #32*4
+
+  // stage 4 - 6  
+  .equ distance, 64
+  add.w temp_l, ptr_p, #8*112+8*4*4 // 8 iterations
+  vmov s9, temp_l
+  1:
+    add.w temp_l, ptr_p, #4*strincr // 4 iterations
+    vmov s10, temp_l
+    vmov ptr_zeta, s0
+    vldm ptr_zeta!, {s2-s8}
+    vmov s0, ptr_zeta
+    2:
+      .rept 2
+      ldr.w pol0, [ptr_p]
+      ldr.w pol1, [ptr_p, #1*distance/4]
+      ldr.w pol2, [ptr_p, #2*distance/4]
+      ldr.w pol3, [ptr_p, #3*distance/4]
+      ldr.w pol4, [ptr_p, #4*distance/4]
+      ldr.w pol5, [ptr_p, #5*distance/4]
+      ldr.w pol6, [ptr_p, #6*distance/4]
+      ldr.w pol7, [ptr_p, #7*distance/4]
+
+      _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+      
+      str.w pol1, [ptr_p, #1*distance/4]
+      str.w pol2, [ptr_p, #2*distance/4]
+      str.w pol3, [ptr_p, #3*distance/4]
+      str.w pol4, [ptr_p, #4*distance/4]
+      str.w pol5, [ptr_p, #5*distance/4]
+      str.w pol6, [ptr_p, #6*distance/4]
+      str.w pol7, [ptr_p, #7*distance/4]
+      str.w pol0, [ptr_p], #4
+      .endr
+      vmov temp_l, s10
+      cmp.w ptr_p, temp_l
+      bne 2b
+
+    add.w ptr_p, #112
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+    bne 1b
+  
+    sub ptr_p, #4*4*8+112*8
+    vmov ptr_zeta, s0
+    //stage 7 and 8
+    add cntr, ptr_p, #1024 // 64 iterations
+    1:
+      ldr.w zeta1, [ptr_zeta, #4]  //z128,..., z254
+      ldr.w zeta2, [ptr_zeta, #8]  //z129,..., z255
+      ldr zeta0, [ptr_zeta], #12  //z64, ..., z127
+      ldr.w pol0, [ptr_p]  //1*4
+      ldr.w pol1, [ptr_p, #4]
+      ldr.w pol2, [ptr_p, #8]
+      ldr.w pol3, [ptr_p, #12] 
+
+      _2_layer_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l
+
+      str.w pol1, [ptr_p, #4]
+      str.w pol2, [ptr_p, #8]
+      str.w pol3, [ptr_p, #12]
+      str pol0, [ptr_p], #16
+
+      cmp.w cntr, ptr_p
+      bne.w 1b
+
+    //restore registers
+    pop {R4-R11, PC}
+
+    //unbind aliases
+    .unreq ptr_p
+    .unreq ptr_zeta
+    .unreq qinv
+    .unreq q
+    .unreq cntr
+    .unreq pol0
+    .unreq pol1
+    .unreq pol2
+    .unreq pol3
+    .unreq temp_h
+    .unreq temp_l
+    .unreq zeta0
+    .unreq zeta1
+    .unreq zeta2
+
+.ltorg
+// ##############################
+// ##########  NTT^-1  ##########
+// ##############################
+
+//void pqcrystals_dilithium_invntt_tomont(int32_t p[N]);
+.global pqcrystals_dilithium_invntt_tomont
+.type pqcrystals_dilithium_invntt_tomont,%function
+.align 2
+pqcrystals_dilithium_invntt_tomont:
+  //bind aliases
+  ptr_p     .req R0
+  ptr_zeta  .req R1
+  zeta      .req R1
+  qinv      .req R2
+  q         .req R3
+  cntr      .req R4
+  pol4      .req R4
+  pol0      .req R5
+  pol1      .req R6
+  pol2      .req R7
+  pol3      .req R8
+  temp_h    .req R9
+  temp_l    .req R10
+  zeta0     .req R11
+  zeta1     .req R12
+  zeta2     .req R14
+  pol5     .req R11
+  pol6     .req R12
+  pol7     .req R14
+
+  //preserve registers
+  push {R4-R11, R14}
+    
+  //load constants, ptr
+  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
+  ldr.w q, inv_ntt_asm_smull_q
+
+  //stage 1 - 3
+  .equ distance, 16
+  .equ strincr, 32
+
+  ldr ptr_zeta, =#zetas_new332inv
+  vldm ptr_zeta!, {s2-s8} 
+  vmov s0, ptr_zeta
+  
+  add.w temp_l, ptr_p, #32*strincr // 32 iterations
+  vmov s9, temp_l
+  1:
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol1, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol3, [ptr_p, #7*distance/4]
+    _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p], #strincr
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+  bne.w 1b
+  
+  sub ptr_p, #32*strincr
+
+  // stage 4 - 6  
+  .equ distance, 128
+  .equ strincr, 256
+  
+  // iteration 0
+  movw temp_l, #4
+  add.w temp_l, ptr_p, #4*256 // 4 iterations
+  vmov s10, temp_l
+	
+  vmov ptr_zeta, s0
+  vldm ptr_zeta!, {s2-s8}
+  vmov s0, ptr_zeta
+
+  2:
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol1, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol3, [ptr_p, #7*distance/4]
+    _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p]
+    add.w ptr_p, #strincr
+
+    vmov temp_l, s10
+    cmp.w temp_l, ptr_p
+  bne.w 2b
+
+  sub.w ptr_p, #4*256-4
+
+  // iteration 1-7
+  add.w temp_l, ptr_p, #7*4 // 7 iterations
+  vmov s9, temp_l
+  1:
+    add.w temp_l, ptr_p, #4*strincr // 4 iterations
+    vmov s10, temp_l
+
+	  vmov ptr_zeta, s0
+    vldm ptr_zeta!, {s2-s8}
+    vmov s0, ptr_zeta
+    2:     
+	    ldr.w pol0, [ptr_p]
+	    ldr.w pol1, [ptr_p, #1*distance/4]
+	    ldr.w pol2, [ptr_p, #2*distance/4]
+	    ldr.w pol3, [ptr_p, #3*distance/4]
+	    ldr.w pol4, [ptr_p, #4*distance/4]
+	    ldr.w pol5, [ptr_p, #5*distance/4]
+	    ldr.w pol6, [ptr_p, #6*distance/4]
+	    ldr.w pol7, [ptr_p, #7*distance/4]
+
+	    _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+	    str.w pol1, [ptr_p, #1*distance/4]
+	    str.w pol2, [ptr_p, #2*distance/4]
+	    str.w pol3, [ptr_p, #3*distance/4]
+	    str.w pol4, [ptr_p, #4*distance/4]
+	    str.w pol5, [ptr_p, #5*distance/4]
+	    str.w pol6, [ptr_p, #6*distance/4]
+	    str.w pol7, [ptr_p, #7*distance/4]
+	    str.w pol0, [ptr_p]
+	    add.w ptr_p, #strincr
+
+      vmov temp_l, s10
+      cmp.w ptr_p, temp_l
+    bne 2b
+    sub.w ptr_p, #4*strincr-4
+
+    vmov temp_l, s9
+    cmp.w temp_l, ptr_p
+  bne 1b
+  
+  sub ptr_p, #8*4
+  vmov ptr_zeta, s0
+  
+  //stage 7 and 8
+  .equ strincr, 4
+
+  add.w cntr, ptr_p, #64*strincr // 64 iterations 
+  vmov s9, cntr
+  1:
+    ldr.w zeta1, [ptr_zeta, #4]
+    ldr.w zeta2, [ptr_zeta, #8]
+    ldr zeta0, [ptr_zeta], #12
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #256]
+    ldr.w pol2, [ptr_p, #512]
+    ldr.w pol3, [ptr_p, #768]
+
+    _2_layer_inv_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l
+
+    ldr.w zeta1, [ptr_zeta, #4]
+    ldr.w zeta2, [ptr_zeta, #8]
+    ldr.w zeta0, [ptr_zeta, #12]
+    ldr.w cntr, [ptr_zeta], #16
+    montgomery_mul_32 pol0, cntr, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol1, zeta1, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol2, zeta2, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol3, zeta0, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #256]
+    str.w pol2, [ptr_p, #512]
+    str.w pol3, [ptr_p, #768]
+    str pol0, [ptr_p], #strincr
+
+    vmov cntr, s9
+    cmp.w cntr, ptr_p
+    bne.w 1b
+
+    //restore registers
+    pop {R4-R11, PC}
+
+    //unbind aliases
+    .unreq ptr_p
+    .unreq ptr_zeta
+    .unreq qinv
+    .unreq q
+    .unreq cntr
+    .unreq pol0
+    .unreq pol1
+    .unreq pol2
+    .unreq pol3
+    .unreq temp_h
+    .unreq temp_l
+    .unreq zeta0
+    .unreq zeta1
+    .unreq zeta2
+
+.align 2
+inv_ntt_asm_smull_qinv:
+.word 0xfc7fdfff
+.align 2
+inv_ntt_asm_smull_q:
+.word 8380417
+
+.section .rodata
+
+.type zetas_new332, %object
+.align 2
+zetas_new332:
+.word 25847, -2608894, -518909, 237124, -777960, -876248, 466468, 1826347, 2725464, 1024112, 2706023, 95776, 3077325, 3530437, 2353451, -1079900, 3585928, -1661693, -3592148, -2537516, 3915439, -359251, -549488, -1119584, -3861115, -3043716, 3574422, -2867647, -2091905, 2619752, -2108549, 3539968, -300467, 2348700, -539299, 3119733, -2118186, -3859737, -1699267, -1643818, 3505694, -3821735, -2884855, -1399561, -3277672, 3507263, -2140649, -1600420, 3699596, 3111497, 1757237, -19422, 811944, 531354, 954230, 3881043, 2680103, 4010497, 280005, 3900724, -2556880, 2071892, -2797779, -3930395, 2091667, 3407706, -1528703, 2316500, 3817976, -3677745, -3342478, 2244091, -3041255, -2446433, -3562462, -1452451, 266997, 2434439, 3475950, -1235728, 3513181, 2176455, -3520352, -3759364, -1585221, -1197226, -3193378, -1257611, 900702, 1859098, 1939314, 909542, 819034, -4083598, 495491, -1613174, -1000202, -43260, -522500, -3190144, -655327, -3122442, -3157330, 2031748, 3207046, -3632928, -3556995, -525098, 126922, -768622, -3595838, 3412210, 342297, 286988, -983419, -2437823, 4108315, 2147896, 3437287, -3342277, 2715295, 1735879, 203044, -2967645, 2842341, 2691481, -3693493, -2590150, 1265009, -411027, 4055324, 1247620, -2477047, 2486353, 1595974, -671102, -3767016, 1250494, -1228525, 2635921, -3548272, -22981, -2994039, 1869119, -1308169, 1903435, -1050970, -381987, -1333058, 1237275, 1349076, -3318210, -1430225, 1852771, -451100, 1312455, -1430430, 3306115, -1962642, -3343383, -1279661, 1917081, 264944, -2546312, -1374803, 508951, 1500165, 777191, 3097992, 2235880, 3406031, 44288, -542412, -2831860, -1100098, -1671176, -1846953, 904516, -2584293, -3724270, 3958618, 594136, -3776993, -3724342, -2013608, 2432395, -8578, 2454455, -164721, 1653064, 1957272, 3369112, -3249728, 185531, -1207385, 2389356, -3183426, 162844, -210977, 1616392, 3014001, 759969, 810149, 1652634, -1316856, -3694233, -1799107, 189548, -3038916, 3523897, -3553272, 3866901, 269760, 3159746, 2213111, -975884, -1851402, 1717735, 472078, -2409325, -426683, 1723600, -177440, -1803090, 1910376, 1315589, -1667432, -1104333, 1341330, -260646, -3833893, 1285669, -2939036, -2235985, -1584928, -420899, -2286327, -812732, 183443, -976891, -1439742, 1612842, -3545687, -3019102, -554416, 3919660, -3881060, -48306, -1362209, -3628969, 3937738, 1400424, 3839961, -846154, 1976782
+.size zetas_new332,.-zetas_new332
+
+.type zetas_new332inv, %object
+.align 2
+zetas_new332inv:
+.word 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, -466468, -2680103, -3111497, -280005, 19422, -4010497, -1757237, 518909, -466468, 876248, -2680103, 2884855, -3111497, -3119733, 777960, 2091905, 359251, 2108549, 1119584, -2619752, 549488, -25847, 518909, 2608894, -466468, 777960, 876248, -237124, 876248, 2884855, -3119733, 3277672, 3859737, 1399561, 2118186, 2608894, 777960, -237124, 2091905, -2353451, 359251, -1826347, -237124, -2353451, -1826347, -3585928, -1024112, 1079900, -2725464, 4193792, 4193792, -25847, 41978, 3024400, 3975713, -1225192, 2797779, -3839961, 3628969, -1711436, 3835778, 485110, -3954267, -280005, 2797779, -2071892, -2831100, -2698859, -908040, -2292170, 539299, 1430430, -1852771, -3658785, 3512212, 1859141, -1607594, -2680103, -280005, -4010497, 715005, 1483994, -1045894, -980943, -3699596, 1316856, -759969, -955715, 3677139, 3933849, 2719610, 2108549, 539299, -2348700, 1658328, -1403403, 1775852, -2460465, -3915439, -126922, 3632928, 1067023, 3847594, 4179270, 1652689, -466468, -2680103, -3111497, -2953811, -284642, 2507426, -324139, -3881043, -1341330, -1315589, 3990128, -2137097, -4109898, 4092021, 3277672, -3699596, 1600420, 1541634, 3493410, 3487504, 2497815, 2867647, 2477047, 411027, 1654972, 1326223, -2608226, -2752209, 2091905, 2108549, -2619752, 1836700, 2945615, -1908953, 729864, 3821735, -3958618, -904516, 2080615, 1555380, -3471815, -1978758, -3585928, -3915439, 2537516, -892788, -553664, -3095038, 658596, -3530437, 1585221, -2176455, 3355482, -1783485, 2780552, -3623330, 518909, -466468, 876248, -442683, 2523147, -2847660, -3683140, 2556880, 1439742, 812732, 774207, -3168108, 1877157, 3406477, 19422, -3881043, -954230, -214686, -1182619, 2453526, -2201920, 300467, 1308169, 22981, 3614022, 2136260, 1459487, -2233803, 2884855, 3277672, 1399561, 394072, -3933227, 4136064, 156486, 2140649, 3249728, -1653064, 1596950, 633578, 2722529, -554462, 1119584, 2867647, -3574422, 1004840, 191586, 3969463, 1161373, 3592148, 1000202, 4083598, 3189243, 3561667, -3650125, 3490511, 777960, 2091905, 359251, -1829156, -3707725, -661807, 1144558, -531354, 1851402, -3159746, 1543095, -2903948, 1505516, -1500460, 3859737, 3821735, -3505694, -2413330, 3908886, -1203856, 3570263, 3043716, -2715295, -2147896, 758741, 3917553, -2414897, -1613811, -2353451, -3585928, 1079900, 990020, -719638, 2718792, 2260310, 1643818, -3097992, -508951, -783456, -2089539, 2616547, 4060031, -1024112, -3530437, -3077325, -1821861, 1920615, 3988525, 2048419, -95776, 3041255, 3677745, -971504, 2190617, 2311312, -1170082, -25847, 518909, 2608894, 1261528, -2073537, -959585, 3948120, -2071892, 3881060, 3019102, -1342633, -1115066, 3589694, -1929116, -4010497, 2556880, -3900724, 3360006, 1758630, -2306989, -1841637, -2348700, -1349076, 381987, -1699982, 3189673, 3531558, -1210546, -3111497, 19422, -1757237, 2977353, 2612035, -2718155, -1544829, 1600420, 210977, -2389356, 2052582, -2737802, 2383976, -450259, -2619752, 300467, -3539968, 1698289, -4065084, -644023, -1114140, 2537516, 3157330, 3190144, -993399, -2220524, 2920588, 252737, 876248, 2884855, -3119733, 1490985, -34731, -1212610, -3183745, -954230, 177440, 2409325, -3302554, -2390327, -2749545, 653128, 1399561, 2140649, -3507263, -3745105, -1942293, -3367121, 2734884, -3574422, 3693493, 2967645, 1393803, -2467905, 1786029, -1633410, 359251, 1119584, 549488, -2824548, -1325638, -2207625, -2601586, -3505694, 1100098, -44288, 3478676, -2457992, -1617107, 2551364, 1079900, 3592148, 1661693, 1593929, 318899, -3366475, 3118416, -3077325, -3475950, 1452451, 3772814, 1424805, -3391376, 632820, 2608894, 777960, -237124, 2062597, 4064335, 2197148, -1127864, -3900724, 1584928, -1285669, 2525341, -896437, -1915773, 1792087, -1757237, -531354, -811944, 938441, -674578, 2876837, 3959371, -3539968, 1228525, 671102, 1219592, -3853560, 2630979, -2134676, -3119733, 3859737, 2118186, -2432637, 2746655, 718593, -2353280, -3507263, 8578, 3724342, -34852, 1387945, 358956, 1604944, 549488, 3043716, 3861115, 1290746, 3208584, 2538711, -1442830, 1661693, -1939314, 1257611, -367371, -1308058, 264382, 2614173, -237124, -2353451, -1826347, 2050674, 592050, -138487, 2310528, -811944, 3553272, -189548, -2728561, -4168358, -79, 3844932, 2118186, 1643818, 1699267, 500408, 743398, 879633, -3105206, 3861115, 983419, -3412210, 712597, -23479, 3729381, -1010481, -1826347, -1024112, -2725464, -2361217, -1864453, 3850522, 2337144, 1699267, -264944, 3343383, 3842267, 4181974, -4032642, 3983585, -2725464, -95776, -2706023, 260345, 2526550, 2000777, 987079, -2706023, 1528703, 3930395, -3030761, -3082055, -2374824, 1836319
+.size zetas_new332inv,.-zetas_new332inv
diff --git a/crypto_sign/dilithium3/m4fstack/ntt.h b/crypto_sign/dilithium3/m4fstack/ntt.h
new file mode 100644
index 00000000..731132d5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/ntt.h
@@ -0,0 +1,13 @@
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define ntt DILITHIUM_NAMESPACE(ntt)
+void ntt(int32_t a[N]);
+
+#define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
+void invntt_tomont(int32_t a[N]);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/packing.c b/crypto_sign/dilithium3/m4fstack/packing.c
new file mode 100644
index 00000000..8aaff2a3
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/packing.c
@@ -0,0 +1,286 @@
+#include "params.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Bit-pack public key pk = (rho, t1).
+*
+* Arguments:   - uint8_t pk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const polyveck *t1: pointer to vector t1
+**************************************************/
+void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const polyveck *t1)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    pk[i] = rho[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: Unpack public key pk = (rho, t1).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const polyveck *t1: pointer to output vector t1
+*              - uint8_t pk[]: byte array containing bit-packed pk
+**************************************************/
+void unpack_pk(uint8_t rho[SEEDBYTES],
+               polyveck *t1,
+               const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = pk[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES);
+}
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - uint8_t sk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const uint8_t tr[]: byte array containing tr
+*              - const uint8_t key[]: byte array containing key
+*              - const polyveck *t0: pointer to vector t0
+*              - const polyvecl *s1: pointer to vector s1
+*              - const polyveck *s2: pointer to vector s2
+**************************************************/
+void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const uint8_t tr[TRBYTES],
+             const uint8_t key[SEEDBYTES],
+             const polyveck *t0,
+             const polyvecl *s1,
+             const polyveck *s2)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = rho[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = key[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    sk[i] = tr[i];
+  sk += TRBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const uint8_t tr[]: output byte array for tr
+*              - const uint8_t key[]: output byte array for key
+*              - const polyveck *t0: pointer to output vector t0
+*              - const polyvecl *s1: pointer to output vector s1
+*              - const polyveck *s2: pointer to output vector s2
+*              - uint8_t sk[]: byte array containing bit-packed sk
+**************************************************/
+void unpack_sk(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               polyveck *t0,
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    tr[i] = sk[i];
+  sk += TRBYTES;
+
+  for(i=0; i < L; ++i)
+    small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES);
+}
+
+
+/*************************************************
+* Name:        pack_sig
+*
+* Description: Bit-pack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t sig[]: output byte array
+*              - const uint8_t *c: pointer to challenge hash length SEEDBYTES
+*              - const polyvecl *z: pointer to vector z
+*              - const polyveck *h: pointer to hint vector h
+**************************************************/
+void pack_sig(uint8_t sig[CRYPTO_BYTES],
+              const uint8_t c[CTILDEBYTES],
+              const polyvecl *z,
+              const polyveck *h)
+{
+  unsigned int i, j, k;
+
+  for(i=0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Encode h */
+  for(i = 0; i < OMEGA + K; ++i)
+    sig[i] = 0;
+
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      if(h->vec[i].coeffs[j] != 0)
+        sig[k++] = j;
+
+    sig[OMEGA + i] = k;
+  }
+}
+
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES],
+              const uint8_t c[CTILDEBYTES])
+{
+  unsigned int i;
+
+  for(i=0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+}
+
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES],
+              const polyvecl *z)
+{
+  unsigned int i;
+  sig += CTILDEBYTES;
+  for(i = 0; i < L; ++i)
+    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
+}
+
+
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written)
+{
+  sig += CTILDEBYTES;
+  sig += L*POLYZ_PACKEDBYTES;
+
+  // Encode h
+  for (unsigned int j = 0; j < N; j++) {
+      if (h_elem->coeffs[j] != 0) {
+          sig[*hints_written] = (uint8_t)j;
+          (*hints_written)++;
+      }
+  }
+  sig[OMEGA + idx] = (uint8_t)*hints_written;
+}
+
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                unsigned int *hints_written) {
+    sig += CTILDEBYTES;
+    sig += L * POLYZ_PACKEDBYTES;
+    while (*hints_written < OMEGA) {
+        sig[*hints_written] = 0;
+        (*hints_written)++;
+    }
+}
+
+/*************************************************
+* Name:        unpack_sig
+*
+* Description: Unpack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t *c: pointer to output challenge hash
+*              - polyvecl *z: pointer to output vector z
+*              - polyveck *h: pointer to output hint vector h
+*              - const uint8_t sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig(uint8_t c[CTILDEBYTES],
+               polyvecl *z,
+               polyveck *h,
+               const uint8_t sig[CRYPTO_BYTES])
+{
+  unsigned int i, j, k;
+
+  for(i = 0; i < CTILDEBYTES; ++i)
+    c[i] = sig[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Decode h */
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      h->vec[i].coeffs[j] = 0;
+
+    if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
+      return 1;
+
+    for(j = k; j < sig[OMEGA + i]; ++j) {
+      /* Coefficients are ordered for strong unforgeability */
+      if(j > k && sig[j] <= sig[j-1]) return 1;
+      h->vec[i].coeffs[sig[j]] = 1;
+    }
+
+    k = sig[OMEGA + i];
+  }
+
+  /* Extra indices are zero for strong unforgeability */
+  for(j = k; j < OMEGA; ++j)
+    if(sig[j])
+      return 1;
+
+  return 0;
+}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/packing.h b/crypto_sign/dilithium3/m4fstack/packing.h
new file mode 100644
index 00000000..35553545
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/packing.h
@@ -0,0 +1,55 @@
+#ifndef PACKING_H
+#define PACKING_H
+
+#include <stdint.h>
+#include "params.h"
+#include "polyvec.h"
+#include "smallpoly.h"
+
+#define pack_pk DILITHIUM_NAMESPACE(pack_pk)
+void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);
+
+#define pack_sk DILITHIUM_NAMESPACE(pack_sk)
+void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const uint8_t tr[TRBYTES],
+             const uint8_t key[SEEDBYTES],
+             const polyveck *t0,
+             const polyvecl *s1,
+             const polyveck *s2);
+
+#define pack_sig DILITHIUM_NAMESPACE(pack_sig)
+void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h);
+
+#define unpack_pk DILITHIUM_NAMESPACE(unpack_pk)
+void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]);
+
+#define unpack_sk DILITHIUM_NAMESPACE(unpack_sk)
+void unpack_sk(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               polyveck *t0,
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+
+#define unpack_sig DILITHIUM_NAMESPACE(unpack_sig)
+int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]);
+
+#define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c)
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]);
+
+#define pack_sig_z DILITHIUM_NAMESPACE(pack_sig_z)
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES], const polyvecl *z);
+
+#define pack_sig_h DILITHIUM_NAMESPACE(pack_sig_h)
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written);
+
+#define pack_sig_h_zero DILITHIUM_NAMESPACE(pack_sig_h_zero)
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                unsigned int *hints_written);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/params.h b/crypto_sign/dilithium3/m4fstack/params.h
new file mode 100644
index 00000000..507de467
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/params.h
@@ -0,0 +1,83 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include "config.h"
+
+#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium_##s
+
+
+#define SEEDBYTES 32
+#define CRHBYTES 64
+#define TRBYTES 64
+#define RNDBYTES 32
+#define N 256
+#define Q 8380417
+#define D 13
+#define ROOT_OF_UNITY 1753
+
+#if DILITHIUM_MODE == 2
+#define K 4
+#define L 4
+#define ETA 2
+#define TAU 39
+#define BETA 78
+#define GAMMA1 (1 << 17)
+#define GAMMA2 ((Q-1)/88)
+#define OMEGA 80
+#define CTILDEBYTES 32
+
+#elif DILITHIUM_MODE == 3
+#define K 6
+#define L 5
+#define ETA 4
+#define TAU 49
+#define BETA 196
+#define GAMMA1 (1 << 19)
+#define GAMMA2 ((Q-1)/32)
+#define OMEGA 55
+#define CTILDEBYTES 48
+
+#elif DILITHIUM_MODE == 5
+#define K 8
+#define L 7
+#define ETA 2
+#define TAU 60
+#define BETA 120
+#define GAMMA1 (1 << 19)
+#define GAMMA2 ((Q-1)/32)
+#define OMEGA 75
+#define CTILDEBYTES 64
+
+#endif
+
+#define POLYT1_PACKEDBYTES  320
+#define POLYT0_PACKEDBYTES  416
+#define POLYVECH_PACKEDBYTES (OMEGA + K)
+
+#if GAMMA1 == (1 << 17)
+#define POLYZ_PACKEDBYTES   576
+#elif GAMMA1 == (1 << 19)
+#define POLYZ_PACKEDBYTES   640
+#endif
+
+#if GAMMA2 == (Q-1)/88
+#define POLYW1_PACKEDBYTES  192
+#elif GAMMA2 == (Q-1)/32
+#define POLYW1_PACKEDBYTES  128
+#endif
+
+#if ETA == 2
+#define POLYETA_PACKEDBYTES  96
+#elif ETA == 4
+#define POLYETA_PACKEDBYTES 128
+#endif
+
+#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
+#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \
+                               + TRBYTES \
+                               + L*POLYETA_PACKEDBYTES \
+                               + K*POLYETA_PACKEDBYTES \
+                               + K*POLYT0_PACKEDBYTES)
+#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.h b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h
new file mode 100644
index 00000000..2647a110
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h
@@ -0,0 +1,13 @@
+#ifndef POINTWISE_MONT_H
+#define POINTWISE_MONT_H
+
+#include <stdint.h>
+#include "params.h"
+
+
+#define asm_pointwise_montgomery DILITHIUM_NAMESPACE(asm_pointwise_montgomery)
+void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+#define asm_pointwise_acc_montgomery DILITHIUM_NAMESPACE(asm_pointwise_acc_montgomery)
+void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.s b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s
new file mode 100644
index 00000000..e21125d7
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s
@@ -0,0 +1,128 @@
+.syntax unified
+.thumb
+
+.macro montgomery_multiplication res, pa, pb, q, qinv
+    smull \pa, \res, \pa, \pb
+    mul \pb, \pa, \qinv
+    smlal \pa, \res, \pb, \q
+.endm
+
+
+// void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+.global pqcrystals_dilithium_asm_pointwise_montgomery
+.type pqcrystals_dilithium_asm_pointwise_montgomery,%function
+.align 2
+pqcrystals_dilithium_asm_pointwise_montgomery:
+    push.w {r4-r11, r14}
+    c_ptr .req r0
+    a_ptr .req r1
+    b_ptr .req r2
+    qinv  .req r3
+    q     .req r4
+    pa0   .req r5
+    pa1   .req r6
+    pa2   .req r7
+    pb0   .req r8
+    pb1   .req r9
+    pb2   .req r10
+    tmp0  .req r11
+    ctr   .req r12
+    res   .req r14
+
+    movw qinv, #:lower16:0xfc7fdfff
+    movt qinv, #:upper16:0xfc7fdfff
+    movw q, #0xE001
+    movt q, #0x7F
+
+
+    // 85x3 = 255 coefficients
+    movw ctr, #85
+    1:
+        ldr.w pa1, [a_ptr, #4]
+        ldr.w pa2, [a_ptr, #8]
+        ldr pa0, [a_ptr], #12
+        ldr.w pb1, [b_ptr, #4]
+        ldr.w pb2, [b_ptr, #8]
+        ldr pb0, [b_ptr], #12
+
+        montgomery_multiplication res, pa0, pb0, q, qinv
+        str res, [c_ptr], #4
+        montgomery_multiplication res, pa1, pb1, q, qinv
+        str res, [c_ptr], #4
+        montgomery_multiplication res, pa2, pb2, q, qinv
+        str res, [c_ptr], #4
+    subs ctr, #1
+    bne.w 1b
+
+    // final coefficient
+    ldr.w pa0, [a_ptr]
+    ldr.w pb0, [b_ptr]
+    montgomery_multiplication res, pa0, pb0, q, qinv
+    str.w res, [c_ptr]
+
+    pop.w {r4-r11, pc}
+.size pqcrystals_dilithium_asm_pointwise_montgomery, .-pqcrystals_dilithium_asm_pointwise_montgomery
+
+// void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+.global pqcrystals_dilithium_asm_pointwise_acc_montgomery
+.type pqcrystals_dilithium_asm_pointwise_acc_montgomery,%function
+.align 2
+pqcrystals_dilithium_asm_pointwise_acc_montgomery:
+    push.w {r4-r11, r14}
+    c_ptr .req r0
+    a_ptr .req r1
+    b_ptr .req r2
+    qinv  .req r3
+    q     .req r4
+    pa0   .req r5
+    pa1   .req r6
+    pa2   .req r7
+    pb0   .req r8
+    pb1   .req r9
+    pb2   .req r10
+    tmp0  .req r11
+    ctr   .req r12
+    res   .req r14
+
+    movw qinv, #:lower16:0xfc7fdfff
+    movt qinv, #:upper16:0xfc7fdfff
+    movw q, #0xE001
+    movt q, #0x7F
+
+
+    // 85x3 = 255 coefficients
+    movw ctr, #85
+    1:
+        ldr.w pa1, [a_ptr, #4]
+        ldr.w pa2, [a_ptr, #8]
+        ldr pa0, [a_ptr], #12
+        ldr.w pb1, [b_ptr, #4]
+        ldr.w pb2, [b_ptr, #8]
+        ldr pb0, [b_ptr], #12
+
+        montgomery_multiplication res, pa0, pb0, q, qinv
+        montgomery_multiplication pa0, pa1, pb1, q, qinv
+        montgomery_multiplication pa1, pa2, pb2, q, qinv
+
+        ldr.w pb0, [c_ptr]
+        ldr.w pb1, [c_ptr, #4]
+        ldr.w pb2, [c_ptr, #8]
+        add.w res, res, pb0
+        str res, [c_ptr], #12
+        add.w pa0, pa0, pb1
+        str pa0, [c_ptr, #-8]
+        add.w pa1, pa1, pb2
+        str pa1, [c_ptr, #-4]
+    subs ctr, #1
+    bne.w 1b
+
+    // final coefficient
+    ldr.w pa0, [a_ptr]
+    ldr.w pb0, [b_ptr]
+    ldr.w pa1, [c_ptr]
+    montgomery_multiplication res, pa0, pb0, q, qinv
+    add.w res, res, pa1
+    str.w res, [c_ptr]
+
+    pop.w {r4-r11, pc}
+.size pqcrystals_dilithium_asm_pointwise_acc_montgomery, .-pqcrystals_dilithium_asm_pointwise_acc_montgomery
diff --git a/crypto_sign/dilithium3/m4fstack/poly.c b/crypto_sign/dilithium3/m4fstack/poly.c
new file mode 100644
index 00000000..0d40fda3
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/poly.c
@@ -0,0 +1,851 @@
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+#include "vector.h"
+#include "ntt.h"
+#include "pointwise_mont.h"
+#include "rounding.h"
+#include "symmetric.h"
+
+#include <stdio.h>
+#include "hal.h"
+
+#ifdef DBENCH
+#include "test/cpucycles.h"
+extern const uint64_t timing_overhead;
+extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack;
+#define DBENCH_START() uint64_t time = cpucycles()
+#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead
+#else
+#define DBENCH_START()
+#define DBENCH_STOP(t)
+#endif
+
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Inplace reduction of all coefficients of polynomial to
+*              representative in [-6283009,6283007].
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *a) {
+  asm_reduce32(a->coeffs);
+}
+
+/*************************************************
+* Name:        poly_caddq
+*
+* Description: For all coefficients of in/out polynomial add Q if
+*              coefficient is negative.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_caddq(poly *a) {
+  asm_caddq(a->coeffs);
+}
+
+#if 0
+/*************************************************
+* Name:        poly_freeze
+*
+* Description: Inplace reduction of all coefficients of polynomial to
+*              standard representatives.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_freeze(poly *a) {
+    asm_freeze(a->coeffs);
+}
+#endif
+
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add polynomials. No modular reduction is performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first summand
+*              - const poly *b: pointer to second summand
+**************************************************/
+void poly_add(poly *c, const poly *a, const poly *b)  {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract polynomials. No modular reduction is
+*              performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial to be
+*                               subtraced from first input polynomial
+**************************************************/
+void poly_sub(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_shiftl
+*
+* Description: Multiply polynomial by 2^D without modular reduction. Assumes
+*              input coefficients to be less than 2^{31-D} in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_shiftl(poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a->coeffs[i] <<= D;
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Inplace forward NTT. Coefficients can grow by
+*              8*Q in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_ntt(poly *a) {
+  DBENCH_START();
+
+  ntt(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_invntt_tomont
+*
+* Description: Inplace inverse NTT and multiplication by 2^{32}.
+*              Input coefficients need to be less than Q in absolute
+*              value and output coefficients are again bounded by Q.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *a) {
+  DBENCH_START();
+
+  invntt_tomont(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_pointwise_montgomery
+*
+* Description: Pointwise multiplication of polynomials in NTT domain
+*              representation and multiplication of resulting polynomial
+*              by 2^{-32}.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+  DBENCH_START();
+
+  asm_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_pointwise_acc_montgomery
+*
+* Description: Pointwise multiplication of polynomials in NTT domain
+*              representation, multiplication of resulting polynomial
+*              by 2^{-32} and accumulate.
+*
+* Arguments:   - poly *c: pointer to output (accumulating) polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b) {
+  DBENCH_START();
+
+  asm_pointwise_acc_montgomery(c->coeffs, a->coeffs, b->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_power2round
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute c0, c1 such that c mod Q = c1*2^D + c0
+*              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_power2round(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_decompose
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
+*              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
+*              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_decompose(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_make_hint
+*
+* Description: Compute hint polynomial. The coefficients of which indicate
+*              whether the low bits of the corresponding coefficient of
+*              the input polynomial overflow into the high bits.
+*
+* Arguments:   - poly *h: pointer to output hint polynomial
+*              - const poly *a0: pointer to low part of input polynomial
+*              - const poly *a1: pointer to high part of input polynomial
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) {
+  unsigned int i, s = 0;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i) {
+    h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]);
+    s += h->coeffs[i];
+  }
+
+  DBENCH_STOP(*tround);
+  return s;
+}
+
+/*************************************************
+* Name:        poly_use_hint
+*
+* Description: Use hint polynomial to correct the high bits of a polynomial.
+*
+* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+*              - const poly *a: pointer to input polynomial
+*              - const poly *h: pointer to input hint polynomial
+**************************************************/
+void poly_use_hint(poly *b, const poly *a, const poly *h) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_chknorm
+*
+* Description: Check infinity norm of polynomial against given bound.
+*              Assumes input coefficients were reduced by reduce32().
+*
+* Arguments:   - const poly *a: pointer to polynomial
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
+**************************************************/
+int poly_chknorm(const poly *a, int32_t B) {
+  unsigned int i;
+  int32_t t;
+  DBENCH_START();
+
+  if(B > (Q-1)/8)
+    return 1;
+
+  /* It is ok to leak which coefficient violates the bound since
+     the probability for each coefficient is independent of secret
+     data but we must not leak the sign of the centralized representative. */
+  for(i = 0; i < N; ++i) {
+    /* Absolute value */
+    t = a->coeffs[i] >> 31;
+    t = a->coeffs[i] - (t & 2*a->coeffs[i]);
+
+    if(t >= B) {
+      DBENCH_STOP(*tsample);
+      return 1;
+    }
+  }
+
+  DBENCH_STOP(*tsample);
+  return 0;
+}
+
+/*************************************************
+* Name:        poly_uniform
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [0,Q-1] by performing rejection sampling on the
+*              output stream of SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
+void poly_uniform(poly *a,
+                  const uint8_t seed[SEEDBYTES],
+                  uint16_t nonce)
+{
+  unsigned int i, ctr, off;
+  unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2];
+  stream128_state state;
+
+  stream128_init(&state, seed, nonce);
+  stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
+
+  ctr = asm_rej_uniform(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    off = buflen % 3;
+    for(i = 0; i < off; ++i)
+      buf[i] = buf[buflen - off + i];
+
+    stream128_squeezeblocks(buf + off, 1, &state);
+    buflen = STREAM128_BLOCKBYTES + off;
+    ctr += asm_rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
+  }
+}
+
+/*************************************************
+* Name:        rej_eta
+*
+* Description: Sample uniformly random coefficients in [-ETA, ETA] by
+*              performing rejection sampling on array of random bytes.
+*
+* Arguments:   - int32_t *a: pointer to output array (allocated)
+*              - unsigned int len: number of coefficients to be sampled
+*              - const uint8_t *buf: array of random bytes
+*              - unsigned int buflen: length of array of random bytes
+*
+* Returns number of sampled coefficients. Can be smaller than len if not enough
+* random bytes were given.
+**************************************************/
+static unsigned int rej_eta(int32_t *a,
+                            unsigned int len,
+                            const uint8_t *buf,
+                            unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint32_t t0, t1;
+  DBENCH_START();
+
+  ctr = pos = 0;
+  while(ctr < len && pos < buflen) {
+    t0 = buf[pos] & 0x0F;
+    t1 = buf[pos++] >> 4;
+
+#if ETA == 2
+    if(t0 < 15) {
+      t0 = t0 - (205*t0 >> 10)*5;
+      a[ctr++] = 2 - t0;
+    }
+    if(t1 < 15 && ctr < len) {
+      t1 = t1 - (205*t1 >> 10)*5;
+      a[ctr++] = 2 - t1;
+    }
+#elif ETA == 4
+    if(t0 < 9)
+      a[ctr++] = 4 - t0;
+    if(t1 < 9 && ctr < len)
+      a[ctr++] = 4 - t1;
+#endif
+  }
+
+  DBENCH_STOP(*tsample);
+  return ctr;
+}
+
+/*************************************************
+* Name:        poly_uniform_eta
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-ETA,ETA] by performing rejection sampling on the
+*              output stream from SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#if ETA == 2
+#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#elif ETA == 4
+#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#endif
+void poly_uniform_eta(poly *a,
+        const uint8_t seed[CRHBYTES],
+        uint16_t nonce) {
+  unsigned int ctr;
+  unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
+
+  ctr = rej_eta(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    stream256_squeezeblocks(buf, 1, &state);
+    ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM256_BLOCKBYTES);
+  }
+}
+
+/*************************************************
+* Name:        poly_uniform_gamma1m1
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
+*              of SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
+*              - uint16_t nonce: 16-bit nonce
+**************************************************/
+#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+void poly_uniform_gamma1(poly *a,
+                         const uint8_t seed[CRHBYTES],
+                         uint16_t nonce)
+{
+  uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
+  polyz_unpack(a, buf);
+}
+
+/*************************************************
+* Name:        challenge
+*
+* Description: Implementation of H. Samples polynomial with TAU nonzero
+*              coefficients in {-1,1} using the output stream of
+*              SHAKE256(seed).
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
+**************************************************/
+void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) {
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[SHAKE256_RATE];
+  shake256incctx state;
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, seed, SEEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeezeblocks(buf, 1, &state);
+
+  signs = 0;
+  for(i = 0; i < 8; ++i)
+    signs |= (uint64_t)buf[i] << 8*i;
+  pos = 8;
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for(i = N-TAU; i < N; ++i) {
+    do {
+      if(pos >= SHAKE256_RATE) {
+        shake256_inc_squeezeblocks(buf, 1, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while(b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2*(signs & 1);
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+* Name:        polyeta_pack
+*
+* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYETA_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyeta_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint8_t t[8];
+  DBENCH_START();
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    t[0] = ETA - a->coeffs[8*i+0];
+    t[1] = ETA - a->coeffs[8*i+1];
+    t[2] = ETA - a->coeffs[8*i+2];
+    t[3] = ETA - a->coeffs[8*i+3];
+    t[4] = ETA - a->coeffs[8*i+4];
+    t[5] = ETA - a->coeffs[8*i+5];
+    t[6] = ETA - a->coeffs[8*i+6];
+    t[7] = ETA - a->coeffs[8*i+7];
+
+    r[3*i+0]  = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
+    r[3*i+1]  = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
+    r[3*i+2]  = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    t[0] = ETA - a->coeffs[2*i+0];
+    t[1] = ETA - a->coeffs[2*i+1];
+    r[i] = t[0] | (t[1] << 4);
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+
+/*************************************************
+* Name:        polyt1_pack
+*
+* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r[5*i+0] = (a->coeffs[4*i+0] >> 0);
+    r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2);
+    r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4);
+    r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6);
+    r[5*i+4] = (a->coeffs[4*i+3] >> 2);
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt1_unpack
+*
+* Description: Unpack polynomial t1 with 10-bit coefficients.
+*              Output coefficients are standard representatives.
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt1_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF;
+    r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF;
+    r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF;
+    r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_pack
+*
+* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT0_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt0_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[8];
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    t[0] = (1 << (D-1)) - a->coeffs[8*i+0];
+    t[1] = (1 << (D-1)) - a->coeffs[8*i+1];
+    t[2] = (1 << (D-1)) - a->coeffs[8*i+2];
+    t[3] = (1 << (D-1)) - a->coeffs[8*i+3];
+    t[4] = (1 << (D-1)) - a->coeffs[8*i+4];
+    t[5] = (1 << (D-1)) - a->coeffs[8*i+5];
+    t[6] = (1 << (D-1)) - a->coeffs[8*i+6];
+    t[7] = (1 << (D-1)) - a->coeffs[8*i+7];
+
+    r[13*i+ 0]  =  t[0];
+    r[13*i+ 1]  =  t[0] >>  8;
+    r[13*i+ 1] |=  t[1] <<  5;
+    r[13*i+ 2]  =  t[1] >>  3;
+    r[13*i+ 3]  =  t[1] >> 11;
+    r[13*i+ 3] |=  t[2] <<  2;
+    r[13*i+ 4]  =  t[2] >>  6;
+    r[13*i+ 4] |=  t[3] <<  7;
+    r[13*i+ 5]  =  t[3] >>  1;
+    r[13*i+ 6]  =  t[3] >>  9;
+    r[13*i+ 6] |=  t[4] <<  4;
+    r[13*i+ 7]  =  t[4] >>  4;
+    r[13*i+ 8]  =  t[4] >> 12;
+    r[13*i+ 8] |=  t[5] <<  1;
+    r[13*i+ 9]  =  t[5] >>  7;
+    r[13*i+ 9] |=  t[6] <<  6;
+    r[13*i+10]  =  t[6] >>  2;
+    r[13*i+11]  =  t[6] >> 10;
+    r[13*i+11] |=  t[7] <<  3;
+    r[13*i+12]  =  t[7] >>  5;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_unpack
+*
+* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt0_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0]  = a[13*i+0];
+    r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8;
+    r->coeffs[8*i+0] &= 0x1FFF;
+
+    r->coeffs[8*i+1]  = a[13*i+1] >> 5;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11;
+    r->coeffs[8*i+1] &= 0x1FFF;
+
+    r->coeffs[8*i+2]  = a[13*i+3] >> 2;
+    r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6;
+    r->coeffs[8*i+2] &= 0x1FFF;
+
+    r->coeffs[8*i+3]  = a[13*i+4] >> 7;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9;
+    r->coeffs[8*i+3] &= 0x1FFF;
+
+    r->coeffs[8*i+4]  = a[13*i+6] >> 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12;
+    r->coeffs[8*i+4] &= 0x1FFF;
+
+    r->coeffs[8*i+5]  = a[13*i+8] >> 1;
+    r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7;
+    r->coeffs[8*i+5] &= 0x1FFF;
+
+    r->coeffs[8*i+6]  = a[13*i+9] >> 6;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10;
+    r->coeffs[8*i+6] &= 0x1FFF;
+
+    r->coeffs[8*i+7]  = a[13*i+11] >> 3;
+    r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5;
+    r->coeffs[8*i+7] &= 0x1FFF;
+
+    r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7];
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_pack
+*
+* Description: Bit-pack polynomial with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYZ_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyz_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[4];
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    t[0] = GAMMA1 - a->coeffs[4*i+0];
+    t[1] = GAMMA1 - a->coeffs[4*i+1];
+    t[2] = GAMMA1 - a->coeffs[4*i+2];
+    t[3] = GAMMA1 - a->coeffs[4*i+3];
+
+    r[9*i+0]  = t[0];
+    r[9*i+1]  = t[0] >> 8;
+    r[9*i+2]  = t[0] >> 16;
+    r[9*i+2] |= t[1] << 2;
+    r[9*i+3]  = t[1] >> 6;
+    r[9*i+4]  = t[1] >> 14;
+    r[9*i+4] |= t[2] << 4;
+    r[9*i+5]  = t[2] >> 4;
+    r[9*i+6]  = t[2] >> 12;
+    r[9*i+6] |= t[3] << 6;
+    r[9*i+7]  = t[3] >> 2;
+    r[9*i+8]  = t[3] >> 10;
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    t[0] = GAMMA1 - a->coeffs[2*i+0];
+    t[1] = GAMMA1 - a->coeffs[2*i+1];
+
+    r[5*i+0]  = t[0];
+    r[5*i+1]  = t[0] >> 8;
+    r[5*i+2]  = t[0] >> 16;
+    r[5*i+2] |= t[1] << 4;
+    r[5*i+3]  = t[1] >> 4;
+    r[5*i+4]  = t[1] >> 12;
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_unpack
+*
+* Description: Unpack polynomial z with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyz_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0]  = a[9*i+0];
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8;
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16;
+    r->coeffs[4*i+0] &= 0x3FFFF;
+
+    r->coeffs[4*i+1]  = a[9*i+2] >> 2;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14;
+    r->coeffs[4*i+1] &= 0x3FFFF;
+
+    r->coeffs[4*i+2]  = a[9*i+4] >> 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12;
+    r->coeffs[4*i+2] &= 0x3FFFF;
+
+    r->coeffs[4*i+3]  = a[9*i+6] >> 6;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10;
+    r->coeffs[4*i+3] &= 0x3FFFF;
+
+    r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0];
+    r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1];
+    r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2];
+    r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3];
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0]  = a[5*i+0];
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8;
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16;
+    r->coeffs[2*i+0] &= 0xFFFFF;
+
+    r->coeffs[2*i+1]  = a[5*i+2] >> 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12;
+    r->coeffs[2*i+0] &= 0xFFFFF;
+
+    r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1];
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyw1_pack
+*
+* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYW1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyw1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA2 == (Q-1)/88
+  for(i = 0; i < N/4; ++i) {
+    r[3*i+0]  = a->coeffs[4*i+0];
+    r[3*i+0] |= a->coeffs[4*i+1] << 6;
+    r[3*i+1]  = a->coeffs[4*i+1] >> 2;
+    r[3*i+1] |= a->coeffs[4*i+2] << 4;
+    r[3*i+2]  = a->coeffs[4*i+2] >> 4;
+    r[3*i+2] |= a->coeffs[4*i+3] << 2;
+  }
+#elif GAMMA2 == (Q-1)/32
+  for(i = 0; i < N/2; ++i)
+    r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
+#endif
+
+  DBENCH_STOP(*tpack);
+}
diff --git a/crypto_sign/dilithium3/m4fstack/poly.h b/crypto_sign/dilithium3/m4fstack/poly.h
new file mode 100644
index 00000000..8f8819b0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/poly.h
@@ -0,0 +1,82 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include <stdint.h>
+#include "params.h"
+
+typedef struct {
+  int32_t coeffs[N];
+} poly;
+
+#define poly_reduce DILITHIUM_NAMESPACE(poly_reduce)
+void poly_reduce(poly *a);
+#define poly_caddq DILITHIUM_NAMESPACE(poly_caddq)
+void poly_caddq(poly *a);
+#define poly_freeze DILITHIUM_NAMESPACE(poly_freeze)
+void poly_freeze(poly *a);
+
+#define poly_add DILITHIUM_NAMESPACE(poly_add)
+void poly_add(poly *c, const poly *a, const poly *b);
+#define poly_sub DILITHIUM_NAMESPACE(poly_sub)
+void poly_sub(poly *c, const poly *a, const poly *b);
+#define poly_shiftl DILITHIUM_NAMESPACE(poly_shiftl)
+void poly_shiftl(poly *a);
+
+#define poly_ntt DILITHIUM_NAMESPACE(poly_ntt)
+void poly_ntt(poly *a);
+
+#define poly_invntt_tomont DILITHIUM_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *a);
+#define poly_pointwise_montgomery DILITHIUM_NAMESPACE(poly_pointwise_montgomery)
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);
+#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery)
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b);
+
+#define poly_power2round DILITHIUM_NAMESPACE(poly_power2round)
+void poly_power2round(poly *a1, poly *a0, const poly *a);
+#define poly_decompose DILITHIUM_NAMESPACE(poly_decompose)
+void poly_decompose(poly *a1, poly *a0, const poly *a);
+#define poly_make_hint DILITHIUM_NAMESPACE(poly_make_hint)
+unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1);
+#define poly_use_hint DILITHIUM_NAMESPACE(poly_use_hint)
+void poly_use_hint(poly *b, const poly *a, const poly *h);
+
+#define poly_chknorm DILITHIUM_NAMESPACE(poly_chknorm)
+int poly_chknorm(const poly *a, int32_t B);
+#define poly_uniform DILITHIUM_NAMESPACE(poly_uniform)
+void poly_uniform(poly *a,
+                  const uint8_t seed[SEEDBYTES],
+                  uint16_t nonce);
+#define poly_uniform_eta DILITHIUM_NAMESPACE(poly_uniform_eta)
+void poly_uniform_eta(poly *a,
+                      const uint8_t seed[CRHBYTES],
+                      uint16_t nonce);
+#define poly_uniform_gamma1 DILITHIUM_NAMESPACE(poly_uniform_gamma1)
+void poly_uniform_gamma1(poly *a,
+                         const uint8_t seed[CRHBYTES],
+                         uint16_t nonce);
+#define poly_challenge DILITHIUM_NAMESPACE(poly_challenge)
+void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);
+
+#define polyeta_pack DILITHIUM_NAMESPACE(polyeta_pack)
+void polyeta_pack(uint8_t *r, const poly *a);
+
+#define polyt1_pack DILITHIUM_NAMESPACE(polyt1_pack)
+void polyt1_pack(uint8_t *r, const poly *a);
+#define polyt1_unpack DILITHIUM_NAMESPACE(polyt1_unpack)
+void polyt1_unpack(poly *r, const uint8_t *a);
+
+#define polyt0_pack DILITHIUM_NAMESPACE(polyt0_pack)
+void polyt0_pack(uint8_t *r, const poly *a);
+#define polyt0_unpack DILITHIUM_NAMESPACE(polyt0_unpack)
+void polyt0_unpack(poly *r, const uint8_t *a);
+
+#define polyz_pack DILITHIUM_NAMESPACE(polyz_pack)
+void polyz_pack(uint8_t *r, const poly *a);
+#define polyz_unpack DILITHIUM_NAMESPACE(polyz_unpack)
+void polyz_unpack(poly *r, const uint8_t *a);
+
+#define polyw1_pack DILITHIUM_NAMESPACE(polyw1_pack)
+void polyw1_pack(uint8_t *r, const poly *a);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.c b/crypto_sign/dilithium3/m4fstack/polyvec.c
new file mode 100644
index 00000000..e20749c0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/polyvec.c
@@ -0,0 +1,429 @@
+#include <stdint.h>
+#include "params.h"
+#include "polyvec.h"
+#include "poly.h"
+
+#include <stdio.h>
+#include "hal.h"
+
+/*************************************************
+* Name:        expand_mat
+*
+* Description: Implementation of ExpandA. Generates matrix A with uniformly
+*              random coefficients a_{i,j} by performing rejection
+*              sampling on the output stream of SHAKE128(rho|j|i).
+*
+* Arguments:   - polyvecl mat[K]: output matrix
+*              - const uint8_t rho[]: byte array containing seed rho
+**************************************************/
+void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
+  unsigned int i, j;
+
+  for(i = 0; i < K; ++i)
+    for(j = 0; j < L; ++j)
+      poly_uniform(&mat[i].vec[j], rho, (i << 8) + j);
+}
+
+void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length L **************/
+/**************************************************************/
+
+void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i);
+}
+
+void polyvecl_reduce(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+#if 0
+/*************************************************
+* Name:        polyvecl_freeze
+*
+* Description: Reduce coefficients of polynomials in vector of length L
+*              to standard representatives.
+*
+* Arguments:   - polyvecl *v: pointer to input/output vector
+**************************************************/
+void polyvecl_freeze(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_freeze(&v->vec[i]);
+}
+#endif
+
+/*************************************************
+* Name:        polyvecl_add
+*
+* Description: Add vectors of polynomials of length L.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyvecl *w: pointer to output vector
+*              - const polyvecl *u: pointer to first summand
+*              - const polyvecl *v: pointer to second summand
+**************************************************/
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvecl_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length L. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyvecl *v: pointer to input/output vector
+**************************************************/
+void polyvecl_ntt(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+void polyvecl_invntt_tomont(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+
+
+/*************************************************
+* Name:        polyvecl_pointwise_acc_montgomery
+*
+* Description: Pointwise multiply vectors of polynomials of length L, multiply
+*              resulting vector by 2^{-32} and add (accumulate) polynomials
+*              in it. Input/output vectors are in NTT domain representation.
+*
+* Arguments:   - poly *w: output polynomial
+*              - const polyvecl *u: pointer to first input vector
+*              - const polyvecl *v: pointer to second input vector
+**************************************************/
+void polyvecl_pointwise_acc_montgomery(poly *w,
+                                       const polyvecl *u,
+                                       const polyvecl *v)
+{
+  unsigned int i;
+
+  poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
+  for(i = 1; i < L; ++i) {
+    poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]);
+  }
+}
+
+/*************************************************
+* Name:        polyvecl_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length L.
+*              Assumes input polyvecl to be reduced by polyvecl_reduce().
+*
+* Arguments:   - const polyvecl *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyvecl_chknorm(const polyvecl *v, int32_t bound)  {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length K **************/
+/**************************************************************/
+
+void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+/*************************************************
+* Name:        polyveck_reduce
+*
+* Description: Reduce coefficients of polynomials in vector of length K
+*              to representatives in [-6283009,6283007].
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_reduce(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_caddq
+*
+* Description: For all coefficients of polynomials in vector of length K
+*              add Q if coefficient is negative.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_caddq(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_caddq(&v->vec[i]);
+}
+
+#if 0
+/*************************************************
+* Name:        polyveck_freeze
+*
+* Description: Reduce coefficients of polynomials in vector of length K
+*              to standard representatives.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_freeze(polyveck *v)  {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_freeze(&v->vec[i]);
+}
+#endif
+
+/*************************************************
+* Name:        polyveck_add
+*
+* Description: Add vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first summand
+*              - const polyveck *v: pointer to second summand
+**************************************************/
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_sub
+*
+* Description: Subtract vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first input vector
+*              - const polyveck *v: pointer to second input vector to be
+*                                   subtracted from first input vector
+**************************************************/
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_shiftl
+*
+* Description: Multiply vector of polynomials of Length K by 2^D without modular
+*              reduction. Assumes input coefficients to be less than 2^{31-D}.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_shiftl(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_shiftl(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length K. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_ntt(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+
+
+/*************************************************
+* Name:        polyveck_invntt_tomont
+*
+* Description: Inverse NTT and multiplication by 2^{32} of polynomials
+*              in vector of length K. Input coefficients need to be less
+*              than 2*Q.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_invntt_tomont(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+
+void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+
+/*************************************************
+* Name:        polyveck_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length K.
+*              Assumes input polyveck to be reduced by polyveck_reduce().
+*
+* Arguments:   - const polyveck *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyveck_chknorm(const polyveck *v, int32_t bound) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        polyveck_power2round
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute a0, a1 such that a mod^+ Q = a1*2^D + a0
+*              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_decompose
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
+*              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
+*              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_make_hint
+*
+* Description: Compute hint vector.
+*
+* Arguments:   - polyveck *h: pointer to output vector
+*              - const polyveck *v0: pointer to low part of input vector
+*              - const polyveck *v1: pointer to high part of input vector
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int polyveck_make_hint(polyveck *h,
+                                const polyveck *v0,
+                                const polyveck *v1)
+{
+  unsigned int i, s = 0;
+
+  for(i = 0; i < K; ++i)
+    s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
+
+  return s;
+}
+
+/*************************************************
+* Name:        polyveck_use_hint
+*
+* Description: Use hint vector to correct the high bits of input vector.
+*
+* Arguments:   - polyveck *w: pointer to output vector of polynomials with
+*                             corrected high bits
+*              - const polyveck *u: pointer to input vector
+*              - const polyveck *h: pointer to input hint vector
+**************************************************/
+void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
+}
+
+void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]);
+}
diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.h b/crypto_sign/dilithium3/m4fstack/polyvec.h
new file mode 100644
index 00000000..d92cd753
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/polyvec.h
@@ -0,0 +1,99 @@
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+/* Vectors of polynomials of length L */
+typedef struct {
+  poly vec[L];
+} polyvecl;
+
+#define polyvecl_uniform_eta DILITHIUM_NAMESPACE(polyvecl_uniform_eta)
+void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyvecl_uniform_gamma1 DILITHIUM_NAMESPACE(polyvecl_uniform_gamma1)
+void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyvecl_reduce DILITHIUM_NAMESPACE(polyvecl_reduce)
+void polyvecl_reduce(polyvecl *v);
+
+#define polyvecl_freeze DILITHIUM_NAMESPACE(polyvecl_freeze)
+void polyvecl_freeze(polyvecl *v);
+
+#define polyvecl_add DILITHIUM_NAMESPACE(polyvecl_add)
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);
+
+#define polyvecl_ntt DILITHIUM_NAMESPACE(polyvecl_ntt)
+void polyvecl_ntt(polyvecl *v);
+#define polyvecl_invntt_tomont DILITHIUM_NAMESPACE(polyvecl_invntt_tomont)
+void polyvecl_invntt_tomont(polyvecl *v);
+#define polyvecl_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyvecl_pointwise_poly_montgomery)
+void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
+#define polyvecl_pointwise_acc_montgomery \
+        DILITHIUM_NAMESPACE(polyvecl_pointwise_acc_montgomery)
+void polyvecl_pointwise_acc_montgomery(poly *w,
+                                       const polyvecl *u,
+                                       const polyvecl *v);
+
+
+#define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm)
+int polyvecl_chknorm(const polyvecl *v, int32_t B);
+
+
+
+/* Vectors of polynomials of length K */
+typedef struct {
+  poly vec[K];
+} polyveck;
+
+#define polyveck_uniform_eta DILITHIUM_NAMESPACE(polyveck_uniform_eta)
+void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyveck_reduce DILITHIUM_NAMESPACE(polyveck_reduce)
+void polyveck_reduce(polyveck *v);
+#define polyveck_caddq DILITHIUM_NAMESPACE(polyveck_caddq)
+void polyveck_caddq(polyveck *v);
+#define polyveck_freeze DILITHIUM_NAMESPACE(polyveck_freeze)
+void polyveck_freeze(polyveck *v);
+
+#define polyveck_add DILITHIUM_NAMESPACE(polyveck_add)
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
+#define polyveck_sub DILITHIUM_NAMESPACE(polyveck_sub)
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
+#define polyveck_shiftl DILITHIUM_NAMESPACE(polyveck_shiftl)
+void polyveck_shiftl(polyveck *v);
+
+#define polyveck_ntt DILITHIUM_NAMESPACE(polyveck_ntt)
+void polyveck_ntt(polyveck *v);
+#define polyveck_invntt_tomont DILITHIUM_NAMESPACE(polyveck_invntt_tomont)
+void polyveck_invntt_tomont(polyveck *v);
+#define polyveck_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyveck_pointwise_poly_montgomery)
+void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);
+
+
+#define polyveck_chknorm DILITHIUM_NAMESPACE(polyveck_chknorm)
+int polyveck_chknorm(const polyveck *v, int32_t B);
+
+#define polyveck_power2round DILITHIUM_NAMESPACE(polyveck_power2round)
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
+#define polyveck_decompose DILITHIUM_NAMESPACE(polyveck_decompose)
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
+#define polyveck_make_hint DILITHIUM_NAMESPACE(polyveck_make_hint)
+unsigned int polyveck_make_hint(polyveck *h,
+                                const polyveck *v0,
+                                const polyveck *v1);
+#define polyveck_use_hint DILITHIUM_NAMESPACE(polyveck_use_hint)
+void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);
+
+#define polyveck_pack_w1 DILITHIUM_NAMESPACE(polyveck_pack_w1)
+void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1);
+
+#define polyvec_matrix_expand DILITHIUM_NAMESPACE(polyvec_matrix_expand)
+void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
+
+#define polyvec_matrix_pointwise_montgomery DILITHIUM_NAMESPACE(polyvec_matrix_pointwise_montgomery)
+void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h
new file mode 100644
index 00000000..02df5500
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/reduce.h
@@ -0,0 +1,29 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define MONT -4186625 // 2^32 % Q
+#define QINV 58728449 // q^(-1) mod 2^32
+
+#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce)
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
+*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
+*
+* Arguments:   - int64_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static inline int32_t montgomery_reduce(int64_t a) {
+  int32_t t;
+
+  t = (int64_t)(int32_t)a*QINV;
+  t = (a - (int64_t)t*Q) >> 32;
+  return t;
+}
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/rounding.c b/crypto_sign/dilithium3/m4fstack/rounding.c
new file mode 100644
index 00000000..889f0a29
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/rounding.c
@@ -0,0 +1,102 @@
+#include <stdint.h>
+#include "params.h"
+#include "rounding.h"
+
+/*************************************************
+* Name:        power2round
+*
+* Description: For finite field element a, compute a0, a1 such that
+*              a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
+*              Assumes a to be standard representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t power2round(int32_t *a0, int32_t a)  {
+  int32_t a1;
+
+  a1 = (a + (1 << (D-1)) - 1) >> D;
+  *a0 = a - (a1 << D);
+  return a1;
+}
+
+/*************************************************
+* Name:        decompose
+*
+* Description: For finite field element a, compute high and low bits a0, a1 such
+*              that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
+*              if a1 = (Q-1)/ALPHA where we set a1 = 0 and
+*              -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
+*              representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t decompose(int32_t *a0, int32_t a) {
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  *a0  = a - a1*2*GAMMA2;
+  *a0 -= (((Q-1)/2 - *a0) >> 31) & Q;
+  return a1;
+}
+
+/*************************************************
+* Name:        make_hint
+*
+* Description: Compute hint bit indicating whether the low bits of the
+*              input element overflow into the high bits.
+*
+* Arguments:   - int32_t a0: low bits of input element
+*              - int32_t a1: high bits of input element
+*
+* Returns 1 if overflow.
+**************************************************/
+unsigned int make_hint(int32_t a0, int32_t a1) {
+  if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0))
+    return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        use_hint
+*
+* Description: Correct high bits according to hint.
+*
+* Arguments:   - int32_t a: input element
+*              - unsigned int hint: hint bit
+*
+* Returns corrected high bits.
+**************************************************/
+int32_t use_hint(int32_t a, unsigned int hint) {
+  int32_t a0, a1;
+
+  a1 = decompose(&a0, a);
+  if(hint == 0)
+    return a1;
+
+#if GAMMA2 == (Q-1)/32
+  if(a0 > 0)
+    return (a1 + 1) & 15;
+  else
+    return (a1 - 1) & 15;
+#elif GAMMA2 == (Q-1)/88
+  if(a0 > 0)
+    return (a1 == 43) ?  0 : a1 + 1;
+  else
+    return (a1 ==  0) ? 43 : a1 - 1;
+#endif
+}
diff --git a/crypto_sign/dilithium3/m4fstack/rounding.h b/crypto_sign/dilithium3/m4fstack/rounding.h
new file mode 100644
index 00000000..b72e8e8d
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/rounding.h
@@ -0,0 +1,19 @@
+#ifndef ROUNDING_H
+#define ROUNDING_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define power2round DILITHIUM_NAMESPACE(power2round)
+int32_t power2round(int32_t *a0, int32_t a);
+
+#define decompose DILITHIUM_NAMESPACE(decompose)
+int32_t decompose(int32_t *a0, int32_t a);
+
+#define make_hint DILITHIUM_NAMESPACE(make_hint)
+unsigned int make_hint(int32_t a0, int32_t a1);
+
+#define use_hint DILITHIUM_NAMESPACE(use_hint)
+int32_t use_hint(int32_t a, unsigned int hint);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
new file mode 100644
index 00000000..04bec45c
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -0,0 +1,352 @@
+#include <stdint.h>
+#include "params.h"
+#include "sign.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "smallpoly.h"
+
+/*************************************************
+* Name:        crypto_sign_keypair
+*
+* Description: Generates public and private key.
+*
+* Arguments:   - uint8_t *pk: pointer to output public key (allocated
+*                             array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key (allocated
+*                             array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
+  uint8_t tr[TRBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+  polyvecl mat[K];
+  polyvecl s1, s1hat;
+  polyveck s2, t1, t0;
+
+  /* Get randomness for rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES);
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rhoprime + CRHBYTES;
+
+  /* Expand matrix */
+  polyvec_matrix_expand(mat, rho);
+
+  /* Sample short vectors s1 and s2 */
+  polyvecl_uniform_eta(&s1, rhoprime, 0);
+  polyveck_uniform_eta(&s2, rhoprime, L);
+
+  /* Matrix-vector multiplication */
+  s1hat = s1;
+  polyvecl_ntt(&s1hat);
+  polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
+  polyveck_reduce(&t1);
+  polyveck_invntt_tomont(&t1);
+
+  /* Add error vector s2 */
+  polyveck_add(&t1, &t1, &s2);
+
+  /* Extract t1 and write public key */
+  polyveck_caddq(&t1);
+  polyveck_power2round(&t1, &t0, &t1);
+  pack_pk(pk, rho, &t1);
+
+  /* Compute H(rho, t1) and write secret key */
+  shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
+
+  return 0;
+}
+
+
+/*************************************************
+* Name:        crypto_sign_signature
+*
+* Description: Computes signature.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_signature(uint8_t *sig,
+                          size_t *siglen,
+                          const uint8_t *m,
+                          size_t mlen,
+                          const uint8_t *sk)
+{
+  uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES];
+  uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd;
+  uint16_t nonce = 0;
+  unsigned int n;
+  polyvecl mat[K], y, z;
+  polyveck t0, w1, w0;
+  poly cp;
+  shake256incctx state;
+
+  smallpoly s1_prime[L];
+  smallpoly s2_prime[K];
+  smallpoly cp_small;
+  smallhalfpoly cp_small_prime;
+
+  rho = seedbuf;
+  tr = rho + SEEDBYTES;
+  key = tr + TRBYTES;
+  rnd = key + SEEDBYTES;
+  mu = rnd + RNDBYTES;
+  rhoprime = mu + CRHBYTES;
+  unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk);
+
+  /* Compute mu = CRH(tr, msg) */
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, tr, TRBYTES);
+  shake256_inc_absorb(&state, m, mlen);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(mu, CRHBYTES, &state);
+
+  for (n = 0; n < RNDBYTES; n++) {
+     rnd[n] = 0;
+  }
+  shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES);
+
+  /* Expand matrix and transform vectors */
+  polyvec_matrix_expand(mat, rho);
+  polyvecl_small_ntt(s1_prime);
+  polyveck_small_ntt(s2_prime);
+
+  polyveck_ntt(&t0);
+
+rej:
+  /* Sample intermediate vector y */
+  polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
+
+  /* Matrix-vector multiplication */
+  z = y;
+  polyvecl_ntt(&z);
+  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+  polyveck_reduce(&w1);
+  polyveck_invntt_tomont(&w1);
+
+  /* Decompose w and call the random oracle */
+  polyveck_caddq(&w1);
+  polyveck_decompose(&w1, &w0, &w1);
+  polyveck_pack_w1(sig, &w1);
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(sig, CTILDEBYTES, &state);
+  poly_challenge(&cp, sig);
+  
+  poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp);
+  poly_ntt(&cp);
+
+  /* Compute z, reject if it reveals secret */
+  polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime);
+
+  polyvecl_add(&z, &z, &y);
+  polyvecl_reduce(&z);
+  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
+    goto rej;
+
+
+  /* Write signature */
+  pack_sig_z(sig, &z);
+  unsigned int hint_n = 0;
+  unsigned int hints_written = 0;
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  for(unsigned int i = 0; i < K; ++i) {
+    poly *tmp = &z.vec[0];
+    poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]);
+
+    poly_sub(&w0.vec[i], &w0.vec[i], tmp);
+    poly_reduce(&w0.vec[i]);
+    if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA))
+      goto rej;
+
+    /* Compute hints for w1 */
+    poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]);
+
+    poly_invntt_tomont(tmp);
+    poly_reduce(tmp);
+
+    if(poly_chknorm(tmp, GAMMA2))
+      goto rej;
+    poly_add(&w0.vec[i], &w0.vec[i], tmp);
+    hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]);
+    if (hint_n > OMEGA) {
+      goto rej;
+    }
+    pack_sig_h(sig, tmp, i, &hints_written);
+  }
+  pack_sig_h_zero(sig, &hints_written);
+  *siglen = CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign
+*
+* Description: Compute signed message.
+*
+* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
+*                             array with CRYPTO_BYTES + mlen bytes),
+*                             can be equal to m
+*              - size_t *smlen: pointer to output length of signed
+*                               message
+*              - const uint8_t *m: pointer to message to be signed
+*              - size_t mlen: length of message
+*              - const uint8_t *sk: pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign(uint8_t *sm,
+                size_t *smlen,
+                const uint8_t *m,
+                size_t mlen,
+                const uint8_t *sk)
+{
+  size_t i;
+
+  for(i = 0; i < mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk);
+  *smlen += mlen;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_verify
+*
+* Description: Verifies signature.
+*
+* Arguments:   - uint8_t *m: pointer to input signature
+*              - size_t siglen: length of signature
+*              - const uint8_t *m: pointer to message
+*              - size_t mlen: length of message
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signature could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_verify(const uint8_t *sig,
+                       size_t siglen,
+                       const uint8_t *m,
+                       size_t mlen,
+                       const uint8_t *pk)
+{
+  unsigned int i;
+  uint8_t buf[K*POLYW1_PACKEDBYTES];
+  uint8_t rho[SEEDBYTES];
+  uint8_t mu[CRHBYTES];
+  uint8_t c[CTILDEBYTES];
+  uint8_t c2[CTILDEBYTES];
+  poly cp;
+  polyvecl mat[K], z;
+  polyveck t1, w1, h;
+  shake256incctx state;
+
+  if(siglen != CRYPTO_BYTES)
+    return -1;
+
+  unpack_pk(rho, &t1, pk);
+  if(unpack_sig(c, &z, &h, sig))
+    return -1;
+  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
+    return -1;
+
+  /* Compute CRH(h(rho, t1), msg) */
+  shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, m, mlen);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(mu, CRHBYTES, &state);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  poly_challenge(&cp, c);
+  polyvec_matrix_expand(mat, rho);
+
+  polyvecl_ntt(&z);
+  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+
+  poly_ntt(&cp);
+  polyveck_shiftl(&t1);
+  polyveck_ntt(&t1);
+  polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);
+
+  polyveck_sub(&w1, &w1, &t1);
+  polyveck_reduce(&w1);
+  polyveck_invntt_tomont(&w1);
+
+  /* Reconstruct w1 */
+  polyveck_caddq(&w1);
+  polyveck_use_hint(&w1, &w1, &h);
+  polyveck_pack_w1(buf, &w1);
+
+  /* Call random oracle and verify challenge */
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(c2, CTILDEBYTES, &state);
+  for(i = 0; i < CTILDEBYTES; ++i)
+    if(c[i] != c2[i])
+      return -1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_open
+*
+* Description: Verify signed message.
+*
+* Arguments:   - uint8_t *m: pointer to output message (allocated
+*                            array with smlen bytes), can be equal to sm
+*              - size_t *mlen: pointer to output length of message
+*              - const uint8_t *sm: pointer to signed message
+*              - size_t smlen: length of signed message
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signed message could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_open(uint8_t *m,
+                     size_t *mlen,
+                     const uint8_t *sm,
+                     size_t smlen,
+                     const uint8_t *pk)
+{
+  size_t i;
+
+  if(smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk))
+    goto badsig;
+  else {
+    /* All good, copy msg, return 0 */
+    for(i = 0; i < *mlen; ++i)
+      m[i] = sm[CRYPTO_BYTES + i];
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = -1;
+  for(i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
diff --git a/crypto_sign/dilithium3/m4fstack/sign.h b/crypto_sign/dilithium3/m4fstack/sign.h
new file mode 100644
index 00000000..42240b30
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/sign.h
@@ -0,0 +1,37 @@
+#ifndef SIGN_H
+#define SIGN_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+#include "api.h"
+#include "polyvec.h"
+#include "poly.h"
+
+#define challenge DILITHIUM_NAMESPACE(challenge)
+void challenge(poly *c, const uint8_t seed[SEEDBYTES]);
+
+// #define crypto_sign_keypair DILITHIUM_NAMESPACE(crypto_sign_keypair)
+// int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+// #define crypto_sign_signature DILITHIUM_NAMESPACE(signature)
+// int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+//                           const uint8_t *m, size_t mlen,
+//                           const uint8_t *sk);
+
+// #define crypto_sign DILITHIUM_NAMESPACE(crypto_sign)
+// int crypto_sign(uint8_t *sm, size_t *smlen,
+//                 const uint8_t *m, size_t mlen,
+//                 const uint8_t *sk);
+
+// #define crypto_sign_verify DILITHIUM_NAMESPACE(verify)
+// int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+//                        const uint8_t *m, size_t mlen,
+//                        const uint8_t *pk);
+
+// #define crypto_sign_open DILITHIUM_NAMESPACE(crypto_sign_open)
+// int crypto_sign_open(uint8_t *m, size_t *mlen,
+//                      const uint8_t *sm, size_t smlen,
+//                      const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S
new file mode 100644
index 00000000..747c111c
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.S
@@ -0,0 +1,837 @@
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+// general macros
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro montgomery q, qinv, a, tmp
+  smulbt \tmp, \a, \qinv
+  smlabb \tmp, \q, \tmp, \a
+.endm
+
+.macro montgomery_inplace q, qinv, a, tmp
+  smulbt \tmp, \a, \qinv
+  smlabb \a, \q, \tmp, \a
+.endm
+
+.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst
+  smulbb \tmp2, \a, \montconst
+  montgomery \q, \qinv, \tmp2, \tmp
+  smultb \a, \a, \montconst
+  montgomery \q, \qinv, \a, \tmp2
+  pkhtb \a, \tmp2, \tmp, asr#16
+.endm
+
+// #######
+// #######
+// # NTT #
+// #######
+// #######
+
+.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv
+    smulb\tb \tmp, \a, \twiddle
+    smult\tb \a, \a, \twiddle
+    montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
+    montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2
+    pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
+.endm
+
+.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
+  smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb
+  smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb
+  montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
+  montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp
+  pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
+  usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs)
+  uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs)
+.endm
+
+.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
+  doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
+  doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
+.endm
+
+.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
+    // layer 3
+    ldrh.w \twiddle, [\twiddle_ptr], #2
+    two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
+    two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 2
+    ldr.w \twiddle, [\twiddle_ptr], #4
+    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 1
+    ldr.w \twiddle, [\twiddle_ptr], #4
+    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    ldr.w \twiddle, [\twiddle_ptr], #4
+    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+.endm
+
+.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2
+    // layer 3
+    vmov \twiddle, \xi01
+    two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
+    two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 2
+    vmov \twiddle, \xi23
+    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 1
+    vmov \twiddle, \xi45
+    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    vmov \twiddle, \xi67
+    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+.endm
+
+.global small_ntt_asm
+.type small_ntt_asm, %function
+.align 2
+small_ntt_asm:
+  push {r4-r11, r14}
+  vpush.w {s16}
+
+  poly        .req r0
+  twiddle_ptr .req r1
+  poly0       .req r2
+  poly1       .req r3
+  poly2       .req r4
+  poly3       .req r5
+  poly4       .req r6
+  poly5       .req r7
+  poly6       .req r8
+  poly7       .req r9
+  twiddle     .req r10
+  qinv        .req r11
+  q           .req r11
+  tmp         .req r12
+  tmp2        .req r14
+
+  movw q, #769
+  movt qinv, #767
+
+  ### LAYER 7+6+5+4
+  .equ distance, 256
+  .equ offset, 32
+  .equ strincr, 4
+  // pre-load twiddle factors to FPU registers
+  vldm twiddle_ptr!, {s8-s15}
+
+
+  add tmp, poly, #strincr*8
+  vmov s16, tmp
+  1:
+    // load a1, a3, ..., a15
+    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+    // 8-NTT on a1, a3, ..., a15
+    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
+
+    // multiply coeffs by layer 4 twiddles for later use
+    vmov twiddle, s12
+    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s13
+    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s14
+    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s15
+    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
+
+    vmov s0, poly0 // a1
+    vmov s1, poly1 // a3
+    vmov s2, poly2 // a5
+    vmov s3, poly3 // a7
+    vmov s4, poly4 // a9
+    vmov s5, poly5 // a11
+    vmov s6, poly6 // a13
+    vmov s7, poly7 // a15
+
+    // ----------
+
+    // load a0, a2, ..., a14
+    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+    // 8-NTT on a0, a2, ..., a14
+    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
+
+    // layer 4 - 1
+    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+    vmov tmp2, s1 // load a3
+    vmov s1, poly0 // preserve a0
+    uadd16 poly0, poly1, tmp2
+    usub16 poly1, poly1, tmp2
+
+    vmov tmp2, s3 // load a7
+    vmov s3, poly2 // preserve a4
+    uadd16 poly2, poly3, tmp2
+    usub16 poly3, poly3, tmp2
+
+    vmov tmp2, s5 // load a11
+    vmov s5, poly4 // preserve a8
+    uadd16 poly4, poly5, tmp2
+    usub16 poly5, poly5, tmp2
+
+    vmov tmp2, s7 // load a15
+    vmov s7, poly6 // preserve a12
+    uadd16 poly6, poly7, tmp2
+    usub16 poly7, poly7, tmp2
+
+    str.w poly0, [poly, #1*distance/4]
+    str.w poly1, [poly, #1*distance/4+offset]
+    str.w poly2, [poly, #3*distance/4]
+    str.w poly3, [poly, #3*distance/4+offset]
+    str.w poly4, [poly, #5*distance/4]
+    str.w poly5, [poly, #5*distance/4+offset]
+    str.w poly6, [poly, #7*distance/4]
+    str.w poly7, [poly, #7*distance/4+offset]
+
+    // layer 4 - 2
+    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+    vmov tmp2, s1 // load a0
+    vmov poly1, s0 // load a1
+    uadd16 poly0, tmp2, poly1
+    usub16 poly1, tmp2, poly1
+
+    vmov tmp2, s3 // load a4
+    vmov poly3, s2 // load a5
+    uadd16 poly2, tmp2, poly3
+    usub16 poly3, tmp2, poly3
+
+    vmov tmp2, s5 // load a8
+    vmov poly5, s4 // load a9
+    uadd16 poly4, tmp2, poly5
+    usub16 poly5, tmp2, poly5
+
+    vmov tmp2, s7 // load a12
+    vmov poly7, s6 // load a13
+    uadd16 poly6, tmp2, poly7
+    usub16 poly7, tmp2, poly7
+
+    str.w poly1, [poly, #offset]
+    str.w poly2, [poly, #2*distance/4]
+    str.w poly3, [poly, #2*distance/4+offset]
+    str.w poly4, [poly, #4*distance/4]
+    str.w poly5, [poly, #4*distance/4+offset]
+    str.w poly6, [poly, #6*distance/4]
+    str.w poly7, [poly, #6*distance/4+offset]
+    str.w poly0, [poly], #4
+
+    vmov tmp, s16
+    cmp.w poly, tmp
+  bne.w 1b
+
+  sub.w poly, #8*strincr
+
+  ### LAYER 3+2+1
+
+  .equ distance, distance/16
+  .equ strincr, 32
+
+  add.w tmp, poly, #strincr*16
+  vmov s13, tmp
+
+  2:
+    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+    _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
+
+    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+    str.w poly1, [poly, #distance/4]
+    str.w poly2, [poly, #2*distance/4]
+    str.w poly3, [poly, #3*distance/4]
+    str.w poly0, [poly], #strincr
+
+    vmov tmp, s13
+    cmp.w poly, tmp
+  bne.w 2b
+
+  vpop.w {s16}
+  pop {r4-r11, pc}
+
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle
+.unreq qinv
+.unreq q
+.unreq tmp
+.unreq tmp2
+
+// ########
+// ########
+// # INTT #
+// ########
+// ########
+
+.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv
+  uadd16 \tmp, \a0, \a1
+  usub16 \a1, \a0, \a1
+  mov.w \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv
+  doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv
+  doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv
+.endm
+
+.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
+
+  // layer 1
+  sadd16.w \tmp, \c0, \c1 // c0, c1
+  ssub16.w \c1, \c0, \c1
+  sadd16.w \tmp2, \c2, \c3 // c2, c3
+  ssub16.w \c3, \c2, \c3
+
+  sadd16.w \c0, \c4, \c5 // c4, c5
+  ssub16.w \c5, \c4, \c5
+  sadd16.w \c2, \c6, \c7 // c6, c7
+  ssub16.w \c7, \c6, \c7
+  // c4, c6 are free at this point
+
+  // layer 2
+  sadd16.w \c6, \tmp, \tmp2 // c0, c2
+  ssub16.w \tmp2, \tmp, \tmp2
+  sadd16.w \c4, \c0, \c2 // c4, c6
+  ssub16.w \c2, \c0, \c2
+
+  vmov.w \twiddle, \xi12
+  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
+  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
+  // c0, c6 are free at this point
+
+  // layer 3
+  sadd16.w \c0, \c6, \c4 // c0, c4
+  ssub16.w \c4, \c6, \c4
+
+  vmov.w \twiddle, \xi34
+  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
+
+  vmov.w \twiddle, \xi56
+  // this block is one doublebutterfly
+  smulbb \tmp, \c2, \twiddle // c2, c6
+  smultb \c2, \c2, \twiddle
+  montgomery_inplace \q, \qinv, \tmp, \c6
+  montgomery_inplace \q, \qinv, \c2, \c6
+  pkhtb \tmp, \c2, \tmp, asr #16
+  ssub16.w \c6, \tmp2, \tmp
+  sadd16.w \c2, \tmp2, \tmp
+
+  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
+
+.endm
+
+.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
+
+  // layer 1
+  sadd16.w \tmp, \c0, \c1 // c0, c1
+  ssub16.w \c1, \c0, \c1
+  sadd16.w \tmp2, \c2, \c3 // c2, c3
+  ssub16.w \c3, \c2, \c3
+
+  sadd16.w \c0, \c4, \c5 // c4, c5
+  ssub16.w \c5, \c4, \c5
+  sadd16.w \c2, \c6, \c7 // c6, c7
+  ssub16.w \c7, \c6, \c7
+  // c4, c6 are free at this point
+
+  mov.w \c6, \tmp
+  mov.w \c4, \c0
+
+  // layer 2
+  vmov.w \twiddle, \xi12
+  doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv
+  doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv
+  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
+  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
+  // c0, c6 are free at this point
+
+  // layer 3
+  sadd16.w \c0, \c6, \c4 // c0, c4
+  ssub16.w \c4, \c6, \c4
+
+  vmov.w \twiddle, \xi34
+  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
+
+  vmov.w \twiddle, \xi56
+  // this block is one doublebutterfly
+  smulbb \tmp, \c2, \twiddle // c2, c6
+  smultb \c2, \c2, \twiddle
+  montgomery_inplace \q, \qinv, \tmp, \c6
+  montgomery_inplace \q, \qinv, \c2, \c6
+  pkhtb \tmp, \c2, \tmp, asr #16
+  ssub16.w \c6, \tmp2, \tmp
+  sadd16.w \c2, \tmp2, \tmp
+
+  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
+
+.endm
+
+.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
+    // layer 3
+    ldrh.w twiddle, [twiddle_ptr], #2
+    two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+    two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 2
+    ldr.w twiddle, [twiddle_ptr], #4
+    two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    // layer 1
+    ldr.w twiddle, [twiddle_ptr], #4
+    two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
+
+    ldr.w twiddle, [twiddle_ptr], #4
+    two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+.endm
+
+.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2
+    smulb\tb \tmp, \a, \twiddle
+    smmulr.w \tmp2, \tmp, \Qbar
+    mls.w \tmp, \tmp2, \Q, \tmp
+    smult\tb \a, \a, \twiddle
+    smmulr.w \tmp2, \a, \Qbar
+    mls.w \a, \tmp2, \Q, \a
+    pkhbt \a, \tmp, \a, lsl #16
+.endm
+
+.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2
+
+    movt \Q, #0
+
+    ldr.w \twiddle, [\twiddle_ptr], #4
+
+    mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2
+    mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2
+
+    ldr.w \twiddle, [\twiddle_ptr], #4
+
+    mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2
+    mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2
+
+    ldr.w \twiddle, [\twiddle_ptr], #4
+
+    mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2
+    mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2
+
+    ldr.w \twiddle, [\twiddle_ptr], #4
+
+    mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2
+    mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2
+
+    movt \Q, #767
+
+.endm
+
+.global small_invntt_tomont_asm
+.type small_invntt_tomont_asm, %function
+.align 2
+small_invntt_tomont_asm:
+  push {r4-r11, r14}
+
+  poly        .req r0
+  twiddle_ptr .req r1
+  poly0       .req r2
+  poly1       .req r3
+  poly2       .req r4
+  poly3       .req r5
+  poly4       .req r6
+  poly5       .req r7
+  poly6       .req r8
+  poly7       .req r9
+  twiddle     .req r10
+  qinv        .req r11
+  q           .req r11
+  tmp         .req r12
+  tmp2        .req r14
+
+  movw q, #769
+  movt qinv, #767
+
+  ### LAYER 7+6+5+4
+  .equ distance, 16
+  .equ offset, 32
+  .equ strincr, 64
+
+  // pre-load twiddle factors to FPU registers
+  vldm twiddle_ptr!, {s8-s15}
+
+  add.w tmp, poly, #8*strincr
+  vmov s8, tmp
+  1:
+    // load a1, a3, ..., a15
+    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+    // NTT on a1, a3, ..., a15
+    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
+
+    // multiply coeffs by layer 4 twiddles for later use
+    vmov twiddle, s12
+    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only
+    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s13
+    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s14
+    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
+
+    vmov twiddle, s15
+    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
+    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
+
+    vmov s0, poly0 // a1
+    vmov s1, poly1 // a3
+    vmov s2, poly2 // a5
+    vmov s3, poly3 // a7
+    vmov s4, poly4 // a9
+    vmov s5, poly5 // a11
+    vmov s6, poly6 // a13
+    vmov s7, poly7 // a15
+
+    // ----------
+
+    // load a0, a2, ..., a14
+    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+    // NTT on a0, a2, ..., a14
+    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
+
+    // layer 4 - 1
+    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+    vmov tmp2, s1 // load a3
+    vmov s1, poly0 // preserve a0
+    uadd16 poly0, poly1, tmp2
+    usub16 poly1, poly1, tmp2
+
+    vmov tmp2, s3 // load a7
+    vmov s3, poly2 // preserve a4
+    uadd16 poly2, poly3, tmp2
+    usub16 poly3, poly3, tmp2
+
+    vmov tmp2, s5 // load a11
+    vmov s5, poly4 // preserve a8
+    uadd16 poly4, poly5, tmp2
+    usub16 poly5, poly5, tmp2
+
+    vmov tmp2, s7 // load a15
+    vmov s7, poly6 // preserve a12
+    uadd16 poly6, poly7, tmp2
+    usub16 poly7, poly7, tmp2
+
+    str.w poly0, [poly, #1*distance/4]
+    str.w poly1, [poly, #1*distance/4+offset]
+    str.w poly2, [poly, #3*distance/4]
+    str.w poly3, [poly, #3*distance/4+offset]
+    str.w poly4, [poly, #5*distance/4]
+    str.w poly5, [poly, #5*distance/4+offset]
+    str.w poly6, [poly, #7*distance/4]
+    str.w poly7, [poly, #7*distance/4+offset]
+
+    // layer 4 - 2
+    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+    vmov tmp2, s1 // load a0
+    vmov poly1, s0 // load a1
+    uadd16 poly0, tmp2, poly1
+    usub16 poly1, tmp2, poly1
+
+    vmov tmp2, s3 // load a4
+    vmov poly3, s2 // load a5
+    uadd16 poly2, tmp2, poly3
+    usub16 poly3, tmp2, poly3
+
+    vmov tmp2, s5 // load a8
+    vmov poly5, s4 // load a9
+    uadd16 poly4, tmp2, poly5
+    usub16 poly5, tmp2, poly5
+
+    vmov tmp2, s7 // load a12
+    vmov poly7, s6 // load a13
+    uadd16 poly6, tmp2, poly7
+    usub16 poly7, tmp2, poly7
+
+    str.w poly1, [poly, #offset]
+    str.w poly2, [poly, #2*distance/4]
+    str.w poly3, [poly, #2*distance/4+offset]
+    str.w poly4, [poly, #4*distance/4]
+    str.w poly5, [poly, #4*distance/4+offset]
+    str.w poly6, [poly, #6*distance/4]
+    str.w poly7, [poly, #6*distance/4+offset]
+    str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+
+    vmov tmp, s8
+    cmp.w poly, tmp
+  bne.w 1b
+
+  sub.w poly, #8*strincr
+
+  ### LAYER 3+2+1
+  .equ distance, distance*16
+  .equ strincr, 4
+
+  // ITER 0
+  load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+  load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+  vldm twiddle_ptr!, {s5-s7}
+
+  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2
+
+  vmov.w s2, poly
+  movw poly, #:lower16:5585133
+  movt poly, #:upper16:5585133
+
+  // twisting
+  _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
+
+  vmov.w poly, s2
+
+  store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+  str.w poly1, [poly, #distance/4]
+  str.w poly2, [poly, #2*distance/4]
+  str.w poly3, [poly, #3*distance/4]
+  str.w poly0, [poly], #4
+
+  // ITER 1-12
+  add.w tmp, poly, #strincr*3*(3+1)
+  vmov s14, tmp
+  3:
+    add.w tmp, poly, #strincr*3
+    vmov s13, tmp
+    2:
+      // polys upto 6q
+      load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+      load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+
+      _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
+
+      vmov.w s2, poly
+      movw poly, #:lower16:5585133
+      movt poly, #:upper16:5585133
+
+      // twisting
+      _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
+
+      vmov.w poly, s2
+
+      store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+      str.w poly1, [poly, #distance/4]
+      str.w poly2, [poly, #2*distance/4]
+      str.w poly3, [poly, #3*distance/4]
+      str.w poly0, [poly], #4
+
+      vmov tmp, s13
+      cmp.w poly, tmp
+    bne.w 2b
+
+    // polys upto 9q
+    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
+
+    vmov.w s2, poly
+    movw poly, #:lower16:5585133
+    movt poly, #:upper16:5585133
+
+    // twisting
+    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
+
+    vmov.w poly, s2
+
+    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+    str.w poly1, [poly, #distance/4]
+    str.w poly2, [poly, #2*distance/4]
+    str.w poly3, [poly, #3*distance/4]
+    str.w poly0, [poly], #4
+
+    vmov tmp, s14
+    cmp.w poly, tmp
+  bne.w 3b
+
+  // ITER 13-15
+  add tmp, poly, #3*strincr
+  vmov s13, tmp
+  2:
+    // polys upto 6q
+    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
+
+    vmov.w s2, poly
+    movw poly, #:lower16:5585133
+    movt poly, #:upper16:5585133
+
+    // twisting
+    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
+
+    vmov.w poly, s2
+
+    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+    str.w poly1, [poly, #distance/4]
+    str.w poly2, [poly, #2*distance/4]
+    str.w poly3, [poly, #3*distance/4]
+    str.w poly0, [poly], #strincr
+
+    vmov tmp, s13
+    cmp.w poly, tmp
+  bne.w 2b
+
+  pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle
+.unreq qinv
+.unreq q
+.unreq tmp
+.unreq tmp2
+
+.align 2
+.global small_pointmul_asm
+.type small_pointmul_asm, %function
+small_pointmul_asm:
+    push.w {r4-r11, lr}
+
+    movw r14, #769
+    movt r14, #767
+
+    .equ width, 4
+
+    add.w r12, r2, #64*2
+    _point_mul_16_loop:
+
+    ldr.w r7, [r1, #2*width]
+    ldr.w r8, [r1, #3*width]
+    ldrsh.w r9, [r2, #1*2]
+    ldr.w r5, [r1, #1*width]
+    ldr.w r4, [r1], #4*width
+    ldrsh.w r6, [r2], #2*2
+
+    smultb r10, r4, r6
+    montgomery r14, r14, r10, r11
+    pkhbt r4, r4, r11
+
+
+    neg.w r6, r6
+
+    smultb r10, r5, r6
+    montgomery r14, r14, r10, r11
+    pkhbt r5, r5, r11
+
+    str.w r5, [r0, #1*width]
+    str.w r4, [r0], #2*width
+
+    smultb r10, r7, r9
+    montgomery r14, r14, r10, r11
+    pkhbt r7, r7, r11
+
+    neg.w r9, r9
+
+    smultb r10, r8, r9
+    montgomery r14, r14, r10, r11
+    pkhbt r8, r8, r11
+
+    str.w r8, [r0, #1*width]
+    str.w r7, [r0], #2*width
+
+    cmp.w r2, r12
+    bne.w _point_mul_16_loop
+
+    pop.w {r4-r11, pc}
+
+  .align 2
+.global small_asymmetric_mul_asm
+.type small_asymmetric_mul_asm, %function
+small_asymmetric_mul_asm:
+    push.w {r4-r11, lr}
+
+    movw r14, #769
+    movt r14, #767
+    .equ width, 4
+    add.w r12, r0, #256*2
+    _asymmetric_mul_16_loop:
+    ldr.w r7, [r1, #width]
+    ldr.w r4, [r1], #2*width
+    ldr.w r8, [r2, #width]
+    ldr.w r5, [r2], #2*width
+    ldr.w r9, [r3, #width]
+    ldr.w r6, [r3], #2*width
+
+    smuad r10, r4, r6
+    montgomery r14, r14, r10, r6
+    smuadx r11, r4, r5
+    montgomery r14, r14, r11, r10
+
+    pkhtb r10, r10, r6, asr#16
+
+    str.w r10, [r0], #width
+
+    smuad r10, r7, r9
+    montgomery r14, r14, r10, r6
+    smuadx r11, r7, r8
+    montgomery r14, r14, r11, r10
+
+    pkhtb r10, r10, r6, asr#16
+    str.w r10, [r0], #width
+
+
+    cmp.w r0, r12
+    bne.w _asymmetric_mul_16_loop
+
+    pop.w {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h
new file mode 100644
index 00000000..0aa0ce9b
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.h
@@ -0,0 +1,53 @@
+#ifndef SMALLNTT_H
+#define SMALLNTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+static const int16_t zetas[64] = {
+-23, 112, -151, -134, -52, -148, 227, 232,
+-71, 212, 236, 21, 341, 379, -202, -220,
+352, 292, 238, 145, 194, -276, 70, -274,
+117, 333, 66, 247, -237, -83, -252, -244,
+331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168
+};
+
+static const int16_t zetas_asm[128] = {
+0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337,
+-76, 147, -114, -23, 112, -151, -134,
+-98, -272, 54, -52, -148, 227, 232,
+36, -2, -124, -71, 212, 236, 21,
+-75, -80, -346, 341, 379, -202, -220,
+-339, 86, -51, 352, 292, 238, 145,
+-255, 364, 267, 194, -276, 70, -274,
+282, 161, -15, 117, 333, 66, 247,
+-203, 288, 169, -237, -83, -252, -244,
+-34, 191, 307, 331, -241, 167, 357,
+199, -50, -24, -355, 291, -358, 105,
+178, -170, 226, -115, -209, 14, 99,
+270, 121, -188, -260, 29, 366, -378,
+-10, -380, 279, -318, 278, 353, 354,
+149, 180, -375, -184, 127, 330, -303,
+369, -157, 263, 222, -78, -348, -44,
+-192, -128, -246, 201, 158, 350, 168
+};
+
+static const int16_t zetas_inv_CT_asm[256] = {
+0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186,
+171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138,
+};
+
+
+#define SMALL_Q 769
+
+void small_ntt_asm(int16_t a[N], const int16_t * zetas);
+void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas);
+void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas);
+void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]);
+
+#define small_ntt(a) small_ntt_asm(a, zetas_asm)
+#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm)
+#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas)
+#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c
new file mode 100644
index 00000000..9e1f6c85
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c
@@ -0,0 +1,84 @@
+#include "smallpoly.h"
+#include "smallntt.h"
+
+void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) {
+  for (int i = 0; i < N; i++)
+  {
+    out->coeffs[i] = in->coeffs[i];
+  }
+  small_ntt(out->coeffs);
+  small_point_mul(out2->coeffs, out->coeffs);
+}
+
+
+void polyvecl_small_ntt(smallpoly v[L]) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+void polyveck_small_ntt(smallpoly v[K]) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){
+    // re-use the buffer
+    smallpoly *tmp = (smallpoly *)r;
+    small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs);
+    small_invntt_tomont(tmp->coeffs);
+
+    #ifdef SMALL_POLY_16_BIT
+    int j;
+    // buffer is the same, so we neeed to be careful
+    for(j=N-1;j>=0;j--){
+        r->coeffs[j] = tmp->coeffs[j];
+    }
+    #endif
+}
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){
+    unsigned int i;
+    for(i=0;i<L;i++){
+        poly_small_basemul_invntt(&r->vec[i], a, aprime, &b[i]);
+    }
+}
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a) {
+  unsigned int i;
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
+    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
+    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
+    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
+    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
+    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
+    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
+    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
+
+    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0] = a[i] & 0x0F;
+    r->coeffs[2*i+1] = a[i] >> 4;
+    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
+  }
+#endif
+}
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h
new file mode 100644
index 00000000..caa26261
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h
@@ -0,0 +1,39 @@
+#ifndef SMALLPOLY_H
+#define SMALLPOLY_H
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+
+
+#if DILITHIUM_MODE == 3 // use q=769
+#define SMALL_POLY_16_BIT
+typedef struct {
+    int16_t coeffs[N];
+} smallpoly;
+
+typedef smallpoly smallhalfpoly;
+
+#else // use q=257
+#define SMALL_POLY_32_BIT
+typedef struct {
+    int32_t coeffs[N];
+} smallpoly;
+
+typedef struct {
+    int16_t coeffs[N];
+} smallhalfpoly;
+#endif
+
+
+void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in);
+void polyvecl_small_ntt(smallpoly v[L]);
+void polyveck_small_ntt(smallpoly v[K]);
+
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]);
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b);
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/symmetric-shake.c b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c
new file mode 100644
index 00000000..963f6498
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c
@@ -0,0 +1,28 @@
+#include <stdint.h>
+#include "params.h"
+#include "symmetric.h"
+#include "fips202.h"
+
+void dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake128_inc_init(state);
+  shake128_inc_absorb(state, seed, SEEDBYTES);
+  shake128_inc_absorb(state, t, 2);
+  shake128_inc_finalize(state);
+}
+
+void dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake256_inc_init(state);
+  shake256_inc_absorb(state, seed, CRHBYTES);
+  shake256_inc_absorb(state, t, 2);
+  shake256_inc_finalize(state);
+}
diff --git a/crypto_sign/dilithium3/m4fstack/symmetric.h b/crypto_sign/dilithium3/m4fstack/symmetric.h
new file mode 100644
index 00000000..47037377
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/symmetric.h
@@ -0,0 +1,65 @@
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+
+#include <stdint.h>
+#include "params.h"
+
+#ifdef DILITHIUM_USE_AES
+
+#include "aes256ctr.h"
+#include "fips202.h"
+
+typedef aes256ctr_ctx stream128_state;
+typedef aes256ctr_ctx stream256_state;
+
+#define dilithium_aes256ctr_init DILITHIUM_NAMESPACE(dilithium_aes256ctr_init)
+void dilithium_aes256ctr_init(aes256ctr_ctx *state,
+                              const uint8_t key[32],
+                              uint16_t nonce);
+
+#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES
+#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES
+
+#define stream128_init(STATE, SEED, NONCE) \
+        dilithium_aes256ctr_init(STATE, SEED, NONCE)
+#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
+#define stream256_init(STATE, SEED, NONCE) \
+        dilithium_aes256ctr_init(STATE, SEED, NONCE)
+#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
+
+#else
+
+#include "fips202.h"
+typedef shake128incctx stream128_state;
+typedef shake256incctx stream256_state;
+
+#define shake256_inc_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
+
+#define dilithium_shake128_stream_init DILITHIUM_NAMESPACE(dilithium_shake128_stream_init)
+void dilithium_shake128_stream_init(stream128_state *state,
+                                    const uint8_t seed[SEEDBYTES],
+                                    uint16_t nonce);
+
+#define dilithium_shake256_stream_init DILITHIUM_NAMESPACE(dilithium_shake256_stream_init)
+void dilithium_shake256_stream_init(stream256_state *state,
+                                    const uint8_t seed[CRHBYTES],
+                                    uint16_t nonce);
+
+#define STREAM128_BLOCKBYTES SHAKE128_RATE
+#define STREAM256_BLOCKBYTES SHAKE256_RATE
+
+#define stream128_init(STATE, SEED, NONCE) \
+        dilithium_shake128_stream_init(STATE, SEED, NONCE)
+#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake128_inc_squeeze(OUT, OUTBLOCKS*SHAKE128_RATE, STATE)
+#define stream256_init(STATE, SEED, NONCE) \
+        dilithium_shake256_stream_init(STATE, SEED, NONCE)
+#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
+
+#endif
+
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/vector.h b/crypto_sign/dilithium3/m4fstack/vector.h
new file mode 100644
index 00000000..e5c5dda3
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/vector.h
@@ -0,0 +1,20 @@
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define asm_reduce32 DILITHIUM_NAMESPACE(asm_reduce32)
+void asm_reduce32(int32_t a[N]);
+#define small_asm_reduce32_central DILITHIUM_NAMESPACE(small_asm_reduce32_central)
+void small_asm_reduce32_central(int32_t a[N]);
+#define asm_caddq DILITHIUM_NAMESPACE(asm_caddq)
+void asm_caddq(int32_t a[N]);
+#define asm_freeze DILITHIUM_NAMESPACE(asm_freeze)
+void asm_freeze(int32_t a[N]);
+#define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform)
+unsigned int asm_rej_uniform(int32_t *a,
+                         unsigned int len,
+                         const unsigned char *buf,
+                         unsigned int buflen);
+#endif
diff --git a/crypto_sign/dilithium3/m4fstack/vector.s b/crypto_sign/dilithium3/m4fstack/vector.s
new file mode 100644
index 00000000..559f11b0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/vector.s
@@ -0,0 +1,210 @@
+.syntax unified
+.thumb
+.macro redq a, tmp, q
+    add     \tmp, \a,  #4194304
+    asrs    \tmp, \tmp, #23
+    mls     \a, \tmp, \q, \a
+.endm
+
+// void asm_reduce32(int32_t a[N]);
+.global pqcrystals_dilithium_asm_reduce32
+.type pqcrystals_dilithium_asm_reduce32, %function
+.align 2
+pqcrystals_dilithium_asm_reduce32:
+    push {r4-r10}
+
+    movw r12,#:lower16:8380417
+    movt r12,#:upper16:8380417
+    movw r10, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        redq r1, r9, r12
+        redq r2, r9, r12
+        redq r3, r9, r12
+        redq r4, r9, r12
+        redq r5, r9, r12
+        redq r6, r9, r12
+        redq r7, r9, r12
+        redq r8, r9, r12
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r10, #1
+        bne.w 1b
+
+    pop {r4-r10}
+    bx lr
+.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32
+
+.macro barrett_32 a, Qbar, Q, tmp
+    smmulr.w \tmp, \a, \Qbar
+    mls.w \a, \tmp, \Q, \a
+.endm
+
+// INPUT: target (signed), KYBER_Q (signed)
+// OUTPUT: target adjusted to be between -KYBER_Q/2 and KYBER_Q/2
+.macro central_reduce target, Q
+  cmp \target, \Q, lsr #1
+  it hi
+  subhi \target, \Q
+  cmn \target, \Q, lsr #1
+  it lt
+  addlt \target, \Q
+.endm
+
+// void asm_reduce32(int32_t a[N]);
+.global pqcrystals_dilithium_small_asm_reduce32_central
+.type pqcrystals_dilithium_small_asm_reduce32_central, %function
+.align 2
+pqcrystals_dilithium_small_asm_reduce32_central:
+    push {r4-r12, lr}
+
+
+    movw r9, #:lower16:5585133
+    movt r9, #:upper16:5585133
+    mov.w r10,#769
+
+    movw r12, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        barrett_32 r1, r9, r10, r11
+        barrett_32 r2, r9, r10, r11
+        barrett_32 r3, r9, r10, r11
+        barrett_32 r4, r9, r10, r11
+        barrett_32 r5, r9, r10, r11
+        barrett_32 r6, r9, r10, r11
+        barrett_32 r7, r9, r10, r11
+        barrett_32 r8, r9, r10, r11
+
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r12, #1
+        bne.w 1b
+
+    pop {r4-r12, pc}
+
+.size pqcrystals_dilithium_small_asm_reduce32_central, .-pqcrystals_dilithium_small_asm_reduce32_central
+
+.macro caddq a, tmp, q
+    and     \tmp, \q, \a, asr #31
+    add     \a, \a, \tmp
+.endm
+
+.macro freezeq a, tmp, q
+    redq \a, \tmp, \q
+    caddq \a, \tmp, \q
+.endm
+
+// void asm_caddq(int32_t a[N]);
+.global pqcrystals_dilithium_asm_caddq
+.type pqcrystals_dilithium_asm_caddq, %function
+.align 2
+pqcrystals_dilithium_asm_caddq:
+    push {r4-r10}
+
+    movw r12,#:lower16:8380417
+    movt r12,#:upper16:8380417
+
+    movw r10, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        caddq r1, r9, r12
+        caddq r2, r9, r12
+        caddq r3, r9, r12
+        caddq r4, r9, r12
+        caddq r5, r9, r12
+        caddq r6, r9, r12
+        caddq r7, r9, r12
+        caddq r8, r9, r12
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r10, #1
+        bne.w 1b
+
+    pop {r4-r10}
+    bx lr
+.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
+
+
+// asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen);
+.global pqcrystals_dilithium_asm_rej_uniform
+.type pqcrystals_dilithium_asm_rej_uniform, %function
+.align 2
+pqcrystals_dilithium_asm_rej_uniform:
+    push.w {r4-r6}
+    push.w {r1}
+    // Store Q-1 in r12.
+    movw r12,#:lower16:8380416
+    movt r12,#:upper16:8380416
+
+    add.w r6, r0, r1, lsl #2
+    add.w r3, r2, r3
+    sub.w r3, r3, #2
+
+1:
+    // If there are less than 3 bytes available, return.
+    cmp.w r3, r2
+    ble.w end
+
+    ldr r5, [r2], #3
+    ubfx r5, r5, #0, #23
+
+    cmp.n r5, r12
+    it le
+    strle r5, [r0], #4
+
+    cmp.n r0, r6
+    bne.n 1b
+
+end:
+    pop.w {r5}
+
+    sub.w r0, r6, r0
+    sub.w r0, r5, r0, lsr #2
+    pop.w {r4-r6}
+    bx lr
+.size pqcrystals_dilithium_asm_rej_uniform, .-pqcrystals_dilithium_asm_rej_uniform

From 80c9e07ff1882f5b773c40d4fadceb790f767b2a Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 15:40:26 +0100
Subject: [PATCH 02/32] Start stack optimization [Passing] * Based on ideas
 from https://eprint.iacr.org/2022/323.pdf, based on code by Matthias J.
 Kannwischer * Sample A on-the-fly * Compressed c * Schoolbook mul for ct1

---
 crypto_sign/dilithium3/m4fstack/reduce.h    |  50 +++
 crypto_sign/dilithium3/m4fstack/sign.c      |  35 +-
 crypto_sign/dilithium3/m4fstack/smallpoly.c |   2 +-
 crypto_sign/dilithium3/m4fstack/stack.c     | 404 ++++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h     |  40 ++
 5 files changed, 517 insertions(+), 14 deletions(-)
 create mode 100644 crypto_sign/dilithium3/m4fstack/stack.c
 create mode 100644 crypto_sign/dilithium3/m4fstack/stack.h

diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h
index 02df5500..5990918a 100644
--- a/crypto_sign/dilithium3/m4fstack/reduce.h
+++ b/crypto_sign/dilithium3/m4fstack/reduce.h
@@ -26,4 +26,54 @@ static inline int32_t montgomery_reduce(int64_t a) {
   return t;
 }
 
+/*************************************************
+* Name:        reduce32
+*
+* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
+*              compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t reduce32(int32_t a) {
+  int32_t t;
+
+  t = (a + (1 << 22)) >> 23;
+  t = a - t*Q;
+  return t;
+}
+
+/*************************************************
+* Name:        caddq
+*
+* Description: Add Q if input coefficient is negative.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t caddq(int32_t a) {
+  a += (a >> 31) & Q;
+  return a;
+}
+
+/*************************************************
+* Name:        freeze
+*
+* Description: For finite field element a, compute standard
+*              representative r = a mod^+ Q.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t freeze(int32_t a) {
+  a = reduce32(a);
+  a = caddq(a);
+  return a;
+}
+
+
+
 #endif
diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 04bec45c..eaecb29f 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -7,6 +7,7 @@
 #include "randombytes.h"
 #include "symmetric.h"
 #include "smallpoly.h"
+#include "stack.h"
 
 /*************************************************
 * Name:        crypto_sign_keypair
@@ -88,9 +89,11 @@ int crypto_sign_signature(uint8_t *sig,
   uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd;
   uint16_t nonce = 0;
   unsigned int n;
-  polyvecl mat[K], y, z;
-  polyveck t0, w1, w0;
+  polyvecl y, z;
+  polyveck w1, w0;
   poly cp;
+  uint8_t ccomp[68];
+  poly matel;
   shake256incctx state;
 
   smallpoly s1_prime[L];
@@ -104,7 +107,7 @@ int crypto_sign_signature(uint8_t *sig,
   rnd = key + SEEDBYTES;
   mu = rnd + RNDBYTES;
   rhoprime = mu + CRHBYTES;
-  unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk);
+  unpack_sk_stack(rho, tr, key, s1_prime, s2_prime, sk);
 
   /* Compute mu = CRH(tr, msg) */
   shake256_inc_init(&state);
@@ -118,13 +121,10 @@ int crypto_sign_signature(uint8_t *sig,
   }
   shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES);
 
-  /* Expand matrix and transform vectors */
-  polyvec_matrix_expand(mat, rho);
+  /* Transform vectors */
   polyvecl_small_ntt(s1_prime);
   polyveck_small_ntt(s2_prime);
 
-  polyveck_ntt(&t0);
-
 rej:
   /* Sample intermediate vector y */
   polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
@@ -132,7 +132,16 @@ int crypto_sign_signature(uint8_t *sig,
   /* Matrix-vector multiplication */
   z = y;
   polyvecl_ntt(&z);
-  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+  
+  for (size_t k_idx = 0; k_idx < K; k_idx++) {
+      poly_uniform(&matel, rho, (k_idx << 8) + 0);
+      poly_pointwise_montgomery(&w1.vec[k_idx],  &matel, &z.vec[0]);
+      for (size_t l_idx = 1; l_idx < L; l_idx++) {
+        poly_uniform(&matel, rho, (k_idx << 8) + l_idx);
+        poly_pointwise_acc_montgomery(&w1.vec[k_idx],  &matel, &z.vec[l_idx]);
+      }
+  }
+
   polyveck_reduce(&w1);
   polyveck_invntt_tomont(&w1);
 
@@ -147,9 +156,10 @@ int crypto_sign_signature(uint8_t *sig,
   shake256_inc_finalize(&state);
   shake256_inc_squeeze(sig, CTILDEBYTES, &state);
   poly_challenge(&cp, sig);
+
+  poly_challenge_compress(ccomp, &cp);
   
   poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp);
-  poly_ntt(&cp);
 
   /* Compute z, reject if it reveals secret */
   polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime);
@@ -175,11 +185,10 @@ int crypto_sign_signature(uint8_t *sig,
     if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA))
       goto rej;
 
-    /* Compute hints for w1 */
-    poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]);
+    poly_schoolbook(tmp, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+      L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + i*POLYT0_PACKEDBYTES);
 
-    poly_invntt_tomont(tmp);
-    poly_reduce(tmp);
+    /* Compute hints for w1 */
 
     if(poly_chknorm(tmp, GAMMA2))
       goto rej;
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c
index 9e1f6c85..1f7fab17 100644
--- a/crypto_sign/dilithium3/m4fstack/smallpoly.c
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c
@@ -2,7 +2,7 @@
 #include "smallntt.h"
 
 void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) {
-  for (int i = 0; i < N; i++)
+  for (int i = N; i >= 0; i--)
   {
     out->coeffs[i] = in->coeffs[i];
   }
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
new file mode 100644
index 00000000..2beb0f46
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -0,0 +1,404 @@
+#include "stack.h"
+#include "fips202.h"
+#include "symmetric.h"
+#include "vector.h"
+#include "reduce.h"
+
+void poly_challenge_compress(uint8_t c[68], const poly *cp){
+  unsigned int i, pos;
+  uint64_t signs;
+  uint64_t mask;
+  /* Encode c */
+  for(i=0;i<68;i++) c[i] = 0;
+  signs = 0;
+  mask = 1;
+  pos = 0;
+  for(i = 0; i < N; ++i){
+    if(cp->coeffs[i] != 0){
+      c[pos++] = i;
+      if(cp->coeffs[i] == -1){
+        signs |= mask;
+      }
+      mask <<= 1;
+    }
+  }
+
+  for (i = 0; i < 8; ++i) {
+    c[60+i] = (unsigned char) (signs >> 8 * i);
+  }
+}
+
+void poly_challenge_decompress(poly *cp, const uint8_t c[68]){
+  unsigned int i;
+  unsigned pos;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) cp->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)c[60+i]) << (8*i);
+  }
+
+  for(i = 0; i < TAU; i++){
+    pos = c[i];
+    if(signs & 1){
+      cp->coeffs[pos] = -1;
+    } else {
+      cp->coeffs[pos] = 1;
+    }
+    signs >>= 1;
+  }
+}
+
+
+// TODO: buffer at most 8 coeffs at once
+static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){
+    int32_t coeff;
+    // 8 coefficients are packed in 13 bytes
+    t0 += 13*(idx >> 3);
+
+    if(idx % 8 == 0){
+        coeff  = t0[0];
+        coeff |= (uint32_t)t0[1] << 8;
+    } else if(idx % 8 == 1){
+        coeff  = t0[1] >> 5;
+        coeff |= (uint32_t)t0[2] << 3;
+        coeff |= (uint32_t)t0[3] << 11;
+    } else if(idx % 8 == 2){
+        coeff  = t0[3] >> 2;
+        coeff |= (uint32_t)t0[4] << 6;
+    } else if(idx % 8 == 3){
+        coeff  = t0[4] >> 7;
+        coeff |= (uint32_t)t0[5] << 1;
+        coeff |= (uint32_t)t0[6] << 9;
+    } else if(idx % 8 == 4){
+        coeff  = t0[6] >> 4;
+        coeff |= (uint32_t)t0[7] << 4;
+        coeff |= (uint32_t)t0[8] << 12;
+    } else if(idx % 8 == 5){
+        coeff  = t0[8] >> 1;
+        coeff |= (uint32_t)t0[9] << 7;
+    } else if(idx % 8 == 6){
+        coeff  = t0[9] >> 6;
+        coeff |= (uint32_t)t0[10] << 2;
+        coeff |= (uint32_t)t0[11] << 10;
+    } else if(idx % 8 == 7){
+        coeff  = t0[11] >> 3;
+        coeff |= (uint32_t)t0[12] << 5;
+    }
+    coeff &= 0x1FFF;
+    return (1 << (D-1)) - coeff;
+}
+
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
+  unsigned i,j,idx;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) c->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)ccomp[60+i]) << (8*i);
+  }
+
+  for(idx = 0; idx < TAU; idx++){
+    i = ccomp[idx];
+    if(!(signs & 1)){
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] += polyt0_unpack_idx(t0, j);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] -= polyt0_unpack_idx(t0, j);
+        }
+    } else {
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] -= polyt0_unpack_idx(t0, j);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] += polyt0_unpack_idx(t0, j);
+        }
+    }
+
+    signs >>= 1;
+  }
+}
+
+
+void polyw_pack(uint8_t buf[3*256], poly *w){
+  poly_reduce(w);
+  poly_caddq(w);
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    buf[i*3 + 0] = w->coeffs[i];
+    buf[i*3 + 1] = w->coeffs[i] >> 8;
+    buf[i*3 + 2] = w->coeffs[i] >> 16;
+  }
+}
+
+void polyw_unpack(poly *w, const uint8_t buf[3*256]) {
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    w->coeffs[i] =  buf[i*3 + 0];
+    w->coeffs[i] |= (int32_t)buf[i*3 + 1] << 8;
+    w->coeffs[i] |= (int32_t)buf[i*3 + 2] << 16;
+  }
+}
+
+
+static void polyw_add_idx(uint8_t buf[3*256], int32_t a, size_t i){
+  int32_t coeff;
+  coeff =  buf[i*3 + 0];
+  coeff |= (int32_t)buf[i*3 + 1] << 8;
+  coeff |= (int32_t)buf[i*3 + 2] << 16;
+
+  coeff += a;
+
+  coeff = freeze(coeff);
+
+  buf[i*3 + 0] = coeff;
+  buf[i*3 + 1] = coeff >> 8;
+  buf[i*3 + 2] = coeff >> 16;
+}
+
+void polyw_add(uint8_t buf[3*256], poly *p){
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    polyw_add_idx(buf, p->coeffs[i], i);
+  }
+}
+void polyw_sub(poly* c, uint8_t buf[3*256], poly *a){
+  int32_t coeff;
+
+
+  for(size_t i=0;i<N;i++){
+    coeff =  buf[i*3 + 0];
+    coeff |= (int32_t)buf[i*3 + 1] << 8;
+    coeff |= (int32_t)buf[i*3 + 2] << 16;
+
+    c->coeffs[i] = coeff - a->coeffs[i];
+  }
+}
+
+static int32_t highbits(int32_t a){
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  return a1;
+}
+
+void poly_highbits(poly *a1, const poly *a) {
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = highbits(a->coeffs[i]);
+}
+
+static int32_t lowbits(int32_t a){
+  int32_t a1;
+  int32_t a0;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  a0  = a - a1*2*GAMMA2;
+  a0 -= (((Q-1)/2 - a0) >> 31) & Q;
+  return a0;
+}
+
+void poly_lowbits(poly *a0, const poly *a){
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a0->coeffs[i] = lowbits(a->coeffs[i]);
+}
+
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) {
+  small_polyeta_unpack(a, sk + 3*SEEDBYTES + idx*POLYETA_PACKEDBYTES);
+}
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
+  small_polyeta_unpack(a, sk + 3*SEEDBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES);
+}
+
+
+// TODO: in the end increase this buffer size as far as possible
+#define POLY_UNIFORM_BUFFERSIZE 3
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
+  int32_t t;
+  uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
+  {
+    size_t ctr = 0;
+    stream128_init(state, seed, nonce);
+
+    do {
+      shake128_inc_squeeze(buf, sizeof buf, state);
+
+      for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){
+        t  = buf[pos];
+        t |= (uint32_t)buf[pos+1] << 8;
+        t |= (uint32_t)buf[pos+2] << 16;
+        t &= 0x7FFFFF;
+
+        if(t < Q) {
+          t = montgomery_reduce((int64_t)t * b->coeffs[ctr]);
+          polyw_add_idx(wcomp, t, ctr);
+          ctr++;
+        }
+      }
+    } while(ctr < N);
+
+  }
+}
+
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE 1
+#if GAMMA1 == (1 << 17)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*4)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES  (POLY_UNIFORM_GAMMA1_BUFFERSIZE*9)
+#elif GAMMA1 == (1 << 19)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*2)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES  (POLY_UNIFORM_GAMMA1_BUFFERSIZE*5)
+#endif
+
+static void polyz_unpack_inplace(int32_t *r){
+  uint8_t *a = (uint8_t *)r;
+
+  unsigned int i,j;
+  #if GAMMA1 == (1 << 17)
+  for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j;
+    int32_t t0;
+
+
+    r[4*i+3]  = a[9*i+6] >> 6;
+    r[4*i+3] |= (uint32_t)a[9*i+7] << 2;
+    r[4*i+3] |= (uint32_t)a[9*i+8] << 10;
+    r[4*i+3] &= 0x3FFFF;
+
+    r[4*i+2]   = a[9*i+4] >> 4;
+    r[4*i+2]  |= (uint32_t)a[9*i+5] << 4;
+    r[4*i+2]  |= (uint32_t)a[9*i+6] << 12;
+    r[4*i+2]  &= 0x3FFFF;
+
+
+    r[4*i+1] = (uint32_t)a[9*i+4] << 14;
+    r[4*i+1] |= a[9*i+2] >> 2;
+    r[4*i+1] |= (uint32_t)a[9*i+3] << 6;
+    r[4*i+1] &= 0x3FFFF;
+
+    t0  = a[9*i+0];
+    t0 |= (uint32_t)a[9*i+1] << 8;
+    t0 |= (uint32_t)a[9*i+2] << 16;
+    t0 &= 0x3FFFF;
+
+    r[4*i+0] = GAMMA1 - t0;
+    r[4*i+1] = GAMMA1 - r[4*i+1];
+    r[4*i+2] = GAMMA1 - r[4*i+2];
+    r[4*i+3] = GAMMA1 - r[4*i+3];
+
+  }
+#elif GAMMA1 == (1 << 19)
+  for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j;
+    int32_t tmp0, tmp1;
+
+    tmp0  = a[5*i+2] >> 4;
+    tmp0 |= (uint32_t)a[5*i+3] << 4;
+    tmp0 |= (uint32_t)a[5*i+4] << 12;
+    tmp0 &= 0xFFFFF;
+
+    tmp1  = a[5*i+0];
+    tmp1 |= (uint32_t)a[5*i+1] << 8;
+    tmp1 |= (uint32_t)a[5*i+2] << 16;
+    tmp1 &= 0xFFFFF;
+
+    r[2*i+0] = GAMMA1 - tmp0;
+    r[2*i+1] = GAMMA1 - tmp1;
+  }
+#endif
+}
+
+
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
+      a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j] + b->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j];
+    }
+  }
+}
+
+
+static inline int32_t make_hint(int32_t z, int32_t r){
+  int32_t r1, v1;
+
+  r1 = highbits(r);
+  v1 = highbits(r+z);
+
+  if(r1 != v1) return 1;
+  return 0;
+}
+
+size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){
+  int32_t coeff;
+  size_t hints_n = 0;
+  for(size_t i=0;i<N;i++){
+    // unpack coeff from w (contains w - cs2)
+    coeff =  w[i*3 + 0];
+    coeff |= (int32_t)w[i*3 + 1] << 8;
+    coeff |= (int32_t)w[i*3 + 2] << 16;
+
+
+    // compute w - cs2 + c*t0
+    coeff  = coeff + t->coeffs[i];
+
+    a->coeffs[i] = make_hint(-t->coeffs[i], coeff);
+    if(a->coeffs[i] == 1){
+      hints_n++;
+    }
+  }
+  return hints_n;
+}
+
+// TODO: remove this later
+void unpack_sk_stack(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    tr[i] = sk[i];
+  sk += TRBYTES;
+
+  for(i=0; i < L; ++i)
+    small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += K*POLYETA_PACKEDBYTES;
+}
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
new file mode 100644
index 00000000..9d36b105
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -0,0 +1,40 @@
+#ifndef STACK_H
+#define STACK_H
+
+#include "poly.h"
+#include "smallpoly.h"
+#include <stdint.h>
+#include <stddef.h>
+#include "fips202.h"
+
+void poly_challenge_compress(uint8_t c[68], const poly *cp);
+void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
+
+
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0);
+void polyw_pack(uint8_t buf[3*256], poly *w);
+void polyw_unpack(poly *w, const uint8_t buf[3*256]);
+
+void polyw_add(uint8_t buf[3*256], poly *p);
+void polyw_sub(poly* c, uint8_t buf[3*256], poly *a);
+
+void poly_highbits(poly *a1, const poly *a);
+void poly_lowbits(poly *a0, const poly *a);
+
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
+
+
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
+
+size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
+
+// TODO: replace this with individual functions later
+void unpack_sk_stack(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+#endif
\ No newline at end of file

From 5c5b86829cec268c440d318ee941370abc121137 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 16:41:09 +0100
Subject: [PATCH 03/32] Compress w

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 69 +++++++++++++++----------
 crypto_sign/dilithium3/m4fstack/stack.c | 22 ++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  1 +
 3 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index eaecb29f..edfc6a81 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -90,10 +90,10 @@ int crypto_sign_signature(uint8_t *sig,
   uint16_t nonce = 0;
   unsigned int n;
   polyvecl y, z;
-  polyveck w1, w0;
+  uint8_t wcomp[K][768];
   poly cp;
   uint8_t ccomp[68];
-  poly matel;
+  poly tmp0, tmp1;
   shake256incctx state;
 
   smallpoly s1_prime[L];
@@ -133,22 +133,27 @@ int crypto_sign_signature(uint8_t *sig,
   z = y;
   polyvecl_ntt(&z);
   
-  for (size_t k_idx = 0; k_idx < K; k_idx++) {
-      poly_uniform(&matel, rho, (k_idx << 8) + 0);
-      poly_pointwise_montgomery(&w1.vec[k_idx],  &matel, &z.vec[0]);
-      for (size_t l_idx = 1; l_idx < L; l_idx++) {
-        poly_uniform(&matel, rho, (k_idx << 8) + l_idx);
-        poly_pointwise_acc_montgomery(&w1.vec[k_idx],  &matel, &z.vec[l_idx]);
+    for (size_t k_idx = 0; k_idx < K; k_idx++) {
+      for(size_t i=0;i<768;i++){
+        wcomp[k_idx][i] = 0;
       }
-  }
 
-  polyveck_reduce(&w1);
-  polyveck_invntt_tomont(&w1);
 
-  /* Decompose w and call the random oracle */
-  polyveck_caddq(&w1);
-  polyveck_decompose(&w1, &w0, &w1);
-  polyveck_pack_w1(sig, &w1);
+      for (size_t l_idx = 0; l_idx < L; l_idx++) {
+        poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
+        poly_pointwise_montgomery(&tmp0,  &tmp0, &z.vec[l_idx]);
+        polyw_add(wcomp[k_idx], &tmp0);
+      }
+
+      polyw_unpack(&tmp0, wcomp[k_idx]);
+      poly_invntt_tomont(&tmp0);
+      poly_caddq(&tmp0);
+
+      polyw_pack(wcomp[k_idx], &tmp0);
+
+      poly_decompose_w1(&tmp0, &tmp0);
+      polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
+  }
 
   shake256_inc_init(&state);
   shake256_inc_absorb(&state, mu, CRHBYTES);
@@ -176,28 +181,38 @@ int crypto_sign_signature(uint8_t *sig,
   unsigned int hints_written = 0;
   /* Check that subtracting cs2 does not change high bits of w and low bits
    * do not reveal secret information */
-  for(unsigned int i = 0; i < K; ++i) {
-    poly *tmp = &z.vec[0];
-    poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]);
+  
+  for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
+    polyw_unpack(&tmp0, wcomp[k_idx]);
+    poly_decompose(&tmp1, &tmp0, &tmp0);
 
-    poly_sub(&w0.vec[i], &w0.vec[i], tmp);
-    poly_reduce(&w0.vec[i]);
-    if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA))
+    poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &s2_prime[k_idx]);
+
+    poly_sub(&tmp0, &tmp0, &tmp1);
+    poly_reduce(&tmp0);
+    if(poly_chknorm(&tmp0, GAMMA2 - BETA))
       goto rej;
 
-    poly_schoolbook(tmp, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
-      L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + i*POLYT0_PACKEDBYTES);
+    poly_schoolbook(&tmp1, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+      L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES);
 
     /* Compute hints for w1 */
 
-    if(poly_chknorm(tmp, GAMMA2))
+    if(poly_chknorm(&tmp1, GAMMA2))
       goto rej;
-    poly_add(&w0.vec[i], &w0.vec[i], tmp);
-    hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]);
+
+    poly_add(&tmp0, &tmp0, &tmp1);
+
+
+    polyw_unpack(&tmp1, wcomp[k_idx]);
+    poly_decompose_w1(&tmp1, &tmp1);
+
+
+    hint_n += poly_make_hint(&tmp1, &tmp0, &tmp1);
     if (hint_n > OMEGA) {
       goto rej;
     }
-    pack_sig_h(sig, tmp, i, &hints_written);
+    pack_sig_h(sig, &tmp1, k_idx, &hints_written);
   }
   pack_sig_h_zero(sig, &hints_written);
   *siglen = CRYPTO_BYTES;
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 2beb0f46..d7d59f1c 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -402,3 +402,25 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
     small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES);
   sk += K*POLYETA_PACKEDBYTES;
 }
+
+static int32_t decompose_w1(int32_t a){
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  return a1;
+}
+
+void poly_decompose_w1(poly *a1, const poly *a) {
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = decompose_w1(a->coeffs[i]);
+}
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 9d36b105..c9ddbe61 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -37,4 +37,5 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                smallpoly s1[L],
                smallpoly s2[K],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+void poly_decompose_w1(poly *a1, const poly *a);
 #endif
\ No newline at end of file

From 926e957d2fe8e11635ea55650ab3c384d597f8b6 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 16:50:42 +0100
Subject: [PATCH 04/32] Eliminate z, y

---
 crypto_sign/dilithium3/m4fstack/sign.c | 61 ++++++++++++++------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index edfc6a81..515e1419 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -89,7 +89,6 @@ int crypto_sign_signature(uint8_t *sig,
   uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd;
   uint16_t nonce = 0;
   unsigned int n;
-  polyvecl y, z;
   uint8_t wcomp[K][768];
   poly cp;
   uint8_t ccomp[68];
@@ -125,35 +124,33 @@ int crypto_sign_signature(uint8_t *sig,
   polyvecl_small_ntt(s1_prime);
   polyveck_small_ntt(s2_prime);
 
-rej:
-  /* Sample intermediate vector y */
-  polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
-
-  /* Matrix-vector multiplication */
-  z = y;
-  polyvecl_ntt(&z);
-  
+rej:  
     for (size_t k_idx = 0; k_idx < K; k_idx++) {
       for(size_t i=0;i<768;i++){
         wcomp[k_idx][i] = 0;
       }
-
+    }
 
       for (size_t l_idx = 0; l_idx < L; l_idx++) {
-        poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
-        poly_pointwise_montgomery(&tmp0,  &tmp0, &z.vec[l_idx]);
-        polyw_add(wcomp[k_idx], &tmp0);
+        /* Sample intermediate vector y */
+        poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx);
+        poly_ntt(&tmp1);
+        for (size_t k_idx = 0; k_idx < K; k_idx++) {
+          poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
+          poly_pointwise_montgomery(&tmp0,  &tmp0, &tmp1);
+          polyw_add(wcomp[k_idx], &tmp0);
+        }
+      }
+      nonce++;
+      for (size_t k_idx = 0; k_idx < K; k_idx++) {
+        polyw_unpack(&tmp0, wcomp[k_idx]);
+        poly_invntt_tomont(&tmp0);
+        poly_caddq(&tmp0);
+
+        polyw_pack(wcomp[k_idx], &tmp0);
+        poly_decompose_w1(&tmp0, &tmp0);
+        polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
       }
-
-      polyw_unpack(&tmp0, wcomp[k_idx]);
-      poly_invntt_tomont(&tmp0);
-      poly_caddq(&tmp0);
-
-      polyw_pack(wcomp[k_idx], &tmp0);
-
-      poly_decompose_w1(&tmp0, &tmp0);
-      polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
-  }
 
   shake256_inc_init(&state);
   shake256_inc_absorb(&state, mu, CRHBYTES);
@@ -167,16 +164,22 @@ int crypto_sign_signature(uint8_t *sig,
   poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp);
 
   /* Compute z, reject if it reveals secret */
-  polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime);
+    for(size_t l_idx=0;l_idx < L; l_idx++){
+      poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &s1_prime[l_idx]);
+      poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx);
 
-  polyvecl_add(&z, &z, &y);
-  polyvecl_reduce(&z);
-  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
-    goto rej;
+      poly_add(&tmp0, &tmp0, &tmp1);
+
+      poly_reduce(&tmp0);
+
+      if(poly_chknorm(&tmp0, GAMMA1 - BETA))
+        goto rej;
+
+      polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, &tmp0);
+  }
 
 
   /* Write signature */
-  pack_sig_z(sig, &z);
   unsigned int hint_n = 0;
   unsigned int hints_written = 0;
   /* Check that subtracting cs2 does not change high bits of w and low bits

From 302f7f203865513d183bc664f1dccc7542cf9f26 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 16:53:37 +0100
Subject: [PATCH 05/32] Eliminate cp

---
 crypto_sign/dilithium3/m4fstack/sign.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 515e1419..bfcd824a 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -90,7 +90,6 @@ int crypto_sign_signature(uint8_t *sig,
   uint16_t nonce = 0;
   unsigned int n;
   uint8_t wcomp[K][768];
-  poly cp;
   uint8_t ccomp[68];
   poly tmp0, tmp1;
   shake256incctx state;
@@ -157,11 +156,11 @@ int crypto_sign_signature(uint8_t *sig,
   shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
   shake256_inc_finalize(&state);
   shake256_inc_squeeze(sig, CTILDEBYTES, &state);
-  poly_challenge(&cp, sig);
+  poly_challenge(&tmp0, sig);
 
-  poly_challenge_compress(ccomp, &cp);
+  poly_challenge_compress(ccomp, &tmp0);
   
-  poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp);
+  poly_small_ntt_precomp(&cp_small, &cp_small_prime, &tmp0);
 
   /* Compute z, reject if it reveals secret */
     for(size_t l_idx=0;l_idx < L; l_idx++){

From 3c36dbea3aa8d328d7689f45e593af16dc7b7558 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 17:07:25 +0100
Subject: [PATCH 06/32] Eliminate s1, s2

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 20 +++++++++++---------
 crypto_sign/dilithium3/m4fstack/stack.c | 14 ++------------
 crypto_sign/dilithium3/m4fstack/stack.h |  2 --
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index bfcd824a..8f001f2a 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -9,6 +9,8 @@
 #include "smallpoly.h"
 #include "stack.h"
 
+#include "smallntt.h"
+
 /*************************************************
 * Name:        crypto_sign_keypair
 *
@@ -94,8 +96,7 @@ int crypto_sign_signature(uint8_t *sig,
   poly tmp0, tmp1;
   shake256incctx state;
 
-  smallpoly s1_prime[L];
-  smallpoly s2_prime[K];
+  smallpoly stmp0, stmp1;
   smallpoly cp_small;
   smallhalfpoly cp_small_prime;
 
@@ -105,7 +106,7 @@ int crypto_sign_signature(uint8_t *sig,
   rnd = key + SEEDBYTES;
   mu = rnd + RNDBYTES;
   rhoprime = mu + CRHBYTES;
-  unpack_sk_stack(rho, tr, key, s1_prime, s2_prime, sk);
+  unpack_sk_stack(rho, tr, key, sk);
 
   /* Compute mu = CRH(tr, msg) */
   shake256_inc_init(&state);
@@ -119,10 +120,6 @@ int crypto_sign_signature(uint8_t *sig,
   }
   shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES);
 
-  /* Transform vectors */
-  polyvecl_small_ntt(s1_prime);
-  polyveck_small_ntt(s2_prime);
-
 rej:  
     for (size_t k_idx = 0; k_idx < K; k_idx++) {
       for(size_t i=0;i<768;i++){
@@ -164,7 +161,10 @@ int crypto_sign_signature(uint8_t *sig,
 
   /* Compute z, reject if it reveals secret */
     for(size_t l_idx=0;l_idx < L; l_idx++){
-      poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &s1_prime[l_idx]);
+      unpack_sk_s1(&stmp0, sk, l_idx);
+      small_ntt(&stmp0);
+      poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
+
       poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx);
 
       poly_add(&tmp0, &tmp0, &tmp1);
@@ -188,7 +188,9 @@ int crypto_sign_signature(uint8_t *sig,
     polyw_unpack(&tmp0, wcomp[k_idx]);
     poly_decompose(&tmp1, &tmp0, &tmp0);
 
-    poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &s2_prime[k_idx]);
+    unpack_sk_s2(&stmp0, sk, k_idx);
+    small_ntt(&stmp0);
+    poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0);
 
     poly_sub(&tmp0, &tmp0, &tmp1);
     poly_reduce(&tmp0);
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index d7d59f1c..b1e89bb5 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -222,10 +222,10 @@ void poly_lowbits(poly *a0, const poly *a){
 }
 
 void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) {
-  small_polyeta_unpack(a, sk + 3*SEEDBYTES + idx*POLYETA_PACKEDBYTES);
+  small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + idx*POLYETA_PACKEDBYTES);
 }
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
-  small_polyeta_unpack(a, sk + 3*SEEDBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES);
+  small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES);
 }
 
 
@@ -376,8 +376,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){
 void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t tr[TRBYTES],
                uint8_t key[SEEDBYTES],
-               smallpoly s1[L],
-               smallpoly s2[K],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES])
 {
   unsigned int i;
@@ -393,14 +391,6 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
   for(i = 0; i < TRBYTES; ++i)
     tr[i] = sk[i];
   sk += TRBYTES;
-
-  for(i=0; i < L; ++i)
-    small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES);
-  sk += L*POLYETA_PACKEDBYTES;
-
-  for(i=0; i < K; ++i)
-    small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES);
-  sk += K*POLYETA_PACKEDBYTES;
 }
 
 static int32_t decompose_w1(int32_t a){
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index c9ddbe61..5998cfd8 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -34,8 +34,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
 void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t tr[TRBYTES],
                uint8_t key[SEEDBYTES],
-               smallpoly s1[L],
-               smallpoly s2[K],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
 void poly_decompose_w1(poly *a1, const poly *a);
 #endif
\ No newline at end of file

From f71e025311417516911d5d090377ff6933893c5f Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Fri, 15 Mar 2024 17:12:39 +0100
Subject: [PATCH 07/32] Eliminate second poly needed for A*y * Note: Reverts
 poly_uniform_pointwise_montgomery_polywadd_stack to prior state

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 8 +++++---
 crypto_sign/dilithium3/m4fstack/stack.c | 8 +++++---
 crypto_sign/dilithium3/m4fstack/stack.h | 2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 8f001f2a..2fe1cdf7 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -131,10 +131,11 @@ int crypto_sign_signature(uint8_t *sig,
         /* Sample intermediate vector y */
         poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx);
         poly_ntt(&tmp1);
+
+        /* Matrix-vector multiplication */
         for (size_t k_idx = 0; k_idx < K; k_idx++) {
-          poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
-          poly_pointwise_montgomery(&tmp0,  &tmp0, &tmp1);
-          polyw_add(wcomp[k_idx], &tmp0);
+          // sampling of y and packing into wcomp inlined into the basemul
+          poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp1, rho, (k_idx << 8) + l_idx);
         }
       }
       nonce++;
@@ -165,6 +166,7 @@ int crypto_sign_signature(uint8_t *sig,
       small_ntt(&stmp0);
       poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
+      // TODO: eliminate tmp1
       poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx);
 
       poly_add(&tmp0, &tmp0, &tmp1);
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index b1e89bb5..536ce472 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -231,15 +231,17 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
 
 // TODO: in the end increase this buffer size as far as possible
 #define POLY_UNIFORM_BUFFERSIZE 3
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){
+  //externalize the Keccak state
+  shake128incctx state;
   int32_t t;
   uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
   {
     size_t ctr = 0;
-    stream128_init(state, seed, nonce);
+    stream128_init(&state, seed, nonce);
 
     do {
-      shake128_inc_squeeze(buf, sizeof buf, state);
+      shake128_inc_squeeze(buf, sizeof buf, &state);
 
       for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){
         t  = buf[pos];
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 5998cfd8..64726a80 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -25,7 +25,7 @@ void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
 
 
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce);
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);

From deeababc49c3f4605b470543190772dbc9e3ff1b Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 13:27:20 +0100
Subject: [PATCH 08/32] Inline sampling uniform and uniform_gamma1

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 15 ++++++---------
 crypto_sign/dilithium3/m4fstack/stack.c | 15 +++++++++------
 crypto_sign/dilithium3/m4fstack/stack.h |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 2fe1cdf7..a0d43790 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -129,13 +129,13 @@ int crypto_sign_signature(uint8_t *sig,
 
       for (size_t l_idx = 0; l_idx < L; l_idx++) {
         /* Sample intermediate vector y */
-        poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx);
-        poly_ntt(&tmp1);
+        poly_uniform_gamma1(&tmp0, rhoprime, L*nonce + l_idx);
+        poly_ntt(&tmp0);
 
         /* Matrix-vector multiplication */
         for (size_t k_idx = 0; k_idx < K; k_idx++) {
           // sampling of y and packing into wcomp inlined into the basemul
-          poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp1, rho, (k_idx << 8) + l_idx);
+          poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx);
         }
       }
       nonce++;
@@ -166,10 +166,7 @@ int crypto_sign_signature(uint8_t *sig,
       small_ntt(&stmp0);
       poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
-      // TODO: eliminate tmp1
-      poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx);
-
-      poly_add(&tmp0, &tmp0, &tmp1);
+      poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx);
 
       poly_reduce(&tmp0);
 
@@ -296,9 +293,9 @@ int crypto_sign_verify(const uint8_t *sig,
     return -1;
 
   /* Compute CRH(h(rho, t1), msg) */
-  shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
   shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, mu, TRBYTES);
   shake256_inc_absorb(&state, m, mlen);
   shake256_inc_finalize(&state);
   shake256_inc_squeeze(mu, CRHBYTES, &state);
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 536ce472..83fd1ac2 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -232,7 +232,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
 // TODO: in the end increase this buffer size as far as possible
 #define POLY_UNIFORM_BUFFERSIZE 3
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){
-  //externalize the Keccak state
+  // TODO: externalize the Keccak state
   shake128incctx state;
   int32_t t;
   uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
@@ -321,19 +321,22 @@ static void polyz_unpack_inplace(int32_t *r){
     tmp1 |= (uint32_t)a[5*i+2] << 16;
     tmp1 &= 0xFFFFF;
 
-    r[2*i+0] = GAMMA1 - tmp0;
-    r[2*i+1] = GAMMA1 - tmp1;
+    r[2*i+0] = GAMMA1 - tmp1;
+    r[2*i+1] = GAMMA1 - tmp0;
   }
 #endif
 }
 
 
-void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
+#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){
+  // TODO: externalize the state
+  shake256incctx state;
   int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
 
-  stream256_init(state, seed, nonce);
+  stream256_init(&state, seed, nonce);
   for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
-    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, &state);
     polyz_unpack_inplace(buf);
 
     for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 64726a80..64504593 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -26,7 +26,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
 
 
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce);
-void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
 

From cbc29cf4d86320929124c84041a6552857966340 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 13:30:25 +0100
Subject: [PATCH 09/32] Inline hint generation

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 23 +++++++++++++++--------
 crypto_sign/dilithium3/m4fstack/stack.c |  1 -
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index a0d43790..89ffa765 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -184,25 +184,31 @@ int crypto_sign_signature(uint8_t *sig,
    * do not reveal secret information */
   
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
-    polyw_unpack(&tmp0, wcomp[k_idx]);
-    poly_decompose(&tmp1, &tmp0, &tmp0);
-
     unpack_sk_s2(&stmp0, sk, k_idx);
     small_ntt(&stmp0);
     poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0);
 
+    polyw_unpack(&tmp0, wcomp[k_idx]);
+
     poly_sub(&tmp0, &tmp0, &tmp1);
     poly_reduce(&tmp0);
-    if(poly_chknorm(&tmp0, GAMMA2 - BETA))
+
+    polyw_pack(wcomp[k_idx], &tmp0);
+
+    poly_decompose(&tmp1, &tmp0, &tmp0);
+    poly_reduce(&tmp0);
+    if(poly_chknorm(&tmp0, GAMMA2 - BETA)){
       goto rej;
+    }
 
-    poly_schoolbook(&tmp1, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+    poly_schoolbook(&tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
       L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES);
 
     /* Compute hints for w1 */
 
-    if(poly_chknorm(&tmp1, GAMMA2))
+    if(poly_chknorm(&tmp0, GAMMA2)) {
       goto rej;
+    }
 
     poly_add(&tmp0, &tmp0, &tmp1);
 
@@ -211,11 +217,12 @@ int crypto_sign_signature(uint8_t *sig,
     poly_decompose_w1(&tmp1, &tmp1);
 
 
-    hint_n += poly_make_hint(&tmp1, &tmp0, &tmp1);
+    hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]);
+
     if (hint_n > OMEGA) {
       goto rej;
     }
-    pack_sig_h(sig, &tmp1, k_idx, &hints_written);
+    pack_sig_h(sig, &tmp0, k_idx, &hints_written);
   }
   pack_sig_h_zero(sig, &hints_written);
   *siglen = CRYPTO_BYTES;
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 83fd1ac2..04f8ffbc 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -328,7 +328,6 @@ static void polyz_unpack_inplace(int32_t *r){
 }
 
 
-#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){
   // TODO: externalize the state
   shake256incctx state;

From 8468d602f4d1abe345c6e8b20c8574ada287d1e8 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 13:37:44 +0100
Subject: [PATCH 10/32] Inline polyw subtraction

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 17 ++++-------------
 crypto_sign/dilithium3/m4fstack/stack.c | 25 +++++++++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  1 +
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 89ffa765..edb40e8b 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -93,7 +93,7 @@ int crypto_sign_signature(uint8_t *sig,
   unsigned int n;
   uint8_t wcomp[K][768];
   uint8_t ccomp[68];
-  poly tmp0, tmp1;
+  poly tmp0;
   shake256incctx state;
 
   smallpoly stmp0, stmp1;
@@ -186,16 +186,14 @@ int crypto_sign_signature(uint8_t *sig,
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
     unpack_sk_s2(&stmp0, sk, k_idx);
     small_ntt(&stmp0);
-    poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0);
+    poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
-    polyw_unpack(&tmp0, wcomp[k_idx]);
-
-    poly_sub(&tmp0, &tmp0, &tmp1);
+    polyw_sub(&tmp0, wcomp[k_idx], &tmp0);
     poly_reduce(&tmp0);
 
     polyw_pack(wcomp[k_idx], &tmp0);
 
-    poly_decompose(&tmp1, &tmp0, &tmp0);
+    poly_decompose_w0(&tmp0, &tmp0);
     poly_reduce(&tmp0);
     if(poly_chknorm(&tmp0, GAMMA2 - BETA)){
       goto rej;
@@ -210,13 +208,6 @@ int crypto_sign_signature(uint8_t *sig,
       goto rej;
     }
 
-    poly_add(&tmp0, &tmp0, &tmp1);
-
-
-    polyw_unpack(&tmp1, wcomp[k_idx]);
-    poly_decompose_w1(&tmp1, &tmp1);
-
-
     hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]);
 
     if (hint_n > OMEGA) {
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 04f8ffbc..600e2a39 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -418,3 +418,28 @@ void poly_decompose_w1(poly *a1, const poly *a) {
   for(i = 0; i < N; ++i)
     a1->coeffs[i] = decompose_w1(a->coeffs[i]);
 }
+
+static int32_t decompose_w0(int32_t a){
+  int32_t a1;
+  int32_t a0;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  a0  = a - a1*2*GAMMA2;
+  a0 -= (((Q-1)/2 - a0) >> 31) & Q;
+  return a0;
+}
+
+void poly_decompose_w0(poly *a0, const poly *a){
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a0->coeffs[i] = decompose_w0(a->coeffs[i]);
+}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 64504593..591f8ea5 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -36,4 +36,5 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t key[SEEDBYTES],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
 void poly_decompose_w1(poly *a1, const poly *a);
+void poly_decompose_w0(poly *a0, const poly *a);
 #endif
\ No newline at end of file

From b4505e734fe79c923e06080816fc2c94332493e4 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 13:42:06 +0100
Subject: [PATCH 11/32] Refactor decompose to high/lowbits

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 12 +++----
 crypto_sign/dilithium3/m4fstack/stack.c | 47 -------------------------
 crypto_sign/dilithium3/m4fstack/stack.h |  2 --
 3 files changed, 6 insertions(+), 55 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index edb40e8b..ee2fcab6 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -96,7 +96,7 @@ int crypto_sign_signature(uint8_t *sig,
   poly tmp0;
   shake256incctx state;
 
-  smallpoly stmp0, stmp1;
+  smallpoly stmp0;
   smallpoly cp_small;
   smallhalfpoly cp_small_prime;
 
@@ -135,7 +135,7 @@ int crypto_sign_signature(uint8_t *sig,
         /* Matrix-vector multiplication */
         for (size_t k_idx = 0; k_idx < K; k_idx++) {
           // sampling of y and packing into wcomp inlined into the basemul
-          poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx);
+          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx);
         }
       }
       nonce++;
@@ -145,7 +145,7 @@ int crypto_sign_signature(uint8_t *sig,
         poly_caddq(&tmp0);
 
         polyw_pack(wcomp[k_idx], &tmp0);
-        poly_decompose_w1(&tmp0, &tmp0);
+        poly_highbits(&tmp0, &tmp0);
         polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
       }
 
@@ -163,7 +163,7 @@ int crypto_sign_signature(uint8_t *sig,
   /* Compute z, reject if it reveals secret */
     for(size_t l_idx=0;l_idx < L; l_idx++){
       unpack_sk_s1(&stmp0, sk, l_idx);
-      small_ntt(&stmp0);
+      small_ntt(stmp0.coeffs);
       poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
       poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx);
@@ -185,7 +185,7 @@ int crypto_sign_signature(uint8_t *sig,
   
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
     unpack_sk_s2(&stmp0, sk, k_idx);
-    small_ntt(&stmp0);
+    small_ntt(stmp0.coeffs);
     poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
     polyw_sub(&tmp0, wcomp[k_idx], &tmp0);
@@ -193,7 +193,7 @@ int crypto_sign_signature(uint8_t *sig,
 
     polyw_pack(wcomp[k_idx], &tmp0);
 
-    poly_decompose_w0(&tmp0, &tmp0);
+    poly_lowbits(&tmp0, &tmp0);
     poly_reduce(&tmp0);
     if(poly_chknorm(&tmp0, GAMMA2 - BETA)){
       goto rej;
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 600e2a39..2bf0b97e 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -395,51 +395,4 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
   for(i = 0; i < TRBYTES; ++i)
     tr[i] = sk[i];
   sk += TRBYTES;
-}
-
-static int32_t decompose_w1(int32_t a){
-  int32_t a1;
-
-  a1  = (a + 127) >> 7;
-#if GAMMA2 == (Q-1)/32
-  a1  = (a1*1025 + (1 << 21)) >> 22;
-  a1 &= 15;
-#elif GAMMA2 == (Q-1)/88
-  a1  = (a1*11275 + (1 << 23)) >> 24;
-  a1 ^= ((43 - a1) >> 31) & a1;
-#endif
-
-  return a1;
-}
-
-void poly_decompose_w1(poly *a1, const poly *a) {
-  unsigned int i;
-
-  for(i = 0; i < N; ++i)
-    a1->coeffs[i] = decompose_w1(a->coeffs[i]);
-}
-
-static int32_t decompose_w0(int32_t a){
-  int32_t a1;
-  int32_t a0;
-
-  a1  = (a + 127) >> 7;
-#if GAMMA2 == (Q-1)/32
-  a1  = (a1*1025 + (1 << 21)) >> 22;
-  a1 &= 15;
-#elif GAMMA2 == (Q-1)/88
-  a1  = (a1*11275 + (1 << 23)) >> 24;
-  a1 ^= ((43 - a1) >> 31) & a1;
-#endif
-
-  a0  = a - a1*2*GAMMA2;
-  a0 -= (((Q-1)/2 - a0) >> 31) & Q;
-  return a0;
-}
-
-void poly_decompose_w0(poly *a0, const poly *a){
-  unsigned int i;
-
-  for(i = 0; i < N; ++i)
-    a0->coeffs[i] = decompose_w0(a->coeffs[i]);
 }
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 591f8ea5..e64f73a4 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -35,6 +35,4 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t tr[TRBYTES],
                uint8_t key[SEEDBYTES],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
-void poly_decompose_w1(poly *a1, const poly *a);
-void poly_decompose_w0(poly *a0, const poly *a);
 #endif
\ No newline at end of file

From f5a8a6588a8e0b837445e80b14dd50eb127d4dfa Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 13:53:31 +0100
Subject: [PATCH 12/32] Inline Keccak state

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 29 ++++++++++++++-----------
 crypto_sign/dilithium3/m4fstack/stack.c | 16 +++++---------
 crypto_sign/dilithium3/m4fstack/stack.h |  5 +++--
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index ee2fcab6..5398e0e1 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -94,7 +94,10 @@ int crypto_sign_signature(uint8_t *sig,
   uint8_t wcomp[K][768];
   uint8_t ccomp[68];
   poly tmp0;
-  shake256incctx state;
+  union {
+    shake128incctx s128;
+    shake256incctx s256;
+  } state;
 
   smallpoly stmp0;
   smallpoly cp_small;
@@ -109,11 +112,11 @@ int crypto_sign_signature(uint8_t *sig,
   unpack_sk_stack(rho, tr, key, sk);
 
   /* Compute mu = CRH(tr, msg) */
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, tr, TRBYTES);
-  shake256_inc_absorb(&state, m, mlen);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(mu, CRHBYTES, &state);
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, tr, TRBYTES);
+  shake256_inc_absorb(&state.s256, m, mlen);
+  shake256_inc_finalize(&state.s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &state.s256);
 
   for (n = 0; n < RNDBYTES; n++) {
      rnd[n] = 0;
@@ -135,7 +138,7 @@ int crypto_sign_signature(uint8_t *sig,
         /* Matrix-vector multiplication */
         for (size_t k_idx = 0; k_idx < K; k_idx++) {
           // sampling of y and packing into wcomp inlined into the basemul
-          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx);
+          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx, &state.s128);
         }
       }
       nonce++;
@@ -149,11 +152,11 @@ int crypto_sign_signature(uint8_t *sig,
         polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
       }
 
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
-  shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(sig, CTILDEBYTES, &state);
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, mu, CRHBYTES);
+  shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES);
+  shake256_inc_finalize(&state.s256);
+  shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256);
   poly_challenge(&tmp0, sig);
 
   poly_challenge_compress(ccomp, &tmp0);
@@ -166,7 +169,7 @@ int crypto_sign_signature(uint8_t *sig,
       small_ntt(stmp0.coeffs);
       poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
 
-      poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx);
+      poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
 
       poly_reduce(&tmp0);
 
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 2bf0b97e..2824bc77 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -231,17 +231,15 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
 
 // TODO: in the end increase this buffer size as far as possible
 #define POLY_UNIFORM_BUFFERSIZE 3
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){
-  // TODO: externalize the Keccak state
-  shake128incctx state;
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
   int32_t t;
   uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
   {
     size_t ctr = 0;
-    stream128_init(&state, seed, nonce);
+    stream128_init(state, seed, nonce);
 
     do {
-      shake128_inc_squeeze(buf, sizeof buf, &state);
+      shake128_inc_squeeze(buf, sizeof buf, state);
 
       for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){
         t  = buf[pos];
@@ -328,14 +326,12 @@ static void polyz_unpack_inplace(int32_t *r){
 }
 
 
-void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){
-  // TODO: externalize the state
-  shake256incctx state;
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
   int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
 
-  stream256_init(&state, seed, nonce);
+  stream256_init(state, seed, nonce);
   for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
-    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, &state);
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
     polyz_unpack_inplace(buf);
 
     for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index e64f73a4..38626a61 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -8,6 +8,7 @@
 #include "fips202.h"
 
 void poly_challenge_compress(uint8_t c[68], const poly *cp);
+// TODO: remove this one
 void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
 
 
@@ -25,8 +26,8 @@ void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
 
 
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce);
-void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce);
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
 

From 10d4766ea59d07683989e1eb255f490e2892d8ac Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 14:02:48 +0100
Subject: [PATCH 13/32] Shared buffer for polynomials

---
 crypto_sign/dilithium3/m4fstack/sign.c | 81 +++++++++++++++-----------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 5398e0e1..eff33f33 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -93,14 +93,25 @@ int crypto_sign_signature(uint8_t *sig,
   unsigned int n;
   uint8_t wcomp[K][768];
   uint8_t ccomp[68];
-  poly tmp0;
+
   union {
     shake128incctx s128;
     shake256incctx s256;
   } state;
 
-  smallpoly stmp0;
-  smallpoly cp_small;
+    // TODO: change this to union
+  struct {
+    poly full;
+    struct {
+      smallpoly stmp0;
+      smallpoly stmp1;
+    } small;
+  } polybuffer;
+
+  poly      *tmp0  = &polybuffer.full;
+  smallpoly *stmp0 = &polybuffer.small.stmp0;
+  smallpoly *scp   = &polybuffer.small.stmp1;
+
   smallhalfpoly cp_small_prime;
 
   rho = seedbuf;
@@ -132,24 +143,24 @@ int crypto_sign_signature(uint8_t *sig,
 
       for (size_t l_idx = 0; l_idx < L; l_idx++) {
         /* Sample intermediate vector y */
-        poly_uniform_gamma1(&tmp0, rhoprime, L*nonce + l_idx);
-        poly_ntt(&tmp0);
+        poly_uniform_gamma1(tmp0, rhoprime, L*nonce + l_idx);
+        poly_ntt(tmp0);
 
         /* Matrix-vector multiplication */
         for (size_t k_idx = 0; k_idx < K; k_idx++) {
           // sampling of y and packing into wcomp inlined into the basemul
-          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx, &state.s128);
+          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], tmp0, rho, (k_idx << 8) + l_idx, &state.s128);
         }
       }
       nonce++;
       for (size_t k_idx = 0; k_idx < K; k_idx++) {
-        polyw_unpack(&tmp0, wcomp[k_idx]);
-        poly_invntt_tomont(&tmp0);
-        poly_caddq(&tmp0);
+        polyw_unpack(tmp0, wcomp[k_idx]);
+        poly_invntt_tomont(tmp0);
+        poly_caddq(tmp0);
 
-        polyw_pack(wcomp[k_idx], &tmp0);
-        poly_highbits(&tmp0, &tmp0);
-        polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0);
+        polyw_pack(wcomp[k_idx], tmp0);
+        poly_highbits(tmp0, tmp0);
+        polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], tmp0);
       }
 
   shake256_inc_init(&state.s256);
@@ -157,26 +168,26 @@ int crypto_sign_signature(uint8_t *sig,
   shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES);
   shake256_inc_finalize(&state.s256);
   shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256);
-  poly_challenge(&tmp0, sig);
+  poly_challenge(tmp0, sig);
 
-  poly_challenge_compress(ccomp, &tmp0);
+  poly_challenge_compress(ccomp, tmp0);
   
-  poly_small_ntt_precomp(&cp_small, &cp_small_prime, &tmp0);
+  poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
 
   /* Compute z, reject if it reveals secret */
     for(size_t l_idx=0;l_idx < L; l_idx++){
-      unpack_sk_s1(&stmp0, sk, l_idx);
-      small_ntt(stmp0.coeffs);
-      poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
+      unpack_sk_s1(stmp0, sk, l_idx);
+      small_ntt(stmp0->coeffs);
+      poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
 
-      poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
+      poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
 
-      poly_reduce(&tmp0);
+      poly_reduce(tmp0);
 
-      if(poly_chknorm(&tmp0, GAMMA1 - BETA))
+      if(poly_chknorm(tmp0, GAMMA1 - BETA))
         goto rej;
 
-      polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, &tmp0);
+      polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0);
   }
 
 
@@ -187,36 +198,36 @@ int crypto_sign_signature(uint8_t *sig,
    * do not reveal secret information */
   
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
-    unpack_sk_s2(&stmp0, sk, k_idx);
-    small_ntt(stmp0.coeffs);
-    poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0);
+    unpack_sk_s2(stmp0, sk, k_idx);
+    small_ntt(stmp0->coeffs);
+    poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
 
-    polyw_sub(&tmp0, wcomp[k_idx], &tmp0);
-    poly_reduce(&tmp0);
+    polyw_sub(tmp0, wcomp[k_idx], tmp0);
+    poly_reduce(tmp0);
 
-    polyw_pack(wcomp[k_idx], &tmp0);
+    polyw_pack(wcomp[k_idx], tmp0);
 
-    poly_lowbits(&tmp0, &tmp0);
-    poly_reduce(&tmp0);
-    if(poly_chknorm(&tmp0, GAMMA2 - BETA)){
+    poly_lowbits(tmp0, tmp0);
+    poly_reduce(tmp0);
+    if(poly_chknorm(tmp0, GAMMA2 - BETA)){
       goto rej;
     }
 
-    poly_schoolbook(&tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+    poly_schoolbook(tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
       L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES);
 
     /* Compute hints for w1 */
 
-    if(poly_chknorm(&tmp0, GAMMA2)) {
+    if(poly_chknorm(tmp0, GAMMA2)) {
       goto rej;
     }
 
-    hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]);
+    hint_n += poly_make_hint_stack(tmp0, tmp0, wcomp[k_idx]);
 
     if (hint_n > OMEGA) {
       goto rej;
     }
-    pack_sig_h(sig, &tmp0, k_idx, &hints_written);
+    pack_sig_h(sig, tmp0, k_idx, &hints_written);
   }
   pack_sig_h_zero(sig, &hints_written);
   *siglen = CRYPTO_BYTES;

From 280423741d94e335182059c122272508758a9edf Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 14:02:58 +0100
Subject: [PATCH 14/32] rm 257 FFT

---
 crypto_sign/dilithium3/m4fstack/smallpoly.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h
index caa26261..f2cf843b 100644
--- a/crypto_sign/dilithium3/m4fstack/smallpoly.h
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h
@@ -6,7 +6,6 @@
 
 
 
-#if DILITHIUM_MODE == 3 // use q=769
 #define SMALL_POLY_16_BIT
 typedef struct {
     int16_t coeffs[N];
@@ -14,18 +13,6 @@ typedef struct {
 
 typedef smallpoly smallhalfpoly;
 
-#else // use q=257
-#define SMALL_POLY_32_BIT
-typedef struct {
-    int32_t coeffs[N];
-} smallpoly;
-
-typedef struct {
-    int16_t coeffs[N];
-} smallhalfpoly;
-#endif
-
-
 void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in);
 void polyvecl_small_ntt(smallpoly v[L]);
 void polyveck_small_ntt(smallpoly v[K]);

From d30a7662f1fa6df0f09880436e3581ac95718431 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 14:05:25 +0100
Subject: [PATCH 15/32] Union for small and big poly

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 12 ++++++++----
 crypto_sign/dilithium3/m4fstack/stack.h |  1 -
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index eff33f33..9a3346c5 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -99,8 +99,7 @@ int crypto_sign_signature(uint8_t *sig,
     shake256incctx s256;
   } state;
 
-    // TODO: change this to union
-  struct {
+  union {
     poly full;
     struct {
       smallpoly stmp0;
@@ -172,10 +171,12 @@ int crypto_sign_signature(uint8_t *sig,
 
   poly_challenge_compress(ccomp, tmp0);
   
-  poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
-
   /* Compute z, reject if it reveals secret */
     for(size_t l_idx=0;l_idx < L; l_idx++){
+    if(l_idx != 0){
+      poly_challenge_decompress(tmp0, ccomp);
+    }
+    poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
       unpack_sk_s1(stmp0, sk, l_idx);
       small_ntt(stmp0->coeffs);
       poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
@@ -198,6 +199,9 @@ int crypto_sign_signature(uint8_t *sig,
    * do not reveal secret information */
   
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
+    poly_challenge_decompress(tmp0, ccomp);
+    poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
+
     unpack_sk_s2(stmp0, sk, k_idx);
     small_ntt(stmp0->coeffs);
     poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 38626a61..6597b78e 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -8,7 +8,6 @@
 #include "fips202.h"
 
 void poly_challenge_compress(uint8_t c[68], const poly *cp);
-// TODO: remove this one
 void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
 
 

From a37b5a627a966a247fe994a7e089b557e7e7b47b Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 15:23:42 +0100
Subject: [PATCH 16/32] Eliminate some smaller buffers

---
 crypto_sign/dilithium3/m4fstack/sign.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 9a3346c5..2e0a66f9 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -87,8 +87,9 @@ int crypto_sign_signature(uint8_t *sig,
                           size_t mlen,
                           const uint8_t *sk)
 {
-  uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES];
-  uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd;
+  uint8_t buf[2 * CRHBYTES];
+  uint8_t *mu, *rhoprime, *rnd;
+  const uint8_t *rho, *tr, *key;
   uint16_t nonce = 0;
   unsigned int n;
   uint8_t wcomp[K][768];
@@ -113,11 +114,12 @@ int crypto_sign_signature(uint8_t *sig,
 
   smallhalfpoly cp_small_prime;
 
-  rho = seedbuf;
-  tr = rho + SEEDBYTES;
-  key = tr + TRBYTES;
-  rnd = key + SEEDBYTES;
-  mu = rnd + RNDBYTES;
+  rho = sk;
+  tr = sk + SEEDBYTES*2;
+  key = sk + SEEDBYTES;
+  
+  mu = buf;
+  rnd = mu + CRHBYTES;
   rhoprime = mu + CRHBYTES;
   unpack_sk_stack(rho, tr, key, sk);
 
@@ -128,10 +130,18 @@ int crypto_sign_signature(uint8_t *sig,
   shake256_inc_finalize(&state.s256);
   shake256_inc_squeeze(mu, CRHBYTES, &state.s256);
 
+  // Note: RNDBYTES < CRHBYTES, so buffer has proper size
   for (n = 0; n < RNDBYTES; n++) {
      rnd[n] = 0;
   }
-  shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES);
+
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, key, SEEDBYTES);
+  shake256_inc_absorb(&state.s256, rnd, RNDBYTES);
+  shake256_inc_absorb(&state.s256, mu, CRHBYTES);
+  shake256_inc_finalize(&state.s256);
+  // rnd can be overwritten here
+  shake256_inc_squeeze(rhoprime, CRHBYTES, &state.s256);
 
 rej:  
     for (size_t k_idx = 0; k_idx < K; k_idx++) {

From 2bd00ad4dafb09c9abbd4f98f3ff278ed4624d55 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 16:31:36 +0100
Subject: [PATCH 17/32] Remove asym small mul

---
 crypto_sign/dilithium3/m4fstack/sign.c      |  10 +-
 crypto_sign/dilithium3/m4fstack/smallntt.S  | 195 +++++++++-----------
 crypto_sign/dilithium3/m4fstack/smallntt.h  |   6 +-
 crypto_sign/dilithium3/m4fstack/smallpoly.c |  13 +-
 crypto_sign/dilithium3/m4fstack/smallpoly.h |   7 +-
 5 files changed, 99 insertions(+), 132 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 2e0a66f9..ff4096e1 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -112,8 +112,6 @@ int crypto_sign_signature(uint8_t *sig,
   smallpoly *stmp0 = &polybuffer.small.stmp0;
   smallpoly *scp   = &polybuffer.small.stmp1;
 
-  smallhalfpoly cp_small_prime;
-
   rho = sk;
   tr = sk + SEEDBYTES*2;
   key = sk + SEEDBYTES;
@@ -186,10 +184,10 @@ int crypto_sign_signature(uint8_t *sig,
     if(l_idx != 0){
       poly_challenge_decompress(tmp0, ccomp);
     }
-    poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
+      poly_small_ntt_copy(scp, tmp0);
       unpack_sk_s1(stmp0, sk, l_idx);
       small_ntt(stmp0->coeffs);
-      poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
+      poly_small_basemul_invntt(tmp0, scp, stmp0);
 
       poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
 
@@ -210,11 +208,11 @@ int crypto_sign_signature(uint8_t *sig,
   
   for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
     poly_challenge_decompress(tmp0, ccomp);
-    poly_small_ntt_precomp(scp, &cp_small_prime, tmp0);
+    poly_small_ntt_copy(scp, tmp0);
 
     unpack_sk_s2(stmp0, sk, k_idx);
     small_ntt(stmp0->coeffs);
-    poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0);
+    poly_small_basemul_invntt(tmp0, scp, stmp0);
 
     polyw_sub(tmp0, wcomp[k_idx], tmp0);
     poly_reduce(tmp0);
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S
index 747c111c..a9a4a576 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.S
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.S
@@ -111,7 +111,6 @@
 .align 2
 small_ntt_asm:
   push {r4-r11, r14}
-  vpush.w {s16}
 
   poly        .req r0
   twiddle_ptr .req r1
@@ -137,33 +136,33 @@ small_ntt_asm:
   .equ offset, 32
   .equ strincr, 4
   // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
+  vldm twiddle_ptr!, {s20-s27}
 
 
   add tmp, poly, #strincr*8
-  vmov s16, tmp
+  vmov s12, tmp
   1:
     // load a1, a3, ..., a15
     load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
     load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
 
     // 8-NTT on a1, a3, ..., a15
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
+    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2
 
     // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
+    vmov twiddle, s24
     mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s13
+    vmov twiddle, s25
     mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s14
+    vmov twiddle, s26
     mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s15
+    vmov twiddle, s27
     mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
 
@@ -183,7 +182,7 @@ small_ntt_asm:
     load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
 
     // 8-NTT on a0, a2, ..., a14
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
+    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2
 
     // layer 4 - 1
     // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
@@ -247,7 +246,7 @@ small_ntt_asm:
     str.w poly7, [poly, #6*distance/4+offset]
     str.w poly0, [poly], #4
 
-    vmov tmp, s16
+    vmov tmp, s12
     cmp.w poly, tmp
   bne.w 1b
 
@@ -277,7 +276,6 @@ small_ntt_asm:
     cmp.w poly, tmp
   bne.w 2b
 
-  vpop.w {s16}
   pop {r4-r11, pc}
 
 
@@ -495,32 +493,32 @@ small_invntt_tomont_asm:
   .equ strincr, 64
 
   // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
+  vldm twiddle_ptr!, {s20-s27}
 
   add.w tmp, poly, #8*strincr
-  vmov s8, tmp
+  vmov s12, tmp
   1:
     // load a1, a3, ..., a15
     load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
     load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
 
     // NTT on a1, a3, ..., a15
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
+    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
 
     // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
+    vmov twiddle, s24
     mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only
     mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s13
+    vmov twiddle, s25
     mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s14
+    vmov twiddle, s26
     mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
 
-    vmov twiddle, s15
+    vmov twiddle, s27
     mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
     mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
 
@@ -540,7 +538,7 @@ small_invntt_tomont_asm:
     load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
 
     // NTT on a0, a2, ..., a14
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
+    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
 
     // layer 4 - 1
     // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
@@ -604,7 +602,7 @@ small_invntt_tomont_asm:
     str.w poly7, [poly, #6*distance/4+offset]
     str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
 
-    vmov tmp, s8
+    vmov tmp, s12
     cmp.w poly, tmp
   bne.w 1b
 
@@ -618,9 +616,9 @@ small_invntt_tomont_asm:
   load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
   load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
 
-  vldm twiddle_ptr!, {s5-s7}
+  vldm twiddle_ptr!, {s21-s23}
 
-  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2
+  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
 
   vmov.w s2, poly
   movw poly, #:lower16:5585133
@@ -742,96 +740,69 @@ small_invntt_tomont_asm:
 .unreq tmp
 .unreq tmp2
 
-.align 2
-.global small_pointmul_asm
-.type small_pointmul_asm, %function
-small_pointmul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-
-    .equ width, 4
-
-    add.w r12, r2, #64*2
-    _point_mul_16_loop:
-
-    ldr.w r7, [r1, #2*width]
-    ldr.w r8, [r1, #3*width]
-    ldrsh.w r9, [r2, #1*2]
-    ldr.w r5, [r1, #1*width]
-    ldr.w r4, [r1], #4*width
-    ldrsh.w r6, [r2], #2*2
-
-    smultb r10, r4, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r4, r4, r11
-
-
-    neg.w r6, r6
-
-    smultb r10, r5, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r5, r5, r11
-
-    str.w r5, [r0, #1*width]
-    str.w r4, [r0], #2*width
-
-    smultb r10, r7, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r7, r7, r11
-
-    neg.w r9, r9
-
-    smultb r10, r8, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r8, r8, r11
+// BASEMUL
 
-    str.w r8, [r0, #1*width]
-    str.w r7, [r0], #2*width
 
-    cmp.w r2, r12
-    bne.w _point_mul_16_loop
-
-    pop.w {r4-r11, pc}
-
-  .align 2
-.global small_asymmetric_mul_asm
-.type small_asymmetric_mul_asm, %function
-small_asymmetric_mul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-    .equ width, 4
-    add.w r12, r0, #256*2
-    _asymmetric_mul_16_loop:
-    ldr.w r7, [r1, #width]
-    ldr.w r4, [r1], #2*width
-    ldr.w r8, [r2, #width]
-    ldr.w r5, [r2], #2*width
-    ldr.w r9, [r3, #width]
-    ldr.w r6, [r3], #2*width
-
-    smuad r10, r4, r6
-    montgomery r14, r14, r10, r6
-    smuadx r11, r4, r5
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-
-    str.w r10, [r0], #width
-
-    smuad r10, r7, r9
-    montgomery r14, r14, r10, r6
-    smuadx r11, r7, r8
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-    str.w r10, [r0], #width
-
-
-    cmp.w r0, r12
-    bne.w _asymmetric_mul_16_loop
+.global small_basemul_asm
+.type small_basemul_asm, %function
+.align 2
+small_basemul_asm:
+  push {r4-r11, lr}
+
+  rptr  .req r0
+  aptr  .req r1
+  bptr  .req r2
+  zeta_ptr  .req r3
+  poly0 .req r4
+  poly1 .req r6
+  poly2 .req r5
+  poly3 .req r7 // TODO: remove poly3
+  q     .req r8
+  qinv  .req r8
+  tmp   .req r9
+  tmp2  .req r10
+  tmp3  .req r11
+  zeta  .req r12
+  ctr  .req r14
+
+  movw  q, #769
+  movt qinv, #767
+  add ctr, rptr, #64*2*4
+  1:
 
-    pop.w {r4-r11, pc}
\ No newline at end of file
+    ldr poly2, [aptr, #4]
+    ldr poly3, [bptr, #4]  
+    ldrh.w zeta, [zeta_ptr], #2
+    ldr poly0, [aptr], #8
+    ldr poly1, [bptr], #8
+    
+    //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+    smultt tmp, poly0, poly1
+    montgomery q, qinv, tmp, tmp2
+    smultb tmp2, tmp2, zeta
+    smlabb tmp2, poly0, poly1, tmp2
+    montgomery q, qinv, tmp2, tmp
+
+    smuadx tmp2, poly0, poly1
+    montgomery q, qinv, tmp2, tmp3
+    pkhtb tmp, tmp3, tmp, asr#16
+    str tmp, [rptr], #4
+    
+    neg zeta, zeta
+    
+    //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+    smultt tmp, poly2, poly3
+    montgomery q, qinv, tmp, tmp2
+    smultb tmp2, tmp2, zeta
+    smlabb tmp2, poly2, poly3, tmp2
+    montgomery q, qinv, tmp2, tmp
+
+    smuadx tmp2, poly2, poly3
+    montgomery q, qinv, tmp2, tmp3
+    pkhtb tmp, tmp3, tmp, asr#16
+    
+    str tmp, [rptr], #4
+    cmp.w rptr, ctr
+    bne.w 1b
+
+  pop {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h
index 0aa0ce9b..048d5df5 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.h
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.h
@@ -42,12 +42,10 @@ static const int16_t zetas_inv_CT_asm[256] = {
 
 void small_ntt_asm(int16_t a[N], const int16_t * zetas);
 void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas);
-void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas);
-void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]);
+void small_basemul_asm(int16_t *c, const int16_t *a, const int16_t *b, const int16_t *zetas);
 
 #define small_ntt(a) small_ntt_asm(a, zetas_asm)
 #define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm)
-#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas)
-#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime);
+#define small_basemul(r,a,b) small_basemul_asm(r, a, b, zetas)
 
 #endif
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c
index 1f7fab17..433d98af 100644
--- a/crypto_sign/dilithium3/m4fstack/smallpoly.c
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c
@@ -1,13 +1,12 @@
 #include "smallpoly.h"
 #include "smallntt.h"
 
-void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) {
-  for (int i = N; i >= 0; i--)
+void poly_small_ntt_copy(smallpoly *out, poly *in) {
+  for (int i = N - 1; i >= 0; i--)
   {
     out->coeffs[i] = in->coeffs[i];
   }
   small_ntt(out->coeffs);
-  small_point_mul(out2->coeffs, out->coeffs);
 }
 
 
@@ -28,10 +27,10 @@ void polyveck_small_ntt(smallpoly v[K]) {
 
 
 
-void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b){
     // re-use the buffer
     smallpoly *tmp = (smallpoly *)r;
-    small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs);
+    small_basemul(tmp->coeffs, a->coeffs, b->coeffs);
     small_invntt_tomont(tmp->coeffs);
 
     #ifdef SMALL_POLY_16_BIT
@@ -43,10 +42,10 @@ void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly
     #endif
 }
 
-void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]){
     unsigned int i;
     for(i=0;i<L;i++){
-        poly_small_basemul_invntt(&r->vec[i], a, aprime, &b[i]);
+        poly_small_basemul_invntt(&r->vec[i], a, &b[i]);
     }
 }
 
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h
index f2cf843b..1aac98fa 100644
--- a/crypto_sign/dilithium3/m4fstack/smallpoly.h
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h
@@ -13,13 +13,14 @@ typedef struct {
 
 typedef smallpoly smallhalfpoly;
 
-void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in);
+void poly_small_ntt_copy(smallpoly*, poly*);
+
 void polyvecl_small_ntt(smallpoly v[L]);
 void polyveck_small_ntt(smallpoly v[K]);
 
 
-void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]);
-void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b);
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]);
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b);
 
 void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
 

From 77a75728bca65be08f897f5dbeb2704cd34e8d6e Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 16:45:16 +0100
Subject: [PATCH 18/32] Stack friendly uniform_gamma1 w/o add

---
 crypto_sign/dilithium3/m4fstack/sign.c  |  2 +-
 crypto_sign/dilithium3/m4fstack/stack.c | 13 +++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index ff4096e1..ab1426ce 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -150,7 +150,7 @@ int crypto_sign_signature(uint8_t *sig,
 
       for (size_t l_idx = 0; l_idx < L; l_idx++) {
         /* Sample intermediate vector y */
-        poly_uniform_gamma1(tmp0, rhoprime, L*nonce + l_idx);
+        poly_uniform_gamma1_stack(tmp0, rhoprime, L*nonce + l_idx, &state.s256);
         poly_ntt(tmp0);
 
         /* Matrix-vector multiplication */
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 2824bc77..d3256c8b 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -325,6 +325,19 @@ static void polyz_unpack_inplace(int32_t *r){
 #endif
 }
 
+void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
+      a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j];
+    }
+  }
+}
 
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
   int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 6597b78e..c21714c7 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -26,6 +26,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
 
 
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);

From 6609f829d81fc7b556944218a274f4e7e6524ce4 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 18 Mar 2024 17:55:15 +0100
Subject: [PATCH 19/32] Stack optimized Dilithium{2,5}

---
 crypto_sign/dilithium2/m4fstack/api.h             | 1 +
 crypto_sign/dilithium2/m4fstack/config.h          | 1 +
 crypto_sign/dilithium2/m4fstack/macros.i          | 1 +
 crypto_sign/dilithium2/m4fstack/ntt.S             | 1 +
 crypto_sign/dilithium2/m4fstack/ntt.h             | 1 +
 crypto_sign/dilithium2/m4fstack/packing.c         | 1 +
 crypto_sign/dilithium2/m4fstack/packing.h         | 1 +
 crypto_sign/dilithium2/m4fstack/params.h          | 1 +
 crypto_sign/dilithium2/m4fstack/pointwise_mont.h  | 1 +
 crypto_sign/dilithium2/m4fstack/pointwise_mont.s  | 1 +
 crypto_sign/dilithium2/m4fstack/poly.c            | 1 +
 crypto_sign/dilithium2/m4fstack/poly.h            | 1 +
 crypto_sign/dilithium2/m4fstack/polyvec.c         | 1 +
 crypto_sign/dilithium2/m4fstack/polyvec.h         | 1 +
 crypto_sign/dilithium2/m4fstack/reduce.h          | 1 +
 crypto_sign/dilithium2/m4fstack/rounding.c        | 1 +
 crypto_sign/dilithium2/m4fstack/rounding.h        | 1 +
 crypto_sign/dilithium2/m4fstack/sign.c            | 1 +
 crypto_sign/dilithium2/m4fstack/sign.h            | 1 +
 crypto_sign/dilithium2/m4fstack/smallntt.S        | 1 +
 crypto_sign/dilithium2/m4fstack/smallntt.h        | 1 +
 crypto_sign/dilithium2/m4fstack/smallpoly.c       | 1 +
 crypto_sign/dilithium2/m4fstack/smallpoly.h       | 1 +
 crypto_sign/dilithium2/m4fstack/stack.c           | 1 +
 crypto_sign/dilithium2/m4fstack/stack.h           | 1 +
 crypto_sign/dilithium2/m4fstack/symmetric-shake.c | 1 +
 crypto_sign/dilithium2/m4fstack/symmetric.h       | 1 +
 crypto_sign/dilithium2/m4fstack/vector.h          | 1 +
 crypto_sign/dilithium2/m4fstack/vector.s          | 1 +
 crypto_sign/dilithium5/m4fstack/api.h             | 1 +
 crypto_sign/dilithium5/m4fstack/config.h          | 1 +
 crypto_sign/dilithium5/m4fstack/macros.i          | 1 +
 crypto_sign/dilithium5/m4fstack/ntt.S             | 1 +
 crypto_sign/dilithium5/m4fstack/ntt.h             | 1 +
 crypto_sign/dilithium5/m4fstack/packing.c         | 1 +
 crypto_sign/dilithium5/m4fstack/packing.h         | 1 +
 crypto_sign/dilithium5/m4fstack/params.h          | 1 +
 crypto_sign/dilithium5/m4fstack/pointwise_mont.h  | 1 +
 crypto_sign/dilithium5/m4fstack/pointwise_mont.s  | 1 +
 crypto_sign/dilithium5/m4fstack/poly.c            | 1 +
 crypto_sign/dilithium5/m4fstack/poly.h            | 1 +
 crypto_sign/dilithium5/m4fstack/polyvec.c         | 1 +
 crypto_sign/dilithium5/m4fstack/polyvec.h         | 1 +
 crypto_sign/dilithium5/m4fstack/reduce.h          | 1 +
 crypto_sign/dilithium5/m4fstack/rounding.c        | 1 +
 crypto_sign/dilithium5/m4fstack/rounding.h        | 1 +
 crypto_sign/dilithium5/m4fstack/sign.c            | 1 +
 crypto_sign/dilithium5/m4fstack/sign.h            | 1 +
 crypto_sign/dilithium5/m4fstack/smallntt.S        | 1 +
 crypto_sign/dilithium5/m4fstack/smallntt.h        | 1 +
 crypto_sign/dilithium5/m4fstack/smallpoly.c       | 1 +
 crypto_sign/dilithium5/m4fstack/smallpoly.h       | 1 +
 crypto_sign/dilithium5/m4fstack/stack.c           | 1 +
 crypto_sign/dilithium5/m4fstack/stack.h           | 1 +
 crypto_sign/dilithium5/m4fstack/symmetric-shake.c | 1 +
 crypto_sign/dilithium5/m4fstack/symmetric.h       | 1 +
 crypto_sign/dilithium5/m4fstack/vector.h          | 1 +
 crypto_sign/dilithium5/m4fstack/vector.s          | 1 +
 58 files changed, 58 insertions(+)
 create mode 120000 crypto_sign/dilithium2/m4fstack/api.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/config.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/macros.i
 create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.S
 create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/packing.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/packing.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/params.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium2/m4fstack/poly.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/poly.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/reduce.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/sign.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/sign.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.S
 create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/smallpoly.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/smallpoly.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/stack.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/stack.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/vector.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/vector.s
 create mode 120000 crypto_sign/dilithium5/m4fstack/api.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/config.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/macros.i
 create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.S
 create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/packing.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/packing.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/params.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium5/m4fstack/poly.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/poly.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/reduce.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/sign.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/sign.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.S
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/stack.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/stack.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/vector.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/vector.s

diff --git a/crypto_sign/dilithium2/m4fstack/api.h b/crypto_sign/dilithium2/m4fstack/api.h
new file mode 120000
index 00000000..d29362d1
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/api.h
@@ -0,0 +1 @@
+../m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/config.h b/crypto_sign/dilithium2/m4fstack/config.h
new file mode 120000
index 00000000..f3892d90
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/config.h
@@ -0,0 +1 @@
+../m4f/config.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/macros.i b/crypto_sign/dilithium2/m4fstack/macros.i
new file mode 120000
index 00000000..d615b854
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/ntt.S b/crypto_sign/dilithium2/m4fstack/ntt.S
new file mode 120000
index 00000000..40cd5d40
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/ntt.S
@@ -0,0 +1 @@
+../m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/ntt.h b/crypto_sign/dilithium2/m4fstack/ntt.h
new file mode 120000
index 00000000..8e99caeb
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/packing.c b/crypto_sign/dilithium2/m4fstack/packing.c
new file mode 120000
index 00000000..1052fe26
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/packing.c
@@ -0,0 +1 @@
+../m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/packing.h b/crypto_sign/dilithium2/m4fstack/packing.h
new file mode 120000
index 00000000..643cc32a
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/packing.h
@@ -0,0 +1 @@
+../m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/params.h b/crypto_sign/dilithium2/m4fstack/params.h
new file mode 120000
index 00000000..1f91a364
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/params.h
@@ -0,0 +1 @@
+../m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.h b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h
new file mode 120000
index 00000000..32558852
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h
@@ -0,0 +1 @@
+../m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.s b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s
new file mode 120000
index 00000000..3597ffdc
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s
@@ -0,0 +1 @@
+../m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/poly.c b/crypto_sign/dilithium2/m4fstack/poly.c
new file mode 120000
index 00000000..2544e75b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/poly.h b/crypto_sign/dilithium2/m4fstack/poly.h
new file mode 120000
index 00000000..7ef70e53
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.c b/crypto_sign/dilithium2/m4fstack/polyvec.c
new file mode 120000
index 00000000..569a9a1b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.h b/crypto_sign/dilithium2/m4fstack/polyvec.h
new file mode 120000
index 00000000..d02c99c3
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/reduce.h b/crypto_sign/dilithium2/m4fstack/reduce.h
new file mode 120000
index 00000000..45fbf228
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/reduce.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/rounding.c b/crypto_sign/dilithium2/m4fstack/rounding.c
new file mode 120000
index 00000000..ec780689
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/rounding.c
@@ -0,0 +1 @@
+../m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/rounding.h b/crypto_sign/dilithium2/m4fstack/rounding.h
new file mode 120000
index 00000000..e64114bc
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/rounding.h
@@ -0,0 +1 @@
+../m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/sign.c b/crypto_sign/dilithium2/m4fstack/sign.c
new file mode 120000
index 00000000..ae3b84fa
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/sign.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/sign.h b/crypto_sign/dilithium2/m4fstack/sign.h
new file mode 120000
index 00000000..551f979a
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/sign.h
@@ -0,0 +1 @@
+../m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.S b/crypto_sign/dilithium2/m4fstack/smallntt.S
new file mode 120000
index 00000000..7e2174f9
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallntt.S
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.h b/crypto_sign/dilithium2/m4fstack/smallntt.h
new file mode 120000
index 00000000..cfd626b9
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallntt.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.c b/crypto_sign/dilithium2/m4fstack/smallpoly.c
new file mode 120000
index 00000000..7dbf4992
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.h b/crypto_sign/dilithium2/m4fstack/smallpoly.h
new file mode 120000
index 00000000..366391d9
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/stack.c b/crypto_sign/dilithium2/m4fstack/stack.c
new file mode 120000
index 00000000..c89dc5a0
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/stack.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/stack.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/stack.h b/crypto_sign/dilithium2/m4fstack/stack.h
new file mode 120000
index 00000000..c9aed5d7
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/stack.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/stack.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/symmetric-shake.c b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c
new file mode 120000
index 00000000..b95855bb
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c
@@ -0,0 +1 @@
+../m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/symmetric.h b/crypto_sign/dilithium2/m4fstack/symmetric.h
new file mode 120000
index 00000000..e89ae955
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/vector.h b/crypto_sign/dilithium2/m4fstack/vector.h
new file mode 120000
index 00000000..0793594b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/vector.h
@@ -0,0 +1 @@
+../m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/vector.s b/crypto_sign/dilithium2/m4fstack/vector.s
new file mode 120000
index 00000000..1a496055
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/vector.s
@@ -0,0 +1 @@
+../m4f/vector.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/api.h b/crypto_sign/dilithium5/m4fstack/api.h
new file mode 120000
index 00000000..d29362d1
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/api.h
@@ -0,0 +1 @@
+../m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/config.h b/crypto_sign/dilithium5/m4fstack/config.h
new file mode 120000
index 00000000..f3892d90
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/config.h
@@ -0,0 +1 @@
+../m4f/config.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/macros.i b/crypto_sign/dilithium5/m4fstack/macros.i
new file mode 120000
index 00000000..d615b854
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/ntt.S b/crypto_sign/dilithium5/m4fstack/ntt.S
new file mode 120000
index 00000000..40cd5d40
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/ntt.S
@@ -0,0 +1 @@
+../m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/ntt.h b/crypto_sign/dilithium5/m4fstack/ntt.h
new file mode 120000
index 00000000..8e99caeb
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/packing.c b/crypto_sign/dilithium5/m4fstack/packing.c
new file mode 120000
index 00000000..1052fe26
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/packing.c
@@ -0,0 +1 @@
+../m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/packing.h b/crypto_sign/dilithium5/m4fstack/packing.h
new file mode 120000
index 00000000..643cc32a
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/packing.h
@@ -0,0 +1 @@
+../m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/params.h b/crypto_sign/dilithium5/m4fstack/params.h
new file mode 120000
index 00000000..1f91a364
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/params.h
@@ -0,0 +1 @@
+../m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.h b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h
new file mode 120000
index 00000000..32558852
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h
@@ -0,0 +1 @@
+../m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.s b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s
new file mode 120000
index 00000000..3597ffdc
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s
@@ -0,0 +1 @@
+../m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/poly.c b/crypto_sign/dilithium5/m4fstack/poly.c
new file mode 120000
index 00000000..b5bdaa81
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/poly.c
@@ -0,0 +1 @@
+../m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/poly.h b/crypto_sign/dilithium5/m4fstack/poly.h
new file mode 120000
index 00000000..bd94e469
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/poly.h
@@ -0,0 +1 @@
+../m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.c b/crypto_sign/dilithium5/m4fstack/polyvec.c
new file mode 120000
index 00000000..569a9a1b
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.h b/crypto_sign/dilithium5/m4fstack/polyvec.h
new file mode 120000
index 00000000..d02c99c3
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/reduce.h b/crypto_sign/dilithium5/m4fstack/reduce.h
new file mode 120000
index 00000000..45fbf228
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/reduce.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/rounding.c b/crypto_sign/dilithium5/m4fstack/rounding.c
new file mode 120000
index 00000000..ec780689
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/rounding.c
@@ -0,0 +1 @@
+../m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/rounding.h b/crypto_sign/dilithium5/m4fstack/rounding.h
new file mode 120000
index 00000000..e64114bc
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/rounding.h
@@ -0,0 +1 @@
+../m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/sign.c b/crypto_sign/dilithium5/m4fstack/sign.c
new file mode 120000
index 00000000..ae3b84fa
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/sign.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/sign.h b/crypto_sign/dilithium5/m4fstack/sign.h
new file mode 120000
index 00000000..551f979a
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/sign.h
@@ -0,0 +1 @@
+../m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.S b/crypto_sign/dilithium5/m4fstack/smallntt.S
new file mode 120000
index 00000000..7e2174f9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallntt.S
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.h b/crypto_sign/dilithium5/m4fstack/smallntt.h
new file mode 120000
index 00000000..cfd626b9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallntt.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.c b/crypto_sign/dilithium5/m4fstack/smallpoly.c
new file mode 120000
index 00000000..7dbf4992
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.h b/crypto_sign/dilithium5/m4fstack/smallpoly.h
new file mode 120000
index 00000000..366391d9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/stack.c b/crypto_sign/dilithium5/m4fstack/stack.c
new file mode 120000
index 00000000..c89dc5a0
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/stack.c
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/stack.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/stack.h b/crypto_sign/dilithium5/m4fstack/stack.h
new file mode 120000
index 00000000..c9aed5d7
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/stack.h
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/stack.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/symmetric-shake.c b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c
new file mode 120000
index 00000000..b95855bb
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c
@@ -0,0 +1 @@
+../m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/symmetric.h b/crypto_sign/dilithium5/m4fstack/symmetric.h
new file mode 120000
index 00000000..e89ae955
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/vector.h b/crypto_sign/dilithium5/m4fstack/vector.h
new file mode 120000
index 00000000..0793594b
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/vector.h
@@ -0,0 +1 @@
+../m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/vector.s b/crypto_sign/dilithium5/m4fstack/vector.s
new file mode 120000
index 00000000..1a496055
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/vector.s
@@ -0,0 +1 @@
+../m4f/vector.s
\ No newline at end of file

From 59724a7b309ec9acdfae2f0dca65f06541d4ffa3 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Tue, 19 Mar 2024 16:39:48 +0100
Subject: [PATCH 20/32] Switch to Plantard-based 769 NTT

---
 .../dilithium2/m4fstack/macros_smallntt.i     |    1 +
 .../dilithium3/m4fstack/macros_smallntt.i     |   77 +
 crypto_sign/dilithium3/m4fstack/smallntt.S    | 1341 ++++++++---------
 crypto_sign/dilithium3/m4fstack/smallntt.h    |   55 +-
 .../dilithium5/m4fstack/macros_smallntt.i     |    1 +
 5 files changed, 701 insertions(+), 774 deletions(-)
 create mode 120000 crypto_sign/dilithium2/m4fstack/macros_smallntt.i
 create mode 100644 crypto_sign/dilithium3/m4fstack/macros_smallntt.i
 create mode 120000 crypto_sign/dilithium5/m4fstack/macros_smallntt.i

diff --git a/crypto_sign/dilithium2/m4fstack/macros_smallntt.i b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i
new file mode 120000
index 00000000..fc731f12
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/macros_smallntt.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
new file mode 100644
index 00000000..b97f4d52
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
@@ -0,0 +1,77 @@
+/* 
+* NTT and inverse NTT code from: 
+* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+*/
+
+#ifndef MACROS_SMALLNTT_I
+#define MACROS_SMALLNTT_I
+
+// general macros
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro doubleplant a, tmp, q, qa, plantconst
+  smulwb \tmp, \plantconst, \a
+  smulwt \a, \plantconst, \a
+  smlabt \tmp, \tmp, \q, \qa
+  smlabt \a, \a, \q, \qa
+  pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebarrett a, tmp, tmp2, q, barrettconst
+  smulbb \tmp, \a, \barrettconst
+  smultb \tmp2, \a, \barrettconst
+  asr \tmp, \tmp, #26
+  asr \tmp2, \tmp2, #26
+  smulbb \tmp, \tmp, \q
+  smulbb \tmp2, \tmp2, \q
+  pkhbt \tmp, \tmp, \tmp2, lsl#16
+  usub16 \a, \a, \tmp
+.endm
+
+// q locate in the top half of the register
+.macro plant_red q, qa, qinv, tmp
+  mul \tmp, \tmp, \qinv     
+  //tmp*qinv mod 2^2n/ 2^n; in high half
+  smlatt \tmp, \tmp, \q, \qa
+  // result in high half
+.endm
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S
index a9a4a576..9f048042 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.S
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.S
@@ -1,283 +1,247 @@
+/* 
+* NTT and inverse NTT code from: 
+* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+*/
+
 #include "macros.i"
 
 .syntax unified
 .cpu cortex-m4
 .thumb
 
-// general macros
-.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  ldr.w \a0, [\a, \mem0]
-  ldr.w \a1, [\a, \mem1]
-  ldr.w \a2, [\a, \mem2]
-  ldr.w \a3, [\a, \mem3]
-.endm
-
-.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  str.w \a0, [\a, \mem0]
-  str.w \a1, [\a, \mem1]
-  str.w \a2, [\a, \mem2]
-  str.w \a3, [\a, \mem3]
-.endm
-
-.macro montgomery q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \tmp, \q, \tmp, \a
-.endm
-
-.macro montgomery_inplace q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \a, \q, \tmp, \a
-.endm
-
-.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst
-  smulbb \tmp2, \a, \montconst
-  montgomery \q, \qinv, \tmp2, \tmp
-  smultb \a, \a, \montconst
-  montgomery \q, \qinv, \a, \tmp2
-  pkhtb \a, \tmp2, \tmp, asr#16
-.endm
-
+#include "macros_smallntt.i"
 // #######
 // #######
 // # NTT #
 // #######
 // #######
 
-.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv
-    smulb\tb \tmp, \a, \twiddle
-    smult\tb \a, \a, \twiddle
-    montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-    montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2
-    pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-.endm
+.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
 
-.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
-  smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb
-  smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb
-  montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-  montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp
-  pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-  usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs)
-  uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs)
-.endm
-
-.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
-  doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
-  doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
-.endm
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
 
-.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w \twiddle, [\twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
 
-    // layer 2
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
 
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
 .endm
 
-.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2
-    // layer 3
-    vmov \twiddle, \xi01
-    two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp
+	// layer 3
+	vmov \twiddle1, \xi0
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
 
-    // layer 2
-    vmov \twiddle, \xi23
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	// layer 2
+	vmov \twiddle1, \xi1
+	vmov \twiddle2, \xi2
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
 
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
 
-    // layer 1
-    vmov \twiddle, \xi45
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	// layer 1
+	vmov \twiddle1, \xi3
+	vmov \twiddle2, \xi4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
 
-    vmov \twiddle, \xi67
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	vmov \twiddle1, \xi5
+	vmov \twiddle2, \xi6
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
 .endm
 
-.global small_ntt_asm
-.type small_ntt_asm, %function
+.global small_ntt_asm_769
+.type small_ntt_asm_769, %function
 .align 2
-small_ntt_asm:
-  push {r4-r11, r14}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 256
-  .equ offset, 32
-  .equ strincr, 4
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s20-s27}
-
-
-  add tmp, poly, #strincr*8
-  vmov s12, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // 8-NTT on a1, a3, ..., a15
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s24
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s25
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s26
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s27
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // 8-NTT on a0, a2, ..., a14
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s12
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-
-  .equ distance, distance/16
-  .equ strincr, 32
-
-  add.w tmp, poly, #strincr*16
-  vmov s13, tmp
-
-  2:
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  pop {r4-r11, pc}
-
+small_ntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s24}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	###  qinv        .req r11 ### q^-1 mod 2^2n; n=16
+	q           .req r12 
+	### at the top of r12
+	qa          .req r0
+	### qa=2^a q;a=3; at the bottom of r12
+	tmp         .req r14
+
+	// movw qa, #24608
+	// Why movt? Because we initially placed qa at the bottom of the same register as q;
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 256
+	.equ offset, 32
+	.equ strincr, 4
+	// pre-load 15 twiddle factors to 15 FPU registers
+	// s0-s7 used to temporary store 16 16-bit polys.
+	vldm twiddle_ptr!, {s8-s22}
+ 
+	add tmp, poly, #strincr*8
+	// s23: poly addr
+	// s24: tmp  
+	vmov s24, tmp  
+	1:
+		// load a1, a3, ..., a15
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+		
+		movw qa, #24608
+
+		// 8-NTT on a1, a3, ..., a15
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// s15, s16, s17, s18, s19, s20, s21, s22 left
+		// multiply coeffs by layer 8 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16 
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18 
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20 
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22 
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+
+		vmov poly, s23
+	
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// 8-NTT on a0, a2, ..., a14
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle1, s1 // load a3
+		uadd16 tmp, poly1, twiddle1
+		usub16 poly1, poly1, twiddle1
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle1, s3 // load a7
+		uadd16 tmp, poly3, twiddle1
+		usub16 poly3, poly3, twiddle1
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle1, s5 // load a11
+		uadd16 tmp, poly5, twiddle1
+		usub16 poly5, poly5, twiddle1
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle1, s7 // load a15
+		uadd16 tmp, poly7, twiddle1
+		usub16 poly7, poly7, twiddle1
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle1, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle1, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle1, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle1, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle1, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle1, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle1, poly0, poly1
+		str.w twiddle1, [poly, #offset]
+		str.w tmp, [poly], #4
+
+	vmov tmp, s24
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr
+
+	### LAYER 3+2+1
+
+	.equ distance, distance/16
+	.equ strincr, 32
+
+	add.w tmp, poly, #strincr*16
+	vmov s13, tmp
+	2:
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+		
+		vmov poly, s23
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #strincr
+
+	vmov tmp, s13
+	cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s24}
+	pop {r4-r11, pc}
 
 .unreq poly
 .unreq twiddle_ptr
@@ -289,11 +253,12 @@ small_ntt_asm:
 .unreq poly5
 .unreq poly6
 .unreq poly7
-.unreq twiddle
-.unreq qinv
+.unreq twiddle1
+.unreq twiddle2
 .unreq q
+.unreq qa
 .unreq tmp
-.unreq tmp2
+
 
 // ########
 // ########
@@ -301,428 +266,296 @@ small_ntt_asm:
 // ########
 // ########
 
-.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv
-  uadd16 \tmp, \a0, \a1
-  usub16 \a1, \a0, \a1
-  mov.w \a0, \tmp
-.endm
-
-.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv
-  doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv
-  doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  // layer 2
-  sadd16.w \c6, \tmp, \tmp2 // c0, c2
-  ssub16.w \tmp2, \tmp, \tmp2
-  sadd16.w \c4, \c0, \c2 // c4, c6
-  ssub16.w \c2, \c0, \c2
-
-  vmov.w \twiddle, \xi12
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
+// input: 0.5/1q
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 1q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 1q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 2q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 1.5q maximum;
+
+	// tmp and c0 are free at this point
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 4q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 2q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 4.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 2.5q maximum
 .endm
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
 
-.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  mov.w \c6, \tmp
-  mov.w \c4, \c0
-
-  // layer 2
-  vmov.w \twiddle, \xi12
-  doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
-.endm
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
 
-.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w twiddle, [twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
 
-    // layer 2
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
 
-    two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
 .endm
 
-.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2
-    smulb\tb \tmp, \a, \twiddle
-    smmulr.w \tmp2, \tmp, \Qbar
-    mls.w \tmp, \tmp2, \Q, \tmp
-    smult\tb \a, \a, \twiddle
-    smmulr.w \tmp2, \a, \Qbar
-    mls.w \a, \tmp2, \Q, \a
-    pkhbt \a, \tmp, \a, lsl #16
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
 .endm
-
-.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2
-
-    movt \Q, #0
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    movt \Q, #767
-
-.endm
-
-.global small_invntt_tomont_asm
-.type small_invntt_tomont_asm, %function
+# input coefficients < 0.5q
+.global small_invntt_asm_769
+.type small_invntt_asm_769, %function
 .align 2
-small_invntt_tomont_asm:
-  push {r4-r11, r14}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 16
-  .equ offset, 32
-  .equ strincr, 64
-
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s20-s27}
-
-  add.w tmp, poly, #8*strincr
-  vmov s12, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // NTT on a1, a3, ..., a15
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s24
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s25
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s26
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s27
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // NTT on a0, a2, ..., a14
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
-
-    vmov tmp, s12
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-  .equ distance, distance*16
-  .equ strincr, 4
-
-  // ITER 0
-  load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-  load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-  vldm twiddle_ptr!, {s21-s23}
-
-  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2
-
-  vmov.w s2, poly
-  movw poly, #:lower16:5585133
-  movt poly, #:upper16:5585133
-
-  // twisting
-  _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-  vmov.w poly, s2
-
-  store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-  str.w poly1, [poly, #distance/4]
-  str.w poly2, [poly, #2*distance/4]
-  str.w poly3, [poly, #3*distance/4]
-  str.w poly0, [poly], #4
-
-  // ITER 1-12
-  add.w tmp, poly, #strincr*3*(3+1)
-  vmov s14, tmp
-  3:
-    add.w tmp, poly, #strincr*3
-    vmov s13, tmp
-    2:
-      // polys upto 6q
-      load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-      load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-
-      _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-      vmov.w s2, poly
-      movw poly, #:lower16:5585133
-      movt poly, #:upper16:5585133
-
-      // twisting
-      _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-      vmov.w poly, s2
-
-      store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-      str.w poly1, [poly, #distance/4]
-      str.w poly2, [poly, #2*distance/4]
-      str.w poly3, [poly, #3*distance/4]
-      str.w poly0, [poly], #4
-
-      vmov tmp, s13
-      cmp.w poly, tmp
-    bne.w 2b
-
-    // polys upto 9q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s14
-    cmp.w poly, tmp
-  bne.w 3b
-
-  // ITER 13-15
-  add tmp, poly, #3*strincr
-  vmov s13, tmp
-  2:
-    // polys upto 6q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  pop {r4-r11, pc}
+small_invntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #24608
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		// vmov twiddle1, s15 
+		vmov twiddle2, s16
+		// mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+		// ----------
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+		// 1,3,5,7: <5q; 0,2,4,6:<1q
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: < 5.5q
+
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: < 1.5q
+	vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #24608
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 5.5q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+	vmov tmp, s14
+	cmp.w poly, tmp
+	bne.w 2b
+
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
 
 .unreq poly
 .unreq twiddle_ptr
@@ -734,75 +567,111 @@ small_invntt_tomont_asm:
 .unreq poly5
 .unreq poly6
 .unreq poly7
-.unreq twiddle
-.unreq qinv
+.unreq twiddle1
+.unreq twiddle2
 .unreq q
+.unreq qa
 .unreq tmp
-.unreq tmp2
 
 // BASEMUL
 
+/* 
+* Basemul code (adapted to q=769) from: 
+* Huang, J. et al. 2022. Improved Plantard Arithmetic for Lattice-based Cryptography.
+* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2022, 4 (Aug. 2022), 614–636.
+* DOI:https://doi.org/10.46586/tches.v2022.i4.614-636.
+* https://github.com/UIC-ESLAS/ImprovedPlantardArithmetic/blob/f3482cfd09dda8f1f55b95e13616147e3b6dd008/crypto_kem/kyber768/m4fstack/fastbasemul.S
+*/
 
-.global small_basemul_asm
-.type small_basemul_asm, %function
+.global small_basemul_asm_769
+.type small_basemul_asm_769, %function
 .align 2
-small_basemul_asm:
-  push {r4-r11, lr}
-
-  rptr  .req r0
-  aptr  .req r1
-  bptr  .req r2
-  zeta_ptr  .req r3
-  poly0 .req r4
-  poly1 .req r6
-  poly2 .req r5
-  poly3 .req r7 // TODO: remove poly3
-  q     .req r8
-  qinv  .req r8
-  tmp   .req r9
-  tmp2  .req r10
-  tmp3  .req r11
-  zeta  .req r12
-  ctr  .req r14
-
-  movw  q, #769
-  movt qinv, #767
-  add ctr, rptr, #64*2*4
-  1:
-
-    ldr poly2, [aptr, #4]
-    ldr poly3, [bptr, #4]  
-    ldrh.w zeta, [zeta_ptr], #2
-    ldr poly0, [aptr], #8
-    ldr poly1, [bptr], #8
-    
-    //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
-    smultt tmp, poly0, poly1
-    montgomery q, qinv, tmp, tmp2
-    smultb tmp2, tmp2, zeta
-    smlabb tmp2, poly0, poly1, tmp2
-    montgomery q, qinv, tmp2, tmp
-
-    smuadx tmp2, poly0, poly1
-    montgomery q, qinv, tmp2, tmp3
-    pkhtb tmp, tmp3, tmp, asr#16
-    str tmp, [rptr], #4
-    
-    neg zeta, zeta
-    
-    //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
-    smultt tmp, poly2, poly3
-    montgomery q, qinv, tmp, tmp2
-    smultb tmp2, tmp2, zeta
-    smlabb tmp2, poly2, poly3, tmp2
-    montgomery q, qinv, tmp2, tmp
-
-    smuadx tmp2, poly2, poly3
-    montgomery q, qinv, tmp2, tmp3
-    pkhtb tmp, tmp3, tmp, asr#16
-    
-    str tmp, [rptr], #4
-    cmp.w rptr, ctr
-    bne.w 1b
-
-  pop {r4-r11, pc}
\ No newline at end of file
+small_basemul_asm_769:
+	push {r4-r11, lr}
+
+	rptr    .req r0
+	aptr    .req r1
+	bptr    .req r2
+	zetaptr .req r3
+	poly0   .req r4
+	poly1   .req r6
+	poly2   .req r5
+	poly3   .req r7
+	q       .req r8
+	qa      .req r14
+	qinv    .req r9
+	tmp     .req r10
+	tmp2    .req r11
+	zeta    .req r12
+	loop    .req r14
+
+	movt  q, #769
+	movw qinv, #64769
+	movt qinv, #58632
+
+	movw loop, #64
+	1:
+	vmov.w s0,loop
+	movw qa, #24608
+			
+	ldrd poly0, poly2, [aptr], #8
+	ldrd poly1, poly3, [bptr], #8 
+	// ldr poly0, [aptr], #4
+	// ldr poly1, [bptr], #4
+	// ldr poly2, [aptr], #4
+	// ldr poly3, [bptr], #4
+
+	ldr.w zeta, [zetaptr], #4
+
+	// basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+	smulwt tmp, zeta, poly1 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly0, tmp  
+	smlabb tmp, poly0, poly1, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly0, poly1 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+
+	neg zeta, zeta
+
+	// basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+	smulwt tmp, zeta, poly3 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly2, tmp  
+	smlabb tmp, poly2, poly3, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly2, poly3 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+		
+	vmov.w loop,s0
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq rptr   
+	.unreq aptr   
+	.unreq bptr   
+	.unreq zetaptr
+	.unreq poly0  
+	.unreq poly1  
+	.unreq poly2  
+	.unreq poly3  
+	.unreq q      
+	.unreq qa     
+	.unreq qinv   
+	.unreq tmp    
+	.unreq tmp2   
+	.unreq zeta   
+	.unreq loop   
+
+	pop {r4-r11, pc}
+//-0.5p~0.5p
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h
index 048d5df5..c3fd065f 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.h
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.h
@@ -4,48 +4,27 @@
 #include <stdint.h>
 #include "params.h"
 
-static const int16_t zetas[64] = {
--23, 112, -151, -134, -52, -148, 227, 232,
--71, 212, 236, 21, 341, 379, -202, -220,
-352, 292, 238, 145, 194, -276, 70, -274,
-117, 333, 66, 247, -237, -83, -252, -244,
-331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168
-};
-
-static const int16_t zetas_asm[128] = {
-0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337,
--76, 147, -114, -23, 112, -151, -134,
--98, -272, 54, -52, -148, 227, 232,
-36, -2, -124, -71, 212, 236, 21,
--75, -80, -346, 341, 379, -202, -220,
--339, 86, -51, 352, 292, 238, 145,
--255, 364, 267, 194, -276, 70, -274,
-282, 161, -15, 117, 333, 66, 247,
--203, 288, 169, -237, -83, -252, -244,
--34, 191, 307, 331, -241, 167, 357,
-199, -50, -24, -355, 291, -358, 105,
-178, -170, 226, -115, -209, 14, 99,
-270, 121, -188, -260, 29, 366, -378,
--10, -380, 279, -318, 278, 353, 354,
-149, 180, -375, -184, 127, 330, -303,
-369, -157, 263, 222, -78, -348, -44,
--192, -128, -246, 201, 158, 350, 168
-};
-
-static const int16_t zetas_inv_CT_asm[256] = {
-0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186,
-171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138,
-};
+static const int32_t zetas_769[64] = {
+	3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838};
+
+static const int32_t zetas_asm_769[128] = {
+	346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0};
+
+// INTT with CT butterfly
+static const int32_t zetas_inv_asm_769[256] = {
+	5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091,
+	// removed first "2285" + LAYER 3+2+1 - 1 - butterfly
+	5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0};
 
 
 #define SMALL_Q 769
 
-void small_ntt_asm(int16_t a[N], const int16_t * zetas);
-void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas);
-void small_basemul_asm(int16_t *c, const int16_t *a, const int16_t *b, const int16_t *zetas);
+void small_ntt_asm_769(int16_t a[N], const int32_t * zetas);
+void small_invntt_asm_769(int16_t a[N], const int32_t * zetas);
+void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas);
 
-#define small_ntt(a) small_ntt_asm(a, zetas_asm)
-#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm)
-#define small_basemul(r,a,b) small_basemul_asm(r, a, b, zetas)
+#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769)
+#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769)
+#define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769)
 
 #endif
diff --git a/crypto_sign/dilithium5/m4fstack/macros_smallntt.i b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i
new file mode 120000
index 00000000..fc731f12
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/macros_smallntt.i
\ No newline at end of file

From 0dd789b5fe2138f40ff741bf1641bc3c683e7090 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Wed, 20 Mar 2024 16:07:22 +0100
Subject: [PATCH 21/32] First batch of stack opt for Verify * On-the-fly matrix
 generation * Schoolbook for ct1 * Challenge compression

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 59 +++++++++++++++----------
 crypto_sign/dilithium3/m4fstack/stack.c | 52 ++++++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  1 +
 3 files changed, 89 insertions(+), 23 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index ab1426ce..2876a9a2 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -297,16 +297,19 @@ int crypto_sign_verify(const uint8_t *sig,
                        const uint8_t *pk)
 {
   unsigned int i;
-  uint8_t buf[K*POLYW1_PACKEDBYTES];
+  uint8_t w1_packed[POLYW1_PACKEDBYTES];
   uint8_t rho[SEEDBYTES];
   uint8_t mu[CRHBYTES];
   uint8_t c[CTILDEBYTES];
   uint8_t c2[CTILDEBYTES];
-  poly cp;
-  polyvecl mat[K], z;
-  polyveck t1, w1, h;
+  polyvecl z;
+  polyveck h, t1;
+  poly w1, cp, tmp0;
   shake256incctx state;
 
+  uint8_t wcomp[768];
+  uint8_t ccomp[68];
+
   if(siglen != CRYPTO_BYTES)
     return -1;
 
@@ -325,30 +328,40 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_squeeze(mu, CRHBYTES, &state);
 
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
-  poly_challenge(&cp, c);
-  polyvec_matrix_expand(mat, rho);
-
-  polyvecl_ntt(&z);
-  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
-
+  poly_challenge(&cp, sig);
+  poly_challenge_compress(ccomp, &cp);
   poly_ntt(&cp);
-  polyveck_shiftl(&t1);
-  polyveck_ntt(&t1);
-  polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);
-
-  polyveck_sub(&w1, &w1, &t1);
-  polyveck_reduce(&w1);
-  polyveck_invntt_tomont(&w1);
 
-  /* Reconstruct w1 */
-  polyveck_caddq(&w1);
-  polyveck_use_hint(&w1, &w1, &h);
-  polyveck_pack_w1(buf, &w1);
+  polyvecl_ntt(&z);
 
-  /* Call random oracle and verify challenge */
   shake256_inc_init(&state);
   shake256_inc_absorb(&state, mu, CRHBYTES);
-  shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES);
+
+  for (size_t k_idx = 0; k_idx < K; k_idx++) {
+    poly_uniform(&tmp0, rho, (k_idx << 8) + 0);
+    poly_pointwise_montgomery(&w1,  &tmp0, &z.vec[0]);
+    for (size_t l_idx = 1; l_idx < L; l_idx++) {
+      poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
+      poly_pointwise_acc_montgomery(&w1,  &tmp0, &z.vec[l_idx]);
+    }
+    
+    poly_reduce(&w1);
+    poly_invntt_tomont(&w1);
+    
+    poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
+
+    // TODO invNTT before sub because of schoolbook
+    poly_sub(&w1, &w1, &tmp0);
+    poly_reduce(&w1);
+
+    /* Reconstruct w1 */
+    poly_caddq(&w1);
+    poly_use_hint(&w1, &w1, &h.vec[k_idx]);
+    polyw1_pack(w1_packed, &w1);
+
+    shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES);
+  }
+  /* Call random oracle and verify challenge */
   shake256_inc_finalize(&state);
   shake256_inc_squeeze(c2, CTILDEBYTES, &state);
   for(i = 0; i < CTILDEBYTES; ++i)
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index d3256c8b..d7469d93 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -88,6 +88,28 @@ static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){
     return (1 << (D-1)) - coeff;
 }
 
+static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx){
+    int32_t coeff;
+    // 4 coefficients are packed in 5 bytes
+    t1 += 5*(idx >> 2);
+
+    if(idx % 4 == 0){
+        coeff  = (t1[0] >> 0);
+        coeff |= ((uint32_t)t1[1] << 8);
+    } else if(idx % 4 == 1){
+        coeff  =  (t1[1] >> 2);
+        coeff |= ((uint32_t)t1[2] << 6);
+    } else if(idx % 4 == 2){
+        coeff  = (t1[2] >> 4);
+        coeff |= ((uint32_t)t1[3] << 4);
+    } else if(idx % 4 == 3){
+        coeff  = (t1[3] >> 6);
+        coeff |= ((uint32_t)t1[4] << 2);
+    }
+    coeff &= 0x3FF;
+    return coeff;
+}
+
 void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
   unsigned i,j,idx;
   uint64_t signs = 0;
@@ -118,6 +140,36 @@ void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
   }
 }
 
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1){
+  unsigned i,j,idx;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) c->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)ccomp[60+i]) << (8*i);
+  }
+
+  for(idx = 0; idx < TAU; idx++){
+    i = ccomp[idx];
+    if(!(signs & 1)){
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] += (polyt1_unpack_idx(t1, j) << D);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] -= (polyt1_unpack_idx(t1, j) << D);
+        }
+    } else {
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] -= (polyt1_unpack_idx(t1, j) << D);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] += (polyt1_unpack_idx(t1, j) << D);
+        }
+    }
+
+    signs >>= 1;
+  }
+}
+
 
 void polyw_pack(uint8_t buf[3*256], poly *w){
   poly_reduce(w);
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index c21714c7..37c659bc 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -12,6 +12,7 @@ void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
 
 
 void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0);
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1);
 void polyw_pack(uint8_t buf[3*256], poly *w);
 void polyw_unpack(poly *w, const uint8_t buf[3*256]);
 

From a8c993fc8f7038de5fd757c505eb43eb6e10d010 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Wed, 20 Mar 2024 16:38:20 +0100
Subject: [PATCH 22/32] On-the-fly unpacking for z, h

---
 crypto_sign/dilithium3/m4fstack/sign.c  | 40 +++++++++++---------
 crypto_sign/dilithium3/m4fstack/stack.c | 49 +++++++++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  2 +-
 3 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 2876a9a2..e81d0f44 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -302,9 +302,7 @@ int crypto_sign_verify(const uint8_t *sig,
   uint8_t mu[CRHBYTES];
   uint8_t c[CTILDEBYTES];
   uint8_t c2[CTILDEBYTES];
-  polyvecl z;
-  polyveck h, t1;
-  poly w1, cp, tmp0;
+  poly w1, tmp0, tmp1;
   shake256incctx state;
 
   uint8_t wcomp[768];
@@ -313,11 +311,8 @@ int crypto_sign_verify(const uint8_t *sig,
   if(siglen != CRYPTO_BYTES)
     return -1;
 
-  unpack_pk(rho, &t1, pk);
-  if(unpack_sig(c, &z, &h, sig))
-    return -1;
-  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
-    return -1;
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = pk[i];
 
   /* Compute CRH(h(rho, t1), msg) */
   shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
@@ -328,21 +323,27 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_squeeze(mu, CRHBYTES, &state);
 
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
-  poly_challenge(&cp, sig);
-  poly_challenge_compress(ccomp, &cp);
-  poly_ntt(&cp);
-
-  polyvecl_ntt(&z);
+  poly_challenge(&tmp0, sig);
+  poly_challenge_compress(ccomp, &tmp0);
 
   shake256_inc_init(&state);
   shake256_inc_absorb(&state, mu, CRHBYTES);
 
   for (size_t k_idx = 0; k_idx < K; k_idx++) {
+    polyz_unpack(&tmp1, sig + CTILDEBYTES);
+    if(poly_chknorm(&tmp1, GAMMA1 - BETA))
+      return -1;
+    poly_ntt(&tmp1);
+    
     poly_uniform(&tmp0, rho, (k_idx << 8) + 0);
-    poly_pointwise_montgomery(&w1,  &tmp0, &z.vec[0]);
+    poly_pointwise_montgomery(&w1,  &tmp0, &tmp1);
     for (size_t l_idx = 1; l_idx < L; l_idx++) {
+      polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
+      if(poly_chknorm(&tmp1, GAMMA1 - BETA))
+        return -1;
+      poly_ntt(&tmp1);
       poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
-      poly_pointwise_acc_montgomery(&w1,  &tmp0, &z.vec[l_idx]);
+      poly_pointwise_acc_montgomery(&w1,  &tmp0, &tmp1);
     }
     
     poly_reduce(&w1);
@@ -350,13 +351,16 @@ int crypto_sign_verify(const uint8_t *sig,
     
     poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
 
-    // TODO invNTT before sub because of schoolbook
     poly_sub(&w1, &w1, &tmp0);
     poly_reduce(&w1);
 
     /* Reconstruct w1 */
     poly_caddq(&w1);
-    poly_use_hint(&w1, &w1, &h.vec[k_idx]);
+
+    if (unpack_sig_h(&tmp0, k_idx, sig) != 0) {
+      return -1;
+    };
+    poly_use_hint(&w1, &w1, &tmp0);
     polyw1_pack(w1_packed, &w1);
 
     shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES);
@@ -365,7 +369,7 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_finalize(&state);
   shake256_inc_squeeze(c2, CTILDEBYTES, &state);
   for(i = 0; i < CTILDEBYTES; ++i)
-    if(c[i] != c2[i])
+    if(sig[i] != c2[i])
       return -1;
 
   return 0;
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index d7469d93..0c7d2b41 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -456,4 +456,53 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
   for(i = 0; i < TRBYTES; ++i)
     tr[i] = sk[i];
   sk += TRBYTES;
+}
+
+/*************************************************
+* Name:        unpack_sig_h
+*
+* Description: Unpack only h from signature sig = (c, z, h).
+*
+* Arguments:   - polyveck *h: pointer to output hint vector h
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) {
+    sig += L * POLYZ_PACKEDBYTES;
+    sig += CTILDEBYTES;
+    /* Decode h */
+    unsigned int k = 0;
+    for (unsigned int i = 0; i < K; ++i) {
+        for (unsigned int j = 0; j < N; ++j) {
+            if (i == idx) {
+                h->coeffs[j] = 0;
+            }
+        }
+
+        if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
+            return 1;
+        }
+
+        for (unsigned int j = k; j < sig[OMEGA + i]; ++j) {
+            /* Coefficients are ordered for strong unforgeability */
+            if (j > k && sig[j] <= sig[j - 1]) {
+                return 1;
+            }
+            if (i == idx) {
+                h->coeffs[sig[j]] = 1;
+            }
+        }
+
+        k = sig[OMEGA + i];
+    }
+
+    /* Extra indices are zero for strong unforgeability */
+    for (unsigned int j = k; j < OMEGA; ++j) {
+        if (sig[j]) {
+            return 1;
+        }
+    }
+    return 0;
 }
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 37c659bc..2893b2b5 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -24,7 +24,7 @@ void poly_lowbits(poly *a0, const poly *a);
 
 void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
-
+unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
 
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
 void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);

From b7ded849ff5133bb33314cf7d565ce90d22ad7d0 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Wed, 20 Mar 2024 17:00:11 +0100
Subject: [PATCH 23/32] Compress w

---
 crypto_sign/dilithium3/m4fstack/sign.c | 37 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index e81d0f44..9709f7fc 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -303,11 +303,13 @@ int crypto_sign_verify(const uint8_t *sig,
   uint8_t c[CTILDEBYTES];
   uint8_t c2[CTILDEBYTES];
   poly w1, tmp0, tmp1;
-  shake256incctx state;
 
   uint8_t wcomp[768];
   uint8_t ccomp[68];
 
+  shake128incctx s128;
+  shake256incctx s256;
+
   if(siglen != CRYPTO_BYTES)
     return -1;
 
@@ -316,36 +318,39 @@ int crypto_sign_verify(const uint8_t *sig,
 
   /* Compute CRH(h(rho, t1), msg) */
   shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, TRBYTES);
-  shake256_inc_absorb(&state, m, mlen);
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(mu, CRHBYTES, &state);
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, mu, TRBYTES);
+  shake256_inc_absorb(&s256, m, mlen);
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &s256);
 
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
   poly_challenge(&tmp0, sig);
   poly_challenge_compress(ccomp, &tmp0);
 
-  shake256_inc_init(&state);
-  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, mu, CRHBYTES);
 
   for (size_t k_idx = 0; k_idx < K; k_idx++) {
+    for(size_t i=0;i<768;i++){
+        wcomp[i] = 0;
+    }
+
     polyz_unpack(&tmp1, sig + CTILDEBYTES);
     if(poly_chknorm(&tmp1, GAMMA1 - BETA))
       return -1;
     poly_ntt(&tmp1);
     
-    poly_uniform(&tmp0, rho, (k_idx << 8) + 0);
-    poly_pointwise_montgomery(&w1,  &tmp0, &tmp1);
+    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + 0, &s128);
+
     for (size_t l_idx = 1; l_idx < L; l_idx++) {
       polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
       if(poly_chknorm(&tmp1, GAMMA1 - BETA))
         return -1;
       poly_ntt(&tmp1);
-      poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
-      poly_pointwise_acc_montgomery(&w1,  &tmp0, &tmp1);
+      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + l_idx, &s128);
     }
-    
+    polyw_unpack(&w1, wcomp);
     poly_reduce(&w1);
     poly_invntt_tomont(&w1);
     
@@ -363,11 +368,11 @@ int crypto_sign_verify(const uint8_t *sig,
     poly_use_hint(&w1, &w1, &tmp0);
     polyw1_pack(w1_packed, &w1);
 
-    shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES);
+    shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES);
   }
   /* Call random oracle and verify challenge */
-  shake256_inc_finalize(&state);
-  shake256_inc_squeeze(c2, CTILDEBYTES, &state);
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(c2, CTILDEBYTES, &s256);
   for(i = 0; i < CTILDEBYTES; ++i)
     if(sig[i] != c2[i])
       return -1;

From e6e164bcedac6dd669fa0a7dc54e1430bf129349 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Wed, 20 Mar 2024 17:34:40 +0100
Subject: [PATCH 24/32] rm tmp poly, subtract on wcomp

---
 crypto_sign/dilithium3/m4fstack/sign.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 9709f7fc..cbc332cb 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -300,9 +300,8 @@ int crypto_sign_verify(const uint8_t *sig,
   uint8_t w1_packed[POLYW1_PACKEDBYTES];
   uint8_t rho[SEEDBYTES];
   uint8_t mu[CRHBYTES];
-  uint8_t c[CTILDEBYTES];
   uint8_t c2[CTILDEBYTES];
-  poly w1, tmp0, tmp1;
+  poly w1, tmp0;
 
   uint8_t wcomp[768];
   uint8_t ccomp[68];
@@ -336,27 +335,28 @@ int crypto_sign_verify(const uint8_t *sig,
         wcomp[i] = 0;
     }
 
-    polyz_unpack(&tmp1, sig + CTILDEBYTES);
-    if(poly_chknorm(&tmp1, GAMMA1 - BETA))
+    polyz_unpack(&tmp0, sig + CTILDEBYTES);
+    if(poly_chknorm(&tmp0, GAMMA1 - BETA))
       return -1;
-    poly_ntt(&tmp1);
+    poly_ntt(&tmp0);
     
-    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + 0, &s128);
+    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + 0, &s128);
 
     for (size_t l_idx = 1; l_idx < L; l_idx++) {
-      polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
-      if(poly_chknorm(&tmp1, GAMMA1 - BETA))
+      polyz_unpack(&tmp0, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
+      if(poly_chknorm(&tmp0, GAMMA1 - BETA))
         return -1;
-      poly_ntt(&tmp1);
-      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + l_idx, &s128);
+      poly_ntt(&tmp0);
+      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + l_idx, &s128);
     }
     polyw_unpack(&w1, wcomp);
     poly_reduce(&w1);
     poly_invntt_tomont(&w1);
+    polyw_pack(wcomp, &w1);
     
     poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
 
-    poly_sub(&w1, &w1, &tmp0);
+    polyw_sub(&w1, wcomp, &tmp0);
     poly_reduce(&w1);
 
     /* Reconstruct w1 */

From 6ef4fbc30c4cc92f734c54a97bcbb8806a6ab254 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Sat, 30 Mar 2024 18:27:22 -0400
Subject: [PATCH 25/32] Verify Stack Optimizations * Stack friendly hint
 decoding * Eliminate second full poly * Remove K-loop from hint unpacking

---
 crypto_sign/dilithium3/m4fstack/sign.c  |  60 +++++++-----
 crypto_sign/dilithium3/m4fstack/stack.c | 122 ++++++++++++++++++++----
 crypto_sign/dilithium3/m4fstack/stack.h |   4 +-
 3 files changed, 141 insertions(+), 45 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index cbc332cb..c754b286 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -297,11 +297,13 @@ int crypto_sign_verify(const uint8_t *sig,
                        const uint8_t *pk)
 {
   unsigned int i;
+  unsigned int number_of_hints;
   uint8_t w1_packed[POLYW1_PACKEDBYTES];
   uint8_t rho[SEEDBYTES];
   uint8_t mu[CRHBYTES];
   uint8_t c2[CTILDEBYTES];
-  poly w1, tmp0;
+  uint8_t hint_ones[OMEGA];
+  poly p;
 
   uint8_t wcomp[768];
   uint8_t ccomp[68];
@@ -316,7 +318,11 @@ int crypto_sign_verify(const uint8_t *sig,
     rho[i] = pk[i];
 
   /* Compute CRH(h(rho, t1), msg) */
-  shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &s256);
+
   shake256_inc_init(&s256);
   shake256_inc_absorb(&s256, mu, TRBYTES);
   shake256_inc_absorb(&s256, m, mlen);
@@ -324,49 +330,51 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_squeeze(mu, CRHBYTES, &s256);
 
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
-  poly_challenge(&tmp0, sig);
-  poly_challenge_compress(ccomp, &tmp0);
+  poly_challenge(&p, sig);
+  poly_challenge_compress(ccomp, &p);
 
   shake256_inc_init(&s256);
   shake256_inc_absorb(&s256, mu, CRHBYTES);
 
   for (size_t k_idx = 0; k_idx < K; k_idx++) {
-    for(size_t i=0;i<768;i++){
-        wcomp[i] = 0;
+    for(size_t widx=0;widx<768;widx++){
+        wcomp[widx] = 0;
     }
 
-    polyz_unpack(&tmp0, sig + CTILDEBYTES);
-    if(poly_chknorm(&tmp0, GAMMA1 - BETA))
+    polyz_unpack(&p, sig + CTILDEBYTES);
+    if(poly_chknorm(&p, GAMMA1 - BETA))
       return -1;
-    poly_ntt(&tmp0);
+    poly_ntt(&p);
     
-    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + 0, &s128);
+    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + 0, &s128);
 
     for (size_t l_idx = 1; l_idx < L; l_idx++) {
-      polyz_unpack(&tmp0, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
-      if(poly_chknorm(&tmp0, GAMMA1 - BETA))
+      polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
+      if(poly_chknorm(&p, GAMMA1 - BETA))
         return -1;
-      poly_ntt(&tmp0);
-      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + l_idx, &s128);
+      poly_ntt(&p);
+      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + l_idx, &s128);
     }
-    polyw_unpack(&w1, wcomp);
-    poly_reduce(&w1);
-    poly_invntt_tomont(&w1);
-    polyw_pack(wcomp, &w1);
+    polyw_unpack(&p, wcomp);
+    poly_reduce(&p);
+    poly_invntt_tomont(&p);
+    polyw_pack(wcomp, &p);
     
-    poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
+    poly_schoolbook_t1(&p, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
 
-    polyw_sub(&w1, wcomp, &tmp0);
-    poly_reduce(&w1);
+    polyw_sub(&p, wcomp, &p);
+    poly_reduce(&p);
 
     /* Reconstruct w1 */
-    poly_caddq(&w1);
+    poly_caddq(&p);
 
-    if (unpack_sig_h(&tmp0, k_idx, sig) != 0) {
+    if (unpack_sig_h_indices(&hint_ones, &number_of_hints, k_idx, sig) != 0)
+    {
       return -1;
-    };
-    poly_use_hint(&w1, &w1, &tmp0);
-    polyw1_pack(w1_packed, &w1);
+    }
+    poly_use_hint_stack(&p, &p, &hint_ones, number_of_hints);
+
+    polyw1_pack(w1_packed, &p);
 
     shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES);
   }
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 0c7d2b41..716eccf6 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -3,6 +3,7 @@
 #include "symmetric.h"
 #include "vector.h"
 #include "reduce.h"
+#include "rounding.h"
 
 void poly_challenge_compress(uint8_t c[68], const poly *cp){
   unsigned int i, pos;
@@ -406,7 +407,7 @@ void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES
 }
 
 
-static inline int32_t make_hint(int32_t z, int32_t r){
+static inline int32_t make_hint_stack(int32_t z, int32_t r){
   int32_t r1, v1;
 
   r1 = highbits(r);
@@ -429,7 +430,7 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){
     // compute w - cs2 + c*t0
     coeff  = coeff + t->coeffs[i];
 
-    a->coeffs[i] = make_hint(-t->coeffs[i], coeff);
+    a->coeffs[i] = make_hint_stack(-t->coeffs[i], coeff);
     if(a->coeffs[i] == 1){
       hints_n++;
     }
@@ -458,6 +459,7 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
   sk += TRBYTES;
 }
 
+/* TODO: remove this function */
 /*************************************************
 * Name:        unpack_sig_h
 *
@@ -474,30 +476,78 @@ int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES
     sig += CTILDEBYTES;
     /* Decode h */
     unsigned int k = 0;
-    for (unsigned int i = 0; i < K; ++i) {
-        for (unsigned int j = 0; j < N; ++j) {
-            if (i == idx) {
-                h->coeffs[j] = 0;
-            }
-        }
 
-        if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
+    if (idx > 0)
+    {
+        k = sig[OMEGA + (idx - 1)];
+    }
+    
+    for (unsigned int j = 0; j < N; ++j) {
+        h->coeffs[j] = 0;
+    }
+
+    if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) {
+        return 1;
+    }
+
+    for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) {
+        /* Coefficients are ordered for strong unforgeability */
+        if (j > k && sig[j] <= sig[j - 1]) {
             return 1;
         }
+        h->coeffs[sig[j]] = 1;
+    }
 
-        for (unsigned int j = k; j < sig[OMEGA + i]; ++j) {
-            /* Coefficients are ordered for strong unforgeability */
-            if (j > k && sig[j] <= sig[j - 1]) {
-                return 1;
-            }
-            if (i == idx) {
-                h->coeffs[sig[j]] = 1;
-            }
+    /* TODO: extract this check, redundant here */
+    k = sig[OMEGA + (K - 1)];
+    /* Extra indices are zero for strong unforgeability */
+    for (unsigned int j = k; j < OMEGA; ++j) {
+        if (sig[j]) {
+            return 1;
         }
+    }
+    return 0;
+}
 
-        k = sig[OMEGA + i];
+/*************************************************
+* Name:        unpack_sig_h_indices
+*
+* Description: Unpack only h from signature sig = (c, z, h).
+*
+* Arguments:   - polyveck *h: pointer to output hint vector h
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) {
+    sig += L * POLYZ_PACKEDBYTES;
+    sig += CTILDEBYTES;
+    /* Decode h */
+    unsigned int k = 0;
+    unsigned int hidx = 0;
+
+    if (idx > 0)
+    {
+        k = sig[OMEGA + (idx - 1)];
     }
 
+    if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) {
+        return 1;
+    }
+
+    for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) {
+        /* Coefficients are ordered for strong unforgeability */
+        if (j > k && sig[j] <= sig[j - 1]) {
+            return 1;
+        }
+        h_i[hidx++] = sig[j];
+    }
+
+    *number_of_hints = hidx;
+
+    /* TODO: extract this check, redundant here */
+    k = sig[OMEGA + (K - 1)];
     /* Extra indices are zero for strong unforgeability */
     for (unsigned int j = k; j < OMEGA; ++j) {
         if (sig[j]) {
@@ -505,4 +555,40 @@ int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES
         }
     }
     return 0;
+}
+
+/*************************************************
+* Name:        poly_use_hint_stack
+*
+* Description: Use hint polynomial to correct the high bits of a polynomial.
+*
+* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+*              - const poly *a: pointer to input polynomial
+*              - const poly *h: pointer to input hint polynomial
+**************************************************/
+void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints) {
+  unsigned int i;
+  unsigned int in_list;
+
+  for(i = 0; i < N; ++i)
+  {
+    in_list = 0;
+    for (size_t hidx = 0; hidx < number_of_hints; hidx++)
+    {
+      if (i == h_i[hidx])
+      {
+        in_list = 1;
+        break;
+      }
+    }
+    if (in_list)
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 1);
+    }
+    else
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 0);
+    }
+    
+  }
 }
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 2893b2b5..e07d8716 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -24,13 +24,15 @@ void poly_lowbits(poly *a0, const poly *a);
 
 void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
-unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
+int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
 
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
 void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
+void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints);
 
 // TODO: replace this with individual functions later
 void unpack_sk_stack(uint8_t rho[SEEDBYTES],

From 9870bec37c0846fb5ec8c8608f225a62137fb081 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Sun, 31 Mar 2024 14:53:50 -0400
Subject: [PATCH 26/32] rm buffers/unionize in Verify

---
 crypto_sign/dilithium3/m4fstack/sign.c | 50 ++++++++++++++++----------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index c754b286..a509bf76 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -297,26 +297,38 @@ int crypto_sign_verify(const uint8_t *sig,
                        const uint8_t *pk)
 {
   unsigned int i;
-  unsigned int number_of_hints;
-  uint8_t w1_packed[POLYW1_PACKEDBYTES];
-  uint8_t rho[SEEDBYTES];
-  uint8_t mu[CRHBYTES];
-  uint8_t c2[CTILDEBYTES];
-  uint8_t hint_ones[OMEGA];
+  
   poly p;
 
-  uint8_t wcomp[768];
-  uint8_t ccomp[68];
+  union {
+    uint8_t w1_packed[POLYW1_PACKEDBYTES];
+    uint8_t wcomp[768];
+  } w1_packed_comp;
+  uint8_t *w1_packed = &w1_packed_comp.w1_packed;
+  uint8_t *wcomp  = &w1_packed_comp.wcomp;
+
+  union {
+    uint8_t ccomp[68];
+    uint8_t mu[CRHBYTES];
+  } ccomp_mu;
+  uint8_t *ccomp = &ccomp_mu.ccomp;
+  uint8_t *mu  = &ccomp_mu.mu;
 
-  shake128incctx s128;
   shake256incctx s256;
 
+  union {
+    uint8_t hint_ones[OMEGA];
+    shake128incctx s128;
+    uint8_t c2[CTILDEBYTES];
+  } shake_hint;
+
+  uint8_t *hint_ones   = &shake_hint.hint_ones;
+  shake128incctx *s128 = &shake_hint.s128;
+  uint8_t *c2          = &shake_hint.c2;
+
   if(siglen != CRYPTO_BYTES)
     return -1;
 
-  for(i = 0; i < SEEDBYTES; ++i)
-    rho[i] = pk[i];
-
   /* Compute CRH(h(rho, t1), msg) */
   shake256_inc_init(&s256);
   shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES);
@@ -329,13 +341,13 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_finalize(&s256);
   shake256_inc_squeeze(mu, CRHBYTES, &s256);
 
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, mu, CRHBYTES);
+
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
   poly_challenge(&p, sig);
   poly_challenge_compress(ccomp, &p);
 
-  shake256_inc_init(&s256);
-  shake256_inc_absorb(&s256, mu, CRHBYTES);
-
   for (size_t k_idx = 0; k_idx < K; k_idx++) {
     for(size_t widx=0;widx<768;widx++){
         wcomp[widx] = 0;
@@ -346,14 +358,14 @@ int crypto_sign_verify(const uint8_t *sig,
       return -1;
     poly_ntt(&p);
     
-    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + 0, &s128);
+    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + 0, s128);
 
     for (size_t l_idx = 1; l_idx < L; l_idx++) {
       polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
       if(poly_chknorm(&p, GAMMA1 - BETA))
         return -1;
       poly_ntt(&p);
-      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + l_idx, &s128);
+      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + l_idx, s128);
     }
     polyw_unpack(&p, wcomp);
     poly_reduce(&p);
@@ -368,11 +380,11 @@ int crypto_sign_verify(const uint8_t *sig,
     /* Reconstruct w1 */
     poly_caddq(&p);
 
-    if (unpack_sig_h_indices(&hint_ones, &number_of_hints, k_idx, sig) != 0)
+    if (unpack_sig_h_indices(hint_ones, &i, k_idx, sig) != 0)
     {
       return -1;
     }
-    poly_use_hint_stack(&p, &p, &hint_ones, number_of_hints);
+    poly_use_hint_stack(&p, &p, hint_ones, i);
 
     polyw1_pack(w1_packed, &p);
 

From 1d21996b1edea0a80f03baa1790ae6c16265a3f1 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 8 Apr 2024 15:42:28 +0200
Subject: [PATCH 27/32] Stack opt key pair * Minor clean up

---
 crypto_sign/dilithium3/m4fstack/sign.c  |  68 ++++++---
 crypto_sign/dilithium3/m4fstack/stack.c | 179 +++++++++++++++++-------
 crypto_sign/dilithium3/m4fstack/stack.h |  32 ++++-
 3 files changed, 203 insertions(+), 76 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index a509bf76..edb4eaa7 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -24,12 +24,12 @@
 * Returns 0 (success)
 **************************************************/
 int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  unsigned int i, j;
   uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
   uint8_t tr[TRBYTES];
   const uint8_t *rho, *rhoprime, *key;
-  polyvecl mat[K];
-  polyvecl s1, s1hat;
-  polyveck s2, t1, t0;
+
+  poly tA, tB, tC;
 
   /* Get randomness for rho, rhoprime and key */
   randombytes(seedbuf, SEEDBYTES);
@@ -38,31 +38,57 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
   rhoprime = rho + SEEDBYTES;
   key = rhoprime + CRHBYTES;
 
-  /* Expand matrix */
-  polyvec_matrix_expand(mat, rho);
-
-  /* Sample short vectors s1 and s2 */
-  polyvecl_uniform_eta(&s1, rhoprime, 0);
-  polyveck_uniform_eta(&s2, rhoprime, L);
+  pack_sk_rho(sk, rho);
+  pack_sk_key(sk, key);
+  pack_pk_rho(pk, rho);
 
   /* Matrix-vector multiplication */
-  s1hat = s1;
-  polyvecl_ntt(&s1hat);
-  polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
-  polyveck_reduce(&t1);
-  polyveck_invntt_tomont(&t1);
+  for (i = 0; i < K; i++)
+  {
+    /* Expand part of s1 */
+    poly_uniform_eta(&tC, rhoprime, 0);
+    if (i == 0)
+    {
+      pack_sk_s1(sk, &tC, 0);
+    }
+    poly_ntt(&tC);
+    /* expand part of the matrix */
+    poly_uniform(&tB, rho, (i << 8) + 0);
+    /* partial matrix-vector multiplication */
+    poly_pointwise_montgomery(&tA, &tB, &tC);
+    for(j = 1; j < L; j++)
+    {
+      /* Expand part of s1 */
+      poly_uniform_eta(&tC, rhoprime, j);
+      if (i == 0)
+      {
+        pack_sk_s1(sk, &tC, j);
+      }
+      poly_ntt(&tC);
+      poly_uniform(&tB, rho, (i << 8) + j);
+      poly_pointwise_acc_montgomery(&tA, &tB, &tC);
+    }
+
+    poly_reduce(&tA);
+    poly_invntt_tomont(&tA);
 
-  /* Add error vector s2 */
-  polyveck_add(&t1, &t1, &s2);
+    /* Add error vector s2 */
+    /* Sample short vector s2 */
+    poly_uniform_eta(&tB, rhoprime, L + i);
+    pack_sk_s2(sk, &tB, i);
+    poly_add(&tA, &tA, &tB);
 
-  /* Extract t1 and write public key */
-  polyveck_caddq(&t1);
-  polyveck_power2round(&t1, &t0, &t1);
-  pack_pk(pk, rho, &t1);
+    /* Compute t{0,1} */
+    poly_caddq(&tA);
+    poly_power2round(&tC, &tB, &tA);
+    pack_sk_t0(sk, &tB, i);
+    pack_pk_t1(pk, &tC, i);
+
+  }
 
   /* Compute H(rho, t1) and write secret key */
   shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
-  pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
+  pack_sk_tr(sk, tr);
 
   return 0;
 }
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index 716eccf6..b1e09325 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -284,7 +284,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
 
 // TODO: in the end increase this buffer size as far as possible
 #define POLY_UNIFORM_BUFFERSIZE 3
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
   int32_t t;
   uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
   {
@@ -438,7 +438,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){
   return hints_n;
 }
 
-// TODO: remove this later
 void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t tr[TRBYTES],
                uint8_t key[SEEDBYTES],
@@ -459,56 +458,6 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES],
   sk += TRBYTES;
 }
 
-/* TODO: remove this function */
-/*************************************************
-* Name:        unpack_sig_h
-*
-* Description: Unpack only h from signature sig = (c, z, h).
-*
-* Arguments:   - polyveck *h: pointer to output hint vector h
-*              - const unsigned char sig[]: byte array containing
-*                bit-packed signature
-*
-* Returns 1 in case of malformed signature; otherwise 0.
-**************************************************/
-int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) {
-    sig += L * POLYZ_PACKEDBYTES;
-    sig += CTILDEBYTES;
-    /* Decode h */
-    unsigned int k = 0;
-
-    if (idx > 0)
-    {
-        k = sig[OMEGA + (idx - 1)];
-    }
-    
-    for (unsigned int j = 0; j < N; ++j) {
-        h->coeffs[j] = 0;
-    }
-
-    if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) {
-        return 1;
-    }
-
-    for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) {
-        /* Coefficients are ordered for strong unforgeability */
-        if (j > k && sig[j] <= sig[j - 1]) {
-            return 1;
-        }
-        h->coeffs[sig[j]] = 1;
-    }
-
-    /* TODO: extract this check, redundant here */
-    k = sig[OMEGA + (K - 1)];
-    /* Extra indices are zero for strong unforgeability */
-    for (unsigned int j = k; j < OMEGA; ++j) {
-        if (sig[j]) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
 /*************************************************
 * Name:        unpack_sig_h_indices
 *
@@ -591,4 +540,130 @@ void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned in
     }
     
   }
+}
+
+/*************************************************
+* Name:        pack_pk_rho
+*
+* Description: Bit-pack only rho in public key pk = (rho, t1).
+*
+* Arguments:   - unsigned char pk[]: output byte array
+*              - const unsigned char rho[]: byte array containing rho
+**************************************************/
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]) {
+    for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+        pk[i] = rho[i];
+    }
+}
+
+/*************************************************
+* Name:        pack_pk_t1
+*
+* Description: Bit-pack only the t1 elem at idx in public key pk = (rho, t1).
+*
+* Arguments:   - unsigned char pk[]: output byte array
+*              - const polyveck *t1: pointer to vector t1
+*              - const unsigned int idx: index to the elem to pack
+**************************************************/
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const poly *t1,
+             const unsigned int idx) {
+    pk += SEEDBYTES;
+    polyt1_pack(pk + idx * POLYT1_PACKEDBYTES, t1);
+}
+
+/*************************************************
+* Name:        pack_sk_s1
+*
+* Description: Bit-pack only some element of s1 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *s1_elem: pointer to vector element idx in s1
+*              - const unisgned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES;
+    polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s1_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_s2
+*
+* Description: Bit-pack only some element of s2 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *s2_elem: pointer to vector element idx in s2
+*              - const unsigned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES;
+    polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s2_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_t0
+*
+* Description: Bit-pack only some element of t0 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *t0_elem: pointer to vector element idx in s2
+*              - const unsigned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + K * POLYETA_PACKEDBYTES;
+    polyt0_pack(sk + idx * POLYT0_PACKEDBYTES, t0_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_rho
+*
+* Description: Bit-pack only rho in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char rho[]: byte array containing rho
+**************************************************/
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]) {
+  for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+    sk[i] = rho[i];
+  }
+}
+
+/*************************************************
+* Name:        pack_sk_key
+*
+* Description: Bit-pack only key in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char key[]: byte array containing key
+**************************************************/
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES]) {
+    sk += SEEDBYTES;
+    for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+      sk[i] = key[i];
+    }
+}
+
+/*************************************************
+* Name:        pack_sk_tr
+*
+* Description: Bit-pack only tr in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char tr[]: byte array containing tr
+**************************************************/
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES]) {
+    sk += 2*SEEDBYTES;
+    for (unsigned int i = 0; i < TRBYTES; ++i) {
+        sk[i] = tr[i];
+    }
 }
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index e07d8716..47dbe50b 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -24,9 +24,8 @@ void poly_lowbits(poly *a0, const poly *a);
 
 void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
 void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
-int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
 
-void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
 void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 
@@ -34,9 +33,36 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
 int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
 void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints);
 
-// TODO: replace this with individual functions later
 void unpack_sk_stack(uint8_t rho[SEEDBYTES],
                uint8_t tr[TRBYTES],
                uint8_t key[SEEDBYTES],
                const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const poly *t1,
+             const unsigned int idx);
+
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx);
+
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx);
+
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx);
+
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES]);
+
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES]);
 #endif
\ No newline at end of file

From 76b16c1bb0a8513757c1999158ab99bb252e13da Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 8 Apr 2024 15:57:45 +0200
Subject: [PATCH 28/32] Overlap buffers

---
 crypto_sign/dilithium3/m4fstack/sign.c | 39 +++++++++++++++++---------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index edb4eaa7..33df06fe 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -26,14 +26,27 @@
 int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
   unsigned int i, j;
   uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
-  uint8_t tr[TRBYTES];
   const uint8_t *rho, *rhoprime, *key;
 
-  poly tA, tB, tC;
+  poly tA, tB;
+
+  union {
+    uint8_t tr[TRBYTES];
+    shake256incctx s256;
+    poly tC;
+  } data;
+
+  shake256incctx *s256 = &data.s256;
+  uint8_t *tr          = &data.tr;
+  poly *tC             = &data.tC;
 
   /* Get randomness for rho, rhoprime and key */
   randombytes(seedbuf, SEEDBYTES);
-  shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES);
+  shake256_inc_init(s256);
+  shake256_inc_absorb(s256, seedbuf, SEEDBYTES);
+  shake256_inc_finalize(s256);
+  shake256_inc_squeeze(seedbuf, 2*SEEDBYTES + CRHBYTES, s256);
+
   rho = seedbuf;
   rhoprime = rho + SEEDBYTES;
   key = rhoprime + CRHBYTES;
@@ -46,27 +59,27 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
   for (i = 0; i < K; i++)
   {
     /* Expand part of s1 */
-    poly_uniform_eta(&tC, rhoprime, 0);
+    poly_uniform_eta(tC, rhoprime, 0);
     if (i == 0)
     {
-      pack_sk_s1(sk, &tC, 0);
+      pack_sk_s1(sk, tC, 0);
     }
-    poly_ntt(&tC);
+    poly_ntt(tC);
     /* expand part of the matrix */
     poly_uniform(&tB, rho, (i << 8) + 0);
     /* partial matrix-vector multiplication */
-    poly_pointwise_montgomery(&tA, &tB, &tC);
+    poly_pointwise_montgomery(&tA, &tB, tC);
     for(j = 1; j < L; j++)
     {
       /* Expand part of s1 */
-      poly_uniform_eta(&tC, rhoprime, j);
+      poly_uniform_eta(tC, rhoprime, j);
       if (i == 0)
       {
-        pack_sk_s1(sk, &tC, j);
+        pack_sk_s1(sk, tC, j);
       }
-      poly_ntt(&tC);
+      poly_ntt(tC);
       poly_uniform(&tB, rho, (i << 8) + j);
-      poly_pointwise_acc_montgomery(&tA, &tB, &tC);
+      poly_pointwise_acc_montgomery(&tA, &tB, tC);
     }
 
     poly_reduce(&tA);
@@ -80,9 +93,9 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
 
     /* Compute t{0,1} */
     poly_caddq(&tA);
-    poly_power2round(&tC, &tB, &tA);
+    poly_power2round(tC, &tB, &tA);
     pack_sk_t0(sk, &tB, i);
-    pack_pk_t1(pk, &tC, i);
+    pack_pk_t1(pk, tC, i);
 
   }
 

From e718f2eb3d4728e246ea3b5ecd9c848e9f017124 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 8 Apr 2024 17:05:53 +0200
Subject: [PATCH 29/32] Stack optimized challenge generation

---
 crypto_sign/dilithium3/m4fstack/sign.c  |  2 +-
 crypto_sign/dilithium3/m4fstack/stack.c | 46 +++++++++++++++++++++++++
 crypto_sign/dilithium3/m4fstack/stack.h |  1 +
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
index 33df06fe..71cff9bb 100644
--- a/crypto_sign/dilithium3/m4fstack/sign.c
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -384,7 +384,7 @@ int crypto_sign_verify(const uint8_t *sig,
   shake256_inc_absorb(&s256, mu, CRHBYTES);
 
   /* Matrix-vector multiplication; compute Az - c2^dt1 */
-  poly_challenge(&p, sig);
+  poly_challenge_stack(&p, sig);
   poly_challenge_compress(ccomp, &p);
 
   for (size_t k_idx = 0; k_idx < K; k_idx++) {
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
index b1e09325..b45f7021 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.c
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -666,4 +666,50 @@ void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
     for (unsigned int i = 0; i < TRBYTES; ++i) {
         sk[i] = tr[i];
     }
+}
+
+/*************************************************
+* Name:        challenge
+*
+* Description: Implementation of H. Samples polynomial with TAU nonzero
+*              coefficients in {-1,1} using the output stream of
+*              SHAKE256(seed). Stack optimized.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
+**************************************************/
+#define CHALLENGE_STACK_BUF_SIZE 8
+void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]) {
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[CHALLENGE_STACK_BUF_SIZE];
+  shake256incctx state;
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, seed, SEEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state);
+  signs = 0;
+  for(i = 0; i < 8; ++i)
+  {
+    signs |= (uint64_t)buf[i] << 8*i;
+  }
+  pos = 8;
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for(i = N-TAU; i < N; ++i) {
+    do {
+      if(pos >= CHALLENGE_STACK_BUF_SIZE) {
+        shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while(b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2*(signs & 1);
+    signs >>= 1;
+  }
 }
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
index 47dbe50b..06c8c576 100644
--- a/crypto_sign/dilithium3/m4fstack/stack.h
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -28,6 +28,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
 void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
 void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
+void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]);
 
 size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
 int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);

From a37b31186f7d7702a50c4816cc9eea5982faee19 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Tue, 9 Apr 2024 16:11:37 +0200
Subject: [PATCH 30/32] Match 769 Plantard to m4f code

---
 crypto_sign/dilithium2/m4fstack/smallntt.S    |   1 -
 .../dilithium2/m4fstack/smallntt_769.S        |   1 +
 crypto_sign/dilithium3/m4fstack/macros_fnt.i  | 158 ------------------
 .../dilithium3/m4fstack/macros_smallntt.i     |  24 ++-
 crypto_sign/dilithium3/m4fstack/smallntt.h    |  23 ++-
 .../m4fstack/{smallntt.S => smallntt_769.S}   |  24 ++-
 crypto_sign/dilithium5/m4fstack/smallntt.S    |   1 -
 .../dilithium5/m4fstack/smallntt_769.S        |   1 +
 8 files changed, 60 insertions(+), 173 deletions(-)
 delete mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.S
 create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt_769.S
 delete mode 100644 crypto_sign/dilithium3/m4fstack/macros_fnt.i
 rename crypto_sign/dilithium3/m4fstack/{smallntt.S => smallntt_769.S} (94%)
 delete mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.S
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt_769.S

diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.S b/crypto_sign/dilithium2/m4fstack/smallntt.S
deleted file mode 120000
index 7e2174f9..00000000
--- a/crypto_sign/dilithium2/m4fstack/smallntt.S
+++ /dev/null
@@ -1 +0,0 @@
-../../dilithium3/m4fstack/smallntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallntt_769.S b/crypto_sign/dilithium2/m4fstack/smallntt_769.S
new file mode 120000
index 00000000..6300683f
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallntt_769.S
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt_769.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/macros_fnt.i b/crypto_sign/dilithium3/m4fstack/macros_fnt.i
deleted file mode 100644
index 25903e41..00000000
--- a/crypto_sign/dilithium3/m4fstack/macros_fnt.i
+++ /dev/null
@@ -1,158 +0,0 @@
-// 2
-.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1
-    \ldrstr \c0, [\target, \mem0]
-    \ldrstr \c1, [\target, \mem1]
-.endm
-
-// 2
-.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c0, [\target], \jump
-.endm
-
-// 4
-.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3
-    \ldrstr \c0, [\target, \mem0]
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c2, [\target, \mem2]
-    \ldrstr \c3, [\target, \mem3]
-.endm
-
-// 4
-.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump
-    \ldrstr \c1, [\target, \mem1]
-    \ldrstr \c2, [\target, \mem2]
-    \ldrstr \c3, [\target, \mem3]
-    \ldrstr \c0, [\target], \jump
-.endm
-
-// 8
-.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7
-    ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3
-    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
-.endm
-
-// 8
-.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump
-    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
-    ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump
-.endm
-
-
-
-.macro addSub1 c0, c1
-    add.w \c0, \c1
-    sub.w \c1, \c0, \c1, lsl #1
-.endm
-
-.macro addSub2 c0, c1, c2, c3
-    add \c0, \c1
-    add \c2, \c3
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-.endm
-
-.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
-    add \c0, \c1
-    add \c2, \c3
-    add \c4, \c5
-    add \c6, \c7
-    sub.w \c1, \c0, \c1, lsl #1
-    sub.w \c3, \c2, \c3, lsl #1
-    sub.w \c5, \c4, \c5, lsl #1
-    sub.w \c7, \c6, \c7, lsl #1
-.endm
-
-// 2
-.macro barrett_32 a, Qbar, Q, tmp
-    smmulr.w \tmp, \a, \Qbar
-    mls.w \a, \tmp, \Q, \a
-.endm
-
-.macro FNT_CT_butterfly c0, c1, logW
-    add.w \c0, \c0, \c1, lsl #\logW
-    sub.w \c1, \c0, \c1, lsl #(\logW+1)
-.endm
-
-.macro shift_subAdd c0, c1, shlv
-    sub.w \c0, \c0, \c1, lsl #(\shlv)
-    add.w \c1, \c0, \c1, lsl #(\shlv+1)
-.endm
-
-.macro FNT_CT_ibutterfly c0, c1, shlv
-    shift_subAdd \c0, \c1, \shlv
-.endm
-
-// 46
-.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
-    vmov.w \twiddle, \xi0
-
-    // c0, c1, c2, c3, c4, c5, c6, c7, c8
-    // 0,4
-    mla \tmp, \c4, \twiddle, \c0
-    mls \c4, \c4, \twiddle, \c0
-
-    // 1,5
-    mla \c0, \c5, \twiddle, \c1
-    mls \c5, \c5, \twiddle, \c1
-
-    // 2,6
-    mla \c1, \c6, \twiddle, \c2
-    mls \c6, \c6, \twiddle, \c2
-
-    // 3,7
-    mla \c2, \c7, \twiddle, \c3
-    mls \c7, \c7, \twiddle, \c3
-
-    // tmp, c0, c1, c2, c4, c5, c6, c7
-
-    barrett_32 \tmp, \Qprime, \Q, \c3
-    barrett_32 \c0, \Qprime, \Q, \c3
-    barrett_32 \c1, \Qprime, \Q, \c3
-    barrett_32 \c2, \Qprime, \Q, \c3
-    barrett_32 \c4, \Qprime, \Q, \c3
-    barrett_32 \c5, \Qprime, \Q, \c3
-    barrett_32 \c6, \Qprime, \Q, \c3
-    barrett_32 \c7, \Qprime, \Q, \c3
-
-    vmov.w \twiddle, \xi1
-    // 0,2
-    mla \tmp2, \c1, \twiddle, \tmp
-    mls \c3, \c1, \twiddle, \tmp
-
-    // 1,3
-    mla \tmp, \c2, \twiddle, \c0
-    mls \c0, \c2, \twiddle, \c0
-
-    vmov.w \twiddle, \xi2
-
-    // 4,6
-    mla \c2, \c6, \twiddle, \c4
-    mls \c1, \c6, \twiddle, \c4
-
-    // 5,7
-    mla \c6, \c7, \twiddle, \c5
-    mls \c7, \c7, \twiddle, \c5
-
-    // tmp2, tmp, c3, c0 | c2, c6, c1, c7
-
-    // 4,5
-    vmov.w \twiddle, \xi5
-    mla \c4, \c6, \twiddle, \c2
-    mls \c5, \c6, \twiddle, \c2
-
-    // 6,7
-    vmov.w \twiddle, \xi6
-    mla \c6, \c7, \twiddle, \c1
-    mls \c7, \c7, \twiddle, \c1
-
-    // 2,3
-    vmov.w \twiddle, \xi4
-    mla \c2, \c0, \twiddle, \c3
-    mls \c3, \c0, \twiddle, \c3
-
-    // 0,1
-    vmov.w \twiddle, \xi3
-    mla \c0, \tmp, \twiddle, \tmp2
-    mls \c1, \tmp, \twiddle, \tmp2
-.endm
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
index b97f4d52..7c9a387c 100644
--- a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
+++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
@@ -1,9 +1,23 @@
 /* 
-* NTT and inverse NTT code from: 
-* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
-* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
-* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
-* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * NTT and inverse NTT code from: 
+ * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+ * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+ * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+ * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
 */
 
 #ifndef MACROS_SMALLNTT_I
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h
index c3fd065f..244fad24 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.h
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.h
@@ -1,9 +1,27 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #ifndef SMALLNTT_H
 #define SMALLNTT_H
 
 #include <stdint.h>
 #include "params.h"
 
+#define SMALL_Q 769
+
 static const int32_t zetas_769[64] = {
 	3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838};
 
@@ -16,13 +34,12 @@ static const int32_t zetas_inv_asm_769[256] = {
 	// removed first "2285" + LAYER 3+2+1 - 1 - butterfly
 	5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0};
 
-
-#define SMALL_Q 769
-
+// Q1=769
 void small_ntt_asm_769(int16_t a[N], const int32_t * zetas);
 void small_invntt_asm_769(int16_t a[N], const int32_t * zetas);
 void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas);
 
+// small NTT for computing cs0 and cs1
 #define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769)
 #define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769)
 #define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769)
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt_769.S
similarity index 94%
rename from crypto_sign/dilithium3/m4fstack/smallntt.S
rename to crypto_sign/dilithium3/m4fstack/smallntt_769.S
index 9f048042..1c3c9a88 100644
--- a/crypto_sign/dilithium3/m4fstack/smallntt.S
+++ b/crypto_sign/dilithium3/m4fstack/smallntt_769.S
@@ -1,9 +1,23 @@
 /* 
-* NTT and inverse NTT code from: 
-* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
-* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
-* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
-* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * NTT and inverse NTT code from: 
+ * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+ * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+ * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+ * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
 */
 
 #include "macros.i"
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.S b/crypto_sign/dilithium5/m4fstack/smallntt.S
deleted file mode 120000
index 7e2174f9..00000000
--- a/crypto_sign/dilithium5/m4fstack/smallntt.S
+++ /dev/null
@@ -1 +0,0 @@
-../../dilithium3/m4fstack/smallntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt_769.S b/crypto_sign/dilithium5/m4fstack/smallntt_769.S
new file mode 120000
index 00000000..6300683f
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallntt_769.S
@@ -0,0 +1 @@
+../../dilithium3/m4fstack/smallntt_769.S
\ No newline at end of file

From d401a156c7a725674c06cfab9ab9e23163054367 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Mon, 15 Apr 2024 15:32:51 +0800
Subject: [PATCH 31/32] update skiplist

---
 skiplist.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/skiplist.py b/skiplist.py
index 47192e42..b97c1b84 100644
--- a/skiplist.py
+++ b/skiplist.py
@@ -237,4 +237,7 @@
     {'scheme': 'tuov_is_pkc_skc', 'implementation': 'ref', 'estmemory': 1275904},
     {'scheme': 'tuov_v_pkc', 'implementation': 'ref', 'estmemory': 7083008},
     {'scheme': 'tuov_v_pkc_skc', 'implementation': 'ref', 'estmemory': 4639744},
+    {'scheme': 'dilithium2', 'implementation': 'm4fstack', 'estmemory': 12288},
+    {'scheme': 'dilithium5', 'implementation': 'm4fstack', 'estmemory': 21504},
+    {'scheme': 'dilithium3', 'implementation': 'm4fstack', 'estmemory': 17408},
 ]

From c013920b8028db39fc0ef52f62dd62088cf54d9f Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Tue, 16 Apr 2024 06:55:18 +0800
Subject: [PATCH 32/32] update benchmarks

---
 benchmarks.csv | 32 ++++++++++++++++++++++----------
 benchmarks.md  | 34 +++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/benchmarks.csv b/benchmarks.csv
index 981accb6..e3bb1bbb 100644
--- a/benchmarks.csv
+++ b/benchmarks.csv
@@ -42,12 +42,15 @@ cross-sha3-r-sdpg-1-fast (10 executions),ref,290136,287742,297758,29963868,29960
 cross-sha3-r-sdpg-1-small (10 executions),ref,290135,287741,297757,102853622,102847774,102861948,75137510,75126803,75159685
 cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565461,43582933,27513830,27493024,27525746
 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852
-dilithium2 (90 executions),clean,1873447,1838554,1903845,7846622,3321671,28761609,2062804,2062332,2063181
-dilithium2 (100 executions),m4f,1427684,1390524,1466437,4219137,1813668,12587382,1417706,1417251,1418128
+dilithium2 (1000 executions),clean,1874167,1827645,1914566,7493877,3321630,40762756,2062795,2062255,2063222
+dilithium2 (1000 executions),m4f,1426036,1379636,1466394,3807970,1813656,18528070,1417745,1417203,1418192
+dilithium2 (1000 executions),m4fstack,1801523,1684895,1902114,12170976,3900911,86281518,3241353,3194028,3281144
 dilithium3 (1000 executions),clean,3205551,3204090,3207411,12696585,5097364,74392293,3376992,3376581,3377393
 dilithium3 (1000 executions),m4f,2515969,2514498,2517634,5884832,2917322,25268693,2411257,2410858,2411717
-dilithium5 (90 executions),clean,5346066,5287239,5395626,15205929,7953360,49173429,5609664,5609137,5610119
-dilithium5 (100 executions),m4f,4273211,4210308,4329697,8062110,4882708,18398575,4185407,4184878,4185954
+dilithium3 (1000 executions),m4fstack,3412759,3406659,3419247,23673016,6733971,145803146,5733307,5688893,5778120
+dilithium5 (1000 executions),clean,5341477,5286872,5395822,15710371,7953367,75940093,5609679,5609217,5610183
+dilithium5 (1000 executions),m4f,4275029,4210286,4329519,7977781,4882524,25936176,4185417,4184925,4185896
+dilithium5 (1000 executions),m4fstack,5816287,5474236,6115061,33452872,11170780,185259803,9912851,9845789,9981834
 falcon-1024 (10 executions),m4-ct,354880005,284902033,635131652,87741288,87506676,87922628,991320,982548,997219
 falcon-1024 (10 executions),opt-ct,555202324,284912829,1157528581,87710190,87606677,87841235,993584,983066,997523
 falcon-1024 (10 executions),opt-leaktime,438412062,334858742,625013074,80139483,79891200,80551967,994127,984891,997390
@@ -190,11 +193,14 @@ cross-sha3-r-sdpg-1-small,ref,2328,466400,245512,,,,,,
 cross-sha3-r-sdpg-3-fast,ref,4032,205080,108236,,,,,,
 cross-sha3-r-sdpg-5-fast,ref,6824,398600,213436,,,,,,
 dilithium2,clean,38304,51968,36192,,,,,,
-dilithium2,m4f,38296,49416,36184,,,,,,
+dilithium2,m4f,38296,49416,36220,,,,,,
+dilithium2,m4fstack,4408,5072,2704,,,,,,
 dilithium3,clean,60832,79616,57728,,,,,,
 dilithium3,m4f,60824,68864,57720,,,,,,
+dilithium3,m4fstack,4408,6608,2704,,,,,,
 dilithium5,clean,97696,122724,92940,,,,,,
-dilithium5,m4f,97688,116076,92824,,,,,,
+dilithium5,m4f,97688,116076,92932,,,,,,
+dilithium5,m4fstack,4408,8136,2712,,,,,,
 falcon-1024,clean,34988,84604,8784,,,,,,
 falcon-1024,m4-ct,1156,2508,376,,,,,,
 falcon-1024,opt-ct,1156,2508,376,,,,,,
@@ -339,12 +345,15 @@ cross-sha3-r-sdpg-1-fast,ref,71.8,74.8,77.1,,,,,,
 cross-sha3-r-sdpg-1-small,ref,71.8,74.7,78.4,,,,,,
 cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,,
 cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,,
-dilithium2,clean,60.9,30.2,52.9,,,,,,
-dilithium2,m4f,79.9,62.2,76.8,,,,,,
+dilithium2,clean,61.0,30.9,52.9,,,,,,
+dilithium2,m4f,79.9,60.6,76.8,,,,,,
+dilithium2,m4fstack,74.8,55.2,40.8,,,,,,
 dilithium3,clean,64.7,31.3,56.8,,,,,,
 dilithium3,m4f,82.3,60.3,79.4,,,,,,
-dilithium5,clean,67.0,38.4,61.1,,,,,,
-dilithium5,m4f,83.4,63.5,81.7,,,,,,
+dilithium3,m4fstack,77.1,54.6,41.0,,,,,,
+dilithium5,clean,67.0,35.7,61.1,,,,,,
+dilithium5,m4f,83.5,65.0,81.7,,,,,,
+dilithium5,m4fstack,76.1,54.5,42.6,,,,,,
 falcon-1024,clean,6.5,0.3,23.7,,,,,,
 falcon-1024,m4-ct,7.4,0.4,32.4,,,,,,
 falcon-1024,opt-ct,11.7,0.4,32.2,,,,,,
@@ -490,10 +499,13 @@ cross-sha3-r-sdpg-3-fast,ref,19689,0,208,19897,,,,,
 cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,,
 dilithium2,clean,8064,0,0,8064,,,,,
 dilithium2,m4f,18596,0,0,18596,,,,,
+dilithium2,m4fstack,24184,0,0,24184,,,,,
 dilithium3,clean,7580,0,0,7580,,,,,
 dilithium3,m4f,18588,0,0,18588,,,,,
+dilithium3,m4fstack,23448,0,0,23448,,,,,
 dilithium5,clean,7808,0,0,7808,,,,,
 dilithium5,m4f,18468,0,0,18468,,,,,
+dilithium5,m4fstack,23820,0,0,23820,,,,,
 falcon-1024,clean,82647,0,0,82647,,,,,
 falcon-1024,m4-ct,81825,0,79872,161697,,,,,
 falcon-1024,opt-ct,81825,0,79872,161697,,,,,
diff --git a/benchmarks.md b/benchmarks.md
index 5574fe2c..5aef4137 100644
--- a/benchmarks.md
+++ b/benchmarks.md
@@ -44,12 +44,15 @@
 | cross-sha3-r-sdpg-1-small (10 executions) | ref | AVG: 290,135 <br /> MIN: 287,741 <br /> MAX: 297,757 | AVG: 102,853,622 <br /> MIN: 102,847,774 <br /> MAX: 102,861,948 | AVG: 75,137,510 <br /> MIN: 75,126,803 <br /> MAX: 75,159,685 |
 | cross-sha3-r-sdpg-3-fast (10 executions) | ref | AVG: 627,948 <br /> MIN: 625,525 <br /> MAX: 637,639 | AVG: 43,573,841 <br /> MIN: 43,565,461 <br /> MAX: 43,582,933 | AVG: 27,513,830 <br /> MIN: 27,493,024 <br /> MAX: 27,525,746 |
 | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280 <br /> MIN: 1,142,409 <br /> MAX: 1,153,794 | AVG: 93,557,878 <br /> MIN: 93,547,167 <br /> MAX: 93,566,329 | AVG: 59,948,216 <br /> MIN: 59,857,434 <br /> MAX: 60,043,852 |
-| dilithium2 (90 executions) | clean | AVG: 1,873,447 <br /> MIN: 1,838,554 <br /> MAX: 1,903,845 | AVG: 7,846,622 <br /> MIN: 3,321,671 <br /> MAX: 28,761,609 | AVG: 2,062,804 <br /> MIN: 2,062,332 <br /> MAX: 2,063,181 |
-| dilithium2 (100 executions) | m4f | AVG: 1,427,684 <br /> MIN: 1,390,524 <br /> MAX: 1,466,437 | AVG: 4,219,137 <br /> MIN: 1,813,668 <br /> MAX: 12,587,382 | AVG: 1,417,706 <br /> MIN: 1,417,251 <br /> MAX: 1,418,128 |
+| dilithium2 (1000 executions) | clean | AVG: 1,874,167 <br /> MIN: 1,827,645 <br /> MAX: 1,914,566 | AVG: 7,493,877 <br /> MIN: 3,321,630 <br /> MAX: 40,762,756 | AVG: 2,062,795 <br /> MIN: 2,062,255 <br /> MAX: 2,063,222 |
+| dilithium2 (1000 executions) | m4f | AVG: 1,426,036 <br /> MIN: 1,379,636 <br /> MAX: 1,466,394 | AVG: 3,807,970 <br /> MIN: 1,813,656 <br /> MAX: 18,528,070 | AVG: 1,417,745 <br /> MIN: 1,417,203 <br /> MAX: 1,418,192 |
+| dilithium2 (1000 executions) | m4fstack | AVG: 1,801,523 <br /> MIN: 1,684,895 <br /> MAX: 1,902,114 | AVG: 12,170,976 <br /> MIN: 3,900,911 <br /> MAX: 86,281,518 | AVG: 3,241,353 <br /> MIN: 3,194,028 <br /> MAX: 3,281,144 |
 | dilithium3 (1000 executions) | clean | AVG: 3,205,551 <br /> MIN: 3,204,090 <br /> MAX: 3,207,411 | AVG: 12,696,585 <br /> MIN: 5,097,364 <br /> MAX: 74,392,293 | AVG: 3,376,992 <br /> MIN: 3,376,581 <br /> MAX: 3,377,393 |
 | dilithium3 (1000 executions) | m4f | AVG: 2,515,969 <br /> MIN: 2,514,498 <br /> MAX: 2,517,634 | AVG: 5,884,832 <br /> MIN: 2,917,322 <br /> MAX: 25,268,693 | AVG: 2,411,257 <br /> MIN: 2,410,858 <br /> MAX: 2,411,717 |
-| dilithium5 (90 executions) | clean | AVG: 5,346,066 <br /> MIN: 5,287,239 <br /> MAX: 5,395,626 | AVG: 15,205,929 <br /> MIN: 7,953,360 <br /> MAX: 49,173,429 | AVG: 5,609,664 <br /> MIN: 5,609,137 <br /> MAX: 5,610,119 |
-| dilithium5 (100 executions) | m4f | AVG: 4,273,211 <br /> MIN: 4,210,308 <br /> MAX: 4,329,697 | AVG: 8,062,110 <br /> MIN: 4,882,708 <br /> MAX: 18,398,575 | AVG: 4,185,407 <br /> MIN: 4,184,878 <br /> MAX: 4,185,954 |
+| dilithium3 (1000 executions) | m4fstack | AVG: 3,412,759 <br /> MIN: 3,406,659 <br /> MAX: 3,419,247 | AVG: 23,673,016 <br /> MIN: 6,733,971 <br /> MAX: 145,803,146 | AVG: 5,733,307 <br /> MIN: 5,688,893 <br /> MAX: 5,778,120 |
+| dilithium5 (1000 executions) | clean | AVG: 5,341,477 <br /> MIN: 5,286,872 <br /> MAX: 5,395,822 | AVG: 15,710,371 <br /> MIN: 7,953,367 <br /> MAX: 75,940,093 | AVG: 5,609,679 <br /> MIN: 5,609,217 <br /> MAX: 5,610,183 |
+| dilithium5 (1000 executions) | m4f | AVG: 4,275,029 <br /> MIN: 4,210,286 <br /> MAX: 4,329,519 | AVG: 7,977,781 <br /> MIN: 4,882,524 <br /> MAX: 25,936,176 | AVG: 4,185,417 <br /> MIN: 4,184,925 <br /> MAX: 4,185,896 |
+| dilithium5 (1000 executions) | m4fstack | AVG: 5,816,287 <br /> MIN: 5,474,236 <br /> MAX: 6,115,061 | AVG: 33,452,872 <br /> MIN: 11,170,780 <br /> MAX: 185,259,803 | AVG: 9,912,851 <br /> MIN: 9,845,789 <br /> MAX: 9,981,834 |
 | falcon-1024 (10 executions) | m4-ct | AVG: 354,880,005 <br /> MIN: 284,902,033 <br /> MAX: 635,131,652 | AVG: 87,741,288 <br /> MIN: 87,506,676 <br /> MAX: 87,922,628 | AVG: 991,320 <br /> MIN: 982,548 <br /> MAX: 997,219 |
 | falcon-1024 (10 executions) | opt-ct | AVG: 555,202,324 <br /> MIN: 284,912,829 <br /> MAX: 1,157,528,581 | AVG: 87,710,190 <br /> MIN: 87,606,677 <br /> MAX: 87,841,235 | AVG: 993,584 <br /> MIN: 983,066 <br /> MAX: 997,523 |
 | falcon-1024 (10 executions) | opt-leaktime | AVG: 438,412,062 <br /> MIN: 334,858,742 <br /> MAX: 625,013,074 | AVG: 80,139,483 <br /> MIN: 79,891,200 <br /> MAX: 80,551,967 | AVG: 994,127 <br /> MIN: 984,891 <br /> MAX: 997,390 |
@@ -194,11 +197,14 @@
 | cross-sha3-r-sdpg-3-fast | ref | 4,032 | 205,080 | 108,236 |
 | cross-sha3-r-sdpg-5-fast | ref | 6,824 | 398,600 | 213,436 |
 | dilithium2 | clean | 38,304 | 51,968 | 36,192 |
-| dilithium2 | m4f | 38,296 | 49,416 | 36,184 |
+| dilithium2 | m4f | 38,296 | 49,416 | 36,220 |
+| dilithium2 | m4fstack | 4,408 | 5,072 | 2,704 |
 | dilithium3 | clean | 60,832 | 79,616 | 57,728 |
 | dilithium3 | m4f | 60,824 | 68,864 | 57,720 |
+| dilithium3 | m4fstack | 4,408 | 6,608 | 2,704 |
 | dilithium5 | clean | 97,696 | 122,724 | 92,940 |
-| dilithium5 | m4f | 97,688 | 116,076 | 92,824 |
+| dilithium5 | m4f | 97,688 | 116,076 | 92,932 |
+| dilithium5 | m4fstack | 4,408 | 8,136 | 2,712 |
 | falcon-1024 | clean | 34,988 | 84,604 | 8,784 |
 | falcon-1024 | m4-ct | 1,156 | 2,508 | 376 |
 | falcon-1024 | opt-ct | 1,156 | 2,508 | 376 |
@@ -345,12 +351,15 @@
 | cross-sha3-r-sdpg-1-small | ref | 71.8% | 74.7% | 78.4% |
 | cross-sha3-r-sdpg-3-fast | ref | 71.7% | 68.2% | 68.7% |
 | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% |
-| dilithium2 | clean | 60.9% | 30.2% | 52.9% |
-| dilithium2 | m4f | 79.9% | 62.2% | 76.8% |
+| dilithium2 | clean | 61.0% | 30.9% | 52.9% |
+| dilithium2 | m4f | 79.9% | 60.6% | 76.8% |
+| dilithium2 | m4fstack | 74.8% | 55.2% | 40.8% |
 | dilithium3 | clean | 64.7% | 31.3% | 56.8% |
-| dilithium3 | m4f | 82.3% | 60.3% | 79.4% |
-| dilithium5 | clean | 67.0% | 38.4% | 61.1% |
-| dilithium5 | m4f | 83.4% | 63.5% | 81.7% |
+| dilithium3 | m4f | 82.3% | 61.4% | 79.4% |
+| dilithium3 | m4fstack | 77.1% | 54.6% | 41.0% |
+| dilithium5 | clean | 67.0% | 35.7% | 61.1% |
+| dilithium5 | m4f | 83.5% | 65.0% | 81.7% |
+| dilithium5 | m4fstack | 76.1% | 54.5% | 42.6% |
 | falcon-1024 | clean | 6.5% | 0.3% | 23.7% |
 | falcon-1024 | m4-ct | 7.4% | 0.4% | 32.4% |
 | falcon-1024 | opt-ct | 11.7% | 0.4% | 32.2% |
@@ -498,10 +507,13 @@
 | cross-sha3-r-sdpg-5-fast | ref | 18,593 | 0 | 208 | 18,801 |
 | dilithium2 | clean | 8,064 | 0 | 0 | 8,064 |
 | dilithium2 | m4f | 18,596 | 0 | 0 | 18,596 |
+| dilithium2 | m4fstack | 24,184 | 0 | 0 | 24,184 |
 | dilithium3 | clean | 7,580 | 0 | 0 | 7,580 |
 | dilithium3 | m4f | 18,588 | 0 | 0 | 18,588 |
+| dilithium3 | m4fstack | 23,448 | 0 | 0 | 23,448 |
 | dilithium5 | clean | 7,808 | 0 | 0 | 7,808 |
 | dilithium5 | m4f | 18,468 | 0 | 0 | 18,468 |
+| dilithium5 | m4fstack | 23,820 | 0 | 0 | 23,820 |
 | falcon-1024 | clean | 82,647 | 0 | 0 | 82,647 |
 | falcon-1024 | m4-ct | 81,825 | 0 | 79,872 | 161,697 |
 | falcon-1024 | opt-ct | 81,825 | 0 | 79,872 | 161,697 |