From 44e901cc7e3d21bcd410e6a2960e16bcf741fc3d Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 12:18:15 +0100 Subject: [PATCH 01/32] Init dilithium3 stack optimized variant --- crypto_sign/dilithium3/m4fstack/api.h | 26 + crypto_sign/dilithium3/m4fstack/config.h | 7 + crypto_sign/dilithium3/m4fstack/macros.i | 191 ++++ crypto_sign/dilithium3/m4fstack/macros_fnt.i | 158 ++++ crypto_sign/dilithium3/m4fstack/ntt.S | 402 +++++++++ crypto_sign/dilithium3/m4fstack/ntt.h | 13 + crypto_sign/dilithium3/m4fstack/packing.c | 286 ++++++ crypto_sign/dilithium3/m4fstack/packing.h | 55 ++ crypto_sign/dilithium3/m4fstack/params.h | 83 ++ .../dilithium3/m4fstack/pointwise_mont.h | 13 + .../dilithium3/m4fstack/pointwise_mont.s | 128 +++ crypto_sign/dilithium3/m4fstack/poly.c | 851 ++++++++++++++++++ crypto_sign/dilithium3/m4fstack/poly.h | 82 ++ crypto_sign/dilithium3/m4fstack/polyvec.c | 429 +++++++++ crypto_sign/dilithium3/m4fstack/polyvec.h | 99 ++ crypto_sign/dilithium3/m4fstack/reduce.h | 29 + crypto_sign/dilithium3/m4fstack/rounding.c | 102 +++ crypto_sign/dilithium3/m4fstack/rounding.h | 19 + crypto_sign/dilithium3/m4fstack/sign.c | 352 ++++++++ crypto_sign/dilithium3/m4fstack/sign.h | 37 + crypto_sign/dilithium3/m4fstack/smallntt.S | 837 +++++++++++++++++ crypto_sign/dilithium3/m4fstack/smallntt.h | 53 ++ crypto_sign/dilithium3/m4fstack/smallpoly.c | 84 ++ crypto_sign/dilithium3/m4fstack/smallpoly.h | 39 + .../dilithium3/m4fstack/symmetric-shake.c | 28 + crypto_sign/dilithium3/m4fstack/symmetric.h | 65 ++ crypto_sign/dilithium3/m4fstack/vector.h | 20 + crypto_sign/dilithium3/m4fstack/vector.s | 210 +++++ 28 files changed, 4698 insertions(+) create mode 100644 crypto_sign/dilithium3/m4fstack/api.h create mode 100644 crypto_sign/dilithium3/m4fstack/config.h create mode 100644 crypto_sign/dilithium3/m4fstack/macros.i create mode 100644 crypto_sign/dilithium3/m4fstack/macros_fnt.i create mode 100644 crypto_sign/dilithium3/m4fstack/ntt.S create mode 100644 crypto_sign/dilithium3/m4fstack/ntt.h create mode 100644 crypto_sign/dilithium3/m4fstack/packing.c create mode 100644 crypto_sign/dilithium3/m4fstack/packing.h create mode 100644 crypto_sign/dilithium3/m4fstack/params.h create mode 100644 crypto_sign/dilithium3/m4fstack/pointwise_mont.h create mode 100644 crypto_sign/dilithium3/m4fstack/pointwise_mont.s create mode 100644 crypto_sign/dilithium3/m4fstack/poly.c create mode 100644 crypto_sign/dilithium3/m4fstack/poly.h create mode 100644 crypto_sign/dilithium3/m4fstack/polyvec.c create mode 100644 crypto_sign/dilithium3/m4fstack/polyvec.h create mode 100644 crypto_sign/dilithium3/m4fstack/reduce.h create mode 100644 crypto_sign/dilithium3/m4fstack/rounding.c create mode 100644 crypto_sign/dilithium3/m4fstack/rounding.h create mode 100644 crypto_sign/dilithium3/m4fstack/sign.c create mode 100644 crypto_sign/dilithium3/m4fstack/sign.h create mode 100644 crypto_sign/dilithium3/m4fstack/smallntt.S create mode 100644 crypto_sign/dilithium3/m4fstack/smallntt.h create mode 100644 crypto_sign/dilithium3/m4fstack/smallpoly.c create mode 100644 crypto_sign/dilithium3/m4fstack/smallpoly.h create mode 100644 crypto_sign/dilithium3/m4fstack/symmetric-shake.c create mode 100644 crypto_sign/dilithium3/m4fstack/symmetric.h create mode 100644 crypto_sign/dilithium3/m4fstack/vector.h create mode 100644 crypto_sign/dilithium3/m4fstack/vector.s diff --git a/crypto_sign/dilithium3/m4fstack/api.h b/crypto_sign/dilithium3/m4fstack/api.h new file mode 100644 index 00000000..a289632c --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/api.h @@ -0,0 +1,26 @@ +#ifndef API_H +#define API_H + +#include +#include +#include "params.h" + +int crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/config.h b/crypto_sign/dilithium3/m4fstack/config.h new file mode 100644 index 00000000..55724079 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/config.h @@ -0,0 +1,7 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define DILITHIUM_MODE 3 +// #define SIGN_STACKSTRATEGY 2 + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/macros.i b/crypto_sign/dilithium3/m4fstack/macros.i new file mode 100644 index 00000000..25d98c2b --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/macros.i @@ -0,0 +1,191 @@ +#ifndef MACROS_I +#define MACROS_I +// 3 +.macro montgomery_mul_32 a, b, Qprime, Q, tmp, tmp2 + smull \tmp, \a, \a, \b + mul \tmp2, \tmp, \Qprime + smlal \tmp, \a, \tmp2, \Q +.endm + +// 2 +.macro addSub1 c0, c1 + add.w \c0, \c1 + sub.w \c1, \c0, \c1, lsl #1 +.endm + +// 3 +.macro addSub2 c0, c1, c2, c3 + add \c0, \c1 + add \c2, \c3 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 +.endm + +// 6 +.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7 + add \c0, \c1 + add \c2, \c3 + add \c4, \c5 + add \c6, \c7 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 + sub.w \c5, \c4, \c5, lsl #1 + sub.w \c7, \c6, \c7, lsl #1 +.endm + +.macro _2_layer_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2 + montgomery_mul_32 \c2, \zeta0, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 + + montgomery_mul_32 \c1, \zeta1, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c1, \c2, \c3 +.endm + +.macro _2_layer_inv_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2 + montgomery_mul_32 \c1, \zeta0, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c1, \c2, \c3 + + montgomery_mul_32 \c2, \zeta1, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 +.endm + +.macro _3_layer_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 + + vmov.w \twiddle, \xi1 + montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7 + + vmov.w \twiddle, \xi3 + montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi4 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi5 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi6 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 +.endm + +.macro _3_layer_inv_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 + + vmov.w \twiddle, \xi1 + montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7 + + vmov.w \twiddle, \xi3 + montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi4 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi5 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi6 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 +.endm + +/************************************************************ +* Name: _3_layer_inv_butterfly_light_fast_first +* +* Description: upper half of 3-layer inverse butterfly +* defined over X^8 - 1 +* +* Input: (c4, c1, c6, c3) = coefficients on the upper half; +* (xi0, xi1, xi2, xi3, xi4, xi5, xi6) = +* ( 1, 1, w_4, 1, w_8, w_4, w_8^3) in +* Montgomery domain +* +* Symbols: R = 2^32 +* +* Constants: Qprime = -MOD^{-1} mod^{+-} R, Q = MOD +* +* Output: +* c4 = c4 + c1 + (c6 + c3) +* c5 = (c4 - c1) w_4 + (c6 + c3) w_8^3 +* c6 = c4 + c1 - (c6 + c3) +* c7 = (c4 - c1) w_8^3 + (c6 + c3) w_4 +************************************************************/ +// 15 +.macro _3_layer_inv_butterfly_light_fast_first c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + addSub2 \c4, \c1, \c6, \c3 + addSub1 \c4, \c6 + + vmov.w \tmp, \xi4 + vmov.w \tmp2, \xi6 + + smull.w \c0, \c5, \c1, \tmp + smlal.w \c0, \c5, \c3, \tmp2 + mul.w \twiddle, \c0, \Qprime + smlal.w \c0, \c5, \twiddle, \Q + + smull.w \c2, \c7, \c1, \tmp2 + smlal.w \c2, \c7, \c3, \tmp + mul.w \twiddle, \c2, \Qprime + smlal.w \c2, \c7, \twiddle, \Q +.endm + +/************************************************************ +* Name: _3_layer_inv_butterfly_light_fast_second +* +* Description: lower half of 3-layer inverse butterfly +* defined over X^8 - 1, and the 2nd +* layer of butterflies +* +* Input: +* (c4, c5, c6, c7) = results of the upper half; +* (c0, c1, c2, c3) = coefficients on the lower half; +* (xi0, xi1, xi2, xi3, xi4, xi5, xi6) = +* ( 1, 1, w_4, 1, w_8, w_4, w_8^3) in +* Montgomery domain +* +* Symbols: R = 2^32 +* +* Constants: Qprime = -MOD^{-1} mod^{+-} R, Q = MOD +* +* Output: (normal order) +* c0 = c0 + c1 + (c2 + c3) + ( c4 + c5 + (c6 + c7) ) +* c1 = (c0 - c1) w3 + (c2 - c3) w4 + ( (c4 - c5) w5 + (c6 - c7) w6 ) +* c2 = ( c0 + c1 - (c2 + c3)) w1 + (( c4 + c5 - (c6 + c7) ) w2) +* c3 = ((c0 - c1) w3 - (c2 - c3) w4) w1 + (((c4 - c5) w5 - (c6 - c7) w6) w2) +* c4 = c0 + c1 - (c2 + c3) - ( c4 + c5 + (c6 + c7) ) w0 +* c5 = (c0 - c1) w3 + (c2 - c3) w4 - ( (c4 - c5) w5 + (c6 - c7) w6 ) w0 +* c6 = ( c0 + c1 - (c2 + c3)) w1 - (( c4 + c5 - (c6 + c7) ) w2) w0 +* c7 = ((c0 - c1) w3 - (c2 - c3) w4) w1 - (((c4 - c5) w5 - (c6 - c7) w6) w2) w0 +************************************************************/ +// 19 +.macro _3_layer_inv_butterfly_light_fast_second c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + addSub2 \c0, \c1, \c2, \c3 + + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 + + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 +.endm + +#endif /* MACROS_I */ diff --git a/crypto_sign/dilithium3/m4fstack/macros_fnt.i b/crypto_sign/dilithium3/m4fstack/macros_fnt.i new file mode 100644 index 00000000..25903e41 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/macros_fnt.i @@ -0,0 +1,158 @@ +// 2 +.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1 + \ldrstr \c0, [\target, \mem0] + \ldrstr \c1, [\target, \mem1] +.endm + +// 2 +.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump + \ldrstr \c1, [\target, \mem1] + \ldrstr \c0, [\target], \jump +.endm + +// 4 +.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + \ldrstr \c0, [\target, \mem0] + \ldrstr \c1, [\target, \mem1] + \ldrstr \c2, [\target, \mem2] + \ldrstr \c3, [\target, \mem3] +.endm + +// 4 +.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump + \ldrstr \c1, [\target, \mem1] + \ldrstr \c2, [\target, \mem2] + \ldrstr \c3, [\target, \mem3] + \ldrstr \c0, [\target], \jump +.endm + +// 8 +.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7 + ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 + ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 +.endm + +// 8 +.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump + ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 + ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump +.endm + + + +.macro addSub1 c0, c1 + add.w \c0, \c1 + sub.w \c1, \c0, \c1, lsl #1 +.endm + +.macro addSub2 c0, c1, c2, c3 + add \c0, \c1 + add \c2, \c3 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 +.endm + +.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7 + add \c0, \c1 + add \c2, \c3 + add \c4, \c5 + add \c6, \c7 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 + sub.w \c5, \c4, \c5, lsl #1 + sub.w \c7, \c6, \c7, lsl #1 +.endm + +// 2 +.macro barrett_32 a, Qbar, Q, tmp + smmulr.w \tmp, \a, \Qbar + mls.w \a, \tmp, \Q, \a +.endm + +.macro FNT_CT_butterfly c0, c1, logW + add.w \c0, \c0, \c1, lsl #\logW + sub.w \c1, \c0, \c1, lsl #(\logW+1) +.endm + +.macro shift_subAdd c0, c1, shlv + sub.w \c0, \c0, \c1, lsl #(\shlv) + add.w \c1, \c0, \c1, lsl #(\shlv+1) +.endm + +.macro FNT_CT_ibutterfly c0, c1, shlv + shift_subAdd \c0, \c1, \shlv +.endm + +// 46 +.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + + // c0, c1, c2, c3, c4, c5, c6, c7, c8 + // 0,4 + mla \tmp, \c4, \twiddle, \c0 + mls \c4, \c4, \twiddle, \c0 + + // 1,5 + mla \c0, \c5, \twiddle, \c1 + mls \c5, \c5, \twiddle, \c1 + + // 2,6 + mla \c1, \c6, \twiddle, \c2 + mls \c6, \c6, \twiddle, \c2 + + // 3,7 + mla \c2, \c7, \twiddle, \c3 + mls \c7, \c7, \twiddle, \c3 + + // tmp, c0, c1, c2, c4, c5, c6, c7 + + barrett_32 \tmp, \Qprime, \Q, \c3 + barrett_32 \c0, \Qprime, \Q, \c3 + barrett_32 \c1, \Qprime, \Q, \c3 + barrett_32 \c2, \Qprime, \Q, \c3 + barrett_32 \c4, \Qprime, \Q, \c3 + barrett_32 \c5, \Qprime, \Q, \c3 + barrett_32 \c6, \Qprime, \Q, \c3 + barrett_32 \c7, \Qprime, \Q, \c3 + + vmov.w \twiddle, \xi1 + // 0,2 + mla \tmp2, \c1, \twiddle, \tmp + mls \c3, \c1, \twiddle, \tmp + + // 1,3 + mla \tmp, \c2, \twiddle, \c0 + mls \c0, \c2, \twiddle, \c0 + + vmov.w \twiddle, \xi2 + + // 4,6 + mla \c2, \c6, \twiddle, \c4 + mls \c1, \c6, \twiddle, \c4 + + // 5,7 + mla \c6, \c7, \twiddle, \c5 + mls \c7, \c7, \twiddle, \c5 + + // tmp2, tmp, c3, c0 | c2, c6, c1, c7 + + // 4,5 + vmov.w \twiddle, \xi5 + mla \c4, \c6, \twiddle, \c2 + mls \c5, \c6, \twiddle, \c2 + + // 6,7 + vmov.w \twiddle, \xi6 + mla \c6, \c7, \twiddle, \c1 + mls \c7, \c7, \twiddle, \c1 + + // 2,3 + vmov.w \twiddle, \xi4 + mla \c2, \c0, \twiddle, \c3 + mls \c3, \c0, \twiddle, \c3 + + // 0,1 + vmov.w \twiddle, \xi3 + mla \c0, \tmp, \twiddle, \tmp2 + mls \c1, \tmp, \twiddle, \tmp2 +.endm \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/ntt.S b/crypto_sign/dilithium3/m4fstack/ntt.S new file mode 100644 index 00000000..bfd5f7a4 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/ntt.S @@ -0,0 +1,402 @@ +// based on code by: Markus Krausz (18.03.18) +// date 23.07.21: Now licensed under CC0 with permission of the authors. + +.syntax unified +#include "macros.i" + +// This code uses UMULL - which is constant time on the M4, but not on the M3 +// Make sure that this code is never used on an M3 +smlad r0,r0,r0,r0 + +// ############################## +// ########## NTT ########## +// ############################## + +//void pqcrystals_dilithium_ntt(int32_t p[N]); +.global pqcrystals_dilithium_ntt +.type pqcrystals_dilithium_ntt,%function +.align 2 +pqcrystals_dilithium_ntt: + //bind aliases + ptr_p .req R0 + ptr_zeta .req R1 + zeta .req R1 + qinv .req R2 + q .req R3 + cntr .req R4 + pol4 .req R4 + pol0 .req R5 + pol1 .req R6 + pol2 .req R7 + pol3 .req R8 + temp_h .req R9 + temp_l .req R10 + zeta0 .req R11 + zeta1 .req R12 + zeta2 .req R14 + pol5 .req R11 + pol6 .req R12 + pol7 .req R14 + + //preserve registers + push {R4-R11, R14} + + //load constants, ptr + ldr.w qinv, inv_ntt_asm_smull_qinv //-qinv_signed + ldr.w q, inv_ntt_asm_smull_q + + //stage 1 - 3 + .equ distance, 512 + .equ strincr, 4 + + ldr ptr_zeta, =#zetas_new332 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + add.w temp_l, ptr_p, #32*strincr // 32 iterations + vmov s9, temp_l + 1: + .rept 2 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #strincr + .endr + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne 1b + + sub ptr_p, #32*4 + + // stage 4 - 6 + .equ distance, 64 + add.w temp_l, ptr_p, #8*112+8*4*4 // 8 iterations + vmov s9, temp_l + 1: + add.w temp_l, ptr_p, #4*strincr // 4 iterations + vmov s10, temp_l + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + 2: + .rept 2 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #4 + .endr + vmov temp_l, s10 + cmp.w ptr_p, temp_l + bne 2b + + add.w ptr_p, #112 + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne 1b + + sub ptr_p, #4*4*8+112*8 + vmov ptr_zeta, s0 + //stage 7 and 8 + add cntr, ptr_p, #1024 // 64 iterations + 1: + ldr.w zeta1, [ptr_zeta, #4] //z128,..., z254 + ldr.w zeta2, [ptr_zeta, #8] //z129,..., z255 + ldr zeta0, [ptr_zeta], #12 //z64, ..., z127 + ldr.w pol0, [ptr_p] //1*4 + ldr.w pol1, [ptr_p, #4] + ldr.w pol2, [ptr_p, #8] + ldr.w pol3, [ptr_p, #12] + + _2_layer_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #4] + str.w pol2, [ptr_p, #8] + str.w pol3, [ptr_p, #12] + str pol0, [ptr_p], #16 + + cmp.w cntr, ptr_p + bne.w 1b + + //restore registers + pop {R4-R11, PC} + + //unbind aliases + .unreq ptr_p + .unreq ptr_zeta + .unreq qinv + .unreq q + .unreq cntr + .unreq pol0 + .unreq pol1 + .unreq pol2 + .unreq pol3 + .unreq temp_h + .unreq temp_l + .unreq zeta0 + .unreq zeta1 + .unreq zeta2 + +.ltorg +// ############################## +// ########## NTT^-1 ########## +// ############################## + +//void pqcrystals_dilithium_invntt_tomont(int32_t p[N]); +.global pqcrystals_dilithium_invntt_tomont +.type pqcrystals_dilithium_invntt_tomont,%function +.align 2 +pqcrystals_dilithium_invntt_tomont: + //bind aliases + ptr_p .req R0 + ptr_zeta .req R1 + zeta .req R1 + qinv .req R2 + q .req R3 + cntr .req R4 + pol4 .req R4 + pol0 .req R5 + pol1 .req R6 + pol2 .req R7 + pol3 .req R8 + temp_h .req R9 + temp_l .req R10 + zeta0 .req R11 + zeta1 .req R12 + zeta2 .req R14 + pol5 .req R11 + pol6 .req R12 + pol7 .req R14 + + //preserve registers + push {R4-R11, R14} + + //load constants, ptr + ldr.w qinv, inv_ntt_asm_smull_qinv //-qinv_signed + ldr.w q, inv_ntt_asm_smull_q + + //stage 1 - 3 + .equ distance, 16 + .equ strincr, 32 + + ldr ptr_zeta, =#zetas_new332inv + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + add.w temp_l, ptr_p, #32*strincr // 32 iterations + vmov s9, temp_l + 1: + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol1, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol3, [ptr_p, #7*distance/4] + _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #strincr + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne.w 1b + + sub ptr_p, #32*strincr + + // stage 4 - 6 + .equ distance, 128 + .equ strincr, 256 + + // iteration 0 + movw temp_l, #4 + add.w temp_l, ptr_p, #4*256 // 4 iterations + vmov s10, temp_l + + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + 2: + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol1, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol3, [ptr_p, #7*distance/4] + _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p] + add.w ptr_p, #strincr + + vmov temp_l, s10 + cmp.w temp_l, ptr_p + bne.w 2b + + sub.w ptr_p, #4*256-4 + + // iteration 1-7 + add.w temp_l, ptr_p, #7*4 // 7 iterations + vmov s9, temp_l + 1: + add.w temp_l, ptr_p, #4*strincr // 4 iterations + vmov s10, temp_l + + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + 2: + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p] + add.w ptr_p, #strincr + + vmov temp_l, s10 + cmp.w ptr_p, temp_l + bne 2b + sub.w ptr_p, #4*strincr-4 + + vmov temp_l, s9 + cmp.w temp_l, ptr_p + bne 1b + + sub ptr_p, #8*4 + vmov ptr_zeta, s0 + + //stage 7 and 8 + .equ strincr, 4 + + add.w cntr, ptr_p, #64*strincr // 64 iterations + vmov s9, cntr + 1: + ldr.w zeta1, [ptr_zeta, #4] + ldr.w zeta2, [ptr_zeta, #8] + ldr zeta0, [ptr_zeta], #12 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #256] + ldr.w pol2, [ptr_p, #512] + ldr.w pol3, [ptr_p, #768] + + _2_layer_inv_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l + + ldr.w zeta1, [ptr_zeta, #4] + ldr.w zeta2, [ptr_zeta, #8] + ldr.w zeta0, [ptr_zeta, #12] + ldr.w cntr, [ptr_zeta], #16 + montgomery_mul_32 pol0, cntr, qinv, q, temp_h, temp_l + montgomery_mul_32 pol1, zeta1, qinv, q, temp_h, temp_l + montgomery_mul_32 pol2, zeta2, qinv, q, temp_h, temp_l + montgomery_mul_32 pol3, zeta0, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #256] + str.w pol2, [ptr_p, #512] + str.w pol3, [ptr_p, #768] + str pol0, [ptr_p], #strincr + + vmov cntr, s9 + cmp.w cntr, ptr_p + bne.w 1b + + //restore registers + pop {R4-R11, PC} + + //unbind aliases + .unreq ptr_p + .unreq ptr_zeta + .unreq qinv + .unreq q + .unreq cntr + .unreq pol0 + .unreq pol1 + .unreq pol2 + .unreq pol3 + .unreq temp_h + .unreq temp_l + .unreq zeta0 + .unreq zeta1 + .unreq zeta2 + +.align 2 +inv_ntt_asm_smull_qinv: +.word 0xfc7fdfff +.align 2 +inv_ntt_asm_smull_q: +.word 8380417 + +.section .rodata + +.type zetas_new332, %object +.align 2 +zetas_new332: +.word 25847, -2608894, -518909, 237124, -777960, -876248, 466468, 1826347, 2725464, 1024112, 2706023, 95776, 3077325, 3530437, 2353451, -1079900, 3585928, -1661693, -3592148, -2537516, 3915439, -359251, -549488, -1119584, -3861115, -3043716, 3574422, -2867647, -2091905, 2619752, -2108549, 3539968, -300467, 2348700, -539299, 3119733, -2118186, -3859737, -1699267, -1643818, 3505694, -3821735, -2884855, -1399561, -3277672, 3507263, -2140649, -1600420, 3699596, 3111497, 1757237, -19422, 811944, 531354, 954230, 3881043, 2680103, 4010497, 280005, 3900724, -2556880, 2071892, -2797779, -3930395, 2091667, 3407706, -1528703, 2316500, 3817976, -3677745, -3342478, 2244091, -3041255, -2446433, -3562462, -1452451, 266997, 2434439, 3475950, -1235728, 3513181, 2176455, -3520352, -3759364, -1585221, -1197226, -3193378, -1257611, 900702, 1859098, 1939314, 909542, 819034, -4083598, 495491, -1613174, -1000202, -43260, -522500, -3190144, -655327, -3122442, -3157330, 2031748, 3207046, -3632928, -3556995, -525098, 126922, -768622, -3595838, 3412210, 342297, 286988, -983419, -2437823, 4108315, 2147896, 3437287, -3342277, 2715295, 1735879, 203044, -2967645, 2842341, 2691481, -3693493, -2590150, 1265009, -411027, 4055324, 1247620, -2477047, 2486353, 1595974, -671102, -3767016, 1250494, -1228525, 2635921, -3548272, -22981, -2994039, 1869119, -1308169, 1903435, -1050970, -381987, -1333058, 1237275, 1349076, -3318210, -1430225, 1852771, -451100, 1312455, -1430430, 3306115, -1962642, -3343383, -1279661, 1917081, 264944, -2546312, -1374803, 508951, 1500165, 777191, 3097992, 2235880, 3406031, 44288, -542412, -2831860, -1100098, -1671176, -1846953, 904516, -2584293, -3724270, 3958618, 594136, -3776993, -3724342, -2013608, 2432395, -8578, 2454455, -164721, 1653064, 1957272, 3369112, -3249728, 185531, -1207385, 2389356, -3183426, 162844, -210977, 1616392, 3014001, 759969, 810149, 1652634, -1316856, -3694233, -1799107, 189548, -3038916, 3523897, -3553272, 3866901, 269760, 3159746, 2213111, -975884, -1851402, 1717735, 472078, -2409325, -426683, 1723600, -177440, -1803090, 1910376, 1315589, -1667432, -1104333, 1341330, -260646, -3833893, 1285669, -2939036, -2235985, -1584928, -420899, -2286327, -812732, 183443, -976891, -1439742, 1612842, -3545687, -3019102, -554416, 3919660, -3881060, -48306, -1362209, -3628969, 3937738, 1400424, 3839961, -846154, 1976782 +.size zetas_new332,.-zetas_new332 + +.type zetas_new332inv, %object +.align 2 +zetas_new332inv: +.word 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, -466468, -2680103, -3111497, -280005, 19422, -4010497, -1757237, 518909, -466468, 876248, -2680103, 2884855, -3111497, -3119733, 777960, 2091905, 359251, 2108549, 1119584, -2619752, 549488, -25847, 518909, 2608894, -466468, 777960, 876248, -237124, 876248, 2884855, -3119733, 3277672, 3859737, 1399561, 2118186, 2608894, 777960, -237124, 2091905, -2353451, 359251, -1826347, -237124, -2353451, -1826347, -3585928, -1024112, 1079900, -2725464, 4193792, 4193792, -25847, 41978, 3024400, 3975713, -1225192, 2797779, -3839961, 3628969, -1711436, 3835778, 485110, -3954267, -280005, 2797779, -2071892, -2831100, -2698859, -908040, -2292170, 539299, 1430430, -1852771, -3658785, 3512212, 1859141, -1607594, -2680103, -280005, -4010497, 715005, 1483994, -1045894, -980943, -3699596, 1316856, -759969, -955715, 3677139, 3933849, 2719610, 2108549, 539299, -2348700, 1658328, -1403403, 1775852, -2460465, -3915439, -126922, 3632928, 1067023, 3847594, 4179270, 1652689, -466468, -2680103, -3111497, -2953811, -284642, 2507426, -324139, -3881043, -1341330, -1315589, 3990128, -2137097, -4109898, 4092021, 3277672, -3699596, 1600420, 1541634, 3493410, 3487504, 2497815, 2867647, 2477047, 411027, 1654972, 1326223, -2608226, -2752209, 2091905, 2108549, -2619752, 1836700, 2945615, -1908953, 729864, 3821735, -3958618, -904516, 2080615, 1555380, -3471815, -1978758, -3585928, -3915439, 2537516, -892788, -553664, -3095038, 658596, -3530437, 1585221, -2176455, 3355482, -1783485, 2780552, -3623330, 518909, -466468, 876248, -442683, 2523147, -2847660, -3683140, 2556880, 1439742, 812732, 774207, -3168108, 1877157, 3406477, 19422, -3881043, -954230, -214686, -1182619, 2453526, -2201920, 300467, 1308169, 22981, 3614022, 2136260, 1459487, -2233803, 2884855, 3277672, 1399561, 394072, -3933227, 4136064, 156486, 2140649, 3249728, -1653064, 1596950, 633578, 2722529, -554462, 1119584, 2867647, -3574422, 1004840, 191586, 3969463, 1161373, 3592148, 1000202, 4083598, 3189243, 3561667, -3650125, 3490511, 777960, 2091905, 359251, -1829156, -3707725, -661807, 1144558, -531354, 1851402, -3159746, 1543095, -2903948, 1505516, -1500460, 3859737, 3821735, -3505694, -2413330, 3908886, -1203856, 3570263, 3043716, -2715295, -2147896, 758741, 3917553, -2414897, -1613811, -2353451, -3585928, 1079900, 990020, -719638, 2718792, 2260310, 1643818, -3097992, -508951, -783456, -2089539, 2616547, 4060031, -1024112, -3530437, -3077325, -1821861, 1920615, 3988525, 2048419, -95776, 3041255, 3677745, -971504, 2190617, 2311312, -1170082, -25847, 518909, 2608894, 1261528, -2073537, -959585, 3948120, -2071892, 3881060, 3019102, -1342633, -1115066, 3589694, -1929116, -4010497, 2556880, -3900724, 3360006, 1758630, -2306989, -1841637, -2348700, -1349076, 381987, -1699982, 3189673, 3531558, -1210546, -3111497, 19422, -1757237, 2977353, 2612035, -2718155, -1544829, 1600420, 210977, -2389356, 2052582, -2737802, 2383976, -450259, -2619752, 300467, -3539968, 1698289, -4065084, -644023, -1114140, 2537516, 3157330, 3190144, -993399, -2220524, 2920588, 252737, 876248, 2884855, -3119733, 1490985, -34731, -1212610, -3183745, -954230, 177440, 2409325, -3302554, -2390327, -2749545, 653128, 1399561, 2140649, -3507263, -3745105, -1942293, -3367121, 2734884, -3574422, 3693493, 2967645, 1393803, -2467905, 1786029, -1633410, 359251, 1119584, 549488, -2824548, -1325638, -2207625, -2601586, -3505694, 1100098, -44288, 3478676, -2457992, -1617107, 2551364, 1079900, 3592148, 1661693, 1593929, 318899, -3366475, 3118416, -3077325, -3475950, 1452451, 3772814, 1424805, -3391376, 632820, 2608894, 777960, -237124, 2062597, 4064335, 2197148, -1127864, -3900724, 1584928, -1285669, 2525341, -896437, -1915773, 1792087, -1757237, -531354, -811944, 938441, -674578, 2876837, 3959371, -3539968, 1228525, 671102, 1219592, -3853560, 2630979, -2134676, -3119733, 3859737, 2118186, -2432637, 2746655, 718593, -2353280, -3507263, 8578, 3724342, -34852, 1387945, 358956, 1604944, 549488, 3043716, 3861115, 1290746, 3208584, 2538711, -1442830, 1661693, -1939314, 1257611, -367371, -1308058, 264382, 2614173, -237124, -2353451, -1826347, 2050674, 592050, -138487, 2310528, -811944, 3553272, -189548, -2728561, -4168358, -79, 3844932, 2118186, 1643818, 1699267, 500408, 743398, 879633, -3105206, 3861115, 983419, -3412210, 712597, -23479, 3729381, -1010481, -1826347, -1024112, -2725464, -2361217, -1864453, 3850522, 2337144, 1699267, -264944, 3343383, 3842267, 4181974, -4032642, 3983585, -2725464, -95776, -2706023, 260345, 2526550, 2000777, 987079, -2706023, 1528703, 3930395, -3030761, -3082055, -2374824, 1836319 +.size zetas_new332inv,.-zetas_new332inv diff --git a/crypto_sign/dilithium3/m4fstack/ntt.h b/crypto_sign/dilithium3/m4fstack/ntt.h new file mode 100644 index 00000000..731132d5 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/ntt.h @@ -0,0 +1,13 @@ +#ifndef NTT_H +#define NTT_H + +#include +#include "params.h" + +#define ntt DILITHIUM_NAMESPACE(ntt) +void ntt(int32_t a[N]); + +#define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) +void invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/packing.c b/crypto_sign/dilithium3/m4fstack/packing.c new file mode 100644 index 00000000..8aaff2a3 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/packing.c @@ -0,0 +1,286 @@ +#include "params.h" +#include "packing.h" +#include "polyvec.h" +#include "poly.h" + +/************************************************* +* Name: pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + pk[i] = rho[i]; + pk += SEEDBYTES; + + for(i = 0; i < K; ++i) + polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]); +} + +/************************************************* +* Name: unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = pk[i]; + pk += SEEDBYTES; + + for(i = 0; i < K; ++i) + polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES); +} + +/************************************************* +* Name: pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[TRBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + sk[i] = rho[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + sk[i] = key[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + sk[i] = tr[i]; + sk += TRBYTES; + + for(i = 0; i < L; ++i) + polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]); + sk += L*POLYETA_PACKEDBYTES; + + for(i = 0; i < K; ++i) + polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]); + sk += K*POLYETA_PACKEDBYTES; + + for(i = 0; i < K; ++i) + polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]); +} + +/************************************************* +* Name: unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + key[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + tr[i] = sk[i]; + sk += TRBYTES; + + for(i=0; i < L; ++i) + small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES); + sk += L*POLYETA_PACKEDBYTES; + + for(i=0; i < K; ++i) + small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES); + sk += K*POLYETA_PACKEDBYTES; + + for(i=0; i < K; ++i) + polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES); +} + + +/************************************************* +* Name: pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void pack_sig(uint8_t sig[CRYPTO_BYTES], + const uint8_t c[CTILDEBYTES], + const polyvecl *z, + const polyveck *h) +{ + unsigned int i, j, k; + + for(i=0; i < CTILDEBYTES; ++i) + sig[i] = c[i]; + sig += CTILDEBYTES; + + for(i = 0; i < L; ++i) + polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]); + sig += L*POLYZ_PACKEDBYTES; + + /* Encode h */ + for(i = 0; i < OMEGA + K; ++i) + sig[i] = 0; + + k = 0; + for(i = 0; i < K; ++i) { + for(j = 0; j < N; ++j) + if(h->vec[i].coeffs[j] != 0) + sig[k++] = j; + + sig[OMEGA + i] = k; + } +} + +void pack_sig_c(uint8_t sig[CRYPTO_BYTES], + const uint8_t c[CTILDEBYTES]) +{ + unsigned int i; + + for(i=0; i < CTILDEBYTES; ++i) + sig[i] = c[i]; + sig += CTILDEBYTES; +} + +void pack_sig_z(uint8_t sig[CRYPTO_BYTES], + const polyvecl *z) +{ + unsigned int i; + sig += CTILDEBYTES; + for(i = 0; i < L; ++i) + polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]); +} + + +void pack_sig_h(unsigned char sig[CRYPTO_BYTES], + const poly *h_elem, + const unsigned int idx, + unsigned int *hints_written) +{ + sig += CTILDEBYTES; + sig += L*POLYZ_PACKEDBYTES; + + // Encode h + for (unsigned int j = 0; j < N; j++) { + if (h_elem->coeffs[j] != 0) { + sig[*hints_written] = (uint8_t)j; + (*hints_written)++; + } + } + sig[OMEGA + idx] = (uint8_t)*hints_written; +} + +void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES], + unsigned int *hints_written) { + sig += CTILDEBYTES; + sig += L * POLYZ_PACKEDBYTES; + while (*hints_written < OMEGA) { + sig[*hints_written] = 0; + (*hints_written)++; + } +} + +/************************************************* +* Name: unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig(uint8_t c[CTILDEBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[CRYPTO_BYTES]) +{ + unsigned int i, j, k; + + for(i = 0; i < CTILDEBYTES; ++i) + c[i] = sig[i]; + sig += CTILDEBYTES; + + for(i = 0; i < L; ++i) + polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES); + sig += L*POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for(i = 0; i < K; ++i) { + for(j = 0; j < N; ++j) + h->vec[i].coeffs[j] = 0; + + if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) + return 1; + + for(j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if(j > k && sig[j] <= sig[j-1]) return 1; + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for(j = k; j < OMEGA; ++j) + if(sig[j]) + return 1; + + return 0; +} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/packing.h b/crypto_sign/dilithium3/m4fstack/packing.h new file mode 100644 index 00000000..35553545 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/packing.h @@ -0,0 +1,55 @@ +#ifndef PACKING_H +#define PACKING_H + +#include +#include "params.h" +#include "polyvec.h" +#include "smallpoly.h" + +#define pack_pk DILITHIUM_NAMESPACE(pack_pk) +void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +#define pack_sk DILITHIUM_NAMESPACE(pack_sk) +void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[TRBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +#define pack_sig DILITHIUM_NAMESPACE(pack_sig) +void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); + +#define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); + +#define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) +void unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + +#define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); + +#define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c) +void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]); + +#define pack_sig_z DILITHIUM_NAMESPACE(pack_sig_z) +void pack_sig_z(uint8_t sig[CRYPTO_BYTES], const polyvecl *z); + +#define pack_sig_h DILITHIUM_NAMESPACE(pack_sig_h) +void pack_sig_h(unsigned char sig[CRYPTO_BYTES], + const poly *h_elem, + const unsigned int idx, + unsigned int *hints_written); + +#define pack_sig_h_zero DILITHIUM_NAMESPACE(pack_sig_h_zero) +void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES], + unsigned int *hints_written); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/params.h b/crypto_sign/dilithium3/m4fstack/params.h new file mode 100644 index 00000000..507de467 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/params.h @@ -0,0 +1,83 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#include "config.h" + +#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium_##s + + +#define SEEDBYTES 32 +#define CRHBYTES 64 +#define TRBYTES 64 +#define RNDBYTES 32 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#if DILITHIUM_MODE == 2 +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) +#define OMEGA 80 +#define CTILDEBYTES 32 + +#elif DILITHIUM_MODE == 3 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define CTILDEBYTES 48 + +#elif DILITHIUM_MODE == 5 +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define CTILDEBYTES 64 + +#endif + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#if GAMMA1 == (1 << 17) +#define POLYZ_PACKEDBYTES 576 +#elif GAMMA1 == (1 << 19) +#define POLYZ_PACKEDBYTES 640 +#endif + +#if GAMMA2 == (Q-1)/88 +#define POLYW1_PACKEDBYTES 192 +#elif GAMMA2 == (Q-1)/32 +#define POLYW1_PACKEDBYTES 128 +#endif + +#if ETA == 2 +#define POLYETA_PACKEDBYTES 96 +#elif ETA == 4 +#define POLYETA_PACKEDBYTES 128 +#endif + +#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.h b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h new file mode 100644 index 00000000..2647a110 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h @@ -0,0 +1,13 @@ +#ifndef POINTWISE_MONT_H +#define POINTWISE_MONT_H + +#include +#include "params.h" + + +#define asm_pointwise_montgomery DILITHIUM_NAMESPACE(asm_pointwise_montgomery) +void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +#define asm_pointwise_acc_montgomery DILITHIUM_NAMESPACE(asm_pointwise_acc_montgomery) +void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.s b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s new file mode 100644 index 00000000..e21125d7 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s @@ -0,0 +1,128 @@ +.syntax unified +.thumb + +.macro montgomery_multiplication res, pa, pb, q, qinv + smull \pa, \res, \pa, \pb + mul \pb, \pa, \qinv + smlal \pa, \res, \pb, \q +.endm + + +// void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +.global pqcrystals_dilithium_asm_pointwise_montgomery +.type pqcrystals_dilithium_asm_pointwise_montgomery,%function +.align 2 +pqcrystals_dilithium_asm_pointwise_montgomery: + push.w {r4-r11, r14} + c_ptr .req r0 + a_ptr .req r1 + b_ptr .req r2 + qinv .req r3 + q .req r4 + pa0 .req r5 + pa1 .req r6 + pa2 .req r7 + pb0 .req r8 + pb1 .req r9 + pb2 .req r10 + tmp0 .req r11 + ctr .req r12 + res .req r14 + + movw qinv, #:lower16:0xfc7fdfff + movt qinv, #:upper16:0xfc7fdfff + movw q, #0xE001 + movt q, #0x7F + + + // 85x3 = 255 coefficients + movw ctr, #85 + 1: + ldr.w pa1, [a_ptr, #4] + ldr.w pa2, [a_ptr, #8] + ldr pa0, [a_ptr], #12 + ldr.w pb1, [b_ptr, #4] + ldr.w pb2, [b_ptr, #8] + ldr pb0, [b_ptr], #12 + + montgomery_multiplication res, pa0, pb0, q, qinv + str res, [c_ptr], #4 + montgomery_multiplication res, pa1, pb1, q, qinv + str res, [c_ptr], #4 + montgomery_multiplication res, pa2, pb2, q, qinv + str res, [c_ptr], #4 + subs ctr, #1 + bne.w 1b + + // final coefficient + ldr.w pa0, [a_ptr] + ldr.w pb0, [b_ptr] + montgomery_multiplication res, pa0, pb0, q, qinv + str.w res, [c_ptr] + + pop.w {r4-r11, pc} +.size pqcrystals_dilithium_asm_pointwise_montgomery, .-pqcrystals_dilithium_asm_pointwise_montgomery + +// void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +.global pqcrystals_dilithium_asm_pointwise_acc_montgomery +.type pqcrystals_dilithium_asm_pointwise_acc_montgomery,%function +.align 2 +pqcrystals_dilithium_asm_pointwise_acc_montgomery: + push.w {r4-r11, r14} + c_ptr .req r0 + a_ptr .req r1 + b_ptr .req r2 + qinv .req r3 + q .req r4 + pa0 .req r5 + pa1 .req r6 + pa2 .req r7 + pb0 .req r8 + pb1 .req r9 + pb2 .req r10 + tmp0 .req r11 + ctr .req r12 + res .req r14 + + movw qinv, #:lower16:0xfc7fdfff + movt qinv, #:upper16:0xfc7fdfff + movw q, #0xE001 + movt q, #0x7F + + + // 85x3 = 255 coefficients + movw ctr, #85 + 1: + ldr.w pa1, [a_ptr, #4] + ldr.w pa2, [a_ptr, #8] + ldr pa0, [a_ptr], #12 + ldr.w pb1, [b_ptr, #4] + ldr.w pb2, [b_ptr, #8] + ldr pb0, [b_ptr], #12 + + montgomery_multiplication res, pa0, pb0, q, qinv + montgomery_multiplication pa0, pa1, pb1, q, qinv + montgomery_multiplication pa1, pa2, pb2, q, qinv + + ldr.w pb0, [c_ptr] + ldr.w pb1, [c_ptr, #4] + ldr.w pb2, [c_ptr, #8] + add.w res, res, pb0 + str res, [c_ptr], #12 + add.w pa0, pa0, pb1 + str pa0, [c_ptr, #-8] + add.w pa1, pa1, pb2 + str pa1, [c_ptr, #-4] + subs ctr, #1 + bne.w 1b + + // final coefficient + ldr.w pa0, [a_ptr] + ldr.w pb0, [b_ptr] + ldr.w pa1, [c_ptr] + montgomery_multiplication res, pa0, pb0, q, qinv + add.w res, res, pa1 + str.w res, [c_ptr] + + pop.w {r4-r11, pc} +.size pqcrystals_dilithium_asm_pointwise_acc_montgomery, .-pqcrystals_dilithium_asm_pointwise_acc_montgomery diff --git a/crypto_sign/dilithium3/m4fstack/poly.c b/crypto_sign/dilithium3/m4fstack/poly.c new file mode 100644 index 00000000..0d40fda3 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/poly.c @@ -0,0 +1,851 @@ +#include +#include "params.h" +#include "poly.h" +#include "vector.h" +#include "ntt.h" +#include "pointwise_mont.h" +#include "rounding.h" +#include "symmetric.h" + +#include +#include "hal.h" + +#ifdef DBENCH +#include "test/cpucycles.h" +extern const uint64_t timing_overhead; +extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack; +#define DBENCH_START() uint64_t time = cpucycles() +#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead +#else +#define DBENCH_START() +#define DBENCH_STOP(t) +#endif + +/************************************************* +* Name: poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *a) { + asm_reduce32(a->coeffs); +} + +/************************************************* +* Name: poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_caddq(poly *a) { + asm_caddq(a->coeffs); +} + +#if 0 +/************************************************* +* Name: poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_freeze(poly *a) { + asm_freeze(a->coeffs); +} +#endif + +/************************************************* +* Name: poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a->coeffs[i] <<= D; + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_ntt(poly *a) { + DBENCH_START(); + + ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_invntt_tomont(poly *a) { + DBENCH_START(); + + invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + asm_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: poly_pointwise_acc_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation, multiplication of resulting polynomial +* by 2^{-32} and accumulate. +* +* Arguments: - poly *c: pointer to output (accumulating) polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + asm_pointwise_acc_montgomery(c->coeffs, a->coeffs, b->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for(i = 0; i < N; ++i) { + h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if(B > (Q-1)/8) + return 1; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for(i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2*a->coeffs[i]); + + if(t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) +{ + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = asm_rej_uniform(a->coeffs, N, buf, buflen); + + while(ctr < N) { + off = buflen % 3; + for(i = 0; i < off; ++i) + buf[i] = buf[buflen - off + i]; + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += asm_rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) +{ + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while(ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + +#if ETA == 2 + if(t0 < 15) { + t0 = t0 - (205*t0 >> 10)*5; + a[ctr++] = 2 - t0; + } + if(t1 < 15 && ctr < len) { + t1 = t1 - (205*t1 >> 10)*5; + a[ctr++] = 2 - t1; + } +#elif ETA == 4 + if(t0 < 9) + a[ctr++] = 4 - t0; + if(t1 < 9 && ctr < len) + a[ctr++] = 4 - t1; +#endif + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#if ETA == 2 +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#elif ETA == 4 +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#endif +void poly_uniform_eta(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while(ctr < N) { + stream256_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM256_BLOCKBYTES); + } +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) +{ + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + polyz_unpack(a, buf); +} + +/************************************************* +* Name: challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeezeblocks(buf, 1, &state); + + signs = 0; + for(i = 0; i < 8; ++i) + signs |= (uint64_t)buf[i] << 8*i; + pos = 8; + + for(i = 0; i < N; ++i) + c->coeffs[i] = 0; + for(i = N-TAU; i < N; ++i) { + do { + if(pos >= SHAKE256_RATE) { + shake256_inc_squeezeblocks(buf, 1, &state); + pos = 0; + } + + b = buf[pos++]; + } while(b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2*(signs & 1); + signs >>= 1; + } +} + +/************************************************* +* Name: polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + +#if ETA == 2 + for(i = 0; i < N/8; ++i) { + t[0] = ETA - a->coeffs[8*i+0]; + t[1] = ETA - a->coeffs[8*i+1]; + t[2] = ETA - a->coeffs[8*i+2]; + t[3] = ETA - a->coeffs[8*i+3]; + t[4] = ETA - a->coeffs[8*i+4]; + t[5] = ETA - a->coeffs[8*i+5]; + t[6] = ETA - a->coeffs[8*i+6]; + t[7] = ETA - a->coeffs[8*i+7]; + + r[3*i+0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3*i+1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3*i+2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } +#elif ETA == 4 + for(i = 0; i < N/2; ++i) { + t[0] = ETA - a->coeffs[2*i+0]; + t[1] = ETA - a->coeffs[2*i+1]; + r[i] = t[0] | (t[1] << 4); + } +#endif + + DBENCH_STOP(*tpack); +} + + +/************************************************* +* Name: polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/4; ++i) { + r[5*i+0] = (a->coeffs[4*i+0] >> 0); + r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2); + r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4); + r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6); + r[5*i+4] = (a->coeffs[4*i+3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/4; ++i) { + r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF; + r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF; + r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF; + r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for(i = 0; i < N/8; ++i) { + t[0] = (1 << (D-1)) - a->coeffs[8*i+0]; + t[1] = (1 << (D-1)) - a->coeffs[8*i+1]; + t[2] = (1 << (D-1)) - a->coeffs[8*i+2]; + t[3] = (1 << (D-1)) - a->coeffs[8*i+3]; + t[4] = (1 << (D-1)) - a->coeffs[8*i+4]; + t[5] = (1 << (D-1)) - a->coeffs[8*i+5]; + t[6] = (1 << (D-1)) - a->coeffs[8*i+6]; + t[7] = (1 << (D-1)) - a->coeffs[8*i+7]; + + r[13*i+ 0] = t[0]; + r[13*i+ 1] = t[0] >> 8; + r[13*i+ 1] |= t[1] << 5; + r[13*i+ 2] = t[1] >> 3; + r[13*i+ 3] = t[1] >> 11; + r[13*i+ 3] |= t[2] << 2; + r[13*i+ 4] = t[2] >> 6; + r[13*i+ 4] |= t[3] << 7; + r[13*i+ 5] = t[3] >> 1; + r[13*i+ 6] = t[3] >> 9; + r[13*i+ 6] |= t[4] << 4; + r[13*i+ 7] = t[4] >> 4; + r[13*i+ 8] = t[4] >> 12; + r[13*i+ 8] |= t[5] << 1; + r[13*i+ 9] = t[5] >> 7; + r[13*i+ 9] |= t[6] << 6; + r[13*i+10] = t[6] >> 2; + r[13*i+11] = t[6] >> 10; + r[13*i+11] |= t[7] << 3; + r[13*i+12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/8; ++i) { + r->coeffs[8*i+0] = a[13*i+0]; + r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8; + r->coeffs[8*i+0] &= 0x1FFF; + + r->coeffs[8*i+1] = a[13*i+1] >> 5; + r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3; + r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11; + r->coeffs[8*i+1] &= 0x1FFF; + + r->coeffs[8*i+2] = a[13*i+3] >> 2; + r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6; + r->coeffs[8*i+2] &= 0x1FFF; + + r->coeffs[8*i+3] = a[13*i+4] >> 7; + r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1; + r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9; + r->coeffs[8*i+3] &= 0x1FFF; + + r->coeffs[8*i+4] = a[13*i+6] >> 4; + r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4; + r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12; + r->coeffs[8*i+4] &= 0x1FFF; + + r->coeffs[8*i+5] = a[13*i+8] >> 1; + r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7; + r->coeffs[8*i+5] &= 0x1FFF; + + r->coeffs[8*i+6] = a[13*i+9] >> 6; + r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2; + r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10; + r->coeffs[8*i+6] &= 0x1FFF; + + r->coeffs[8*i+7] = a[13*i+11] >> 3; + r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5; + r->coeffs[8*i+7] &= 0x1FFF; + + r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0]; + r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1]; + r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2]; + r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3]; + r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4]; + r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5]; + r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6]; + r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + +#if GAMMA1 == (1 << 17) + for(i = 0; i < N/4; ++i) { + t[0] = GAMMA1 - a->coeffs[4*i+0]; + t[1] = GAMMA1 - a->coeffs[4*i+1]; + t[2] = GAMMA1 - a->coeffs[4*i+2]; + t[3] = GAMMA1 - a->coeffs[4*i+3]; + + r[9*i+0] = t[0]; + r[9*i+1] = t[0] >> 8; + r[9*i+2] = t[0] >> 16; + r[9*i+2] |= t[1] << 2; + r[9*i+3] = t[1] >> 6; + r[9*i+4] = t[1] >> 14; + r[9*i+4] |= t[2] << 4; + r[9*i+5] = t[2] >> 4; + r[9*i+6] = t[2] >> 12; + r[9*i+6] |= t[3] << 6; + r[9*i+7] = t[3] >> 2; + r[9*i+8] = t[3] >> 10; + } +#elif GAMMA1 == (1 << 19) + for(i = 0; i < N/2; ++i) { + t[0] = GAMMA1 - a->coeffs[2*i+0]; + t[1] = GAMMA1 - a->coeffs[2*i+1]; + + r[5*i+0] = t[0]; + r[5*i+1] = t[0] >> 8; + r[5*i+2] = t[0] >> 16; + r[5*i+2] |= t[1] << 4; + r[5*i+3] = t[1] >> 4; + r[5*i+4] = t[1] >> 12; + } +#endif + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + +#if GAMMA1 == (1 << 17) + for(i = 0; i < N/4; ++i) { + r->coeffs[4*i+0] = a[9*i+0]; + r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8; + r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16; + r->coeffs[4*i+0] &= 0x3FFFF; + + r->coeffs[4*i+1] = a[9*i+2] >> 2; + r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6; + r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14; + r->coeffs[4*i+1] &= 0x3FFFF; + + r->coeffs[4*i+2] = a[9*i+4] >> 4; + r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4; + r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12; + r->coeffs[4*i+2] &= 0x3FFFF; + + r->coeffs[4*i+3] = a[9*i+6] >> 6; + r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2; + r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10; + r->coeffs[4*i+3] &= 0x3FFFF; + + r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0]; + r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1]; + r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2]; + r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3]; + } +#elif GAMMA1 == (1 << 19) + for(i = 0; i < N/2; ++i) { + r->coeffs[2*i+0] = a[5*i+0]; + r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8; + r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16; + r->coeffs[2*i+0] &= 0xFFFFF; + + r->coeffs[2*i+1] = a[5*i+2] >> 4; + r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4; + r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12; + r->coeffs[2*i+0] &= 0xFFFFF; + + r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0]; + r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1]; + } +#endif + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + +#if GAMMA2 == (Q-1)/88 + for(i = 0; i < N/4; ++i) { + r[3*i+0] = a->coeffs[4*i+0]; + r[3*i+0] |= a->coeffs[4*i+1] << 6; + r[3*i+1] = a->coeffs[4*i+1] >> 2; + r[3*i+1] |= a->coeffs[4*i+2] << 4; + r[3*i+2] = a->coeffs[4*i+2] >> 4; + r[3*i+2] |= a->coeffs[4*i+3] << 2; + } +#elif GAMMA2 == (Q-1)/32 + for(i = 0; i < N/2; ++i) + r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4); +#endif + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium3/m4fstack/poly.h b/crypto_sign/dilithium3/m4fstack/poly.h new file mode 100644 index 00000000..8f8819b0 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/poly.h @@ -0,0 +1,82 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include "params.h" + +typedef struct { + int32_t coeffs[N]; +} poly; + +#define poly_reduce DILITHIUM_NAMESPACE(poly_reduce) +void poly_reduce(poly *a); +#define poly_caddq DILITHIUM_NAMESPACE(poly_caddq) +void poly_caddq(poly *a); +#define poly_freeze DILITHIUM_NAMESPACE(poly_freeze) +void poly_freeze(poly *a); + +#define poly_add DILITHIUM_NAMESPACE(poly_add) +void poly_add(poly *c, const poly *a, const poly *b); +#define poly_sub DILITHIUM_NAMESPACE(poly_sub) +void poly_sub(poly *c, const poly *a, const poly *b); +#define poly_shiftl DILITHIUM_NAMESPACE(poly_shiftl) +void poly_shiftl(poly *a); + +#define poly_ntt DILITHIUM_NAMESPACE(poly_ntt) +void poly_ntt(poly *a); + +#define poly_invntt_tomont DILITHIUM_NAMESPACE(poly_invntt_tomont) +void poly_invntt_tomont(poly *a); +#define poly_pointwise_montgomery DILITHIUM_NAMESPACE(poly_pointwise_montgomery) +void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); +#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery) +void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b); + +#define poly_power2round DILITHIUM_NAMESPACE(poly_power2round) +void poly_power2round(poly *a1, poly *a0, const poly *a); +#define poly_decompose DILITHIUM_NAMESPACE(poly_decompose) +void poly_decompose(poly *a1, poly *a0, const poly *a); +#define poly_make_hint DILITHIUM_NAMESPACE(poly_make_hint) +unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1); +#define poly_use_hint DILITHIUM_NAMESPACE(poly_use_hint) +void poly_use_hint(poly *b, const poly *a, const poly *h); + +#define poly_chknorm DILITHIUM_NAMESPACE(poly_chknorm) +int poly_chknorm(const poly *a, int32_t B); +#define poly_uniform DILITHIUM_NAMESPACE(poly_uniform) +void poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +#define poly_uniform_eta DILITHIUM_NAMESPACE(poly_uniform_eta) +void poly_uniform_eta(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +#define poly_uniform_gamma1 DILITHIUM_NAMESPACE(poly_uniform_gamma1) +void poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +#define poly_challenge DILITHIUM_NAMESPACE(poly_challenge) +void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +#define polyeta_pack DILITHIUM_NAMESPACE(polyeta_pack) +void polyeta_pack(uint8_t *r, const poly *a); + +#define polyt1_pack DILITHIUM_NAMESPACE(polyt1_pack) +void polyt1_pack(uint8_t *r, const poly *a); +#define polyt1_unpack DILITHIUM_NAMESPACE(polyt1_unpack) +void polyt1_unpack(poly *r, const uint8_t *a); + +#define polyt0_pack DILITHIUM_NAMESPACE(polyt0_pack) +void polyt0_pack(uint8_t *r, const poly *a); +#define polyt0_unpack DILITHIUM_NAMESPACE(polyt0_unpack) +void polyt0_unpack(poly *r, const uint8_t *a); + +#define polyz_pack DILITHIUM_NAMESPACE(polyz_pack) +void polyz_pack(uint8_t *r, const poly *a); +#define polyz_unpack DILITHIUM_NAMESPACE(polyz_unpack) +void polyz_unpack(poly *r, const uint8_t *a); + +#define polyw1_pack DILITHIUM_NAMESPACE(polyw1_pack) +void polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.c b/crypto_sign/dilithium3/m4fstack/polyvec.c new file mode 100644 index 00000000..e20749c0 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/polyvec.c @@ -0,0 +1,429 @@ +#include +#include "params.h" +#include "polyvec.h" +#include "poly.h" + +#include +#include "hal.h" + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for(i = 0; i < K; ++i) + for(j = 0; j < L; ++j) + poly_uniform(&mat[i].vec[j], rho, (i << 8) + j); +} + +void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_uniform_eta(&v->vec[i], seed, nonce++); +} + +void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i); +} + +void polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_reduce(&v->vec[i]); +} + +#if 0 +/************************************************* +* Name: polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_freeze(&v->vec[i]); +} +#endif + +/************************************************* +* Name: polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_ntt(&v->vec[i]); +} + +void polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_invntt_tomont(&v->vec[i]); +} + +void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); +} + + + +/************************************************* +* Name: polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) +{ + unsigned int i; + + poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for(i = 1; i < L; ++i) { + poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for(i = 0; i < L; ++i) + if(poly_chknorm(&v->vec[i], bound)) + return 1; + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_uniform_eta(&v->vec[i], seed, nonce++); +} + +/************************************************* +* Name: polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_reduce(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_reduce(&v->vec[i]); +} + +/************************************************* +* Name: polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_caddq(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_caddq(&v->vec[i]); +} + +#if 0 +/************************************************* +* Name: polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_freeze(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_freeze(&v->vec[i]); +} +#endif + +/************************************************* +* Name: polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_shiftl(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_shiftl(&v->vec[i]); +} + +/************************************************* +* Name: polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_ntt(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_ntt(&v->vec[i]); +} + + + +/************************************************* +* Name: polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_invntt_tomont(&v->vec[i]); +} + + +void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); +} + + +/************************************************* +* Name: polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for(i = 0; i < K; ++i) + if(poly_chknorm(&v->vec[i], bound)) + return 1; + + return 0; +} + +/************************************************* +* Name: polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) +{ + unsigned int i, s = 0; + + for(i = 0; i < K; ++i) + s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + + return s; +} + +/************************************************* +* Name: polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); +} + +void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for(i = 0; i < K; ++i) + polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]); +} diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.h b/crypto_sign/dilithium3/m4fstack/polyvec.h new file mode 100644 index 00000000..d92cd753 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/polyvec.h @@ -0,0 +1,99 @@ +#ifndef POLYVEC_H +#define POLYVEC_H + +#include +#include "params.h" +#include "poly.h" + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +#define polyvecl_uniform_eta DILITHIUM_NAMESPACE(polyvecl_uniform_eta) +void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyvecl_uniform_gamma1 DILITHIUM_NAMESPACE(polyvecl_uniform_gamma1) +void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyvecl_reduce DILITHIUM_NAMESPACE(polyvecl_reduce) +void polyvecl_reduce(polyvecl *v); + +#define polyvecl_freeze DILITHIUM_NAMESPACE(polyvecl_freeze) +void polyvecl_freeze(polyvecl *v); + +#define polyvecl_add DILITHIUM_NAMESPACE(polyvecl_add) +void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +#define polyvecl_ntt DILITHIUM_NAMESPACE(polyvecl_ntt) +void polyvecl_ntt(polyvecl *v); +#define polyvecl_invntt_tomont DILITHIUM_NAMESPACE(polyvecl_invntt_tomont) +void polyvecl_invntt_tomont(polyvecl *v); +#define polyvecl_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyvecl_pointwise_poly_montgomery) +void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +#define polyvecl_pointwise_acc_montgomery \ + DILITHIUM_NAMESPACE(polyvecl_pointwise_acc_montgomery) +void polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +#define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) +int polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +#define polyveck_uniform_eta DILITHIUM_NAMESPACE(polyveck_uniform_eta) +void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyveck_reduce DILITHIUM_NAMESPACE(polyveck_reduce) +void polyveck_reduce(polyveck *v); +#define polyveck_caddq DILITHIUM_NAMESPACE(polyveck_caddq) +void polyveck_caddq(polyveck *v); +#define polyveck_freeze DILITHIUM_NAMESPACE(polyveck_freeze) +void polyveck_freeze(polyveck *v); + +#define polyveck_add DILITHIUM_NAMESPACE(polyveck_add) +void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +#define polyveck_sub DILITHIUM_NAMESPACE(polyveck_sub) +void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +#define polyveck_shiftl DILITHIUM_NAMESPACE(polyveck_shiftl) +void polyveck_shiftl(polyveck *v); + +#define polyveck_ntt DILITHIUM_NAMESPACE(polyveck_ntt) +void polyveck_ntt(polyveck *v); +#define polyveck_invntt_tomont DILITHIUM_NAMESPACE(polyveck_invntt_tomont) +void polyveck_invntt_tomont(polyveck *v); +#define polyveck_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyveck_pointwise_poly_montgomery) +void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + + +#define polyveck_chknorm DILITHIUM_NAMESPACE(polyveck_chknorm) +int polyveck_chknorm(const polyveck *v, int32_t B); + +#define polyveck_power2round DILITHIUM_NAMESPACE(polyveck_power2round) +void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +#define polyveck_decompose DILITHIUM_NAMESPACE(polyveck_decompose) +void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +#define polyveck_make_hint DILITHIUM_NAMESPACE(polyveck_make_hint) +unsigned int polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +#define polyveck_use_hint DILITHIUM_NAMESPACE(polyveck_use_hint) +void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); + +#define polyveck_pack_w1 DILITHIUM_NAMESPACE(polyveck_pack_w1) +void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1); + +#define polyvec_matrix_expand DILITHIUM_NAMESPACE(polyvec_matrix_expand) +void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +#define polyvec_matrix_pointwise_montgomery DILITHIUM_NAMESPACE(polyvec_matrix_pointwise_montgomery) +void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h new file mode 100644 index 00000000..02df5500 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/reduce.h @@ -0,0 +1,29 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include +#include "params.h" + +#define MONT -4186625 // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce) +/************************************************* +* Name: montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +static inline int32_t montgomery_reduce(int64_t a) { + int32_t t; + + t = (int64_t)(int32_t)a*QINV; + t = (a - (int64_t)t*Q) >> 32; + return t; +} + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/rounding.c b/crypto_sign/dilithium3/m4fstack/rounding.c new file mode 100644 index 00000000..889f0a29 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/rounding.c @@ -0,0 +1,102 @@ +#include +#include "params.h" +#include "rounding.h" + +/************************************************* +* Name: power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D-1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + *a0 = a - a1*2*GAMMA2; + *a0 -= (((Q-1)/2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int make_hint(int32_t a0, int32_t a1) { + if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) + return 1; + + return 0; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = decompose(&a0, a); + if(hint == 0) + return a1; + +#if GAMMA2 == (Q-1)/32 + if(a0 > 0) + return (a1 + 1) & 15; + else + return (a1 - 1) & 15; +#elif GAMMA2 == (Q-1)/88 + if(a0 > 0) + return (a1 == 43) ? 0 : a1 + 1; + else + return (a1 == 0) ? 43 : a1 - 1; +#endif +} diff --git a/crypto_sign/dilithium3/m4fstack/rounding.h b/crypto_sign/dilithium3/m4fstack/rounding.h new file mode 100644 index 00000000..b72e8e8d --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/rounding.h @@ -0,0 +1,19 @@ +#ifndef ROUNDING_H +#define ROUNDING_H + +#include +#include "params.h" + +#define power2round DILITHIUM_NAMESPACE(power2round) +int32_t power2round(int32_t *a0, int32_t a); + +#define decompose DILITHIUM_NAMESPACE(decompose) +int32_t decompose(int32_t *a0, int32_t a); + +#define make_hint DILITHIUM_NAMESPACE(make_hint) +unsigned int make_hint(int32_t a0, int32_t a1); + +#define use_hint DILITHIUM_NAMESPACE(use_hint) +int32_t use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c new file mode 100644 index 00000000..04bec45c --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -0,0 +1,352 @@ +#include +#include "params.h" +#include "sign.h" +#include "packing.h" +#include "polyvec.h" +#include "poly.h" +#include "randombytes.h" +#include "symmetric.h" +#include "smallpoly.h" + +/************************************************* +* Name: crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[2*SEEDBYTES + CRHBYTES]; + uint8_t tr[TRBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = rho + SEEDBYTES; + key = rhoprime + CRHBYTES; + + /* Expand matrix */ + polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + polyvecl_uniform_eta(&s1, rhoprime, 0); + polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + polyvecl_ntt(&s1hat); + polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + polyveck_reduce(&t1); + polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + polyveck_caddq(&t1); + polyveck_power2round(&t1, &t0, &t1); + pack_pk(pk, rho, &t1); + + /* Compute H(rho, t1) and write secret key */ + shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + + +/************************************************* +* Name: crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd; + uint16_t nonce = 0; + unsigned int n; + polyvecl mat[K], y, z; + polyveck t0, w1, w0; + poly cp; + shake256incctx state; + + smallpoly s1_prime[L]; + smallpoly s2_prime[K]; + smallpoly cp_small; + smallhalfpoly cp_small_prime; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + TRBYTES; + rnd = key + SEEDBYTES; + mu = rnd + RNDBYTES; + rhoprime = mu + CRHBYTES; + unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk); + + /* Compute mu = CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, TRBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + for (n = 0; n < RNDBYTES; n++) { + rnd[n] = 0; + } + shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + polyvec_matrix_expand(mat, rho); + polyvecl_small_ntt(s1_prime); + polyveck_small_ntt(s2_prime); + + polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + polyvecl_ntt(&z); + polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + polyveck_reduce(&w1); + polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + polyveck_caddq(&w1); + polyveck_decompose(&w1, &w0, &w1); + polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, CTILDEBYTES, &state); + poly_challenge(&cp, sig); + + poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp); + poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime); + + polyvecl_add(&z, &z, &y); + polyvecl_reduce(&z); + if(polyvecl_chknorm(&z, GAMMA1 - BETA)) + goto rej; + + + /* Write signature */ + pack_sig_z(sig, &z); + unsigned int hint_n = 0; + unsigned int hints_written = 0; + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + for(unsigned int i = 0; i < K; ++i) { + poly *tmp = &z.vec[0]; + poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]); + + poly_sub(&w0.vec[i], &w0.vec[i], tmp); + poly_reduce(&w0.vec[i]); + if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA)) + goto rej; + + /* Compute hints for w1 */ + poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]); + + poly_invntt_tomont(tmp); + poly_reduce(tmp); + + if(poly_chknorm(tmp, GAMMA2)) + goto rej; + poly_add(&w0.vec[i], &w0.vec[i], tmp); + hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]); + if (hint_n > OMEGA) { + goto rej; + } + pack_sig_h(sig, tmp, i, &hints_written); + } + pack_sig_h_zero(sig, &hints_written); + *siglen = CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + size_t i; + + for(i = 0; i < mlen; ++i) + sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) +{ + unsigned int i; + uint8_t buf[K*POLYW1_PACKEDBYTES]; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + uint8_t c[CTILDEBYTES]; + uint8_t c2[CTILDEBYTES]; + poly cp; + polyvecl mat[K], z; + polyveck t1, w1, h; + shake256incctx state; + + if(siglen != CRYPTO_BYTES) + return -1; + + unpack_pk(rho, &t1, pk); + if(unpack_sig(c, &z, &h, sig)) + return -1; + if(polyvecl_chknorm(&z, GAMMA1 - BETA)) + return -1; + + /* Compute CRH(h(rho, t1), msg) */ + shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + poly_challenge(&cp, c); + polyvec_matrix_expand(mat, rho); + + polyvecl_ntt(&z); + polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + poly_ntt(&cp); + polyveck_shiftl(&t1); + polyveck_ntt(&t1); + polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); + + polyveck_sub(&w1, &w1, &t1); + polyveck_reduce(&w1); + polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + polyveck_caddq(&w1); + polyveck_use_hint(&w1, &w1, &h); + polyveck_pack_w1(buf, &w1); + + /* Call random oracle and verify challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, CTILDEBYTES, &state); + for(i = 0; i < CTILDEBYTES; ++i) + if(c[i] != c2[i]) + return -1; + + return 0; +} + +/************************************************* +* Name: crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) +{ + size_t i; + + if(smlen < CRYPTO_BYTES) + goto badsig; + + *mlen = smlen - CRYPTO_BYTES; + if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) + goto badsig; + else { + /* All good, copy msg, return 0 */ + for(i = 0; i < *mlen; ++i) + m[i] = sm[CRYPTO_BYTES + i]; + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for(i = 0; i < smlen; ++i) + m[i] = 0; + + return -1; +} diff --git a/crypto_sign/dilithium3/m4fstack/sign.h b/crypto_sign/dilithium3/m4fstack/sign.h new file mode 100644 index 00000000..42240b30 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/sign.h @@ -0,0 +1,37 @@ +#ifndef SIGN_H +#define SIGN_H + +#include +#include +#include "params.h" +#include "api.h" +#include "polyvec.h" +#include "poly.h" + +#define challenge DILITHIUM_NAMESPACE(challenge) +void challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +// #define crypto_sign_keypair DILITHIUM_NAMESPACE(crypto_sign_keypair) +// int crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +// #define crypto_sign_signature DILITHIUM_NAMESPACE(signature) +// int crypto_sign_signature(uint8_t *sig, size_t *siglen, +// const uint8_t *m, size_t mlen, +// const uint8_t *sk); + +// #define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) +// int crypto_sign(uint8_t *sm, size_t *smlen, +// const uint8_t *m, size_t mlen, +// const uint8_t *sk); + +// #define crypto_sign_verify DILITHIUM_NAMESPACE(verify) +// int crypto_sign_verify(const uint8_t *sig, size_t siglen, +// const uint8_t *m, size_t mlen, +// const uint8_t *pk); + +// #define crypto_sign_open DILITHIUM_NAMESPACE(crypto_sign_open) +// int crypto_sign_open(uint8_t *m, size_t *mlen, +// const uint8_t *sm, size_t smlen, +// const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S new file mode 100644 index 00000000..747c111c --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallntt.S @@ -0,0 +1,837 @@ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +// general macros +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro montgomery q, qinv, a, tmp + smulbt \tmp, \a, \qinv + smlabb \tmp, \q, \tmp, \a +.endm + +.macro montgomery_inplace q, qinv, a, tmp + smulbt \tmp, \a, \qinv + smlabb \a, \q, \tmp, \a +.endm + +.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst + smulbb \tmp2, \a, \montconst + montgomery \q, \qinv, \tmp2, \tmp + smultb \a, \a, \montconst + montgomery \q, \qinv, \a, \tmp2 + pkhtb \a, \tmp2, \tmp, asr#16 +.endm + +// ####### +// ####### +// # NTT # +// ####### +// ####### + +.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv + smulb\tb \tmp, \a, \twiddle + smult\tb \a, \a, \twiddle + montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 + montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2 + pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves +.endm + +.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv + smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb + smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb + montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 + montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp + pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves + usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs) + uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs) +.endm + +.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv + doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv + doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv +.endm + +.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 + // layer 3 + ldrh.w \twiddle, [\twiddle_ptr], #2 + two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 2 + ldr.w \twiddle, [\twiddle_ptr], #4 + two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + + two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 1 + ldr.w \twiddle, [\twiddle_ptr], #4 + two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + + ldr.w \twiddle, [\twiddle_ptr], #4 + two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime +.endm + +.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2 + // layer 3 + vmov \twiddle, \xi01 + two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 2 + vmov \twiddle, \xi23 + two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + + two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 1 + vmov \twiddle, \xi45 + two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + + vmov \twiddle, \xi67 + two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime +.endm + +.global small_ntt_asm +.type small_ntt_asm, %function +.align 2 +small_ntt_asm: + push {r4-r11, r14} + vpush.w {s16} + + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle .req r10 + qinv .req r11 + q .req r11 + tmp .req r12 + tmp2 .req r14 + + movw q, #769 + movt qinv, #767 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s15} + + + add tmp, poly, #strincr*8 + vmov s16, tmp + 1: + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 + + // multiply coeffs by layer 4 twiddles for later use + vmov twiddle, s12 + mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s13 + mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s14 + mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s15 + mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + // ---------- + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov tmp2, s1 // load a3 + vmov s1, poly0 // preserve a0 + uadd16 poly0, poly1, tmp2 + usub16 poly1, poly1, tmp2 + + vmov tmp2, s3 // load a7 + vmov s3, poly2 // preserve a4 + uadd16 poly2, poly3, tmp2 + usub16 poly3, poly3, tmp2 + + vmov tmp2, s5 // load a11 + vmov s5, poly4 // preserve a8 + uadd16 poly4, poly5, tmp2 + usub16 poly5, poly5, tmp2 + + vmov tmp2, s7 // load a15 + vmov s7, poly6 // preserve a12 + uadd16 poly6, poly7, tmp2 + usub16 poly7, poly7, tmp2 + + str.w poly0, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + str.w poly2, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + str.w poly4, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + str.w poly6, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov tmp2, s1 // load a0 + vmov poly1, s0 // load a1 + uadd16 poly0, tmp2, poly1 + usub16 poly1, tmp2, poly1 + + vmov tmp2, s3 // load a4 + vmov poly3, s2 // load a5 + uadd16 poly2, tmp2, poly3 + usub16 poly3, tmp2, poly3 + + vmov tmp2, s5 // load a8 + vmov poly5, s4 // load a9 + uadd16 poly4, tmp2, poly5 + usub16 poly5, tmp2, poly5 + + vmov tmp2, s7 // load a12 + vmov poly7, s6 // load a13 + uadd16 poly6, tmp2, poly7 + usub16 poly7, tmp2, poly7 + + str.w poly1, [poly, #offset] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #2*distance/4+offset] + str.w poly4, [poly, #4*distance/4] + str.w poly5, [poly, #4*distance/4+offset] + str.w poly6, [poly, #6*distance/4] + str.w poly7, [poly, #6*distance/4+offset] + str.w poly0, [poly], #4 + + vmov tmp, s16 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + + 2: + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 + + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16} + pop {r4-r11, pc} + + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle +.unreq qinv +.unreq q +.unreq tmp +.unreq tmp2 + +// ######## +// ######## +// # INTT # +// ######## +// ######## + +.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv + uadd16 \tmp, \a0, \a1 + usub16 \a1, \a0, \a1 + mov.w \a0, \tmp +.endm + +.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv + doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv + doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv +.endm + +.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + + vmov.w \twiddle, \xi12 + doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free + doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv + // c0, c6 are free at this point + + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + + vmov.w \twiddle, \xi34 + doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv + + vmov.w \twiddle, \xi56 + // this block is one doublebutterfly + smulbb \tmp, \c2, \twiddle // c2, c6 + smultb \c2, \c2, \twiddle + montgomery_inplace \q, \qinv, \tmp, \c6 + montgomery_inplace \q, \qinv, \c2, \c6 + pkhtb \tmp, \c2, \tmp, asr #16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + + doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv + +.endm + +.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + + mov.w \c6, \tmp + mov.w \c4, \c0 + + // layer 2 + vmov.w \twiddle, \xi12 + doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv + doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv + doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free + doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv + // c0, c6 are free at this point + + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + + vmov.w \twiddle, \xi34 + doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv + + vmov.w \twiddle, \xi56 + // this block is one doublebutterfly + smulbb \tmp, \c2, \twiddle // c2, c6 + smultb \c2, \c2, \twiddle + montgomery_inplace \q, \qinv, \tmp, \c6 + montgomery_inplace \q, \qinv, \c2, \c6 + pkhtb \tmp, \c2, \tmp, asr #16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + + doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv + +.endm + +.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 + // layer 3 + ldrh.w twiddle, [twiddle_ptr], #2 + two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 2 + ldr.w twiddle, [twiddle_ptr], #4 + two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + + two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + + // layer 1 + ldr.w twiddle, [twiddle_ptr], #4 + two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime + + ldr.w twiddle, [twiddle_ptr], #4 + two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime +.endm + +.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2 + smulb\tb \tmp, \a, \twiddle + smmulr.w \tmp2, \tmp, \Qbar + mls.w \tmp, \tmp2, \Q, \tmp + smult\tb \a, \a, \twiddle + smmulr.w \tmp2, \a, \Qbar + mls.w \a, \tmp2, \Q, \a + pkhbt \a, \tmp, \a, lsl #16 +.endm + +.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2 + + movt \Q, #0 + + ldr.w \twiddle, [\twiddle_ptr], #4 + + mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2 + mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2 + + ldr.w \twiddle, [\twiddle_ptr], #4 + + mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2 + mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2 + + ldr.w \twiddle, [\twiddle_ptr], #4 + + mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2 + mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2 + + ldr.w \twiddle, [\twiddle_ptr], #4 + + mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2 + mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2 + + movt \Q, #767 + +.endm + +.global small_invntt_tomont_asm +.type small_invntt_tomont_asm, %function +.align 2 +small_invntt_tomont_asm: + push {r4-r11, r14} + + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle .req r10 + qinv .req r11 + q .req r11 + tmp .req r12 + tmp2 .req r14 + + movw q, #769 + movt qinv, #767 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s15} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + // NTT on a1, a3, ..., a15 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 + + // multiply coeffs by layer 4 twiddles for later use + vmov twiddle, s12 + mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only + mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s13 + mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s14 + mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv + + vmov twiddle, s15 + mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv + mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + // ---------- + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + // NTT on a0, a2, ..., a14 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov tmp2, s1 // load a3 + vmov s1, poly0 // preserve a0 + uadd16 poly0, poly1, tmp2 + usub16 poly1, poly1, tmp2 + + vmov tmp2, s3 // load a7 + vmov s3, poly2 // preserve a4 + uadd16 poly2, poly3, tmp2 + usub16 poly3, poly3, tmp2 + + vmov tmp2, s5 // load a11 + vmov s5, poly4 // preserve a8 + uadd16 poly4, poly5, tmp2 + usub16 poly5, poly5, tmp2 + + vmov tmp2, s7 // load a15 + vmov s7, poly6 // preserve a12 + uadd16 poly6, poly7, tmp2 + usub16 poly7, poly7, tmp2 + + str.w poly0, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + str.w poly2, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + str.w poly4, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + str.w poly6, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov tmp2, s1 // load a0 + vmov poly1, s0 // load a1 + uadd16 poly0, tmp2, poly1 + usub16 poly1, tmp2, poly1 + + vmov tmp2, s3 // load a4 + vmov poly3, s2 // load a5 + uadd16 poly2, tmp2, poly3 + usub16 poly3, tmp2, poly3 + + vmov tmp2, s5 // load a8 + vmov poly5, s4 // load a9 + uadd16 poly4, tmp2, poly5 + usub16 poly5, tmp2, poly5 + + vmov tmp2, s7 // load a12 + vmov poly7, s6 // load a13 + uadd16 poly6, tmp2, poly7 + usub16 poly7, tmp2, poly7 + + str.w poly1, [poly, #offset] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #2*distance/4+offset] + str.w poly4, [poly, #4*distance/4] + str.w poly5, [poly, #4*distance/4+offset] + str.w poly6, [poly, #6*distance/4] + str.w poly7, [poly, #6*distance/4+offset] + str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s5-s7} + + _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2 + + vmov.w s2, poly + movw poly, #:lower16:5585133 + movt poly, #:upper16:5585133 + + // twisting + _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 + + vmov.w poly, s2 + + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-12 + add.w tmp, poly, #strincr*3*(3+1) + vmov s14, tmp + 3: + add.w tmp, poly, #strincr*3 + vmov s13, tmp + 2: + // polys upto 6q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + + _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 + + vmov.w s2, poly + movw poly, #:lower16:5585133 + movt poly, #:upper16:5585133 + + // twisting + _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 + + vmov.w poly, s2 + + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + + // polys upto 9q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 + + vmov.w s2, poly + movw poly, #:lower16:5585133 + movt poly, #:upper16:5585133 + + // twisting + _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 + + vmov.w poly, s2 + + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 3b + + // ITER 13-15 + add tmp, poly, #3*strincr + vmov s13, tmp + 2: + // polys upto 6q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 + + vmov.w s2, poly + movw poly, #:lower16:5585133 + movt poly, #:upper16:5585133 + + // twisting + _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 + + vmov.w poly, s2 + + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle +.unreq qinv +.unreq q +.unreq tmp +.unreq tmp2 + +.align 2 +.global small_pointmul_asm +.type small_pointmul_asm, %function +small_pointmul_asm: + push.w {r4-r11, lr} + + movw r14, #769 + movt r14, #767 + + .equ width, 4 + + add.w r12, r2, #64*2 + _point_mul_16_loop: + + ldr.w r7, [r1, #2*width] + ldr.w r8, [r1, #3*width] + ldrsh.w r9, [r2, #1*2] + ldr.w r5, [r1, #1*width] + ldr.w r4, [r1], #4*width + ldrsh.w r6, [r2], #2*2 + + smultb r10, r4, r6 + montgomery r14, r14, r10, r11 + pkhbt r4, r4, r11 + + + neg.w r6, r6 + + smultb r10, r5, r6 + montgomery r14, r14, r10, r11 + pkhbt r5, r5, r11 + + str.w r5, [r0, #1*width] + str.w r4, [r0], #2*width + + smultb r10, r7, r9 + montgomery r14, r14, r10, r11 + pkhbt r7, r7, r11 + + neg.w r9, r9 + + smultb r10, r8, r9 + montgomery r14, r14, r10, r11 + pkhbt r8, r8, r11 + + str.w r8, [r0, #1*width] + str.w r7, [r0], #2*width + + cmp.w r2, r12 + bne.w _point_mul_16_loop + + pop.w {r4-r11, pc} + + .align 2 +.global small_asymmetric_mul_asm +.type small_asymmetric_mul_asm, %function +small_asymmetric_mul_asm: + push.w {r4-r11, lr} + + movw r14, #769 + movt r14, #767 + .equ width, 4 + add.w r12, r0, #256*2 + _asymmetric_mul_16_loop: + ldr.w r7, [r1, #width] + ldr.w r4, [r1], #2*width + ldr.w r8, [r2, #width] + ldr.w r5, [r2], #2*width + ldr.w r9, [r3, #width] + ldr.w r6, [r3], #2*width + + smuad r10, r4, r6 + montgomery r14, r14, r10, r6 + smuadx r11, r4, r5 + montgomery r14, r14, r11, r10 + + pkhtb r10, r10, r6, asr#16 + + str.w r10, [r0], #width + + smuad r10, r7, r9 + montgomery r14, r14, r10, r6 + smuadx r11, r7, r8 + montgomery r14, r14, r11, r10 + + pkhtb r10, r10, r6, asr#16 + str.w r10, [r0], #width + + + cmp.w r0, r12 + bne.w _asymmetric_mul_16_loop + + pop.w {r4-r11, pc} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h new file mode 100644 index 00000000..0aa0ce9b --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallntt.h @@ -0,0 +1,53 @@ +#ifndef SMALLNTT_H +#define SMALLNTT_H + +#include +#include "params.h" + +static const int16_t zetas[64] = { +-23, 112, -151, -134, -52, -148, 227, 232, +-71, 212, 236, 21, 341, 379, -202, -220, +352, 292, 238, 145, 194, -276, 70, -274, +117, 333, 66, 247, -237, -83, -252, -244, +331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168 +}; + +static const int16_t zetas_asm[128] = { +0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337, +-76, 147, -114, -23, 112, -151, -134, +-98, -272, 54, -52, -148, 227, 232, +36, -2, -124, -71, 212, 236, 21, +-75, -80, -346, 341, 379, -202, -220, +-339, 86, -51, 352, 292, 238, 145, +-255, 364, 267, 194, -276, 70, -274, +282, 161, -15, 117, 333, 66, 247, +-203, 288, 169, -237, -83, -252, -244, +-34, 191, 307, 331, -241, 167, 357, +199, -50, -24, -355, 291, -358, 105, +178, -170, 226, -115, -209, 14, 99, +270, 121, -188, -260, 29, 366, -378, +-10, -380, 279, -318, 278, 353, 354, +149, 180, -375, -184, 127, 330, -303, +369, -157, 263, 222, -78, -348, -44, +-192, -128, -246, 201, 158, 350, 168 +}; + +static const int16_t zetas_inv_CT_asm[256] = { +0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186, +171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138, +}; + + +#define SMALL_Q 769 + +void small_ntt_asm(int16_t a[N], const int16_t * zetas); +void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas); +void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas); +void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]); + +#define small_ntt(a) small_ntt_asm(a, zetas_asm) +#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm) +#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas) +#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime); + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c new file mode 100644 index 00000000..9e1f6c85 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c @@ -0,0 +1,84 @@ +#include "smallpoly.h" +#include "smallntt.h" + +void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) { + for (int i = 0; i < N; i++) + { + out->coeffs[i] = in->coeffs[i]; + } + small_ntt(out->coeffs); + small_point_mul(out2->coeffs, out->coeffs); +} + + +void polyvecl_small_ntt(smallpoly v[L]) { + unsigned int i; + + for(i = 0; i < L; ++i) + small_ntt(v[i].coeffs); +} + + +void polyveck_small_ntt(smallpoly v[K]) { + unsigned int i; + + for(i = 0; i < K; ++i) + small_ntt(v[i].coeffs); +} + + + +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){ + // re-use the buffer + smallpoly *tmp = (smallpoly *)r; + small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs); + small_invntt_tomont(tmp->coeffs); + + #ifdef SMALL_POLY_16_BIT + int j; + // buffer is the same, so we neeed to be careful + for(j=N-1;j>=0;j--){ + r->coeffs[j] = tmp->coeffs[j]; + } + #endif +} + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){ + unsigned int i; + for(i=0;ivec[i], a, aprime, &b[i]); + } +} + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a) { + unsigned int i; + +#if ETA == 2 + for(i = 0; i < N/8; ++i) { + r->coeffs[8*i+0] = (a[3*i+0] >> 0) & 7; + r->coeffs[8*i+1] = (a[3*i+0] >> 3) & 7; + r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7; + r->coeffs[8*i+3] = (a[3*i+1] >> 1) & 7; + r->coeffs[8*i+4] = (a[3*i+1] >> 4) & 7; + r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7; + r->coeffs[8*i+6] = (a[3*i+2] >> 2) & 7; + r->coeffs[8*i+7] = (a[3*i+2] >> 5) & 7; + + r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0]; + r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1]; + r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2]; + r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3]; + r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4]; + r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5]; + r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6]; + r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7]; + } +#elif ETA == 4 + for(i = 0; i < N/2; ++i) { + r->coeffs[2*i+0] = a[i] & 0x0F; + r->coeffs[2*i+1] = a[i] >> 4; + r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0]; + r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1]; + } +#endif +} diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h new file mode 100644 index 00000000..caa26261 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h @@ -0,0 +1,39 @@ +#ifndef SMALLPOLY_H +#define SMALLPOLY_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + + +#if DILITHIUM_MODE == 3 // use q=769 +#define SMALL_POLY_16_BIT +typedef struct { + int16_t coeffs[N]; +} smallpoly; + +typedef smallpoly smallhalfpoly; + +#else // use q=257 +#define SMALL_POLY_32_BIT +typedef struct { + int32_t coeffs[N]; +} smallpoly; + +typedef struct { + int16_t coeffs[N]; +} smallhalfpoly; +#endif + + +void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in); +void polyvecl_small_ntt(smallpoly v[L]); +void polyveck_small_ntt(smallpoly v[K]); + + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]); +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b); + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a); + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/symmetric-shake.c b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c new file mode 100644 index 00000000..963f6498 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c @@ -0,0 +1,28 @@ +#include +#include "params.h" +#include "symmetric.h" +#include "fips202.h" + +void dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) +{ + uint8_t t[2]; + t[0] = nonce; + t[1] = nonce >> 8; + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) +{ + uint8_t t[2]; + t[0] = nonce; + t[1] = nonce >> 8; + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium3/m4fstack/symmetric.h b/crypto_sign/dilithium3/m4fstack/symmetric.h new file mode 100644 index 00000000..47037377 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/symmetric.h @@ -0,0 +1,65 @@ +#ifndef SYMMETRIC_H +#define SYMMETRIC_H + +#include +#include "params.h" + +#ifdef DILITHIUM_USE_AES + +#include "aes256ctr.h" +#include "fips202.h" + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +#define dilithium_aes256ctr_init DILITHIUM_NAMESPACE(dilithium_aes256ctr_init) +void dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define stream128_init(STATE, SEED, NONCE) \ + dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_init(STATE, SEED, NONCE) \ + dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#else + +#include "fips202.h" +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +#define shake256_inc_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE) + +#define dilithium_shake128_stream_init DILITHIUM_NAMESPACE(dilithium_shake128_stream_init) +void dilithium_shake128_stream_init(stream128_state *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +#define dilithium_shake256_stream_init DILITHIUM_NAMESPACE(dilithium_shake256_stream_init) +void dilithium_shake256_stream_init(stream256_state *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +#define stream128_init(STATE, SEED, NONCE) \ + dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, OUTBLOCKS*SHAKE128_RATE, STATE) +#define stream256_init(STATE, SEED, NONCE) \ + dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE) + +#endif + +#endif diff --git a/crypto_sign/dilithium3/m4fstack/vector.h b/crypto_sign/dilithium3/m4fstack/vector.h new file mode 100644 index 00000000..e5c5dda3 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/vector.h @@ -0,0 +1,20 @@ +#ifndef VECTOR_H +#define VECTOR_H + +#include +#include "params.h" + +#define asm_reduce32 DILITHIUM_NAMESPACE(asm_reduce32) +void asm_reduce32(int32_t a[N]); +#define small_asm_reduce32_central DILITHIUM_NAMESPACE(small_asm_reduce32_central) +void small_asm_reduce32_central(int32_t a[N]); +#define asm_caddq DILITHIUM_NAMESPACE(asm_caddq) +void asm_caddq(int32_t a[N]); +#define asm_freeze DILITHIUM_NAMESPACE(asm_freeze) +void asm_freeze(int32_t a[N]); +#define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform) +unsigned int asm_rej_uniform(int32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen); +#endif diff --git a/crypto_sign/dilithium3/m4fstack/vector.s b/crypto_sign/dilithium3/m4fstack/vector.s new file mode 100644 index 00000000..559f11b0 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/vector.s @@ -0,0 +1,210 @@ +.syntax unified +.thumb +.macro redq a, tmp, q + add \tmp, \a, #4194304 + asrs \tmp, \tmp, #23 + mls \a, \tmp, \q, \a +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_asm_reduce32 +.type pqcrystals_dilithium_asm_reduce32, %function +.align 2 +pqcrystals_dilithium_asm_reduce32: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + redq r1, r9, r12 + redq r2, r9, r12 + redq r3, r9, r12 + redq r4, r9, r12 + redq r5, r9, r12 + redq r6, r9, r12 + redq r7, r9, r12 + redq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32 + +.macro barrett_32 a, Qbar, Q, tmp + smmulr.w \tmp, \a, \Qbar + mls.w \a, \tmp, \Q, \a +.endm + +// INPUT: target (signed), KYBER_Q (signed) +// OUTPUT: target adjusted to be between -KYBER_Q/2 and KYBER_Q/2 +.macro central_reduce target, Q + cmp \target, \Q, lsr #1 + it hi + subhi \target, \Q + cmn \target, \Q, lsr #1 + it lt + addlt \target, \Q +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_small_asm_reduce32_central +.type pqcrystals_dilithium_small_asm_reduce32_central, %function +.align 2 +pqcrystals_dilithium_small_asm_reduce32_central: + push {r4-r12, lr} + + + movw r9, #:lower16:5585133 + movt r9, #:upper16:5585133 + mov.w r10,#769 + + movw r12, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + barrett_32 r1, r9, r10, r11 + barrett_32 r2, r9, r10, r11 + barrett_32 r3, r9, r10, r11 + barrett_32 r4, r9, r10, r11 + barrett_32 r5, r9, r10, r11 + barrett_32 r6, r9, r10, r11 + barrett_32 r7, r9, r10, r11 + barrett_32 r8, r9, r10, r11 + + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r12, #1 + bne.w 1b + + pop {r4-r12, pc} + +.size pqcrystals_dilithium_small_asm_reduce32_central, .-pqcrystals_dilithium_small_asm_reduce32_central + +.macro caddq a, tmp, q + and \tmp, \q, \a, asr #31 + add \a, \a, \tmp +.endm + +.macro freezeq a, tmp, q + redq \a, \tmp, \q + caddq \a, \tmp, \q +.endm + +// void asm_caddq(int32_t a[N]); +.global pqcrystals_dilithium_asm_caddq +.type pqcrystals_dilithium_asm_caddq, %function +.align 2 +pqcrystals_dilithium_asm_caddq: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + caddq r1, r9, r12 + caddq r2, r9, r12 + caddq r3, r9, r12 + caddq r4, r9, r12 + caddq r5, r9, r12 + caddq r6, r9, r12 + caddq r7, r9, r12 + caddq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq + + +// asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen); +.global pqcrystals_dilithium_asm_rej_uniform +.type pqcrystals_dilithium_asm_rej_uniform, %function +.align 2 +pqcrystals_dilithium_asm_rej_uniform: + push.w {r4-r6} + push.w {r1} + // Store Q-1 in r12. + movw r12,#:lower16:8380416 + movt r12,#:upper16:8380416 + + add.w r6, r0, r1, lsl #2 + add.w r3, r2, r3 + sub.w r3, r3, #2 + +1: + // If there are less than 3 bytes available, return. + cmp.w r3, r2 + ble.w end + + ldr r5, [r2], #3 + ubfx r5, r5, #0, #23 + + cmp.n r5, r12 + it le + strle r5, [r0], #4 + + cmp.n r0, r6 + bne.n 1b + +end: + pop.w {r5} + + sub.w r0, r6, r0 + sub.w r0, r5, r0, lsr #2 + pop.w {r4-r6} + bx lr +.size pqcrystals_dilithium_asm_rej_uniform, .-pqcrystals_dilithium_asm_rej_uniform From 80c9e07ff1882f5b773c40d4fadceb790f767b2a Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 15:40:26 +0100 Subject: [PATCH 02/32] Start stack optimization [Passing] * Based on ideas from https://eprint.iacr.org/2022/323.pdf, based on code by Matthias J. Kannwischer * Sample A on-the-fly * Compressed c * Schoolbook mul for ct1 --- crypto_sign/dilithium3/m4fstack/reduce.h | 50 +++ crypto_sign/dilithium3/m4fstack/sign.c | 35 +- crypto_sign/dilithium3/m4fstack/smallpoly.c | 2 +- crypto_sign/dilithium3/m4fstack/stack.c | 404 ++++++++++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 40 ++ 5 files changed, 517 insertions(+), 14 deletions(-) create mode 100644 crypto_sign/dilithium3/m4fstack/stack.c create mode 100644 crypto_sign/dilithium3/m4fstack/stack.h diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h index 02df5500..5990918a 100644 --- a/crypto_sign/dilithium3/m4fstack/reduce.h +++ b/crypto_sign/dilithium3/m4fstack/reduce.h @@ -26,4 +26,54 @@ static inline int32_t montgomery_reduce(int64_t a) { return t; } +/************************************************* +* Name: reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t*Q; + return t; +} + +/************************************************* +* Name: caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t freeze(int32_t a) { + a = reduce32(a); + a = caddq(a); + return a; +} + + + #endif diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 04bec45c..eaecb29f 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -7,6 +7,7 @@ #include "randombytes.h" #include "symmetric.h" #include "smallpoly.h" +#include "stack.h" /************************************************* * Name: crypto_sign_keypair @@ -88,9 +89,11 @@ int crypto_sign_signature(uint8_t *sig, uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd; uint16_t nonce = 0; unsigned int n; - polyvecl mat[K], y, z; - polyveck t0, w1, w0; + polyvecl y, z; + polyveck w1, w0; poly cp; + uint8_t ccomp[68]; + poly matel; shake256incctx state; smallpoly s1_prime[L]; @@ -104,7 +107,7 @@ int crypto_sign_signature(uint8_t *sig, rnd = key + SEEDBYTES; mu = rnd + RNDBYTES; rhoprime = mu + CRHBYTES; - unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk); + unpack_sk_stack(rho, tr, key, s1_prime, s2_prime, sk); /* Compute mu = CRH(tr, msg) */ shake256_inc_init(&state); @@ -118,13 +121,10 @@ int crypto_sign_signature(uint8_t *sig, } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); - /* Expand matrix and transform vectors */ - polyvec_matrix_expand(mat, rho); + /* Transform vectors */ polyvecl_small_ntt(s1_prime); polyveck_small_ntt(s2_prime); - polyveck_ntt(&t0); - rej: /* Sample intermediate vector y */ polyvecl_uniform_gamma1(&y, rhoprime, nonce++); @@ -132,7 +132,16 @@ int crypto_sign_signature(uint8_t *sig, /* Matrix-vector multiplication */ z = y; polyvecl_ntt(&z); - polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + for (size_t k_idx = 0; k_idx < K; k_idx++) { + poly_uniform(&matel, rho, (k_idx << 8) + 0); + poly_pointwise_montgomery(&w1.vec[k_idx], &matel, &z.vec[0]); + for (size_t l_idx = 1; l_idx < L; l_idx++) { + poly_uniform(&matel, rho, (k_idx << 8) + l_idx); + poly_pointwise_acc_montgomery(&w1.vec[k_idx], &matel, &z.vec[l_idx]); + } + } + polyveck_reduce(&w1); polyveck_invntt_tomont(&w1); @@ -147,9 +156,10 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_finalize(&state); shake256_inc_squeeze(sig, CTILDEBYTES, &state); poly_challenge(&cp, sig); + + poly_challenge_compress(ccomp, &cp); poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp); - poly_ntt(&cp); /* Compute z, reject if it reveals secret */ polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime); @@ -175,11 +185,10 @@ int crypto_sign_signature(uint8_t *sig, if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA)) goto rej; - /* Compute hints for w1 */ - poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]); + poly_schoolbook(tmp, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + + L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + i*POLYT0_PACKEDBYTES); - poly_invntt_tomont(tmp); - poly_reduce(tmp); + /* Compute hints for w1 */ if(poly_chknorm(tmp, GAMMA2)) goto rej; diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c index 9e1f6c85..1f7fab17 100644 --- a/crypto_sign/dilithium3/m4fstack/smallpoly.c +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c @@ -2,7 +2,7 @@ #include "smallntt.h" void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) { - for (int i = 0; i < N; i++) + for (int i = N; i >= 0; i--) { out->coeffs[i] = in->coeffs[i]; } diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c new file mode 100644 index 00000000..2beb0f46 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -0,0 +1,404 @@ +#include "stack.h" +#include "fips202.h" +#include "symmetric.h" +#include "vector.h" +#include "reduce.h" + +void poly_challenge_compress(uint8_t c[68], const poly *cp){ + unsigned int i, pos; + uint64_t signs; + uint64_t mask; + /* Encode c */ + for(i=0;i<68;i++) c[i] = 0; + signs = 0; + mask = 1; + pos = 0; + for(i = 0; i < N; ++i){ + if(cp->coeffs[i] != 0){ + c[pos++] = i; + if(cp->coeffs[i] == -1){ + signs |= mask; + } + mask <<= 1; + } + } + + for (i = 0; i < 8; ++i) { + c[60+i] = (unsigned char) (signs >> 8 * i); + } +} + +void poly_challenge_decompress(poly *cp, const uint8_t c[68]){ + unsigned int i; + unsigned pos; + uint64_t signs = 0; + for(i = 0; i < N; i++) cp->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)c[60+i]) << (8*i); + } + + for(i = 0; i < TAU; i++){ + pos = c[i]; + if(signs & 1){ + cp->coeffs[pos] = -1; + } else { + cp->coeffs[pos] = 1; + } + signs >>= 1; + } +} + + +// TODO: buffer at most 8 coeffs at once +static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){ + int32_t coeff; + // 8 coefficients are packed in 13 bytes + t0 += 13*(idx >> 3); + + if(idx % 8 == 0){ + coeff = t0[0]; + coeff |= (uint32_t)t0[1] << 8; + } else if(idx % 8 == 1){ + coeff = t0[1] >> 5; + coeff |= (uint32_t)t0[2] << 3; + coeff |= (uint32_t)t0[3] << 11; + } else if(idx % 8 == 2){ + coeff = t0[3] >> 2; + coeff |= (uint32_t)t0[4] << 6; + } else if(idx % 8 == 3){ + coeff = t0[4] >> 7; + coeff |= (uint32_t)t0[5] << 1; + coeff |= (uint32_t)t0[6] << 9; + } else if(idx % 8 == 4){ + coeff = t0[6] >> 4; + coeff |= (uint32_t)t0[7] << 4; + coeff |= (uint32_t)t0[8] << 12; + } else if(idx % 8 == 5){ + coeff = t0[8] >> 1; + coeff |= (uint32_t)t0[9] << 7; + } else if(idx % 8 == 6){ + coeff = t0[9] >> 6; + coeff |= (uint32_t)t0[10] << 2; + coeff |= (uint32_t)t0[11] << 10; + } else if(idx % 8 == 7){ + coeff = t0[11] >> 3; + coeff |= (uint32_t)t0[12] << 5; + } + coeff &= 0x1FFF; + return (1 << (D-1)) - coeff; +} + +void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){ + unsigned i,j,idx; + uint64_t signs = 0; + for(i = 0; i < N; i++) c->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)ccomp[60+i]) << (8*i); + } + + for(idx = 0; idx < TAU; idx++){ + i = ccomp[idx]; + if(!(signs & 1)){ + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] += polyt0_unpack_idx(t0, j); + } + for(j = N-i; jcoeffs[i+j-N] -= polyt0_unpack_idx(t0, j); + } + } else { + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] -= polyt0_unpack_idx(t0, j); + } + for(j = N-i; jcoeffs[i+j-N] += polyt0_unpack_idx(t0, j); + } + } + + signs >>= 1; + } +} + + +void polyw_pack(uint8_t buf[3*256], poly *w){ + poly_reduce(w); + poly_caddq(w); + unsigned int i; + for(i = 0; i < N; i++){ + buf[i*3 + 0] = w->coeffs[i]; + buf[i*3 + 1] = w->coeffs[i] >> 8; + buf[i*3 + 2] = w->coeffs[i] >> 16; + } +} + +void polyw_unpack(poly *w, const uint8_t buf[3*256]) { + unsigned int i; + for(i = 0; i < N; i++){ + w->coeffs[i] = buf[i*3 + 0]; + w->coeffs[i] |= (int32_t)buf[i*3 + 1] << 8; + w->coeffs[i] |= (int32_t)buf[i*3 + 2] << 16; + } +} + + +static void polyw_add_idx(uint8_t buf[3*256], int32_t a, size_t i){ + int32_t coeff; + coeff = buf[i*3 + 0]; + coeff |= (int32_t)buf[i*3 + 1] << 8; + coeff |= (int32_t)buf[i*3 + 2] << 16; + + coeff += a; + + coeff = freeze(coeff); + + buf[i*3 + 0] = coeff; + buf[i*3 + 1] = coeff >> 8; + buf[i*3 + 2] = coeff >> 16; +} + +void polyw_add(uint8_t buf[3*256], poly *p){ + unsigned int i; + for(i = 0; i < N; i++){ + polyw_add_idx(buf, p->coeffs[i], i); + } +} +void polyw_sub(poly* c, uint8_t buf[3*256], poly *a){ + int32_t coeff; + + + for(size_t i=0;icoeffs[i] = coeff - a->coeffs[i]; + } +} + +static int32_t highbits(int32_t a){ + int32_t a1; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + return a1; +} + +void poly_highbits(poly *a1, const poly *a) { + unsigned int i; + + for(i = 0; i < N; ++i) + a1->coeffs[i] = highbits(a->coeffs[i]); +} + +static int32_t lowbits(int32_t a){ + int32_t a1; + int32_t a0; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + a0 = a - a1*2*GAMMA2; + a0 -= (((Q-1)/2 - a0) >> 31) & Q; + return a0; +} + +void poly_lowbits(poly *a0, const poly *a){ + unsigned int i; + + for(i = 0; i < N; ++i) + a0->coeffs[i] = lowbits(a->coeffs[i]); +} + +void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) { + small_polyeta_unpack(a, sk + 3*SEEDBYTES + idx*POLYETA_PACKEDBYTES); +} +void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { + small_polyeta_unpack(a, sk + 3*SEEDBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES); +} + + +// TODO: in the end increase this buffer size as far as possible +#define POLY_UNIFORM_BUFFERSIZE 3 +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ + int32_t t; + uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; + { + size_t ctr = 0; + stream128_init(state, seed, nonce); + + do { + shake128_inc_squeeze(buf, sizeof buf, state); + + for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){ + t = buf[pos]; + t |= (uint32_t)buf[pos+1] << 8; + t |= (uint32_t)buf[pos+2] << 16; + t &= 0x7FFFFF; + + if(t < Q) { + t = montgomery_reduce((int64_t)t * b->coeffs[ctr]); + polyw_add_idx(wcomp, t, ctr); + ctr++; + } + } + } while(ctr < N); + + } +} + +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE 1 +#if GAMMA1 == (1 << 17) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*4) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE*9) +#elif GAMMA1 == (1 << 19) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*2) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE*5) +#endif + +static void polyz_unpack_inplace(int32_t *r){ + uint8_t *a = (uint8_t *)r; + + unsigned int i,j; + #if GAMMA1 == (1 << 17) + for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) { + i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j; + int32_t t0; + + + r[4*i+3] = a[9*i+6] >> 6; + r[4*i+3] |= (uint32_t)a[9*i+7] << 2; + r[4*i+3] |= (uint32_t)a[9*i+8] << 10; + r[4*i+3] &= 0x3FFFF; + + r[4*i+2] = a[9*i+4] >> 4; + r[4*i+2] |= (uint32_t)a[9*i+5] << 4; + r[4*i+2] |= (uint32_t)a[9*i+6] << 12; + r[4*i+2] &= 0x3FFFF; + + + r[4*i+1] = (uint32_t)a[9*i+4] << 14; + r[4*i+1] |= a[9*i+2] >> 2; + r[4*i+1] |= (uint32_t)a[9*i+3] << 6; + r[4*i+1] &= 0x3FFFF; + + t0 = a[9*i+0]; + t0 |= (uint32_t)a[9*i+1] << 8; + t0 |= (uint32_t)a[9*i+2] << 16; + t0 &= 0x3FFFF; + + r[4*i+0] = GAMMA1 - t0; + r[4*i+1] = GAMMA1 - r[4*i+1]; + r[4*i+2] = GAMMA1 - r[4*i+2]; + r[4*i+3] = GAMMA1 - r[4*i+3]; + + } +#elif GAMMA1 == (1 << 19) + for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) { + i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j; + int32_t tmp0, tmp1; + + tmp0 = a[5*i+2] >> 4; + tmp0 |= (uint32_t)a[5*i+3] << 4; + tmp0 |= (uint32_t)a[5*i+4] << 12; + tmp0 &= 0xFFFFF; + + tmp1 = a[5*i+0]; + tmp1 |= (uint32_t)a[5*i+1] << 8; + tmp1 |= (uint32_t)a[5*i+2] << 16; + tmp1 &= 0xFFFFF; + + r[2*i+0] = GAMMA1 - tmp0; + r[2*i+1] = GAMMA1 - tmp1; + } +#endif +} + + +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ + int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; + + stream256_init(state, seed, nonce); + for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); + polyz_unpack_inplace(buf); + + for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ + a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j] + b->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j]; + } + } +} + + +static inline int32_t make_hint(int32_t z, int32_t r){ + int32_t r1, v1; + + r1 = highbits(r); + v1 = highbits(r+z); + + if(r1 != v1) return 1; + return 0; +} + +size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){ + int32_t coeff; + size_t hints_n = 0; + for(size_t i=0;icoeffs[i]; + + a->coeffs[i] = make_hint(-t->coeffs[i], coeff); + if(a->coeffs[i] == 1){ + hints_n++; + } + } + return hints_n; +} + +// TODO: remove this later +void unpack_sk_stack(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + key[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + tr[i] = sk[i]; + sk += TRBYTES; + + for(i=0; i < L; ++i) + small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES); + sk += L*POLYETA_PACKEDBYTES; + + for(i=0; i < K; ++i) + small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES); + sk += K*POLYETA_PACKEDBYTES; +} diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h new file mode 100644 index 00000000..9d36b105 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -0,0 +1,40 @@ +#ifndef STACK_H +#define STACK_H + +#include "poly.h" +#include "smallpoly.h" +#include +#include +#include "fips202.h" + +void poly_challenge_compress(uint8_t c[68], const poly *cp); +void poly_challenge_decompress(poly *cp, const uint8_t c[68]); + + +void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0); +void polyw_pack(uint8_t buf[3*256], poly *w); +void polyw_unpack(poly *w, const uint8_t buf[3*256]); + +void polyw_add(uint8_t buf[3*256], poly *p); +void polyw_sub(poly* c, uint8_t buf[3*256], poly *a); + +void poly_highbits(poly *a1, const poly *a); +void poly_lowbits(poly *a0, const poly *a); + +void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); +void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); + + +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); + +size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); + +// TODO: replace this with individual functions later +void unpack_sk_stack(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]); +#endif \ No newline at end of file From 5c5b86829cec268c440d318ee941370abc121137 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 16:41:09 +0100 Subject: [PATCH 03/32] Compress w --- crypto_sign/dilithium3/m4fstack/sign.c | 69 +++++++++++++++---------- crypto_sign/dilithium3/m4fstack/stack.c | 22 ++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 1 + 3 files changed, 65 insertions(+), 27 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index eaecb29f..edfc6a81 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -90,10 +90,10 @@ int crypto_sign_signature(uint8_t *sig, uint16_t nonce = 0; unsigned int n; polyvecl y, z; - polyveck w1, w0; + uint8_t wcomp[K][768]; poly cp; uint8_t ccomp[68]; - poly matel; + poly tmp0, tmp1; shake256incctx state; smallpoly s1_prime[L]; @@ -133,22 +133,27 @@ int crypto_sign_signature(uint8_t *sig, z = y; polyvecl_ntt(&z); - for (size_t k_idx = 0; k_idx < K; k_idx++) { - poly_uniform(&matel, rho, (k_idx << 8) + 0); - poly_pointwise_montgomery(&w1.vec[k_idx], &matel, &z.vec[0]); - for (size_t l_idx = 1; l_idx < L; l_idx++) { - poly_uniform(&matel, rho, (k_idx << 8) + l_idx); - poly_pointwise_acc_montgomery(&w1.vec[k_idx], &matel, &z.vec[l_idx]); + for (size_t k_idx = 0; k_idx < K; k_idx++) { + for(size_t i=0;i<768;i++){ + wcomp[k_idx][i] = 0; } - } - polyveck_reduce(&w1); - polyveck_invntt_tomont(&w1); - /* Decompose w and call the random oracle */ - polyveck_caddq(&w1); - polyveck_decompose(&w1, &w0, &w1); - polyveck_pack_w1(sig, &w1); + for (size_t l_idx = 0; l_idx < L; l_idx++) { + poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); + poly_pointwise_montgomery(&tmp0, &tmp0, &z.vec[l_idx]); + polyw_add(wcomp[k_idx], &tmp0); + } + + polyw_unpack(&tmp0, wcomp[k_idx]); + poly_invntt_tomont(&tmp0); + poly_caddq(&tmp0); + + polyw_pack(wcomp[k_idx], &tmp0); + + poly_decompose_w1(&tmp0, &tmp0); + polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); + } shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); @@ -176,28 +181,38 @@ int crypto_sign_signature(uint8_t *sig, unsigned int hints_written = 0; /* Check that subtracting cs2 does not change high bits of w and low bits * do not reveal secret information */ - for(unsigned int i = 0; i < K; ++i) { - poly *tmp = &z.vec[0]; - poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]); + + for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { + polyw_unpack(&tmp0, wcomp[k_idx]); + poly_decompose(&tmp1, &tmp0, &tmp0); - poly_sub(&w0.vec[i], &w0.vec[i], tmp); - poly_reduce(&w0.vec[i]); - if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA)) + poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &s2_prime[k_idx]); + + poly_sub(&tmp0, &tmp0, &tmp1); + poly_reduce(&tmp0); + if(poly_chknorm(&tmp0, GAMMA2 - BETA)) goto rej; - poly_schoolbook(tmp, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + - L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + i*POLYT0_PACKEDBYTES); + poly_schoolbook(&tmp1, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + + L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES); /* Compute hints for w1 */ - if(poly_chknorm(tmp, GAMMA2)) + if(poly_chknorm(&tmp1, GAMMA2)) goto rej; - poly_add(&w0.vec[i], &w0.vec[i], tmp); - hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]); + + poly_add(&tmp0, &tmp0, &tmp1); + + + polyw_unpack(&tmp1, wcomp[k_idx]); + poly_decompose_w1(&tmp1, &tmp1); + + + hint_n += poly_make_hint(&tmp1, &tmp0, &tmp1); if (hint_n > OMEGA) { goto rej; } - pack_sig_h(sig, tmp, i, &hints_written); + pack_sig_h(sig, &tmp1, k_idx, &hints_written); } pack_sig_h_zero(sig, &hints_written); *siglen = CRYPTO_BYTES; diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 2beb0f46..d7d59f1c 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -402,3 +402,25 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES); sk += K*POLYETA_PACKEDBYTES; } + +static int32_t decompose_w1(int32_t a){ + int32_t a1; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + return a1; +} + +void poly_decompose_w1(poly *a1, const poly *a) { + unsigned int i; + + for(i = 0; i < N; ++i) + a1->coeffs[i] = decompose_w1(a->coeffs[i]); +} diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 9d36b105..c9ddbe61 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -37,4 +37,5 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], smallpoly s1[L], smallpoly s2[K], const uint8_t sk[CRYPTO_SECRETKEYBYTES]); +void poly_decompose_w1(poly *a1, const poly *a); #endif \ No newline at end of file From 926e957d2fe8e11635ea55650ab3c384d597f8b6 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 16:50:42 +0100 Subject: [PATCH 04/32] Eliminate z, y --- crypto_sign/dilithium3/m4fstack/sign.c | 61 ++++++++++++++------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index edfc6a81..515e1419 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -89,7 +89,6 @@ int crypto_sign_signature(uint8_t *sig, uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd; uint16_t nonce = 0; unsigned int n; - polyvecl y, z; uint8_t wcomp[K][768]; poly cp; uint8_t ccomp[68]; @@ -125,35 +124,33 @@ int crypto_sign_signature(uint8_t *sig, polyvecl_small_ntt(s1_prime); polyveck_small_ntt(s2_prime); -rej: - /* Sample intermediate vector y */ - polyvecl_uniform_gamma1(&y, rhoprime, nonce++); - - /* Matrix-vector multiplication */ - z = y; - polyvecl_ntt(&z); - +rej: for (size_t k_idx = 0; k_idx < K; k_idx++) { for(size_t i=0;i<768;i++){ wcomp[k_idx][i] = 0; } - + } for (size_t l_idx = 0; l_idx < L; l_idx++) { - poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); - poly_pointwise_montgomery(&tmp0, &tmp0, &z.vec[l_idx]); - polyw_add(wcomp[k_idx], &tmp0); + /* Sample intermediate vector y */ + poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx); + poly_ntt(&tmp1); + for (size_t k_idx = 0; k_idx < K; k_idx++) { + poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); + poly_pointwise_montgomery(&tmp0, &tmp0, &tmp1); + polyw_add(wcomp[k_idx], &tmp0); + } + } + nonce++; + for (size_t k_idx = 0; k_idx < K; k_idx++) { + polyw_unpack(&tmp0, wcomp[k_idx]); + poly_invntt_tomont(&tmp0); + poly_caddq(&tmp0); + + polyw_pack(wcomp[k_idx], &tmp0); + poly_decompose_w1(&tmp0, &tmp0); + polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); } - - polyw_unpack(&tmp0, wcomp[k_idx]); - poly_invntt_tomont(&tmp0); - poly_caddq(&tmp0); - - polyw_pack(wcomp[k_idx], &tmp0); - - poly_decompose_w1(&tmp0, &tmp0); - polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); - } shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); @@ -167,16 +164,22 @@ int crypto_sign_signature(uint8_t *sig, poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp); /* Compute z, reject if it reveals secret */ - polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime); + for(size_t l_idx=0;l_idx < L; l_idx++){ + poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &s1_prime[l_idx]); + poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx); - polyvecl_add(&z, &z, &y); - polyvecl_reduce(&z); - if(polyvecl_chknorm(&z, GAMMA1 - BETA)) - goto rej; + poly_add(&tmp0, &tmp0, &tmp1); + + poly_reduce(&tmp0); + + if(poly_chknorm(&tmp0, GAMMA1 - BETA)) + goto rej; + + polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, &tmp0); + } /* Write signature */ - pack_sig_z(sig, &z); unsigned int hint_n = 0; unsigned int hints_written = 0; /* Check that subtracting cs2 does not change high bits of w and low bits From 302f7f203865513d183bc664f1dccc7542cf9f26 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 16:53:37 +0100 Subject: [PATCH 05/32] Eliminate cp --- crypto_sign/dilithium3/m4fstack/sign.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 515e1419..bfcd824a 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -90,7 +90,6 @@ int crypto_sign_signature(uint8_t *sig, uint16_t nonce = 0; unsigned int n; uint8_t wcomp[K][768]; - poly cp; uint8_t ccomp[68]; poly tmp0, tmp1; shake256incctx state; @@ -157,11 +156,11 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES); shake256_inc_finalize(&state); shake256_inc_squeeze(sig, CTILDEBYTES, &state); - poly_challenge(&cp, sig); + poly_challenge(&tmp0, sig); - poly_challenge_compress(ccomp, &cp); + poly_challenge_compress(ccomp, &tmp0); - poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp); + poly_small_ntt_precomp(&cp_small, &cp_small_prime, &tmp0); /* Compute z, reject if it reveals secret */ for(size_t l_idx=0;l_idx < L; l_idx++){ From 3c36dbea3aa8d328d7689f45e593af16dc7b7558 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 17:07:25 +0100 Subject: [PATCH 06/32] Eliminate s1, s2 --- crypto_sign/dilithium3/m4fstack/sign.c | 20 +++++++++++--------- crypto_sign/dilithium3/m4fstack/stack.c | 14 ++------------ crypto_sign/dilithium3/m4fstack/stack.h | 2 -- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index bfcd824a..8f001f2a 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -9,6 +9,8 @@ #include "smallpoly.h" #include "stack.h" +#include "smallntt.h" + /************************************************* * Name: crypto_sign_keypair * @@ -94,8 +96,7 @@ int crypto_sign_signature(uint8_t *sig, poly tmp0, tmp1; shake256incctx state; - smallpoly s1_prime[L]; - smallpoly s2_prime[K]; + smallpoly stmp0, stmp1; smallpoly cp_small; smallhalfpoly cp_small_prime; @@ -105,7 +106,7 @@ int crypto_sign_signature(uint8_t *sig, rnd = key + SEEDBYTES; mu = rnd + RNDBYTES; rhoprime = mu + CRHBYTES; - unpack_sk_stack(rho, tr, key, s1_prime, s2_prime, sk); + unpack_sk_stack(rho, tr, key, sk); /* Compute mu = CRH(tr, msg) */ shake256_inc_init(&state); @@ -119,10 +120,6 @@ int crypto_sign_signature(uint8_t *sig, } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); - /* Transform vectors */ - polyvecl_small_ntt(s1_prime); - polyveck_small_ntt(s2_prime); - rej: for (size_t k_idx = 0; k_idx < K; k_idx++) { for(size_t i=0;i<768;i++){ @@ -164,7 +161,10 @@ int crypto_sign_signature(uint8_t *sig, /* Compute z, reject if it reveals secret */ for(size_t l_idx=0;l_idx < L; l_idx++){ - poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &s1_prime[l_idx]); + unpack_sk_s1(&stmp0, sk, l_idx); + small_ntt(&stmp0); + poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); + poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx); poly_add(&tmp0, &tmp0, &tmp1); @@ -188,7 +188,9 @@ int crypto_sign_signature(uint8_t *sig, polyw_unpack(&tmp0, wcomp[k_idx]); poly_decompose(&tmp1, &tmp0, &tmp0); - poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &s2_prime[k_idx]); + unpack_sk_s2(&stmp0, sk, k_idx); + small_ntt(&stmp0); + poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0); poly_sub(&tmp0, &tmp0, &tmp1); poly_reduce(&tmp0); diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index d7d59f1c..b1e89bb5 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -222,10 +222,10 @@ void poly_lowbits(poly *a0, const poly *a){ } void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) { - small_polyeta_unpack(a, sk + 3*SEEDBYTES + idx*POLYETA_PACKEDBYTES); + small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + idx*POLYETA_PACKEDBYTES); } void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { - small_polyeta_unpack(a, sk + 3*SEEDBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES); + small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES); } @@ -376,8 +376,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){ void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], uint8_t key[SEEDBYTES], - smallpoly s1[L], - smallpoly s2[K], const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { unsigned int i; @@ -393,14 +391,6 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], for(i = 0; i < TRBYTES; ++i) tr[i] = sk[i]; sk += TRBYTES; - - for(i=0; i < L; ++i) - small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES); - sk += L*POLYETA_PACKEDBYTES; - - for(i=0; i < K; ++i) - small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES); - sk += K*POLYETA_PACKEDBYTES; } static int32_t decompose_w1(int32_t a){ diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index c9ddbe61..5998cfd8 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -34,8 +34,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], uint8_t key[SEEDBYTES], - smallpoly s1[L], - smallpoly s2[K], const uint8_t sk[CRYPTO_SECRETKEYBYTES]); void poly_decompose_w1(poly *a1, const poly *a); #endif \ No newline at end of file From f71e025311417516911d5d090377ff6933893c5f Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 15 Mar 2024 17:12:39 +0100 Subject: [PATCH 07/32] Eliminate second poly needed for A*y * Note: Reverts poly_uniform_pointwise_montgomery_polywadd_stack to prior state --- crypto_sign/dilithium3/m4fstack/sign.c | 8 +++++--- crypto_sign/dilithium3/m4fstack/stack.c | 8 +++++--- crypto_sign/dilithium3/m4fstack/stack.h | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 8f001f2a..2fe1cdf7 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -131,10 +131,11 @@ int crypto_sign_signature(uint8_t *sig, /* Sample intermediate vector y */ poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx); poly_ntt(&tmp1); + + /* Matrix-vector multiplication */ for (size_t k_idx = 0; k_idx < K; k_idx++) { - poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); - poly_pointwise_montgomery(&tmp0, &tmp0, &tmp1); - polyw_add(wcomp[k_idx], &tmp0); + // sampling of y and packing into wcomp inlined into the basemul + poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp1, rho, (k_idx << 8) + l_idx); } } nonce++; @@ -165,6 +166,7 @@ int crypto_sign_signature(uint8_t *sig, small_ntt(&stmp0); poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); + // TODO: eliminate tmp1 poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx); poly_add(&tmp0, &tmp0, &tmp1); diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index b1e89bb5..536ce472 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -231,15 +231,17 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { // TODO: in the end increase this buffer size as far as possible #define POLY_UNIFORM_BUFFERSIZE 3 -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){ + //externalize the Keccak state + shake128incctx state; int32_t t; uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; { size_t ctr = 0; - stream128_init(state, seed, nonce); + stream128_init(&state, seed, nonce); do { - shake128_inc_squeeze(buf, sizeof buf, state); + shake128_inc_squeeze(buf, sizeof buf, &state); for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){ t = buf[pos]; diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 5998cfd8..64726a80 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -25,7 +25,7 @@ void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce); void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); From deeababc49c3f4605b470543190772dbc9e3ff1b Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 13:27:20 +0100 Subject: [PATCH 08/32] Inline sampling uniform and uniform_gamma1 --- crypto_sign/dilithium3/m4fstack/sign.c | 15 ++++++--------- crypto_sign/dilithium3/m4fstack/stack.c | 15 +++++++++------ crypto_sign/dilithium3/m4fstack/stack.h | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 2fe1cdf7..a0d43790 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -129,13 +129,13 @@ int crypto_sign_signature(uint8_t *sig, for (size_t l_idx = 0; l_idx < L; l_idx++) { /* Sample intermediate vector y */ - poly_uniform_gamma1(&tmp1, rhoprime, L*nonce + l_idx); - poly_ntt(&tmp1); + poly_uniform_gamma1(&tmp0, rhoprime, L*nonce + l_idx); + poly_ntt(&tmp0); /* Matrix-vector multiplication */ for (size_t k_idx = 0; k_idx < K; k_idx++) { // sampling of y and packing into wcomp inlined into the basemul - poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp1, rho, (k_idx << 8) + l_idx); + poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx); } } nonce++; @@ -166,10 +166,7 @@ int crypto_sign_signature(uint8_t *sig, small_ntt(&stmp0); poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); - // TODO: eliminate tmp1 - poly_uniform_gamma1(&tmp1, rhoprime, L*(nonce-1) + l_idx); - - poly_add(&tmp0, &tmp0, &tmp1); + poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx); poly_reduce(&tmp0); @@ -296,9 +293,9 @@ int crypto_sign_verify(const uint8_t *sig, return -1; /* Compute CRH(h(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, mu, TRBYTES); shake256_inc_absorb(&state, m, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 536ce472..83fd1ac2 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -232,7 +232,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { // TODO: in the end increase this buffer size as far as possible #define POLY_UNIFORM_BUFFERSIZE 3 void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){ - //externalize the Keccak state + // TODO: externalize the Keccak state shake128incctx state; int32_t t; uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; @@ -321,19 +321,22 @@ static void polyz_unpack_inplace(int32_t *r){ tmp1 |= (uint32_t)a[5*i+2] << 16; tmp1 &= 0xFFFFF; - r[2*i+0] = GAMMA1 - tmp0; - r[2*i+1] = GAMMA1 - tmp1; + r[2*i+0] = GAMMA1 - tmp1; + r[2*i+1] = GAMMA1 - tmp0; } #endif } -void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){ + // TODO: externalize the state + shake256incctx state; int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; - stream256_init(state, seed, nonce); + stream256_init(&state, seed, nonce); for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ - shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, &state); polyz_unpack_inplace(buf); for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 64726a80..64504593 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -26,7 +26,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce); -void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); From cbc29cf4d86320929124c84041a6552857966340 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 13:30:25 +0100 Subject: [PATCH 09/32] Inline hint generation --- crypto_sign/dilithium3/m4fstack/sign.c | 23 +++++++++++++++-------- crypto_sign/dilithium3/m4fstack/stack.c | 1 - 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index a0d43790..89ffa765 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -184,25 +184,31 @@ int crypto_sign_signature(uint8_t *sig, * do not reveal secret information */ for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { - polyw_unpack(&tmp0, wcomp[k_idx]); - poly_decompose(&tmp1, &tmp0, &tmp0); - unpack_sk_s2(&stmp0, sk, k_idx); small_ntt(&stmp0); poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0); + polyw_unpack(&tmp0, wcomp[k_idx]); + poly_sub(&tmp0, &tmp0, &tmp1); poly_reduce(&tmp0); - if(poly_chknorm(&tmp0, GAMMA2 - BETA)) + + polyw_pack(wcomp[k_idx], &tmp0); + + poly_decompose(&tmp1, &tmp0, &tmp0); + poly_reduce(&tmp0); + if(poly_chknorm(&tmp0, GAMMA2 - BETA)){ goto rej; + } - poly_schoolbook(&tmp1, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + + poly_schoolbook(&tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES); /* Compute hints for w1 */ - if(poly_chknorm(&tmp1, GAMMA2)) + if(poly_chknorm(&tmp0, GAMMA2)) { goto rej; + } poly_add(&tmp0, &tmp0, &tmp1); @@ -211,11 +217,12 @@ int crypto_sign_signature(uint8_t *sig, poly_decompose_w1(&tmp1, &tmp1); - hint_n += poly_make_hint(&tmp1, &tmp0, &tmp1); + hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]); + if (hint_n > OMEGA) { goto rej; } - pack_sig_h(sig, &tmp1, k_idx, &hints_written); + pack_sig_h(sig, &tmp0, k_idx, &hints_written); } pack_sig_h_zero(sig, &hints_written); *siglen = CRYPTO_BYTES; diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 83fd1ac2..04f8ffbc 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -328,7 +328,6 @@ static void polyz_unpack_inplace(int32_t *r){ } -#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){ // TODO: externalize the state shake256incctx state; From 8468d602f4d1abe345c6e8b20c8574ada287d1e8 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 13:37:44 +0100 Subject: [PATCH 10/32] Inline polyw subtraction --- crypto_sign/dilithium3/m4fstack/sign.c | 17 ++++------------- crypto_sign/dilithium3/m4fstack/stack.c | 25 +++++++++++++++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 1 + 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 89ffa765..edb40e8b 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -93,7 +93,7 @@ int crypto_sign_signature(uint8_t *sig, unsigned int n; uint8_t wcomp[K][768]; uint8_t ccomp[68]; - poly tmp0, tmp1; + poly tmp0; shake256incctx state; smallpoly stmp0, stmp1; @@ -186,16 +186,14 @@ int crypto_sign_signature(uint8_t *sig, for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { unpack_sk_s2(&stmp0, sk, k_idx); small_ntt(&stmp0); - poly_small_basemul_invntt(&tmp1, &cp_small, &cp_small_prime, &stmp0); + poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); - polyw_unpack(&tmp0, wcomp[k_idx]); - - poly_sub(&tmp0, &tmp0, &tmp1); + polyw_sub(&tmp0, wcomp[k_idx], &tmp0); poly_reduce(&tmp0); polyw_pack(wcomp[k_idx], &tmp0); - poly_decompose(&tmp1, &tmp0, &tmp0); + poly_decompose_w0(&tmp0, &tmp0); poly_reduce(&tmp0); if(poly_chknorm(&tmp0, GAMMA2 - BETA)){ goto rej; @@ -210,13 +208,6 @@ int crypto_sign_signature(uint8_t *sig, goto rej; } - poly_add(&tmp0, &tmp0, &tmp1); - - - polyw_unpack(&tmp1, wcomp[k_idx]); - poly_decompose_w1(&tmp1, &tmp1); - - hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]); if (hint_n > OMEGA) { diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 04f8ffbc..600e2a39 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -418,3 +418,28 @@ void poly_decompose_w1(poly *a1, const poly *a) { for(i = 0; i < N; ++i) a1->coeffs[i] = decompose_w1(a->coeffs[i]); } + +static int32_t decompose_w0(int32_t a){ + int32_t a1; + int32_t a0; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + a0 = a - a1*2*GAMMA2; + a0 -= (((Q-1)/2 - a0) >> 31) & Q; + return a0; +} + +void poly_decompose_w0(poly *a0, const poly *a){ + unsigned int i; + + for(i = 0; i < N; ++i) + a0->coeffs[i] = decompose_w0(a->coeffs[i]); +} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 64504593..591f8ea5 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -36,4 +36,5 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t key[SEEDBYTES], const uint8_t sk[CRYPTO_SECRETKEYBYTES]); void poly_decompose_w1(poly *a1, const poly *a); +void poly_decompose_w0(poly *a0, const poly *a); #endif \ No newline at end of file From b4505e734fe79c923e06080816fc2c94332493e4 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 13:42:06 +0100 Subject: [PATCH 11/32] Refactor decompose to high/lowbits --- crypto_sign/dilithium3/m4fstack/sign.c | 12 +++---- crypto_sign/dilithium3/m4fstack/stack.c | 47 ------------------------- crypto_sign/dilithium3/m4fstack/stack.h | 2 -- 3 files changed, 6 insertions(+), 55 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index edb40e8b..ee2fcab6 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -96,7 +96,7 @@ int crypto_sign_signature(uint8_t *sig, poly tmp0; shake256incctx state; - smallpoly stmp0, stmp1; + smallpoly stmp0; smallpoly cp_small; smallhalfpoly cp_small_prime; @@ -135,7 +135,7 @@ int crypto_sign_signature(uint8_t *sig, /* Matrix-vector multiplication */ for (size_t k_idx = 0; k_idx < K; k_idx++) { // sampling of y and packing into wcomp inlined into the basemul - poly_uniform_pointwise_montgomery_polywadd_stack(&wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx); } } nonce++; @@ -145,7 +145,7 @@ int crypto_sign_signature(uint8_t *sig, poly_caddq(&tmp0); polyw_pack(wcomp[k_idx], &tmp0); - poly_decompose_w1(&tmp0, &tmp0); + poly_highbits(&tmp0, &tmp0); polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); } @@ -163,7 +163,7 @@ int crypto_sign_signature(uint8_t *sig, /* Compute z, reject if it reveals secret */ for(size_t l_idx=0;l_idx < L; l_idx++){ unpack_sk_s1(&stmp0, sk, l_idx); - small_ntt(&stmp0); + small_ntt(stmp0.coeffs); poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx); @@ -185,7 +185,7 @@ int crypto_sign_signature(uint8_t *sig, for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { unpack_sk_s2(&stmp0, sk, k_idx); - small_ntt(&stmp0); + small_ntt(stmp0.coeffs); poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); polyw_sub(&tmp0, wcomp[k_idx], &tmp0); @@ -193,7 +193,7 @@ int crypto_sign_signature(uint8_t *sig, polyw_pack(wcomp[k_idx], &tmp0); - poly_decompose_w0(&tmp0, &tmp0); + poly_lowbits(&tmp0, &tmp0); poly_reduce(&tmp0); if(poly_chknorm(&tmp0, GAMMA2 - BETA)){ goto rej; diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 600e2a39..2bf0b97e 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -395,51 +395,4 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], for(i = 0; i < TRBYTES; ++i) tr[i] = sk[i]; sk += TRBYTES; -} - -static int32_t decompose_w1(int32_t a){ - int32_t a1; - - a1 = (a + 127) >> 7; -#if GAMMA2 == (Q-1)/32 - a1 = (a1*1025 + (1 << 21)) >> 22; - a1 &= 15; -#elif GAMMA2 == (Q-1)/88 - a1 = (a1*11275 + (1 << 23)) >> 24; - a1 ^= ((43 - a1) >> 31) & a1; -#endif - - return a1; -} - -void poly_decompose_w1(poly *a1, const poly *a) { - unsigned int i; - - for(i = 0; i < N; ++i) - a1->coeffs[i] = decompose_w1(a->coeffs[i]); -} - -static int32_t decompose_w0(int32_t a){ - int32_t a1; - int32_t a0; - - a1 = (a + 127) >> 7; -#if GAMMA2 == (Q-1)/32 - a1 = (a1*1025 + (1 << 21)) >> 22; - a1 &= 15; -#elif GAMMA2 == (Q-1)/88 - a1 = (a1*11275 + (1 << 23)) >> 24; - a1 ^= ((43 - a1) >> 31) & a1; -#endif - - a0 = a - a1*2*GAMMA2; - a0 -= (((Q-1)/2 - a0) >> 31) & Q; - return a0; -} - -void poly_decompose_w0(poly *a0, const poly *a){ - unsigned int i; - - for(i = 0; i < N; ++i) - a0->coeffs[i] = decompose_w0(a->coeffs[i]); } \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 591f8ea5..e64f73a4 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -35,6 +35,4 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], uint8_t key[SEEDBYTES], const uint8_t sk[CRYPTO_SECRETKEYBYTES]); -void poly_decompose_w1(poly *a1, const poly *a); -void poly_decompose_w0(poly *a0, const poly *a); #endif \ No newline at end of file From f5a8a6588a8e0b837445e80b14dd50eb127d4dfa Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 13:53:31 +0100 Subject: [PATCH 12/32] Inline Keccak state --- crypto_sign/dilithium3/m4fstack/sign.c | 29 ++++++++++++++----------- crypto_sign/dilithium3/m4fstack/stack.c | 16 +++++--------- crypto_sign/dilithium3/m4fstack/stack.h | 5 +++-- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index ee2fcab6..5398e0e1 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -94,7 +94,10 @@ int crypto_sign_signature(uint8_t *sig, uint8_t wcomp[K][768]; uint8_t ccomp[68]; poly tmp0; - shake256incctx state; + union { + shake128incctx s128; + shake256incctx s256; + } state; smallpoly stmp0; smallpoly cp_small; @@ -109,11 +112,11 @@ int crypto_sign_signature(uint8_t *sig, unpack_sk_stack(rho, tr, key, sk); /* Compute mu = CRH(tr, msg) */ - shake256_inc_init(&state); - shake256_inc_absorb(&state, tr, TRBYTES); - shake256_inc_absorb(&state, m, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, tr, TRBYTES); + shake256_inc_absorb(&state.s256, m, mlen); + shake256_inc_finalize(&state.s256); + shake256_inc_squeeze(mu, CRHBYTES, &state.s256); for (n = 0; n < RNDBYTES; n++) { rnd[n] = 0; @@ -135,7 +138,7 @@ int crypto_sign_signature(uint8_t *sig, /* Matrix-vector multiplication */ for (size_t k_idx = 0; k_idx < K; k_idx++) { // sampling of y and packing into wcomp inlined into the basemul - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx, &state.s128); } } nonce++; @@ -149,11 +152,11 @@ int crypto_sign_signature(uint8_t *sig, polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); } - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); - shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES); - shake256_inc_finalize(&state); - shake256_inc_squeeze(sig, CTILDEBYTES, &state); + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, mu, CRHBYTES); + shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state.s256); + shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256); poly_challenge(&tmp0, sig); poly_challenge_compress(ccomp, &tmp0); @@ -166,7 +169,7 @@ int crypto_sign_signature(uint8_t *sig, small_ntt(stmp0.coeffs); poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); - poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx); + poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256); poly_reduce(&tmp0); diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 2bf0b97e..2824bc77 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -231,17 +231,15 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { // TODO: in the end increase this buffer size as far as possible #define POLY_UNIFORM_BUFFERSIZE 3 -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce){ - // TODO: externalize the Keccak state - shake128incctx state; +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ int32_t t; uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; { size_t ctr = 0; - stream128_init(&state, seed, nonce); + stream128_init(state, seed, nonce); do { - shake128_inc_squeeze(buf, sizeof buf, &state); + shake128_inc_squeeze(buf, sizeof buf, state); for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){ t = buf[pos]; @@ -328,14 +326,12 @@ static void polyz_unpack_inplace(int32_t *r){ } -void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce){ - // TODO: externalize the state - shake256incctx state; +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; - stream256_init(&state, seed, nonce); + stream256_init(state, seed, nonce); for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ - shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, &state); + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); polyz_unpack_inplace(buf); for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index e64f73a4..38626a61 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -8,6 +8,7 @@ #include "fips202.h" void poly_challenge_compress(uint8_t c[68], const poly *cp); +// TODO: remove this one void poly_challenge_decompress(poly *cp, const uint8_t c[68]); @@ -25,8 +26,8 @@ void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce); -void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce); +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); From 10d4766ea59d07683989e1eb255f490e2892d8ac Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 14:02:48 +0100 Subject: [PATCH 13/32] Shared buffer for polynomials --- crypto_sign/dilithium3/m4fstack/sign.c | 81 +++++++++++++++----------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 5398e0e1..eff33f33 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -93,14 +93,25 @@ int crypto_sign_signature(uint8_t *sig, unsigned int n; uint8_t wcomp[K][768]; uint8_t ccomp[68]; - poly tmp0; + union { shake128incctx s128; shake256incctx s256; } state; - smallpoly stmp0; - smallpoly cp_small; + // TODO: change this to union + struct { + poly full; + struct { + smallpoly stmp0; + smallpoly stmp1; + } small; + } polybuffer; + + poly *tmp0 = &polybuffer.full; + smallpoly *stmp0 = &polybuffer.small.stmp0; + smallpoly *scp = &polybuffer.small.stmp1; + smallhalfpoly cp_small_prime; rho = seedbuf; @@ -132,24 +143,24 @@ int crypto_sign_signature(uint8_t *sig, for (size_t l_idx = 0; l_idx < L; l_idx++) { /* Sample intermediate vector y */ - poly_uniform_gamma1(&tmp0, rhoprime, L*nonce + l_idx); - poly_ntt(&tmp0); + poly_uniform_gamma1(tmp0, rhoprime, L*nonce + l_idx); + poly_ntt(tmp0); /* Matrix-vector multiplication */ for (size_t k_idx = 0; k_idx < K; k_idx++) { // sampling of y and packing into wcomp inlined into the basemul - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], &tmp0, rho, (k_idx << 8) + l_idx, &state.s128); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], tmp0, rho, (k_idx << 8) + l_idx, &state.s128); } } nonce++; for (size_t k_idx = 0; k_idx < K; k_idx++) { - polyw_unpack(&tmp0, wcomp[k_idx]); - poly_invntt_tomont(&tmp0); - poly_caddq(&tmp0); + polyw_unpack(tmp0, wcomp[k_idx]); + poly_invntt_tomont(tmp0); + poly_caddq(tmp0); - polyw_pack(wcomp[k_idx], &tmp0); - poly_highbits(&tmp0, &tmp0); - polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], &tmp0); + polyw_pack(wcomp[k_idx], tmp0); + poly_highbits(tmp0, tmp0); + polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], tmp0); } shake256_inc_init(&state.s256); @@ -157,26 +168,26 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES); shake256_inc_finalize(&state.s256); shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256); - poly_challenge(&tmp0, sig); + poly_challenge(tmp0, sig); - poly_challenge_compress(ccomp, &tmp0); + poly_challenge_compress(ccomp, tmp0); - poly_small_ntt_precomp(&cp_small, &cp_small_prime, &tmp0); + poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); /* Compute z, reject if it reveals secret */ for(size_t l_idx=0;l_idx < L; l_idx++){ - unpack_sk_s1(&stmp0, sk, l_idx); - small_ntt(stmp0.coeffs); - poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); + unpack_sk_s1(stmp0, sk, l_idx); + small_ntt(stmp0->coeffs); + poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); - poly_uniform_gamma1_add_stack(&tmp0, &tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256); + poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256); - poly_reduce(&tmp0); + poly_reduce(tmp0); - if(poly_chknorm(&tmp0, GAMMA1 - BETA)) + if(poly_chknorm(tmp0, GAMMA1 - BETA)) goto rej; - polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, &tmp0); + polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0); } @@ -187,36 +198,36 @@ int crypto_sign_signature(uint8_t *sig, * do not reveal secret information */ for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { - unpack_sk_s2(&stmp0, sk, k_idx); - small_ntt(stmp0.coeffs); - poly_small_basemul_invntt(&tmp0, &cp_small, &cp_small_prime, &stmp0); + unpack_sk_s2(stmp0, sk, k_idx); + small_ntt(stmp0->coeffs); + poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); - polyw_sub(&tmp0, wcomp[k_idx], &tmp0); - poly_reduce(&tmp0); + polyw_sub(tmp0, wcomp[k_idx], tmp0); + poly_reduce(tmp0); - polyw_pack(wcomp[k_idx], &tmp0); + polyw_pack(wcomp[k_idx], tmp0); - poly_lowbits(&tmp0, &tmp0); - poly_reduce(&tmp0); - if(poly_chknorm(&tmp0, GAMMA2 - BETA)){ + poly_lowbits(tmp0, tmp0); + poly_reduce(tmp0); + if(poly_chknorm(tmp0, GAMMA2 - BETA)){ goto rej; } - poly_schoolbook(&tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + + poly_schoolbook(tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES); /* Compute hints for w1 */ - if(poly_chknorm(&tmp0, GAMMA2)) { + if(poly_chknorm(tmp0, GAMMA2)) { goto rej; } - hint_n += poly_make_hint_stack(&tmp0, &tmp0, wcomp[k_idx]); + hint_n += poly_make_hint_stack(tmp0, tmp0, wcomp[k_idx]); if (hint_n > OMEGA) { goto rej; } - pack_sig_h(sig, &tmp0, k_idx, &hints_written); + pack_sig_h(sig, tmp0, k_idx, &hints_written); } pack_sig_h_zero(sig, &hints_written); *siglen = CRYPTO_BYTES; From 280423741d94e335182059c122272508758a9edf Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 14:02:58 +0100 Subject: [PATCH 14/32] rm 257 FFT --- crypto_sign/dilithium3/m4fstack/smallpoly.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h index caa26261..f2cf843b 100644 --- a/crypto_sign/dilithium3/m4fstack/smallpoly.h +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h @@ -6,7 +6,6 @@ -#if DILITHIUM_MODE == 3 // use q=769 #define SMALL_POLY_16_BIT typedef struct { int16_t coeffs[N]; @@ -14,18 +13,6 @@ typedef struct { typedef smallpoly smallhalfpoly; -#else // use q=257 -#define SMALL_POLY_32_BIT -typedef struct { - int32_t coeffs[N]; -} smallpoly; - -typedef struct { - int16_t coeffs[N]; -} smallhalfpoly; -#endif - - void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in); void polyvecl_small_ntt(smallpoly v[L]); void polyveck_small_ntt(smallpoly v[K]); From d30a7662f1fa6df0f09880436e3581ac95718431 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 14:05:25 +0100 Subject: [PATCH 15/32] Union for small and big poly --- crypto_sign/dilithium3/m4fstack/sign.c | 12 ++++++++---- crypto_sign/dilithium3/m4fstack/stack.h | 1 - 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index eff33f33..9a3346c5 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -99,8 +99,7 @@ int crypto_sign_signature(uint8_t *sig, shake256incctx s256; } state; - // TODO: change this to union - struct { + union { poly full; struct { smallpoly stmp0; @@ -172,10 +171,12 @@ int crypto_sign_signature(uint8_t *sig, poly_challenge_compress(ccomp, tmp0); - poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); - /* Compute z, reject if it reveals secret */ for(size_t l_idx=0;l_idx < L; l_idx++){ + if(l_idx != 0){ + poly_challenge_decompress(tmp0, ccomp); + } + poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); unpack_sk_s1(stmp0, sk, l_idx); small_ntt(stmp0->coeffs); poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); @@ -198,6 +199,9 @@ int crypto_sign_signature(uint8_t *sig, * do not reveal secret information */ for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { + poly_challenge_decompress(tmp0, ccomp); + poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); + unpack_sk_s2(stmp0, sk, k_idx); small_ntt(stmp0->coeffs); poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 38626a61..6597b78e 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -8,7 +8,6 @@ #include "fips202.h" void poly_challenge_compress(uint8_t c[68], const poly *cp); -// TODO: remove this one void poly_challenge_decompress(poly *cp, const uint8_t c[68]); From a37b5a627a966a247fe994a7e089b557e7e7b47b Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 15:23:42 +0100 Subject: [PATCH 16/32] Eliminate some smaller buffers --- crypto_sign/dilithium3/m4fstack/sign.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 9a3346c5..2e0a66f9 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -87,8 +87,9 @@ int crypto_sign_signature(uint8_t *sig, size_t mlen, const uint8_t *sk) { - uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES]; - uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd; + uint8_t buf[2 * CRHBYTES]; + uint8_t *mu, *rhoprime, *rnd; + const uint8_t *rho, *tr, *key; uint16_t nonce = 0; unsigned int n; uint8_t wcomp[K][768]; @@ -113,11 +114,12 @@ int crypto_sign_signature(uint8_t *sig, smallhalfpoly cp_small_prime; - rho = seedbuf; - tr = rho + SEEDBYTES; - key = tr + TRBYTES; - rnd = key + SEEDBYTES; - mu = rnd + RNDBYTES; + rho = sk; + tr = sk + SEEDBYTES*2; + key = sk + SEEDBYTES; + + mu = buf; + rnd = mu + CRHBYTES; rhoprime = mu + CRHBYTES; unpack_sk_stack(rho, tr, key, sk); @@ -128,10 +130,18 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_finalize(&state.s256); shake256_inc_squeeze(mu, CRHBYTES, &state.s256); + // Note: RNDBYTES < CRHBYTES, so buffer has proper size for (n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } - shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); + + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, key, SEEDBYTES); + shake256_inc_absorb(&state.s256, rnd, RNDBYTES); + shake256_inc_absorb(&state.s256, mu, CRHBYTES); + shake256_inc_finalize(&state.s256); + // rnd can be overwritten here + shake256_inc_squeeze(rhoprime, CRHBYTES, &state.s256); rej: for (size_t k_idx = 0; k_idx < K; k_idx++) { From 2bd00ad4dafb09c9abbd4f98f3ff278ed4624d55 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 16:31:36 +0100 Subject: [PATCH 17/32] Remove asym small mul --- crypto_sign/dilithium3/m4fstack/sign.c | 10 +- crypto_sign/dilithium3/m4fstack/smallntt.S | 195 +++++++++----------- crypto_sign/dilithium3/m4fstack/smallntt.h | 6 +- crypto_sign/dilithium3/m4fstack/smallpoly.c | 13 +- crypto_sign/dilithium3/m4fstack/smallpoly.h | 7 +- 5 files changed, 99 insertions(+), 132 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 2e0a66f9..ff4096e1 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -112,8 +112,6 @@ int crypto_sign_signature(uint8_t *sig, smallpoly *stmp0 = &polybuffer.small.stmp0; smallpoly *scp = &polybuffer.small.stmp1; - smallhalfpoly cp_small_prime; - rho = sk; tr = sk + SEEDBYTES*2; key = sk + SEEDBYTES; @@ -186,10 +184,10 @@ int crypto_sign_signature(uint8_t *sig, if(l_idx != 0){ poly_challenge_decompress(tmp0, ccomp); } - poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); + poly_small_ntt_copy(scp, tmp0); unpack_sk_s1(stmp0, sk, l_idx); small_ntt(stmp0->coeffs); - poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); + poly_small_basemul_invntt(tmp0, scp, stmp0); poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256); @@ -210,11 +208,11 @@ int crypto_sign_signature(uint8_t *sig, for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { poly_challenge_decompress(tmp0, ccomp); - poly_small_ntt_precomp(scp, &cp_small_prime, tmp0); + poly_small_ntt_copy(scp, tmp0); unpack_sk_s2(stmp0, sk, k_idx); small_ntt(stmp0->coeffs); - poly_small_basemul_invntt(tmp0, scp, &cp_small_prime, stmp0); + poly_small_basemul_invntt(tmp0, scp, stmp0); polyw_sub(tmp0, wcomp[k_idx], tmp0); poly_reduce(tmp0); diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S index 747c111c..a9a4a576 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.S +++ b/crypto_sign/dilithium3/m4fstack/smallntt.S @@ -111,7 +111,6 @@ .align 2 small_ntt_asm: push {r4-r11, r14} - vpush.w {s16} poly .req r0 twiddle_ptr .req r1 @@ -137,33 +136,33 @@ small_ntt_asm: .equ offset, 32 .equ strincr, 4 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} + vldm twiddle_ptr!, {s20-s27} add tmp, poly, #strincr*8 - vmov s16, tmp + vmov s12, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 + vmov twiddle, s24 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s13 + vmov twiddle, s25 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s14 + vmov twiddle, s26 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s15 + vmov twiddle, s27 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -183,7 +182,7 @@ small_ntt_asm: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 + _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -247,7 +246,7 @@ small_ntt_asm: str.w poly7, [poly, #6*distance/4+offset] str.w poly0, [poly], #4 - vmov tmp, s16 + vmov tmp, s12 cmp.w poly, tmp bne.w 1b @@ -277,7 +276,6 @@ small_ntt_asm: cmp.w poly, tmp bne.w 2b - vpop.w {s16} pop {r4-r11, pc} @@ -495,32 +493,32 @@ small_invntt_tomont_asm: .equ strincr, 64 // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} + vldm twiddle_ptr!, {s20-s27} add.w tmp, poly, #8*strincr - vmov s8, tmp + vmov s12, tmp 1: // load a1, a3, ..., a15 load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 + vmov twiddle, s24 mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s13 + vmov twiddle, s25 mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s14 + vmov twiddle, s26 mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - vmov twiddle, s15 + vmov twiddle, s27 mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv @@ -540,7 +538,7 @@ small_invntt_tomont_asm: load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 // layer 4 - 1 // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) @@ -604,7 +602,7 @@ small_invntt_tomont_asm: str.w poly7, [poly, #6*distance/4+offset] str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - vmov tmp, s8 + vmov tmp, s12 cmp.w poly, tmp bne.w 1b @@ -618,9 +616,9 @@ small_invntt_tomont_asm: load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - vldm twiddle_ptr!, {s5-s7} + vldm twiddle_ptr!, {s21-s23} - _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2 + _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 vmov.w s2, poly movw poly, #:lower16:5585133 @@ -742,96 +740,69 @@ small_invntt_tomont_asm: .unreq tmp .unreq tmp2 -.align 2 -.global small_pointmul_asm -.type small_pointmul_asm, %function -small_pointmul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - - .equ width, 4 - - add.w r12, r2, #64*2 - _point_mul_16_loop: - - ldr.w r7, [r1, #2*width] - ldr.w r8, [r1, #3*width] - ldrsh.w r9, [r2, #1*2] - ldr.w r5, [r1, #1*width] - ldr.w r4, [r1], #4*width - ldrsh.w r6, [r2], #2*2 - - smultb r10, r4, r6 - montgomery r14, r14, r10, r11 - pkhbt r4, r4, r11 - - - neg.w r6, r6 - - smultb r10, r5, r6 - montgomery r14, r14, r10, r11 - pkhbt r5, r5, r11 - - str.w r5, [r0, #1*width] - str.w r4, [r0], #2*width - - smultb r10, r7, r9 - montgomery r14, r14, r10, r11 - pkhbt r7, r7, r11 - - neg.w r9, r9 - - smultb r10, r8, r9 - montgomery r14, r14, r10, r11 - pkhbt r8, r8, r11 +// BASEMUL - str.w r8, [r0, #1*width] - str.w r7, [r0], #2*width - cmp.w r2, r12 - bne.w _point_mul_16_loop - - pop.w {r4-r11, pc} - - .align 2 -.global small_asymmetric_mul_asm -.type small_asymmetric_mul_asm, %function -small_asymmetric_mul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - .equ width, 4 - add.w r12, r0, #256*2 - _asymmetric_mul_16_loop: - ldr.w r7, [r1, #width] - ldr.w r4, [r1], #2*width - ldr.w r8, [r2, #width] - ldr.w r5, [r2], #2*width - ldr.w r9, [r3, #width] - ldr.w r6, [r3], #2*width - - smuad r10, r4, r6 - montgomery r14, r14, r10, r6 - smuadx r11, r4, r5 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - - str.w r10, [r0], #width - - smuad r10, r7, r9 - montgomery r14, r14, r10, r6 - smuadx r11, r7, r8 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - str.w r10, [r0], #width - - - cmp.w r0, r12 - bne.w _asymmetric_mul_16_loop +.global small_basemul_asm +.type small_basemul_asm, %function +.align 2 +small_basemul_asm: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + zeta_ptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 // TODO: remove poly3 + q .req r8 + qinv .req r8 + tmp .req r9 + tmp2 .req r10 + tmp3 .req r11 + zeta .req r12 + ctr .req r14 + + movw q, #769 + movt qinv, #767 + add ctr, rptr, #64*2*4 + 1: - pop.w {r4-r11, pc} \ No newline at end of file + ldr poly2, [aptr, #4] + ldr poly3, [bptr, #4] + ldrh.w zeta, [zeta_ptr], #2 + ldr poly0, [aptr], #8 + ldr poly1, [bptr], #8 + + //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smultt tmp, poly0, poly1 + montgomery q, qinv, tmp, tmp2 + smultb tmp2, tmp2, zeta + smlabb tmp2, poly0, poly1, tmp2 + montgomery q, qinv, tmp2, tmp + + smuadx tmp2, poly0, poly1 + montgomery q, qinv, tmp2, tmp3 + pkhtb tmp, tmp3, tmp, asr#16 + str tmp, [rptr], #4 + + neg zeta, zeta + + //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smultt tmp, poly2, poly3 + montgomery q, qinv, tmp, tmp2 + smultb tmp2, tmp2, zeta + smlabb tmp2, poly2, poly3, tmp2 + montgomery q, qinv, tmp2, tmp + + smuadx tmp2, poly2, poly3 + montgomery q, qinv, tmp2, tmp3 + pkhtb tmp, tmp3, tmp, asr#16 + + str tmp, [rptr], #4 + cmp.w rptr, ctr + bne.w 1b + + pop {r4-r11, pc} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h index 0aa0ce9b..048d5df5 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.h +++ b/crypto_sign/dilithium3/m4fstack/smallntt.h @@ -42,12 +42,10 @@ static const int16_t zetas_inv_CT_asm[256] = { void small_ntt_asm(int16_t a[N], const int16_t * zetas); void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas); -void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas); -void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]); +void small_basemul_asm(int16_t *c, const int16_t *a, const int16_t *b, const int16_t *zetas); #define small_ntt(a) small_ntt_asm(a, zetas_asm) #define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm) -#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas) -#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime); +#define small_basemul(r,a,b) small_basemul_asm(r, a, b, zetas) #endif diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c index 1f7fab17..433d98af 100644 --- a/crypto_sign/dilithium3/m4fstack/smallpoly.c +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c @@ -1,13 +1,12 @@ #include "smallpoly.h" #include "smallntt.h" -void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) { - for (int i = N; i >= 0; i--) +void poly_small_ntt_copy(smallpoly *out, poly *in) { + for (int i = N - 1; i >= 0; i--) { out->coeffs[i] = in->coeffs[i]; } small_ntt(out->coeffs); - small_point_mul(out2->coeffs, out->coeffs); } @@ -28,10 +27,10 @@ void polyveck_small_ntt(smallpoly v[K]) { -void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){ +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b){ // re-use the buffer smallpoly *tmp = (smallpoly *)r; - small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs); + small_basemul(tmp->coeffs, a->coeffs, b->coeffs); small_invntt_tomont(tmp->coeffs); #ifdef SMALL_POLY_16_BIT @@ -43,10 +42,10 @@ void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly #endif } -void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){ +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]){ unsigned int i; for(i=0;ivec[i], a, aprime, &b[i]); + poly_small_basemul_invntt(&r->vec[i], a, &b[i]); } } diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h index f2cf843b..1aac98fa 100644 --- a/crypto_sign/dilithium3/m4fstack/smallpoly.h +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h @@ -13,13 +13,14 @@ typedef struct { typedef smallpoly smallhalfpoly; -void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in); +void poly_small_ntt_copy(smallpoly*, poly*); + void polyvecl_small_ntt(smallpoly v[L]); void polyveck_small_ntt(smallpoly v[K]); -void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]); -void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b); +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]); +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b); void small_polyeta_unpack(smallpoly *r, const uint8_t *a); From 77a75728bca65be08f897f5dbeb2704cd34e8d6e Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 16:45:16 +0100 Subject: [PATCH 18/32] Stack friendly uniform_gamma1 w/o add --- crypto_sign/dilithium3/m4fstack/sign.c | 2 +- crypto_sign/dilithium3/m4fstack/stack.c | 13 +++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index ff4096e1..ab1426ce 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -150,7 +150,7 @@ int crypto_sign_signature(uint8_t *sig, for (size_t l_idx = 0; l_idx < L; l_idx++) { /* Sample intermediate vector y */ - poly_uniform_gamma1(tmp0, rhoprime, L*nonce + l_idx); + poly_uniform_gamma1_stack(tmp0, rhoprime, L*nonce + l_idx, &state.s256); poly_ntt(tmp0); /* Matrix-vector multiplication */ diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 2824bc77..d3256c8b 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -325,6 +325,19 @@ static void polyz_unpack_inplace(int32_t *r){ #endif } +void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ + int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; + + stream256_init(state, seed, nonce); + for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); + polyz_unpack_inplace(buf); + + for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ + a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j]; + } + } +} void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 6597b78e..c21714c7 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -26,6 +26,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); From 6609f829d81fc7b556944218a274f4e7e6524ce4 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 18 Mar 2024 17:55:15 +0100 Subject: [PATCH 19/32] Stack optimized Dilithium{2,5} --- crypto_sign/dilithium2/m4fstack/api.h | 1 + crypto_sign/dilithium2/m4fstack/config.h | 1 + crypto_sign/dilithium2/m4fstack/macros.i | 1 + crypto_sign/dilithium2/m4fstack/ntt.S | 1 + crypto_sign/dilithium2/m4fstack/ntt.h | 1 + crypto_sign/dilithium2/m4fstack/packing.c | 1 + crypto_sign/dilithium2/m4fstack/packing.h | 1 + crypto_sign/dilithium2/m4fstack/params.h | 1 + crypto_sign/dilithium2/m4fstack/pointwise_mont.h | 1 + crypto_sign/dilithium2/m4fstack/pointwise_mont.s | 1 + crypto_sign/dilithium2/m4fstack/poly.c | 1 + crypto_sign/dilithium2/m4fstack/poly.h | 1 + crypto_sign/dilithium2/m4fstack/polyvec.c | 1 + crypto_sign/dilithium2/m4fstack/polyvec.h | 1 + crypto_sign/dilithium2/m4fstack/reduce.h | 1 + crypto_sign/dilithium2/m4fstack/rounding.c | 1 + crypto_sign/dilithium2/m4fstack/rounding.h | 1 + crypto_sign/dilithium2/m4fstack/sign.c | 1 + crypto_sign/dilithium2/m4fstack/sign.h | 1 + crypto_sign/dilithium2/m4fstack/smallntt.S | 1 + crypto_sign/dilithium2/m4fstack/smallntt.h | 1 + crypto_sign/dilithium2/m4fstack/smallpoly.c | 1 + crypto_sign/dilithium2/m4fstack/smallpoly.h | 1 + crypto_sign/dilithium2/m4fstack/stack.c | 1 + crypto_sign/dilithium2/m4fstack/stack.h | 1 + crypto_sign/dilithium2/m4fstack/symmetric-shake.c | 1 + crypto_sign/dilithium2/m4fstack/symmetric.h | 1 + crypto_sign/dilithium2/m4fstack/vector.h | 1 + crypto_sign/dilithium2/m4fstack/vector.s | 1 + crypto_sign/dilithium5/m4fstack/api.h | 1 + crypto_sign/dilithium5/m4fstack/config.h | 1 + crypto_sign/dilithium5/m4fstack/macros.i | 1 + crypto_sign/dilithium5/m4fstack/ntt.S | 1 + crypto_sign/dilithium5/m4fstack/ntt.h | 1 + crypto_sign/dilithium5/m4fstack/packing.c | 1 + crypto_sign/dilithium5/m4fstack/packing.h | 1 + crypto_sign/dilithium5/m4fstack/params.h | 1 + crypto_sign/dilithium5/m4fstack/pointwise_mont.h | 1 + crypto_sign/dilithium5/m4fstack/pointwise_mont.s | 1 + crypto_sign/dilithium5/m4fstack/poly.c | 1 + crypto_sign/dilithium5/m4fstack/poly.h | 1 + crypto_sign/dilithium5/m4fstack/polyvec.c | 1 + crypto_sign/dilithium5/m4fstack/polyvec.h | 1 + crypto_sign/dilithium5/m4fstack/reduce.h | 1 + crypto_sign/dilithium5/m4fstack/rounding.c | 1 + crypto_sign/dilithium5/m4fstack/rounding.h | 1 + crypto_sign/dilithium5/m4fstack/sign.c | 1 + crypto_sign/dilithium5/m4fstack/sign.h | 1 + crypto_sign/dilithium5/m4fstack/smallntt.S | 1 + crypto_sign/dilithium5/m4fstack/smallntt.h | 1 + crypto_sign/dilithium5/m4fstack/smallpoly.c | 1 + crypto_sign/dilithium5/m4fstack/smallpoly.h | 1 + crypto_sign/dilithium5/m4fstack/stack.c | 1 + crypto_sign/dilithium5/m4fstack/stack.h | 1 + crypto_sign/dilithium5/m4fstack/symmetric-shake.c | 1 + crypto_sign/dilithium5/m4fstack/symmetric.h | 1 + crypto_sign/dilithium5/m4fstack/vector.h | 1 + crypto_sign/dilithium5/m4fstack/vector.s | 1 + 58 files changed, 58 insertions(+) create mode 120000 crypto_sign/dilithium2/m4fstack/api.h create mode 120000 crypto_sign/dilithium2/m4fstack/config.h create mode 120000 crypto_sign/dilithium2/m4fstack/macros.i create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.S create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.h create mode 120000 crypto_sign/dilithium2/m4fstack/packing.c create mode 120000 crypto_sign/dilithium2/m4fstack/packing.h create mode 120000 crypto_sign/dilithium2/m4fstack/params.h create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.h create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.s create mode 120000 crypto_sign/dilithium2/m4fstack/poly.c create mode 120000 crypto_sign/dilithium2/m4fstack/poly.h create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.c create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.h create mode 120000 crypto_sign/dilithium2/m4fstack/reduce.h create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.c create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.h create mode 120000 crypto_sign/dilithium2/m4fstack/sign.c create mode 120000 crypto_sign/dilithium2/m4fstack/sign.h create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.S create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.h create mode 120000 crypto_sign/dilithium2/m4fstack/smallpoly.c create mode 120000 crypto_sign/dilithium2/m4fstack/smallpoly.h create mode 120000 crypto_sign/dilithium2/m4fstack/stack.c create mode 120000 crypto_sign/dilithium2/m4fstack/stack.h create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric-shake.c create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric.h create mode 120000 crypto_sign/dilithium2/m4fstack/vector.h create mode 120000 crypto_sign/dilithium2/m4fstack/vector.s create mode 120000 crypto_sign/dilithium5/m4fstack/api.h create mode 120000 crypto_sign/dilithium5/m4fstack/config.h create mode 120000 crypto_sign/dilithium5/m4fstack/macros.i create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.S create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.h create mode 120000 crypto_sign/dilithium5/m4fstack/packing.c create mode 120000 crypto_sign/dilithium5/m4fstack/packing.h create mode 120000 crypto_sign/dilithium5/m4fstack/params.h create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.h create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.s create mode 120000 crypto_sign/dilithium5/m4fstack/poly.c create mode 120000 crypto_sign/dilithium5/m4fstack/poly.h create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.c create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.h create mode 120000 crypto_sign/dilithium5/m4fstack/reduce.h create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.c create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.h create mode 120000 crypto_sign/dilithium5/m4fstack/sign.c create mode 120000 crypto_sign/dilithium5/m4fstack/sign.h create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.S create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.h create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.c create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.h create mode 120000 crypto_sign/dilithium5/m4fstack/stack.c create mode 120000 crypto_sign/dilithium5/m4fstack/stack.h create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric-shake.c create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric.h create mode 120000 crypto_sign/dilithium5/m4fstack/vector.h create mode 120000 crypto_sign/dilithium5/m4fstack/vector.s diff --git a/crypto_sign/dilithium2/m4fstack/api.h b/crypto_sign/dilithium2/m4fstack/api.h new file mode 120000 index 00000000..d29362d1 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/api.h @@ -0,0 +1 @@ +../m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/config.h b/crypto_sign/dilithium2/m4fstack/config.h new file mode 120000 index 00000000..f3892d90 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/config.h @@ -0,0 +1 @@ +../m4f/config.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/macros.i b/crypto_sign/dilithium2/m4fstack/macros.i new file mode 120000 index 00000000..d615b854 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/macros.i @@ -0,0 +1 @@ +../m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/ntt.S b/crypto_sign/dilithium2/m4fstack/ntt.S new file mode 120000 index 00000000..40cd5d40 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/ntt.S @@ -0,0 +1 @@ +../m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/ntt.h b/crypto_sign/dilithium2/m4fstack/ntt.h new file mode 120000 index 00000000..8e99caeb --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/packing.c b/crypto_sign/dilithium2/m4fstack/packing.c new file mode 120000 index 00000000..1052fe26 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/packing.c @@ -0,0 +1 @@ +../m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/packing.h b/crypto_sign/dilithium2/m4fstack/packing.h new file mode 120000 index 00000000..643cc32a --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/packing.h @@ -0,0 +1 @@ +../m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/params.h b/crypto_sign/dilithium2/m4fstack/params.h new file mode 120000 index 00000000..1f91a364 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/params.h @@ -0,0 +1 @@ +../m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.h b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h new file mode 120000 index 00000000..32558852 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h @@ -0,0 +1 @@ +../m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.s b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s new file mode 120000 index 00000000..3597ffdc --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s @@ -0,0 +1 @@ +../m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/poly.c b/crypto_sign/dilithium2/m4fstack/poly.c new file mode 120000 index 00000000..2544e75b --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/poly.h b/crypto_sign/dilithium2/m4fstack/poly.h new file mode 120000 index 00000000..7ef70e53 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.c b/crypto_sign/dilithium2/m4fstack/polyvec.c new file mode 120000 index 00000000..569a9a1b --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.h b/crypto_sign/dilithium2/m4fstack/polyvec.h new file mode 120000 index 00000000..d02c99c3 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/reduce.h b/crypto_sign/dilithium2/m4fstack/reduce.h new file mode 120000 index 00000000..45fbf228 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/reduce.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/rounding.c b/crypto_sign/dilithium2/m4fstack/rounding.c new file mode 120000 index 00000000..ec780689 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/rounding.c @@ -0,0 +1 @@ +../m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/rounding.h b/crypto_sign/dilithium2/m4fstack/rounding.h new file mode 120000 index 00000000..e64114bc --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/rounding.h @@ -0,0 +1 @@ +../m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/sign.c b/crypto_sign/dilithium2/m4fstack/sign.c new file mode 120000 index 00000000..ae3b84fa --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/sign.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/sign.h b/crypto_sign/dilithium2/m4fstack/sign.h new file mode 120000 index 00000000..551f979a --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/sign.h @@ -0,0 +1 @@ +../m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.S b/crypto_sign/dilithium2/m4fstack/smallntt.S new file mode 120000 index 00000000..7e2174f9 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallntt.S @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.h b/crypto_sign/dilithium2/m4fstack/smallntt.h new file mode 120000 index 00000000..cfd626b9 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallntt.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.c b/crypto_sign/dilithium2/m4fstack/smallpoly.c new file mode 120000 index 00000000..7dbf4992 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallpoly.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.h b/crypto_sign/dilithium2/m4fstack/smallpoly.h new file mode 120000 index 00000000..366391d9 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallpoly.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/stack.c b/crypto_sign/dilithium2/m4fstack/stack.c new file mode 120000 index 00000000..c89dc5a0 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/stack.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/stack.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/stack.h b/crypto_sign/dilithium2/m4fstack/stack.h new file mode 120000 index 00000000..c9aed5d7 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/stack.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/stack.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/symmetric-shake.c b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c new file mode 120000 index 00000000..b95855bb --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c @@ -0,0 +1 @@ +../m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/symmetric.h b/crypto_sign/dilithium2/m4fstack/symmetric.h new file mode 120000 index 00000000..e89ae955 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/vector.h b/crypto_sign/dilithium2/m4fstack/vector.h new file mode 120000 index 00000000..0793594b --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/vector.h @@ -0,0 +1 @@ +../m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/vector.s b/crypto_sign/dilithium2/m4fstack/vector.s new file mode 120000 index 00000000..1a496055 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/vector.s @@ -0,0 +1 @@ +../m4f/vector.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/api.h b/crypto_sign/dilithium5/m4fstack/api.h new file mode 120000 index 00000000..d29362d1 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/api.h @@ -0,0 +1 @@ +../m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/config.h b/crypto_sign/dilithium5/m4fstack/config.h new file mode 120000 index 00000000..f3892d90 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/config.h @@ -0,0 +1 @@ +../m4f/config.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/macros.i b/crypto_sign/dilithium5/m4fstack/macros.i new file mode 120000 index 00000000..d615b854 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/macros.i @@ -0,0 +1 @@ +../m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/ntt.S b/crypto_sign/dilithium5/m4fstack/ntt.S new file mode 120000 index 00000000..40cd5d40 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/ntt.S @@ -0,0 +1 @@ +../m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/ntt.h b/crypto_sign/dilithium5/m4fstack/ntt.h new file mode 120000 index 00000000..8e99caeb --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/packing.c b/crypto_sign/dilithium5/m4fstack/packing.c new file mode 120000 index 00000000..1052fe26 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/packing.c @@ -0,0 +1 @@ +../m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/packing.h b/crypto_sign/dilithium5/m4fstack/packing.h new file mode 120000 index 00000000..643cc32a --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/packing.h @@ -0,0 +1 @@ +../m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/params.h b/crypto_sign/dilithium5/m4fstack/params.h new file mode 120000 index 00000000..1f91a364 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/params.h @@ -0,0 +1 @@ +../m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.h b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h new file mode 120000 index 00000000..32558852 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h @@ -0,0 +1 @@ +../m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.s b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s new file mode 120000 index 00000000..3597ffdc --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s @@ -0,0 +1 @@ +../m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/poly.c b/crypto_sign/dilithium5/m4fstack/poly.c new file mode 120000 index 00000000..b5bdaa81 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/poly.c @@ -0,0 +1 @@ +../m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/poly.h b/crypto_sign/dilithium5/m4fstack/poly.h new file mode 120000 index 00000000..bd94e469 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/poly.h @@ -0,0 +1 @@ +../m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.c b/crypto_sign/dilithium5/m4fstack/polyvec.c new file mode 120000 index 00000000..569a9a1b --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.h b/crypto_sign/dilithium5/m4fstack/polyvec.h new file mode 120000 index 00000000..d02c99c3 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/reduce.h b/crypto_sign/dilithium5/m4fstack/reduce.h new file mode 120000 index 00000000..45fbf228 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/reduce.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/rounding.c b/crypto_sign/dilithium5/m4fstack/rounding.c new file mode 120000 index 00000000..ec780689 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/rounding.c @@ -0,0 +1 @@ +../m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/rounding.h b/crypto_sign/dilithium5/m4fstack/rounding.h new file mode 120000 index 00000000..e64114bc --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/rounding.h @@ -0,0 +1 @@ +../m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/sign.c b/crypto_sign/dilithium5/m4fstack/sign.c new file mode 120000 index 00000000..ae3b84fa --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/sign.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/sign.h b/crypto_sign/dilithium5/m4fstack/sign.h new file mode 120000 index 00000000..551f979a --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/sign.h @@ -0,0 +1 @@ +../m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.S b/crypto_sign/dilithium5/m4fstack/smallntt.S new file mode 120000 index 00000000..7e2174f9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallntt.S @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.h b/crypto_sign/dilithium5/m4fstack/smallntt.h new file mode 120000 index 00000000..cfd626b9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallntt.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.c b/crypto_sign/dilithium5/m4fstack/smallpoly.c new file mode 120000 index 00000000..7dbf4992 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallpoly.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.h b/crypto_sign/dilithium5/m4fstack/smallpoly.h new file mode 120000 index 00000000..366391d9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallpoly.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/stack.c b/crypto_sign/dilithium5/m4fstack/stack.c new file mode 120000 index 00000000..c89dc5a0 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/stack.c @@ -0,0 +1 @@ +../../dilithium3/m4fstack/stack.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/stack.h b/crypto_sign/dilithium5/m4fstack/stack.h new file mode 120000 index 00000000..c9aed5d7 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/stack.h @@ -0,0 +1 @@ +../../dilithium3/m4fstack/stack.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/symmetric-shake.c b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c new file mode 120000 index 00000000..b95855bb --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c @@ -0,0 +1 @@ +../m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/symmetric.h b/crypto_sign/dilithium5/m4fstack/symmetric.h new file mode 120000 index 00000000..e89ae955 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/vector.h b/crypto_sign/dilithium5/m4fstack/vector.h new file mode 120000 index 00000000..0793594b --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/vector.h @@ -0,0 +1 @@ +../m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/vector.s b/crypto_sign/dilithium5/m4fstack/vector.s new file mode 120000 index 00000000..1a496055 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/vector.s @@ -0,0 +1 @@ +../m4f/vector.s \ No newline at end of file From 59724a7b309ec9acdfae2f0dca65f06541d4ffa3 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 19 Mar 2024 16:39:48 +0100 Subject: [PATCH 20/32] Switch to Plantard-based 769 NTT --- .../dilithium2/m4fstack/macros_smallntt.i | 1 + .../dilithium3/m4fstack/macros_smallntt.i | 77 + crypto_sign/dilithium3/m4fstack/smallntt.S | 1341 ++++++++--------- crypto_sign/dilithium3/m4fstack/smallntt.h | 55 +- .../dilithium5/m4fstack/macros_smallntt.i | 1 + 5 files changed, 701 insertions(+), 774 deletions(-) create mode 120000 crypto_sign/dilithium2/m4fstack/macros_smallntt.i create mode 100644 crypto_sign/dilithium3/m4fstack/macros_smallntt.i create mode 120000 crypto_sign/dilithium5/m4fstack/macros_smallntt.i diff --git a/crypto_sign/dilithium2/m4fstack/macros_smallntt.i b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i new file mode 120000 index 00000000..fc731f12 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i @@ -0,0 +1 @@ +../../dilithium3/m4fstack/macros_smallntt.i \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i new file mode 100644 index 00000000..b97f4d52 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i @@ -0,0 +1,77 @@ +/* +* NTT and inverse NTT code from: +* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. +* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. +* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. +* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S +*/ + +#ifndef MACROS_SMALLNTT_I +#define MACROS_SMALLNTT_I + +// general macros +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro doubleplant a, tmp, q, qa, plantconst + smulwb \tmp, \plantconst, \a + smulwt \a, \plantconst, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebarrett a, tmp, tmp2, q, barrettconst + smulbb \tmp, \a, \barrettconst + smultb \tmp2, \a, \barrettconst + asr \tmp, \tmp, #26 + asr \tmp2, \tmp2, #26 + smulbb \tmp, \tmp, \q + smulbb \tmp2, \tmp2, \q + pkhbt \tmp, \tmp, \tmp2, lsl#16 + usub16 \a, \a, \tmp +.endm + +// q locate in the top half of the register +.macro plant_red q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatt \tmp, \tmp, \q, \qa + // result in high half +.endm + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt.S index a9a4a576..9f048042 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.S +++ b/crypto_sign/dilithium3/m4fstack/smallntt.S @@ -1,283 +1,247 @@ +/* +* NTT and inverse NTT code from: +* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. +* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. +* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. +* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S +*/ + #include "macros.i" .syntax unified .cpu cortex-m4 .thumb -// general macros -.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - ldr.w \a0, [\a, \mem0] - ldr.w \a1, [\a, \mem1] - ldr.w \a2, [\a, \mem2] - ldr.w \a3, [\a, \mem3] -.endm - -.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - str.w \a0, [\a, \mem0] - str.w \a1, [\a, \mem1] - str.w \a2, [\a, \mem2] - str.w \a3, [\a, \mem3] -.endm - -.macro montgomery q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \tmp, \q, \tmp, \a -.endm - -.macro montgomery_inplace q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \a, \q, \tmp, \a -.endm - -.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst - smulbb \tmp2, \a, \montconst - montgomery \q, \qinv, \tmp2, \tmp - smultb \a, \a, \montconst - montgomery \q, \qinv, \a, \tmp2 - pkhtb \a, \tmp2, \tmp, asr#16 -.endm - +#include "macros_smallntt.i" // ####### // ####### // # NTT # // ####### // ####### -.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a, \twiddle - smult\tb \a, \a, \twiddle - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2 - pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves -.endm +.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa -.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb - smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp - pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves - usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs) - uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs) -.endm - -.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv - doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv - doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv -.endm + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa -.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w \twiddle, [\twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa - // layer 2 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa .endm -.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2 - // layer 3 - vmov \twiddle, \xi01 - two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime +.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp + // layer 3 + vmov \twiddle1, \xi0 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa - // layer 2 - vmov \twiddle, \xi23 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + // layer 2 + vmov \twiddle1, \xi1 + vmov \twiddle2, \xi2 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa - // layer 1 - vmov \twiddle, \xi45 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + // layer 1 + vmov \twiddle1, \xi3 + vmov \twiddle2, \xi4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa - vmov \twiddle, \xi67 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + vmov \twiddle1, \xi5 + vmov \twiddle2, \xi6 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa .endm -.global small_ntt_asm -.type small_ntt_asm, %function +.global small_ntt_asm_769 +.type small_ntt_asm_769, %function .align 2 -small_ntt_asm: - push {r4-r11, r14} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 256 - .equ offset, 32 - .equ strincr, 4 - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} - - - add tmp, poly, #strincr*8 - vmov s12, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s25 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s26 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s27 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, qinv, q, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #4 - - vmov tmp, s12 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - - .equ distance, distance/16 - .equ strincr, 32 - - add.w tmp, poly, #strincr*16 - vmov s13, tmp - - 2: - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - pop {r4-r11, pc} - +small_ntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s24} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + ### qinv .req r11 ### q^-1 mod 2^2n; n=16 + q .req r12 + ### at the top of r12 + qa .req r0 + ### qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + // movw qa, #24608 + // Why movt? Because we initially placed qa at the bottom of the same register as q; + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load 15 twiddle factors to 15 FPU registers + // s0-s7 used to temporary store 16 16-bit polys. + vldm twiddle_ptr!, {s8-s22} + + add tmp, poly, #strincr*8 + // s23: poly addr + // s24: tmp + vmov s24, tmp + 1: + // load a1, a3, ..., a15 + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // s15, s16, s17, s18, s19, s20, s21, s22 left + // multiply coeffs by layer 8 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + vmov poly, s23 + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle1, s1 // load a3 + uadd16 tmp, poly1, twiddle1 + usub16 poly1, poly1, twiddle1 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle1, s3 // load a7 + uadd16 tmp, poly3, twiddle1 + usub16 poly3, poly3, twiddle1 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle1, s5 // load a11 + uadd16 tmp, poly5, twiddle1 + usub16 poly5, poly5, twiddle1 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle1, s7 // load a15 + uadd16 tmp, poly7, twiddle1 + usub16 poly7, poly7, twiddle1 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle1, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle1, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle1, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle1, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle1, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle1, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle1, poly0, poly1 + str.w twiddle1, [poly, #offset] + str.w tmp, [poly], #4 + + vmov tmp, s24 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + 2: + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s23 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s24} + pop {r4-r11, pc} .unreq poly .unreq twiddle_ptr @@ -289,11 +253,12 @@ small_ntt_asm: .unreq poly5 .unreq poly6 .unreq poly7 -.unreq twiddle -.unreq qinv +.unreq twiddle1 +.unreq twiddle2 .unreq q +.unreq qa .unreq tmp -.unreq tmp2 + // ######## // ######## @@ -301,428 +266,296 @@ small_ntt_asm: // ######## // ######## -.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv - uadd16 \tmp, \a0, \a1 - usub16 \a1, \a0, \a1 - mov.w \a0, \tmp -.endm - -.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv - doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv - doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv -.endm - -.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - // layer 2 - sadd16.w \c6, \tmp, \tmp2 // c0, c2 - ssub16.w \tmp2, \tmp, \tmp2 - sadd16.w \c4, \c0, \c2 // c4, c6 - ssub16.w \c2, \c0, \c2 - - vmov.w \twiddle, \xi12 - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - +// input: 0.5/1q +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 1q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 1q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 2q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 1.5q maximum; + + // tmp and c0 are free at this point + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 4q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 2q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 4.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 2.5q maximum .endm +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa -.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - mov.w \c6, \tmp - mov.w \c4, \c0 - - // layer 2 - vmov.w \twiddle, \xi12 - doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - -.endm + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa -.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w twiddle, [twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa - // layer 2 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa - two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa .endm -.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2 - smulb\tb \tmp, \a, \twiddle - smmulr.w \tmp2, \tmp, \Qbar - mls.w \tmp, \tmp2, \Q, \tmp - smult\tb \a, \a, \twiddle - smmulr.w \tmp2, \a, \Qbar - mls.w \a, \tmp2, \Q, \a - pkhbt \a, \tmp, \a, lsl #16 +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa .endm - -.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2 - - movt \Q, #0 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - movt \Q, #767 - -.endm - -.global small_invntt_tomont_asm -.type small_invntt_tomont_asm, %function +# input coefficients < 0.5q +.global small_invntt_asm_769 +.type small_invntt_asm_769, %function .align 2 -small_invntt_tomont_asm: - push {r4-r11, r14} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 16 - .equ offset, 32 - .equ strincr, 64 - - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s20-s27} - - add.w tmp, poly, #8*strincr - vmov s12, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s24 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s25 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s26 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s27 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - - vmov tmp, s12 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - .equ distance, distance*16 - .equ strincr, 4 - - // ITER 0 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - vldm twiddle_ptr!, {s21-s23} - - _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s20, s21, s22, s23, twiddle, q, qinv, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - // ITER 1-12 - add.w tmp, poly, #strincr*3*(3+1) - vmov s14, tmp - 3: - add.w tmp, poly, #strincr*3 - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - // polys upto 9q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s14 - cmp.w poly, tmp - bne.w 3b - - // ITER 13-15 - add tmp, poly, #3*strincr - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - pop {r4-r11, pc} +small_invntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + // vmov twiddle1, s15 + vmov twiddle2, s16 + // mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + // ---------- + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + // 1,3,5,7: <5q; 0,2,4,6:<1q + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: < 5.5q + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: < 1.5q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #24608 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 5.5q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16-s23} + pop {r4-r11, pc} .unreq poly .unreq twiddle_ptr @@ -734,75 +567,111 @@ small_invntt_tomont_asm: .unreq poly5 .unreq poly6 .unreq poly7 -.unreq twiddle -.unreq qinv +.unreq twiddle1 +.unreq twiddle2 .unreq q +.unreq qa .unreq tmp -.unreq tmp2 // BASEMUL +/* +* Basemul code (adapted to q=769) from: +* Huang, J. et al. 2022. Improved Plantard Arithmetic for Lattice-based Cryptography. +* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2022, 4 (Aug. 2022), 614–636. +* DOI:https://doi.org/10.46586/tches.v2022.i4.614-636. +* https://github.com/UIC-ESLAS/ImprovedPlantardArithmetic/blob/f3482cfd09dda8f1f55b95e13616147e3b6dd008/crypto_kem/kyber768/m4fstack/fastbasemul.S +*/ -.global small_basemul_asm -.type small_basemul_asm, %function +.global small_basemul_asm_769 +.type small_basemul_asm_769, %function .align 2 -small_basemul_asm: - push {r4-r11, lr} - - rptr .req r0 - aptr .req r1 - bptr .req r2 - zeta_ptr .req r3 - poly0 .req r4 - poly1 .req r6 - poly2 .req r5 - poly3 .req r7 // TODO: remove poly3 - q .req r8 - qinv .req r8 - tmp .req r9 - tmp2 .req r10 - tmp3 .req r11 - zeta .req r12 - ctr .req r14 - - movw q, #769 - movt qinv, #767 - add ctr, rptr, #64*2*4 - 1: - - ldr poly2, [aptr, #4] - ldr poly3, [bptr, #4] - ldrh.w zeta, [zeta_ptr], #2 - ldr poly0, [aptr], #8 - ldr poly1, [bptr], #8 - - //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); - smultt tmp, poly0, poly1 - montgomery q, qinv, tmp, tmp2 - smultb tmp2, tmp2, zeta - smlabb tmp2, poly0, poly1, tmp2 - montgomery q, qinv, tmp2, tmp - - smuadx tmp2, poly0, poly1 - montgomery q, qinv, tmp2, tmp3 - pkhtb tmp, tmp3, tmp, asr#16 - str tmp, [rptr], #4 - - neg zeta, zeta - - //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); - smultt tmp, poly2, poly3 - montgomery q, qinv, tmp, tmp2 - smultb tmp2, tmp2, zeta - smlabb tmp2, poly2, poly3, tmp2 - montgomery q, qinv, tmp2, tmp - - smuadx tmp2, poly2, poly3 - montgomery q, qinv, tmp2, tmp3 - pkhtb tmp, tmp3, tmp, asr#16 - - str tmp, [rptr], #4 - cmp.w rptr, ctr - bne.w 1b - - pop {r4-r11, pc} \ No newline at end of file +small_basemul_asm_769: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + zetaptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 + q .req r8 + qa .req r14 + qinv .req r9 + tmp .req r10 + tmp2 .req r11 + zeta .req r12 + loop .req r14 + + movt q, #769 + movw qinv, #64769 + movt qinv, #58632 + + movw loop, #64 + 1: + vmov.w s0,loop + movw qa, #24608 + + ldrd poly0, poly2, [aptr], #8 + ldrd poly1, poly3, [bptr], #8 + // ldr poly0, [aptr], #4 + // ldr poly1, [bptr], #4 + // ldr poly2, [aptr], #4 + // ldr poly3, [bptr], #4 + + ldr.w zeta, [zetaptr], #4 + + // basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt tmp, zeta, poly1 + smlabt tmp, tmp, q, qa + smultt tmp, poly0, tmp + smlabb tmp, poly0, poly1, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly0, poly1 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + neg zeta, zeta + + // basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt tmp, zeta, poly3 + smlabt tmp, tmp, q, qa + smultt tmp, poly2, tmp + smlabb tmp, poly2, poly3, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly2, poly3 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + vmov.w loop,s0 + subs.w loop, #1 + bne.w 1b + + .unreq rptr + .unreq aptr + .unreq bptr + .unreq zetaptr + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq q + .unreq qa + .unreq qinv + .unreq tmp + .unreq tmp2 + .unreq zeta + .unreq loop + + pop {r4-r11, pc} +//-0.5p~0.5p \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h index 048d5df5..c3fd065f 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.h +++ b/crypto_sign/dilithium3/m4fstack/smallntt.h @@ -4,48 +4,27 @@ #include #include "params.h" -static const int16_t zetas[64] = { --23, 112, -151, -134, -52, -148, 227, 232, --71, 212, 236, 21, 341, 379, -202, -220, -352, 292, 238, 145, 194, -276, 70, -274, -117, 333, 66, 247, -237, -83, -252, -244, -331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168 -}; - -static const int16_t zetas_asm[128] = { -0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337, --76, 147, -114, -23, 112, -151, -134, --98, -272, 54, -52, -148, 227, 232, -36, -2, -124, -71, 212, 236, 21, --75, -80, -346, 341, 379, -202, -220, --339, 86, -51, 352, 292, 238, 145, --255, 364, 267, 194, -276, 70, -274, -282, 161, -15, 117, 333, 66, 247, --203, 288, 169, -237, -83, -252, -244, --34, 191, 307, 331, -241, 167, 357, -199, -50, -24, -355, 291, -358, 105, -178, -170, 226, -115, -209, 14, 99, -270, 121, -188, -260, 29, 366, -378, --10, -380, 279, -318, 278, 353, 354, -149, 180, -375, -184, 127, 330, -303, -369, -157, 263, 222, -78, -348, -44, --192, -128, -246, 201, 158, 350, 168 -}; - -static const int16_t zetas_inv_CT_asm[256] = { -0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186, -171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138, -}; +static const int32_t zetas_769[64] = { + 3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838}; + +static const int32_t zetas_asm_769[128] = { + 346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0}; + +// INTT with CT butterfly +static const int32_t zetas_inv_asm_769[256] = { + 5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0}; #define SMALL_Q 769 -void small_ntt_asm(int16_t a[N], const int16_t * zetas); -void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas); -void small_basemul_asm(int16_t *c, const int16_t *a, const int16_t *b, const int16_t *zetas); +void small_ntt_asm_769(int16_t a[N], const int32_t * zetas); +void small_invntt_asm_769(int16_t a[N], const int32_t * zetas); +void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas); -#define small_ntt(a) small_ntt_asm(a, zetas_asm) -#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm) -#define small_basemul(r,a,b) small_basemul_asm(r, a, b, zetas) +#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769) +#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769) +#define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769) #endif diff --git a/crypto_sign/dilithium5/m4fstack/macros_smallntt.i b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i new file mode 120000 index 00000000..fc731f12 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i @@ -0,0 +1 @@ +../../dilithium3/m4fstack/macros_smallntt.i \ No newline at end of file From 0dd789b5fe2138f40ff741bf1641bc3c683e7090 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 20 Mar 2024 16:07:22 +0100 Subject: [PATCH 21/32] First batch of stack opt for Verify * On-the-fly matrix generation * Schoolbook for ct1 * Challenge compression --- crypto_sign/dilithium3/m4fstack/sign.c | 59 +++++++++++++++---------- crypto_sign/dilithium3/m4fstack/stack.c | 52 ++++++++++++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 1 + 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index ab1426ce..2876a9a2 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -297,16 +297,19 @@ int crypto_sign_verify(const uint8_t *sig, const uint8_t *pk) { unsigned int i; - uint8_t buf[K*POLYW1_PACKEDBYTES]; + uint8_t w1_packed[POLYW1_PACKEDBYTES]; uint8_t rho[SEEDBYTES]; uint8_t mu[CRHBYTES]; uint8_t c[CTILDEBYTES]; uint8_t c2[CTILDEBYTES]; - poly cp; - polyvecl mat[K], z; - polyveck t1, w1, h; + polyvecl z; + polyveck h, t1; + poly w1, cp, tmp0; shake256incctx state; + uint8_t wcomp[768]; + uint8_t ccomp[68]; + if(siglen != CRYPTO_BYTES) return -1; @@ -325,30 +328,40 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - poly_challenge(&cp, c); - polyvec_matrix_expand(mat, rho); - - polyvecl_ntt(&z); - polyvec_matrix_pointwise_montgomery(&w1, mat, &z); - + poly_challenge(&cp, sig); + poly_challenge_compress(ccomp, &cp); poly_ntt(&cp); - polyveck_shiftl(&t1); - polyveck_ntt(&t1); - polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); - - polyveck_sub(&w1, &w1, &t1); - polyveck_reduce(&w1); - polyveck_invntt_tomont(&w1); - /* Reconstruct w1 */ - polyveck_caddq(&w1); - polyveck_use_hint(&w1, &w1, &h); - polyveck_pack_w1(buf, &w1); + polyvecl_ntt(&z); - /* Call random oracle and verify challenge */ shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); - shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES); + + for (size_t k_idx = 0; k_idx < K; k_idx++) { + poly_uniform(&tmp0, rho, (k_idx << 8) + 0); + poly_pointwise_montgomery(&w1, &tmp0, &z.vec[0]); + for (size_t l_idx = 1; l_idx < L; l_idx++) { + poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); + poly_pointwise_acc_montgomery(&w1, &tmp0, &z.vec[l_idx]); + } + + poly_reduce(&w1); + poly_invntt_tomont(&w1); + + poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); + + // TODO invNTT before sub because of schoolbook + poly_sub(&w1, &w1, &tmp0); + poly_reduce(&w1); + + /* Reconstruct w1 */ + poly_caddq(&w1); + poly_use_hint(&w1, &w1, &h.vec[k_idx]); + polyw1_pack(w1_packed, &w1); + + shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES); + } + /* Call random oracle and verify challenge */ shake256_inc_finalize(&state); shake256_inc_squeeze(c2, CTILDEBYTES, &state); for(i = 0; i < CTILDEBYTES; ++i) diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index d3256c8b..d7469d93 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -88,6 +88,28 @@ static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){ return (1 << (D-1)) - coeff; } +static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx){ + int32_t coeff; + // 4 coefficients are packed in 5 bytes + t1 += 5*(idx >> 2); + + if(idx % 4 == 0){ + coeff = (t1[0] >> 0); + coeff |= ((uint32_t)t1[1] << 8); + } else if(idx % 4 == 1){ + coeff = (t1[1] >> 2); + coeff |= ((uint32_t)t1[2] << 6); + } else if(idx % 4 == 2){ + coeff = (t1[2] >> 4); + coeff |= ((uint32_t)t1[3] << 4); + } else if(idx % 4 == 3){ + coeff = (t1[3] >> 6); + coeff |= ((uint32_t)t1[4] << 2); + } + coeff &= 0x3FF; + return coeff; +} + void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){ unsigned i,j,idx; uint64_t signs = 0; @@ -118,6 +140,36 @@ void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){ } } +void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1){ + unsigned i,j,idx; + uint64_t signs = 0; + for(i = 0; i < N; i++) c->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)ccomp[60+i]) << (8*i); + } + + for(idx = 0; idx < TAU; idx++){ + i = ccomp[idx]; + if(!(signs & 1)){ + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] += (polyt1_unpack_idx(t1, j) << D); + } + for(j = N-i; jcoeffs[i+j-N] -= (polyt1_unpack_idx(t1, j) << D); + } + } else { + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] -= (polyt1_unpack_idx(t1, j) << D); + } + for(j = N-i; jcoeffs[i+j-N] += (polyt1_unpack_idx(t1, j) << D); + } + } + + signs >>= 1; + } +} + void polyw_pack(uint8_t buf[3*256], poly *w){ poly_reduce(w); diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index c21714c7..37c659bc 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -12,6 +12,7 @@ void poly_challenge_decompress(poly *cp, const uint8_t c[68]); void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0); +void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1); void polyw_pack(uint8_t buf[3*256], poly *w); void polyw_unpack(poly *w, const uint8_t buf[3*256]); From a8c993fc8f7038de5fd757c505eb43eb6e10d010 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 20 Mar 2024 16:38:20 +0100 Subject: [PATCH 22/32] On-the-fly unpacking for z, h --- crypto_sign/dilithium3/m4fstack/sign.c | 40 +++++++++++--------- crypto_sign/dilithium3/m4fstack/stack.c | 49 +++++++++++++++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 2 +- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 2876a9a2..e81d0f44 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -302,9 +302,7 @@ int crypto_sign_verify(const uint8_t *sig, uint8_t mu[CRHBYTES]; uint8_t c[CTILDEBYTES]; uint8_t c2[CTILDEBYTES]; - polyvecl z; - polyveck h, t1; - poly w1, cp, tmp0; + poly w1, tmp0, tmp1; shake256incctx state; uint8_t wcomp[768]; @@ -313,11 +311,8 @@ int crypto_sign_verify(const uint8_t *sig, if(siglen != CRYPTO_BYTES) return -1; - unpack_pk(rho, &t1, pk); - if(unpack_sig(c, &z, &h, sig)) - return -1; - if(polyvecl_chknorm(&z, GAMMA1 - BETA)) - return -1; + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = pk[i]; /* Compute CRH(h(rho, t1), msg) */ shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); @@ -328,21 +323,27 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - poly_challenge(&cp, sig); - poly_challenge_compress(ccomp, &cp); - poly_ntt(&cp); - - polyvecl_ntt(&z); + poly_challenge(&tmp0, sig); + poly_challenge_compress(ccomp, &tmp0); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); for (size_t k_idx = 0; k_idx < K; k_idx++) { + polyz_unpack(&tmp1, sig + CTILDEBYTES); + if(poly_chknorm(&tmp1, GAMMA1 - BETA)) + return -1; + poly_ntt(&tmp1); + poly_uniform(&tmp0, rho, (k_idx << 8) + 0); - poly_pointwise_montgomery(&w1, &tmp0, &z.vec[0]); + poly_pointwise_montgomery(&w1, &tmp0, &tmp1); for (size_t l_idx = 1; l_idx < L; l_idx++) { + polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); + if(poly_chknorm(&tmp1, GAMMA1 - BETA)) + return -1; + poly_ntt(&tmp1); poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); - poly_pointwise_acc_montgomery(&w1, &tmp0, &z.vec[l_idx]); + poly_pointwise_acc_montgomery(&w1, &tmp0, &tmp1); } poly_reduce(&w1); @@ -350,13 +351,16 @@ int crypto_sign_verify(const uint8_t *sig, poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); - // TODO invNTT before sub because of schoolbook poly_sub(&w1, &w1, &tmp0); poly_reduce(&w1); /* Reconstruct w1 */ poly_caddq(&w1); - poly_use_hint(&w1, &w1, &h.vec[k_idx]); + + if (unpack_sig_h(&tmp0, k_idx, sig) != 0) { + return -1; + }; + poly_use_hint(&w1, &w1, &tmp0); polyw1_pack(w1_packed, &w1); shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES); @@ -365,7 +369,7 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_finalize(&state); shake256_inc_squeeze(c2, CTILDEBYTES, &state); for(i = 0; i < CTILDEBYTES; ++i) - if(c[i] != c2[i]) + if(sig[i] != c2[i]) return -1; return 0; diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index d7469d93..0c7d2b41 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -456,4 +456,53 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], for(i = 0; i < TRBYTES; ++i) tr[i] = sk[i]; sk += TRBYTES; +} + +/************************************************* +* Name: unpack_sig_h +* +* Description: Unpack only h from signature sig = (c, z, h). +* +* Arguments: - polyveck *h: pointer to output hint vector h +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) { + sig += L * POLYZ_PACKEDBYTES; + sig += CTILDEBYTES; + /* Decode h */ + unsigned int k = 0; + for (unsigned int i = 0; i < K; ++i) { + for (unsigned int j = 0; j < N; ++j) { + if (i == idx) { + h->coeffs[j] = 0; + } + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (unsigned int j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + if (i == idx) { + h->coeffs[sig[j]] = 1; + } + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (unsigned int j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + return 0; } \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 37c659bc..2893b2b5 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -24,7 +24,7 @@ void poly_lowbits(poly *a0, const poly *a); void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); - +unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); From b7ded849ff5133bb33314cf7d565ce90d22ad7d0 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 20 Mar 2024 17:00:11 +0100 Subject: [PATCH 23/32] Compress w --- crypto_sign/dilithium3/m4fstack/sign.c | 37 +++++++++++++++----------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index e81d0f44..9709f7fc 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -303,11 +303,13 @@ int crypto_sign_verify(const uint8_t *sig, uint8_t c[CTILDEBYTES]; uint8_t c2[CTILDEBYTES]; poly w1, tmp0, tmp1; - shake256incctx state; uint8_t wcomp[768]; uint8_t ccomp[68]; + shake128incctx s128; + shake256incctx s256; + if(siglen != CRYPTO_BYTES) return -1; @@ -316,36 +318,39 @@ int crypto_sign_verify(const uint8_t *sig, /* Compute CRH(h(rho, t1), msg) */ shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, TRBYTES); - shake256_inc_absorb(&state, m, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, mu, TRBYTES); + shake256_inc_absorb(&s256, m, mlen); + shake256_inc_finalize(&s256); + shake256_inc_squeeze(mu, CRHBYTES, &s256); /* Matrix-vector multiplication; compute Az - c2^dt1 */ poly_challenge(&tmp0, sig); poly_challenge_compress(ccomp, &tmp0); - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, mu, CRHBYTES); for (size_t k_idx = 0; k_idx < K; k_idx++) { + for(size_t i=0;i<768;i++){ + wcomp[i] = 0; + } + polyz_unpack(&tmp1, sig + CTILDEBYTES); if(poly_chknorm(&tmp1, GAMMA1 - BETA)) return -1; poly_ntt(&tmp1); - poly_uniform(&tmp0, rho, (k_idx << 8) + 0); - poly_pointwise_montgomery(&w1, &tmp0, &tmp1); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + 0, &s128); + for (size_t l_idx = 1; l_idx < L; l_idx++) { polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); if(poly_chknorm(&tmp1, GAMMA1 - BETA)) return -1; poly_ntt(&tmp1); - poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx); - poly_pointwise_acc_montgomery(&w1, &tmp0, &tmp1); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + l_idx, &s128); } - + polyw_unpack(&w1, wcomp); poly_reduce(&w1); poly_invntt_tomont(&w1); @@ -363,11 +368,11 @@ int crypto_sign_verify(const uint8_t *sig, poly_use_hint(&w1, &w1, &tmp0); polyw1_pack(w1_packed, &w1); - shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES); + shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES); } /* Call random oracle and verify challenge */ - shake256_inc_finalize(&state); - shake256_inc_squeeze(c2, CTILDEBYTES, &state); + shake256_inc_finalize(&s256); + shake256_inc_squeeze(c2, CTILDEBYTES, &s256); for(i = 0; i < CTILDEBYTES; ++i) if(sig[i] != c2[i]) return -1; From e6e164bcedac6dd669fa0a7dc54e1430bf129349 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Wed, 20 Mar 2024 17:34:40 +0100 Subject: [PATCH 24/32] rm tmp poly, subtract on wcomp --- crypto_sign/dilithium3/m4fstack/sign.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 9709f7fc..cbc332cb 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -300,9 +300,8 @@ int crypto_sign_verify(const uint8_t *sig, uint8_t w1_packed[POLYW1_PACKEDBYTES]; uint8_t rho[SEEDBYTES]; uint8_t mu[CRHBYTES]; - uint8_t c[CTILDEBYTES]; uint8_t c2[CTILDEBYTES]; - poly w1, tmp0, tmp1; + poly w1, tmp0; uint8_t wcomp[768]; uint8_t ccomp[68]; @@ -336,27 +335,28 @@ int crypto_sign_verify(const uint8_t *sig, wcomp[i] = 0; } - polyz_unpack(&tmp1, sig + CTILDEBYTES); - if(poly_chknorm(&tmp1, GAMMA1 - BETA)) + polyz_unpack(&tmp0, sig + CTILDEBYTES); + if(poly_chknorm(&tmp0, GAMMA1 - BETA)) return -1; - poly_ntt(&tmp1); + poly_ntt(&tmp0); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + 0, &s128); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + 0, &s128); for (size_t l_idx = 1; l_idx < L; l_idx++) { - polyz_unpack(&tmp1, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); - if(poly_chknorm(&tmp1, GAMMA1 - BETA)) + polyz_unpack(&tmp0, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); + if(poly_chknorm(&tmp0, GAMMA1 - BETA)) return -1; - poly_ntt(&tmp1); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp1, rho, (k_idx << 8) + l_idx, &s128); + poly_ntt(&tmp0); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + l_idx, &s128); } polyw_unpack(&w1, wcomp); poly_reduce(&w1); poly_invntt_tomont(&w1); + polyw_pack(wcomp, &w1); poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); - poly_sub(&w1, &w1, &tmp0); + polyw_sub(&w1, wcomp, &tmp0); poly_reduce(&w1); /* Reconstruct w1 */ From 6ef4fbc30c4cc92f734c54a97bcbb8806a6ab254 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Sat, 30 Mar 2024 18:27:22 -0400 Subject: [PATCH 25/32] Verify Stack Optimizations * Stack friendly hint decoding * Eliminate second full poly * Remove K-loop from hint unpacking --- crypto_sign/dilithium3/m4fstack/sign.c | 60 +++++++----- crypto_sign/dilithium3/m4fstack/stack.c | 122 ++++++++++++++++++++---- crypto_sign/dilithium3/m4fstack/stack.h | 4 +- 3 files changed, 141 insertions(+), 45 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index cbc332cb..c754b286 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -297,11 +297,13 @@ int crypto_sign_verify(const uint8_t *sig, const uint8_t *pk) { unsigned int i; + unsigned int number_of_hints; uint8_t w1_packed[POLYW1_PACKEDBYTES]; uint8_t rho[SEEDBYTES]; uint8_t mu[CRHBYTES]; uint8_t c2[CTILDEBYTES]; - poly w1, tmp0; + uint8_t hint_ones[OMEGA]; + poly p; uint8_t wcomp[768]; uint8_t ccomp[68]; @@ -316,7 +318,11 @@ int crypto_sign_verify(const uint8_t *sig, rho[i] = pk[i]; /* Compute CRH(h(rho, t1), msg) */ - shake256(mu, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES); + shake256_inc_finalize(&s256); + shake256_inc_squeeze(mu, CRHBYTES, &s256); + shake256_inc_init(&s256); shake256_inc_absorb(&s256, mu, TRBYTES); shake256_inc_absorb(&s256, m, mlen); @@ -324,49 +330,51 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &s256); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - poly_challenge(&tmp0, sig); - poly_challenge_compress(ccomp, &tmp0); + poly_challenge(&p, sig); + poly_challenge_compress(ccomp, &p); shake256_inc_init(&s256); shake256_inc_absorb(&s256, mu, CRHBYTES); for (size_t k_idx = 0; k_idx < K; k_idx++) { - for(size_t i=0;i<768;i++){ - wcomp[i] = 0; + for(size_t widx=0;widx<768;widx++){ + wcomp[widx] = 0; } - polyz_unpack(&tmp0, sig + CTILDEBYTES); - if(poly_chknorm(&tmp0, GAMMA1 - BETA)) + polyz_unpack(&p, sig + CTILDEBYTES); + if(poly_chknorm(&p, GAMMA1 - BETA)) return -1; - poly_ntt(&tmp0); + poly_ntt(&p); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + 0, &s128); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + 0, &s128); for (size_t l_idx = 1; l_idx < L; l_idx++) { - polyz_unpack(&tmp0, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); - if(poly_chknorm(&tmp0, GAMMA1 - BETA)) + polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); + if(poly_chknorm(&p, GAMMA1 - BETA)) return -1; - poly_ntt(&tmp0); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &tmp0, rho, (k_idx << 8) + l_idx, &s128); + poly_ntt(&p); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + l_idx, &s128); } - polyw_unpack(&w1, wcomp); - poly_reduce(&w1); - poly_invntt_tomont(&w1); - polyw_pack(wcomp, &w1); + polyw_unpack(&p, wcomp); + poly_reduce(&p); + poly_invntt_tomont(&p); + polyw_pack(wcomp, &p); - poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); + poly_schoolbook_t1(&p, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); - polyw_sub(&w1, wcomp, &tmp0); - poly_reduce(&w1); + polyw_sub(&p, wcomp, &p); + poly_reduce(&p); /* Reconstruct w1 */ - poly_caddq(&w1); + poly_caddq(&p); - if (unpack_sig_h(&tmp0, k_idx, sig) != 0) { + if (unpack_sig_h_indices(&hint_ones, &number_of_hints, k_idx, sig) != 0) + { return -1; - }; - poly_use_hint(&w1, &w1, &tmp0); - polyw1_pack(w1_packed, &w1); + } + poly_use_hint_stack(&p, &p, &hint_ones, number_of_hints); + + polyw1_pack(w1_packed, &p); shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES); } diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 0c7d2b41..716eccf6 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -3,6 +3,7 @@ #include "symmetric.h" #include "vector.h" #include "reduce.h" +#include "rounding.h" void poly_challenge_compress(uint8_t c[68], const poly *cp){ unsigned int i, pos; @@ -406,7 +407,7 @@ void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES } -static inline int32_t make_hint(int32_t z, int32_t r){ +static inline int32_t make_hint_stack(int32_t z, int32_t r){ int32_t r1, v1; r1 = highbits(r); @@ -429,7 +430,7 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){ // compute w - cs2 + c*t0 coeff = coeff + t->coeffs[i]; - a->coeffs[i] = make_hint(-t->coeffs[i], coeff); + a->coeffs[i] = make_hint_stack(-t->coeffs[i], coeff); if(a->coeffs[i] == 1){ hints_n++; } @@ -458,6 +459,7 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], sk += TRBYTES; } +/* TODO: remove this function */ /************************************************* * Name: unpack_sig_h * @@ -474,30 +476,78 @@ int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES sig += CTILDEBYTES; /* Decode h */ unsigned int k = 0; - for (unsigned int i = 0; i < K; ++i) { - for (unsigned int j = 0; j < N; ++j) { - if (i == idx) { - h->coeffs[j] = 0; - } - } - if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + if (idx > 0) + { + k = sig[OMEGA + (idx - 1)]; + } + + for (unsigned int j = 0; j < N; ++j) { + h->coeffs[j] = 0; + } + + if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) { + return 1; + } + + for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { return 1; } + h->coeffs[sig[j]] = 1; + } - for (unsigned int j = k; j < sig[OMEGA + i]; ++j) { - /* Coefficients are ordered for strong unforgeability */ - if (j > k && sig[j] <= sig[j - 1]) { - return 1; - } - if (i == idx) { - h->coeffs[sig[j]] = 1; - } + /* TODO: extract this check, redundant here */ + k = sig[OMEGA + (K - 1)]; + /* Extra indices are zero for strong unforgeability */ + for (unsigned int j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; } + } + return 0; +} - k = sig[OMEGA + i]; +/************************************************* +* Name: unpack_sig_h_indices +* +* Description: Unpack only h from signature sig = (c, z, h). +* +* Arguments: - polyveck *h: pointer to output hint vector h +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) { + sig += L * POLYZ_PACKEDBYTES; + sig += CTILDEBYTES; + /* Decode h */ + unsigned int k = 0; + unsigned int hidx = 0; + + if (idx > 0) + { + k = sig[OMEGA + (idx - 1)]; } + if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) { + return 1; + } + + for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h_i[hidx++] = sig[j]; + } + + *number_of_hints = hidx; + + /* TODO: extract this check, redundant here */ + k = sig[OMEGA + (K - 1)]; /* Extra indices are zero for strong unforgeability */ for (unsigned int j = k; j < OMEGA; ++j) { if (sig[j]) { @@ -505,4 +555,40 @@ int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES } } return 0; +} + +/************************************************* +* Name: poly_use_hint_stack +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints) { + unsigned int i; + unsigned int in_list; + + for(i = 0; i < N; ++i) + { + in_list = 0; + for (size_t hidx = 0; hidx < number_of_hints; hidx++) + { + if (i == h_i[hidx]) + { + in_list = 1; + break; + } + } + if (in_list) + { + b->coeffs[i] = use_hint(a->coeffs[i], 1); + } + else + { + b->coeffs[i] = use_hint(a->coeffs[i], 0); + } + + } } \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 2893b2b5..e07d8716 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -24,13 +24,15 @@ void poly_lowbits(poly *a0, const poly *a); void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); -unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); +int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); +int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); +void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints); // TODO: replace this with individual functions later void unpack_sk_stack(uint8_t rho[SEEDBYTES], From 9870bec37c0846fb5ec8c8608f225a62137fb081 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Sun, 31 Mar 2024 14:53:50 -0400 Subject: [PATCH 26/32] rm buffers/unionize in Verify --- crypto_sign/dilithium3/m4fstack/sign.c | 50 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index c754b286..a509bf76 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -297,26 +297,38 @@ int crypto_sign_verify(const uint8_t *sig, const uint8_t *pk) { unsigned int i; - unsigned int number_of_hints; - uint8_t w1_packed[POLYW1_PACKEDBYTES]; - uint8_t rho[SEEDBYTES]; - uint8_t mu[CRHBYTES]; - uint8_t c2[CTILDEBYTES]; - uint8_t hint_ones[OMEGA]; + poly p; - uint8_t wcomp[768]; - uint8_t ccomp[68]; + union { + uint8_t w1_packed[POLYW1_PACKEDBYTES]; + uint8_t wcomp[768]; + } w1_packed_comp; + uint8_t *w1_packed = &w1_packed_comp.w1_packed; + uint8_t *wcomp = &w1_packed_comp.wcomp; + + union { + uint8_t ccomp[68]; + uint8_t mu[CRHBYTES]; + } ccomp_mu; + uint8_t *ccomp = &ccomp_mu.ccomp; + uint8_t *mu = &ccomp_mu.mu; - shake128incctx s128; shake256incctx s256; + union { + uint8_t hint_ones[OMEGA]; + shake128incctx s128; + uint8_t c2[CTILDEBYTES]; + } shake_hint; + + uint8_t *hint_ones = &shake_hint.hint_ones; + shake128incctx *s128 = &shake_hint.s128; + uint8_t *c2 = &shake_hint.c2; + if(siglen != CRYPTO_BYTES) return -1; - for(i = 0; i < SEEDBYTES; ++i) - rho[i] = pk[i]; - /* Compute CRH(h(rho, t1), msg) */ shake256_inc_init(&s256); shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES); @@ -329,13 +341,13 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_finalize(&s256); shake256_inc_squeeze(mu, CRHBYTES, &s256); + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, mu, CRHBYTES); + /* Matrix-vector multiplication; compute Az - c2^dt1 */ poly_challenge(&p, sig); poly_challenge_compress(ccomp, &p); - shake256_inc_init(&s256); - shake256_inc_absorb(&s256, mu, CRHBYTES); - for (size_t k_idx = 0; k_idx < K; k_idx++) { for(size_t widx=0;widx<768;widx++){ wcomp[widx] = 0; @@ -346,14 +358,14 @@ int crypto_sign_verify(const uint8_t *sig, return -1; poly_ntt(&p); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + 0, &s128); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + 0, s128); for (size_t l_idx = 1; l_idx < L; l_idx++) { polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); if(poly_chknorm(&p, GAMMA1 - BETA)) return -1; poly_ntt(&p); - poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, rho, (k_idx << 8) + l_idx, &s128); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + l_idx, s128); } polyw_unpack(&p, wcomp); poly_reduce(&p); @@ -368,11 +380,11 @@ int crypto_sign_verify(const uint8_t *sig, /* Reconstruct w1 */ poly_caddq(&p); - if (unpack_sig_h_indices(&hint_ones, &number_of_hints, k_idx, sig) != 0) + if (unpack_sig_h_indices(hint_ones, &i, k_idx, sig) != 0) { return -1; } - poly_use_hint_stack(&p, &p, &hint_ones, number_of_hints); + poly_use_hint_stack(&p, &p, hint_ones, i); polyw1_pack(w1_packed, &p); From 1d21996b1edea0a80f03baa1790ae6c16265a3f1 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 8 Apr 2024 15:42:28 +0200 Subject: [PATCH 27/32] Stack opt key pair * Minor clean up --- crypto_sign/dilithium3/m4fstack/sign.c | 68 ++++++--- crypto_sign/dilithium3/m4fstack/stack.c | 179 +++++++++++++++++------- crypto_sign/dilithium3/m4fstack/stack.h | 32 ++++- 3 files changed, 203 insertions(+), 76 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index a509bf76..edb4eaa7 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -24,12 +24,12 @@ * Returns 0 (success) **************************************************/ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i, j; uint8_t seedbuf[2*SEEDBYTES + CRHBYTES]; uint8_t tr[TRBYTES]; const uint8_t *rho, *rhoprime, *key; - polyvecl mat[K]; - polyvecl s1, s1hat; - polyveck s2, t1, t0; + + poly tA, tB, tC; /* Get randomness for rho, rhoprime and key */ randombytes(seedbuf, SEEDBYTES); @@ -38,31 +38,57 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { rhoprime = rho + SEEDBYTES; key = rhoprime + CRHBYTES; - /* Expand matrix */ - polyvec_matrix_expand(mat, rho); - - /* Sample short vectors s1 and s2 */ - polyvecl_uniform_eta(&s1, rhoprime, 0); - polyveck_uniform_eta(&s2, rhoprime, L); + pack_sk_rho(sk, rho); + pack_sk_key(sk, key); + pack_pk_rho(pk, rho); /* Matrix-vector multiplication */ - s1hat = s1; - polyvecl_ntt(&s1hat); - polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); - polyveck_reduce(&t1); - polyveck_invntt_tomont(&t1); + for (i = 0; i < K; i++) + { + /* Expand part of s1 */ + poly_uniform_eta(&tC, rhoprime, 0); + if (i == 0) + { + pack_sk_s1(sk, &tC, 0); + } + poly_ntt(&tC); + /* expand part of the matrix */ + poly_uniform(&tB, rho, (i << 8) + 0); + /* partial matrix-vector multiplication */ + poly_pointwise_montgomery(&tA, &tB, &tC); + for(j = 1; j < L; j++) + { + /* Expand part of s1 */ + poly_uniform_eta(&tC, rhoprime, j); + if (i == 0) + { + pack_sk_s1(sk, &tC, j); + } + poly_ntt(&tC); + poly_uniform(&tB, rho, (i << 8) + j); + poly_pointwise_acc_montgomery(&tA, &tB, &tC); + } + + poly_reduce(&tA); + poly_invntt_tomont(&tA); - /* Add error vector s2 */ - polyveck_add(&t1, &t1, &s2); + /* Add error vector s2 */ + /* Sample short vector s2 */ + poly_uniform_eta(&tB, rhoprime, L + i); + pack_sk_s2(sk, &tB, i); + poly_add(&tA, &tA, &tB); - /* Extract t1 and write public key */ - polyveck_caddq(&t1); - polyveck_power2round(&t1, &t0, &t1); - pack_pk(pk, rho, &t1); + /* Compute t{0,1} */ + poly_caddq(&tA); + poly_power2round(&tC, &tB, &tA); + pack_sk_t0(sk, &tB, i); + pack_pk_t1(pk, &tC, i); + + } /* Compute H(rho, t1) and write secret key */ shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); - pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + pack_sk_tr(sk, tr); return 0; } diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index 716eccf6..b1e09325 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -284,7 +284,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { // TODO: in the end increase this buffer size as far as possible #define POLY_UNIFORM_BUFFERSIZE 3 -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ int32_t t; uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; { @@ -438,7 +438,6 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){ return hints_n; } -// TODO: remove this later void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], uint8_t key[SEEDBYTES], @@ -459,56 +458,6 @@ void unpack_sk_stack(uint8_t rho[SEEDBYTES], sk += TRBYTES; } -/* TODO: remove this function */ -/************************************************* -* Name: unpack_sig_h -* -* Description: Unpack only h from signature sig = (c, z, h). -* -* Arguments: - polyveck *h: pointer to output hint vector h -* - const unsigned char sig[]: byte array containing -* bit-packed signature -* -* Returns 1 in case of malformed signature; otherwise 0. -**************************************************/ -int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) { - sig += L * POLYZ_PACKEDBYTES; - sig += CTILDEBYTES; - /* Decode h */ - unsigned int k = 0; - - if (idx > 0) - { - k = sig[OMEGA + (idx - 1)]; - } - - for (unsigned int j = 0; j < N; ++j) { - h->coeffs[j] = 0; - } - - if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) { - return 1; - } - - for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) { - /* Coefficients are ordered for strong unforgeability */ - if (j > k && sig[j] <= sig[j - 1]) { - return 1; - } - h->coeffs[sig[j]] = 1; - } - - /* TODO: extract this check, redundant here */ - k = sig[OMEGA + (K - 1)]; - /* Extra indices are zero for strong unforgeability */ - for (unsigned int j = k; j < OMEGA; ++j) { - if (sig[j]) { - return 1; - } - } - return 0; -} - /************************************************* * Name: unpack_sig_h_indices * @@ -591,4 +540,130 @@ void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned in } } +} + +/************************************************* +* Name: pack_pk_rho +* +* Description: Bit-pack only rho in public key pk = (rho, t1). +* +* Arguments: - unsigned char pk[]: output byte array +* - const unsigned char rho[]: byte array containing rho +**************************************************/ +void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const unsigned char rho[SEEDBYTES]) { + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } +} + +/************************************************* +* Name: pack_pk_t1 +* +* Description: Bit-pack only the t1 elem at idx in public key pk = (rho, t1). +* +* Arguments: - unsigned char pk[]: output byte array +* - const polyveck *t1: pointer to vector t1 +* - const unsigned int idx: index to the elem to pack +**************************************************/ +void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const poly *t1, + const unsigned int idx) { + pk += SEEDBYTES; + polyt1_pack(pk + idx * POLYT1_PACKEDBYTES, t1); +} + +/************************************************* +* Name: pack_sk_s1 +* +* Description: Bit-pack only some element of s1 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *s1_elem: pointer to vector element idx in s1 +* - const unisgned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s1_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES; + polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s1_elem); +} + +/************************************************* +* Name: pack_sk_s2 +* +* Description: Bit-pack only some element of s2 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *s2_elem: pointer to vector element idx in s2 +* - const unsigned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s2_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES; + polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s2_elem); +} + +/************************************************* +* Name: pack_sk_t0 +* +* Description: Bit-pack only some element of t0 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *t0_elem: pointer to vector element idx in s2 +* - const unsigned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *t0_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + K * POLYETA_PACKEDBYTES; + polyt0_pack(sk + idx * POLYT0_PACKEDBYTES, t0_elem); +} + +/************************************************* +* Name: pack_sk_rho +* +* Description: Bit-pack only rho in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char rho[]: byte array containing rho +**************************************************/ +void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char rho[SEEDBYTES]) { + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } +} + +/************************************************* +* Name: pack_sk_key +* +* Description: Bit-pack only key in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char key[]: byte array containing key +**************************************************/ +void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char key[SEEDBYTES]) { + sk += SEEDBYTES; + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } +} + +/************************************************* +* Name: pack_sk_tr +* +* Description: Bit-pack only tr in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char tr[]: byte array containing tr +**************************************************/ +void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char tr[TRBYTES]) { + sk += 2*SEEDBYTES; + for (unsigned int i = 0; i < TRBYTES; ++i) { + sk[i] = tr[i]; + } } \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index e07d8716..47dbe50b 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -24,9 +24,8 @@ void poly_lowbits(poly *a0, const poly *a); void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); -int unpack_sig_h(poly *h, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); -void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); @@ -34,9 +33,36 @@ size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints); -// TODO: replace this with individual functions later void unpack_sk_stack(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], uint8_t key[SEEDBYTES], const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + +void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const unsigned char rho[SEEDBYTES]); + +void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const poly *t1, + const unsigned int idx); + +void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s1_elem, + const unsigned int idx); + +void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s2_elem, + const unsigned int idx); + +void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *t0_elem, + const unsigned int idx); + +void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char rho[SEEDBYTES]); + +void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char key[SEEDBYTES]); + +void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char tr[TRBYTES]); #endif \ No newline at end of file From 76b16c1bb0a8513757c1999158ab99bb252e13da Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 8 Apr 2024 15:57:45 +0200 Subject: [PATCH 28/32] Overlap buffers --- crypto_sign/dilithium3/m4fstack/sign.c | 39 +++++++++++++++++--------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index edb4eaa7..33df06fe 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -26,14 +26,27 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { unsigned int i, j; uint8_t seedbuf[2*SEEDBYTES + CRHBYTES]; - uint8_t tr[TRBYTES]; const uint8_t *rho, *rhoprime, *key; - poly tA, tB, tC; + poly tA, tB; + + union { + uint8_t tr[TRBYTES]; + shake256incctx s256; + poly tC; + } data; + + shake256incctx *s256 = &data.s256; + uint8_t *tr = &data.tr; + poly *tC = &data.tC; /* Get randomness for rho, rhoprime and key */ randombytes(seedbuf, SEEDBYTES); - shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES); + shake256_inc_init(s256); + shake256_inc_absorb(s256, seedbuf, SEEDBYTES); + shake256_inc_finalize(s256); + shake256_inc_squeeze(seedbuf, 2*SEEDBYTES + CRHBYTES, s256); + rho = seedbuf; rhoprime = rho + SEEDBYTES; key = rhoprime + CRHBYTES; @@ -46,27 +59,27 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { for (i = 0; i < K; i++) { /* Expand part of s1 */ - poly_uniform_eta(&tC, rhoprime, 0); + poly_uniform_eta(tC, rhoprime, 0); if (i == 0) { - pack_sk_s1(sk, &tC, 0); + pack_sk_s1(sk, tC, 0); } - poly_ntt(&tC); + poly_ntt(tC); /* expand part of the matrix */ poly_uniform(&tB, rho, (i << 8) + 0); /* partial matrix-vector multiplication */ - poly_pointwise_montgomery(&tA, &tB, &tC); + poly_pointwise_montgomery(&tA, &tB, tC); for(j = 1; j < L; j++) { /* Expand part of s1 */ - poly_uniform_eta(&tC, rhoprime, j); + poly_uniform_eta(tC, rhoprime, j); if (i == 0) { - pack_sk_s1(sk, &tC, j); + pack_sk_s1(sk, tC, j); } - poly_ntt(&tC); + poly_ntt(tC); poly_uniform(&tB, rho, (i << 8) + j); - poly_pointwise_acc_montgomery(&tA, &tB, &tC); + poly_pointwise_acc_montgomery(&tA, &tB, tC); } poly_reduce(&tA); @@ -80,9 +93,9 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { /* Compute t{0,1} */ poly_caddq(&tA); - poly_power2round(&tC, &tB, &tA); + poly_power2round(tC, &tB, &tA); pack_sk_t0(sk, &tB, i); - pack_pk_t1(pk, &tC, i); + pack_pk_t1(pk, tC, i); } From e718f2eb3d4728e246ea3b5ecd9c848e9f017124 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Mon, 8 Apr 2024 17:05:53 +0200 Subject: [PATCH 29/32] Stack optimized challenge generation --- crypto_sign/dilithium3/m4fstack/sign.c | 2 +- crypto_sign/dilithium3/m4fstack/stack.c | 46 +++++++++++++++++++++++++ crypto_sign/dilithium3/m4fstack/stack.h | 1 + 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c index 33df06fe..71cff9bb 100644 --- a/crypto_sign/dilithium3/m4fstack/sign.c +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -384,7 +384,7 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_absorb(&s256, mu, CRHBYTES); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - poly_challenge(&p, sig); + poly_challenge_stack(&p, sig); poly_challenge_compress(ccomp, &p); for (size_t k_idx = 0; k_idx < K; k_idx++) { diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c index b1e09325..b45f7021 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.c +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -666,4 +666,50 @@ void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES], for (unsigned int i = 0; i < TRBYTES; ++i) { sk[i] = tr[i]; } +} + +/************************************************* +* Name: challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). Stack optimized. +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +#define CHALLENGE_STACK_BUF_SIZE 8 +void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[CHALLENGE_STACK_BUF_SIZE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state); + signs = 0; + for(i = 0; i < 8; ++i) + { + signs |= (uint64_t)buf[i] << 8*i; + } + pos = 8; + + for(i = 0; i < N; ++i) + c->coeffs[i] = 0; + for(i = N-TAU; i < N; ++i) { + do { + if(pos >= CHALLENGE_STACK_BUF_SIZE) { + shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state); + pos = 0; + } + + b = buf[pos++]; + } while(b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2*(signs & 1); + signs >>= 1; + } } \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h index 47dbe50b..06c8c576 100644 --- a/crypto_sign/dilithium3/m4fstack/stack.h +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -28,6 +28,7 @@ void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); +void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]); size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); From a37b31186f7d7702a50c4816cc9eea5982faee19 Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Tue, 9 Apr 2024 16:11:37 +0200 Subject: [PATCH 30/32] Match 769 Plantard to m4f code --- crypto_sign/dilithium2/m4fstack/smallntt.S | 1 - .../dilithium2/m4fstack/smallntt_769.S | 1 + crypto_sign/dilithium3/m4fstack/macros_fnt.i | 158 ------------------ .../dilithium3/m4fstack/macros_smallntt.i | 24 ++- crypto_sign/dilithium3/m4fstack/smallntt.h | 23 ++- .../m4fstack/{smallntt.S => smallntt_769.S} | 24 ++- crypto_sign/dilithium5/m4fstack/smallntt.S | 1 - .../dilithium5/m4fstack/smallntt_769.S | 1 + 8 files changed, 60 insertions(+), 173 deletions(-) delete mode 120000 crypto_sign/dilithium2/m4fstack/smallntt.S create mode 120000 crypto_sign/dilithium2/m4fstack/smallntt_769.S delete mode 100644 crypto_sign/dilithium3/m4fstack/macros_fnt.i rename crypto_sign/dilithium3/m4fstack/{smallntt.S => smallntt_769.S} (94%) delete mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.S create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt_769.S diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.S b/crypto_sign/dilithium2/m4fstack/smallntt.S deleted file mode 120000 index 7e2174f9..00000000 --- a/crypto_sign/dilithium2/m4fstack/smallntt.S +++ /dev/null @@ -1 +0,0 @@ -../../dilithium3/m4fstack/smallntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallntt_769.S b/crypto_sign/dilithium2/m4fstack/smallntt_769.S new file mode 120000 index 00000000..6300683f --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallntt_769.S @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt_769.S \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/macros_fnt.i b/crypto_sign/dilithium3/m4fstack/macros_fnt.i deleted file mode 100644 index 25903e41..00000000 --- a/crypto_sign/dilithium3/m4fstack/macros_fnt.i +++ /dev/null @@ -1,158 +0,0 @@ -// 2 -.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1 - \ldrstr \c0, [\target, \mem0] - \ldrstr \c1, [\target, \mem1] -.endm - -// 2 -.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump - \ldrstr \c1, [\target, \mem1] - \ldrstr \c0, [\target], \jump -.endm - -// 4 -.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3 - \ldrstr \c0, [\target, \mem0] - \ldrstr \c1, [\target, \mem1] - \ldrstr \c2, [\target, \mem2] - \ldrstr \c3, [\target, \mem3] -.endm - -// 4 -.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump - \ldrstr \c1, [\target, \mem1] - \ldrstr \c2, [\target, \mem2] - \ldrstr \c3, [\target, \mem3] - \ldrstr \c0, [\target], \jump -.endm - -// 8 -.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7 - ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 - ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 -.endm - -// 8 -.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump - ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 - ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump -.endm - - - -.macro addSub1 c0, c1 - add.w \c0, \c1 - sub.w \c1, \c0, \c1, lsl #1 -.endm - -.macro addSub2 c0, c1, c2, c3 - add \c0, \c1 - add \c2, \c3 - sub.w \c1, \c0, \c1, lsl #1 - sub.w \c3, \c2, \c3, lsl #1 -.endm - -.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7 - add \c0, \c1 - add \c2, \c3 - add \c4, \c5 - add \c6, \c7 - sub.w \c1, \c0, \c1, lsl #1 - sub.w \c3, \c2, \c3, lsl #1 - sub.w \c5, \c4, \c5, lsl #1 - sub.w \c7, \c6, \c7, lsl #1 -.endm - -// 2 -.macro barrett_32 a, Qbar, Q, tmp - smmulr.w \tmp, \a, \Qbar - mls.w \a, \tmp, \Q, \a -.endm - -.macro FNT_CT_butterfly c0, c1, logW - add.w \c0, \c0, \c1, lsl #\logW - sub.w \c1, \c0, \c1, lsl #(\logW+1) -.endm - -.macro shift_subAdd c0, c1, shlv - sub.w \c0, \c0, \c1, lsl #(\shlv) - add.w \c1, \c0, \c1, lsl #(\shlv+1) -.endm - -.macro FNT_CT_ibutterfly c0, c1, shlv - shift_subAdd \c0, \c1, \shlv -.endm - -// 46 -.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 - vmov.w \twiddle, \xi0 - - // c0, c1, c2, c3, c4, c5, c6, c7, c8 - // 0,4 - mla \tmp, \c4, \twiddle, \c0 - mls \c4, \c4, \twiddle, \c0 - - // 1,5 - mla \c0, \c5, \twiddle, \c1 - mls \c5, \c5, \twiddle, \c1 - - // 2,6 - mla \c1, \c6, \twiddle, \c2 - mls \c6, \c6, \twiddle, \c2 - - // 3,7 - mla \c2, \c7, \twiddle, \c3 - mls \c7, \c7, \twiddle, \c3 - - // tmp, c0, c1, c2, c4, c5, c6, c7 - - barrett_32 \tmp, \Qprime, \Q, \c3 - barrett_32 \c0, \Qprime, \Q, \c3 - barrett_32 \c1, \Qprime, \Q, \c3 - barrett_32 \c2, \Qprime, \Q, \c3 - barrett_32 \c4, \Qprime, \Q, \c3 - barrett_32 \c5, \Qprime, \Q, \c3 - barrett_32 \c6, \Qprime, \Q, \c3 - barrett_32 \c7, \Qprime, \Q, \c3 - - vmov.w \twiddle, \xi1 - // 0,2 - mla \tmp2, \c1, \twiddle, \tmp - mls \c3, \c1, \twiddle, \tmp - - // 1,3 - mla \tmp, \c2, \twiddle, \c0 - mls \c0, \c2, \twiddle, \c0 - - vmov.w \twiddle, \xi2 - - // 4,6 - mla \c2, \c6, \twiddle, \c4 - mls \c1, \c6, \twiddle, \c4 - - // 5,7 - mla \c6, \c7, \twiddle, \c5 - mls \c7, \c7, \twiddle, \c5 - - // tmp2, tmp, c3, c0 | c2, c6, c1, c7 - - // 4,5 - vmov.w \twiddle, \xi5 - mla \c4, \c6, \twiddle, \c2 - mls \c5, \c6, \twiddle, \c2 - - // 6,7 - vmov.w \twiddle, \xi6 - mla \c6, \c7, \twiddle, \c1 - mls \c7, \c7, \twiddle, \c1 - - // 2,3 - vmov.w \twiddle, \xi4 - mla \c2, \c0, \twiddle, \c3 - mls \c3, \c0, \twiddle, \c3 - - // 0,1 - vmov.w \twiddle, \xi3 - mla \c0, \tmp, \twiddle, \tmp2 - mls \c1, \tmp, \twiddle, \tmp2 -.endm \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i index b97f4d52..7c9a387c 100644 --- a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i +++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i @@ -1,9 +1,23 @@ /* -* NTT and inverse NTT code from: -* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. -* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. -* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. -* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * NTT and inverse NTT code from: + * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. + * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. + * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. + * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S */ #ifndef MACROS_SMALLNTT_I diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h index c3fd065f..244fad24 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.h +++ b/crypto_sign/dilithium3/m4fstack/smallntt.h @@ -1,9 +1,27 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #ifndef SMALLNTT_H #define SMALLNTT_H #include #include "params.h" +#define SMALL_Q 769 + static const int32_t zetas_769[64] = { 3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838}; @@ -16,13 +34,12 @@ static const int32_t zetas_inv_asm_769[256] = { // removed first "2285" + LAYER 3+2+1 - 1 - butterfly 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0}; - -#define SMALL_Q 769 - +// Q1=769 void small_ntt_asm_769(int16_t a[N], const int32_t * zetas); void small_invntt_asm_769(int16_t a[N], const int32_t * zetas); void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas); +// small NTT for computing cs0 and cs1 #define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769) #define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769) #define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769) diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.S b/crypto_sign/dilithium3/m4fstack/smallntt_769.S similarity index 94% rename from crypto_sign/dilithium3/m4fstack/smallntt.S rename to crypto_sign/dilithium3/m4fstack/smallntt_769.S index 9f048042..1c3c9a88 100644 --- a/crypto_sign/dilithium3/m4fstack/smallntt.S +++ b/crypto_sign/dilithium3/m4fstack/smallntt_769.S @@ -1,9 +1,23 @@ /* -* NTT and inverse NTT code from: -* Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. -* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. -* DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. -* https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * NTT and inverse NTT code from: + * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. + * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. + * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. + * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S */ #include "macros.i" diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.S b/crypto_sign/dilithium5/m4fstack/smallntt.S deleted file mode 120000 index 7e2174f9..00000000 --- a/crypto_sign/dilithium5/m4fstack/smallntt.S +++ /dev/null @@ -1 +0,0 @@ -../../dilithium3/m4fstack/smallntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallntt_769.S b/crypto_sign/dilithium5/m4fstack/smallntt_769.S new file mode 120000 index 00000000..6300683f --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallntt_769.S @@ -0,0 +1 @@ +../../dilithium3/m4fstack/smallntt_769.S \ No newline at end of file From d401a156c7a725674c06cfab9ab9e23163054367 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 15 Apr 2024 15:32:51 +0800 Subject: [PATCH 31/32] update skiplist --- skiplist.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skiplist.py b/skiplist.py index 47192e42..b97c1b84 100644 --- a/skiplist.py +++ b/skiplist.py @@ -237,4 +237,7 @@ {'scheme': 'tuov_is_pkc_skc', 'implementation': 'ref', 'estmemory': 1275904}, {'scheme': 'tuov_v_pkc', 'implementation': 'ref', 'estmemory': 7083008}, {'scheme': 'tuov_v_pkc_skc', 'implementation': 'ref', 'estmemory': 4639744}, + {'scheme': 'dilithium2', 'implementation': 'm4fstack', 'estmemory': 12288}, + {'scheme': 'dilithium5', 'implementation': 'm4fstack', 'estmemory': 21504}, + {'scheme': 'dilithium3', 'implementation': 'm4fstack', 'estmemory': 17408}, ] From c013920b8028db39fc0ef52f62dd62088cf54d9f Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 16 Apr 2024 06:55:18 +0800 Subject: [PATCH 32/32] update benchmarks --- benchmarks.csv | 32 ++++++++++++++++++++++---------- benchmarks.md | 34 +++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/benchmarks.csv b/benchmarks.csv index 981accb6..e3bb1bbb 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -42,12 +42,15 @@ cross-sha3-r-sdpg-1-fast (10 executions),ref,290136,287742,297758,29963868,29960 cross-sha3-r-sdpg-1-small (10 executions),ref,290135,287741,297757,102853622,102847774,102861948,75137510,75126803,75159685 cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565461,43582933,27513830,27493024,27525746 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852 -dilithium2 (90 executions),clean,1873447,1838554,1903845,7846622,3321671,28761609,2062804,2062332,2063181 -dilithium2 (100 executions),m4f,1427684,1390524,1466437,4219137,1813668,12587382,1417706,1417251,1418128 +dilithium2 (1000 executions),clean,1874167,1827645,1914566,7493877,3321630,40762756,2062795,2062255,2063222 +dilithium2 (1000 executions),m4f,1426036,1379636,1466394,3807970,1813656,18528070,1417745,1417203,1418192 +dilithium2 (1000 executions),m4fstack,1801523,1684895,1902114,12170976,3900911,86281518,3241353,3194028,3281144 dilithium3 (1000 executions),clean,3205551,3204090,3207411,12696585,5097364,74392293,3376992,3376581,3377393 dilithium3 (1000 executions),m4f,2515969,2514498,2517634,5884832,2917322,25268693,2411257,2410858,2411717 -dilithium5 (90 executions),clean,5346066,5287239,5395626,15205929,7953360,49173429,5609664,5609137,5610119 -dilithium5 (100 executions),m4f,4273211,4210308,4329697,8062110,4882708,18398575,4185407,4184878,4185954 +dilithium3 (1000 executions),m4fstack,3412759,3406659,3419247,23673016,6733971,145803146,5733307,5688893,5778120 +dilithium5 (1000 executions),clean,5341477,5286872,5395822,15710371,7953367,75940093,5609679,5609217,5610183 +dilithium5 (1000 executions),m4f,4275029,4210286,4329519,7977781,4882524,25936176,4185417,4184925,4185896 +dilithium5 (1000 executions),m4fstack,5816287,5474236,6115061,33452872,11170780,185259803,9912851,9845789,9981834 falcon-1024 (10 executions),m4-ct,354880005,284902033,635131652,87741288,87506676,87922628,991320,982548,997219 falcon-1024 (10 executions),opt-ct,555202324,284912829,1157528581,87710190,87606677,87841235,993584,983066,997523 falcon-1024 (10 executions),opt-leaktime,438412062,334858742,625013074,80139483,79891200,80551967,994127,984891,997390 @@ -190,11 +193,14 @@ cross-sha3-r-sdpg-1-small,ref,2328,466400,245512,,,,,, cross-sha3-r-sdpg-3-fast,ref,4032,205080,108236,,,,,, cross-sha3-r-sdpg-5-fast,ref,6824,398600,213436,,,,,, dilithium2,clean,38304,51968,36192,,,,,, -dilithium2,m4f,38296,49416,36184,,,,,, +dilithium2,m4f,38296,49416,36220,,,,,, +dilithium2,m4fstack,4408,5072,2704,,,,,, dilithium3,clean,60832,79616,57728,,,,,, dilithium3,m4f,60824,68864,57720,,,,,, +dilithium3,m4fstack,4408,6608,2704,,,,,, dilithium5,clean,97696,122724,92940,,,,,, -dilithium5,m4f,97688,116076,92824,,,,,, +dilithium5,m4f,97688,116076,92932,,,,,, +dilithium5,m4fstack,4408,8136,2712,,,,,, falcon-1024,clean,34988,84604,8784,,,,,, falcon-1024,m4-ct,1156,2508,376,,,,,, falcon-1024,opt-ct,1156,2508,376,,,,,, @@ -339,12 +345,15 @@ cross-sha3-r-sdpg-1-fast,ref,71.8,74.8,77.1,,,,,, cross-sha3-r-sdpg-1-small,ref,71.8,74.7,78.4,,,,,, cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,, cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,, -dilithium2,clean,60.9,30.2,52.9,,,,,, -dilithium2,m4f,79.9,62.2,76.8,,,,,, +dilithium2,clean,61.0,30.9,52.9,,,,,, +dilithium2,m4f,79.9,60.6,76.8,,,,,, +dilithium2,m4fstack,74.8,55.2,40.8,,,,,, dilithium3,clean,64.7,31.3,56.8,,,,,, dilithium3,m4f,82.3,60.3,79.4,,,,,, -dilithium5,clean,67.0,38.4,61.1,,,,,, -dilithium5,m4f,83.4,63.5,81.7,,,,,, +dilithium3,m4fstack,77.1,54.6,41.0,,,,,, +dilithium5,clean,67.0,35.7,61.1,,,,,, +dilithium5,m4f,83.5,65.0,81.7,,,,,, +dilithium5,m4fstack,76.1,54.5,42.6,,,,,, falcon-1024,clean,6.5,0.3,23.7,,,,,, falcon-1024,m4-ct,7.4,0.4,32.4,,,,,, falcon-1024,opt-ct,11.7,0.4,32.2,,,,,, @@ -490,10 +499,13 @@ cross-sha3-r-sdpg-3-fast,ref,19689,0,208,19897,,,,, cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,, dilithium2,clean,8064,0,0,8064,,,,, dilithium2,m4f,18596,0,0,18596,,,,, +dilithium2,m4fstack,24184,0,0,24184,,,,, dilithium3,clean,7580,0,0,7580,,,,, dilithium3,m4f,18588,0,0,18588,,,,, +dilithium3,m4fstack,23448,0,0,23448,,,,, dilithium5,clean,7808,0,0,7808,,,,, dilithium5,m4f,18468,0,0,18468,,,,, +dilithium5,m4fstack,23820,0,0,23820,,,,, falcon-1024,clean,82647,0,0,82647,,,,, falcon-1024,m4-ct,81825,0,79872,161697,,,,, falcon-1024,opt-ct,81825,0,79872,161697,,,,, diff --git a/benchmarks.md b/benchmarks.md index 5574fe2c..5aef4137 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -44,12 +44,15 @@ | cross-sha3-r-sdpg-1-small (10 executions) | ref | AVG: 290,135
MIN: 287,741
MAX: 297,757 | AVG: 102,853,622
MIN: 102,847,774
MAX: 102,861,948 | AVG: 75,137,510
MIN: 75,126,803
MAX: 75,159,685 | | cross-sha3-r-sdpg-3-fast (10 executions) | ref | AVG: 627,948
MIN: 625,525
MAX: 637,639 | AVG: 43,573,841
MIN: 43,565,461
MAX: 43,582,933 | AVG: 27,513,830
MIN: 27,493,024
MAX: 27,525,746 | | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280
MIN: 1,142,409
MAX: 1,153,794 | AVG: 93,557,878
MIN: 93,547,167
MAX: 93,566,329 | AVG: 59,948,216
MIN: 59,857,434
MAX: 60,043,852 | -| dilithium2 (90 executions) | clean | AVG: 1,873,447
MIN: 1,838,554
MAX: 1,903,845 | AVG: 7,846,622
MIN: 3,321,671
MAX: 28,761,609 | AVG: 2,062,804
MIN: 2,062,332
MAX: 2,063,181 | -| dilithium2 (100 executions) | m4f | AVG: 1,427,684
MIN: 1,390,524
MAX: 1,466,437 | AVG: 4,219,137
MIN: 1,813,668
MAX: 12,587,382 | AVG: 1,417,706
MIN: 1,417,251
MAX: 1,418,128 | +| dilithium2 (1000 executions) | clean | AVG: 1,874,167
MIN: 1,827,645
MAX: 1,914,566 | AVG: 7,493,877
MIN: 3,321,630
MAX: 40,762,756 | AVG: 2,062,795
MIN: 2,062,255
MAX: 2,063,222 | +| dilithium2 (1000 executions) | m4f | AVG: 1,426,036
MIN: 1,379,636
MAX: 1,466,394 | AVG: 3,807,970
MIN: 1,813,656
MAX: 18,528,070 | AVG: 1,417,745
MIN: 1,417,203
MAX: 1,418,192 | +| dilithium2 (1000 executions) | m4fstack | AVG: 1,801,523
MIN: 1,684,895
MAX: 1,902,114 | AVG: 12,170,976
MIN: 3,900,911
MAX: 86,281,518 | AVG: 3,241,353
MIN: 3,194,028
MAX: 3,281,144 | | dilithium3 (1000 executions) | clean | AVG: 3,205,551
MIN: 3,204,090
MAX: 3,207,411 | AVG: 12,696,585
MIN: 5,097,364
MAX: 74,392,293 | AVG: 3,376,992
MIN: 3,376,581
MAX: 3,377,393 | | dilithium3 (1000 executions) | m4f | AVG: 2,515,969
MIN: 2,514,498
MAX: 2,517,634 | AVG: 5,884,832
MIN: 2,917,322
MAX: 25,268,693 | AVG: 2,411,257
MIN: 2,410,858
MAX: 2,411,717 | -| dilithium5 (90 executions) | clean | AVG: 5,346,066
MIN: 5,287,239
MAX: 5,395,626 | AVG: 15,205,929
MIN: 7,953,360
MAX: 49,173,429 | AVG: 5,609,664
MIN: 5,609,137
MAX: 5,610,119 | -| dilithium5 (100 executions) | m4f | AVG: 4,273,211
MIN: 4,210,308
MAX: 4,329,697 | AVG: 8,062,110
MIN: 4,882,708
MAX: 18,398,575 | AVG: 4,185,407
MIN: 4,184,878
MAX: 4,185,954 | +| dilithium3 (1000 executions) | m4fstack | AVG: 3,412,759
MIN: 3,406,659
MAX: 3,419,247 | AVG: 23,673,016
MIN: 6,733,971
MAX: 145,803,146 | AVG: 5,733,307
MIN: 5,688,893
MAX: 5,778,120 | +| dilithium5 (1000 executions) | clean | AVG: 5,341,477
MIN: 5,286,872
MAX: 5,395,822 | AVG: 15,710,371
MIN: 7,953,367
MAX: 75,940,093 | AVG: 5,609,679
MIN: 5,609,217
MAX: 5,610,183 | +| dilithium5 (1000 executions) | m4f | AVG: 4,275,029
MIN: 4,210,286
MAX: 4,329,519 | AVG: 7,977,781
MIN: 4,882,524
MAX: 25,936,176 | AVG: 4,185,417
MIN: 4,184,925
MAX: 4,185,896 | +| dilithium5 (1000 executions) | m4fstack | AVG: 5,816,287
MIN: 5,474,236
MAX: 6,115,061 | AVG: 33,452,872
MIN: 11,170,780
MAX: 185,259,803 | AVG: 9,912,851
MIN: 9,845,789
MAX: 9,981,834 | | falcon-1024 (10 executions) | m4-ct | AVG: 354,880,005
MIN: 284,902,033
MAX: 635,131,652 | AVG: 87,741,288
MIN: 87,506,676
MAX: 87,922,628 | AVG: 991,320
MIN: 982,548
MAX: 997,219 | | falcon-1024 (10 executions) | opt-ct | AVG: 555,202,324
MIN: 284,912,829
MAX: 1,157,528,581 | AVG: 87,710,190
MIN: 87,606,677
MAX: 87,841,235 | AVG: 993,584
MIN: 983,066
MAX: 997,523 | | falcon-1024 (10 executions) | opt-leaktime | AVG: 438,412,062
MIN: 334,858,742
MAX: 625,013,074 | AVG: 80,139,483
MIN: 79,891,200
MAX: 80,551,967 | AVG: 994,127
MIN: 984,891
MAX: 997,390 | @@ -194,11 +197,14 @@ | cross-sha3-r-sdpg-3-fast | ref | 4,032 | 205,080 | 108,236 | | cross-sha3-r-sdpg-5-fast | ref | 6,824 | 398,600 | 213,436 | | dilithium2 | clean | 38,304 | 51,968 | 36,192 | -| dilithium2 | m4f | 38,296 | 49,416 | 36,184 | +| dilithium2 | m4f | 38,296 | 49,416 | 36,220 | +| dilithium2 | m4fstack | 4,408 | 5,072 | 2,704 | | dilithium3 | clean | 60,832 | 79,616 | 57,728 | | dilithium3 | m4f | 60,824 | 68,864 | 57,720 | +| dilithium3 | m4fstack | 4,408 | 6,608 | 2,704 | | dilithium5 | clean | 97,696 | 122,724 | 92,940 | -| dilithium5 | m4f | 97,688 | 116,076 | 92,824 | +| dilithium5 | m4f | 97,688 | 116,076 | 92,932 | +| dilithium5 | m4fstack | 4,408 | 8,136 | 2,712 | | falcon-1024 | clean | 34,988 | 84,604 | 8,784 | | falcon-1024 | m4-ct | 1,156 | 2,508 | 376 | | falcon-1024 | opt-ct | 1,156 | 2,508 | 376 | @@ -345,12 +351,15 @@ | cross-sha3-r-sdpg-1-small | ref | 71.8% | 74.7% | 78.4% | | cross-sha3-r-sdpg-3-fast | ref | 71.7% | 68.2% | 68.7% | | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% | -| dilithium2 | clean | 60.9% | 30.2% | 52.9% | -| dilithium2 | m4f | 79.9% | 62.2% | 76.8% | +| dilithium2 | clean | 61.0% | 30.9% | 52.9% | +| dilithium2 | m4f | 79.9% | 60.6% | 76.8% | +| dilithium2 | m4fstack | 74.8% | 55.2% | 40.8% | | dilithium3 | clean | 64.7% | 31.3% | 56.8% | -| dilithium3 | m4f | 82.3% | 60.3% | 79.4% | -| dilithium5 | clean | 67.0% | 38.4% | 61.1% | -| dilithium5 | m4f | 83.4% | 63.5% | 81.7% | +| dilithium3 | m4f | 82.3% | 61.4% | 79.4% | +| dilithium3 | m4fstack | 77.1% | 54.6% | 41.0% | +| dilithium5 | clean | 67.0% | 35.7% | 61.1% | +| dilithium5 | m4f | 83.5% | 65.0% | 81.7% | +| dilithium5 | m4fstack | 76.1% | 54.5% | 42.6% | | falcon-1024 | clean | 6.5% | 0.3% | 23.7% | | falcon-1024 | m4-ct | 7.4% | 0.4% | 32.4% | | falcon-1024 | opt-ct | 11.7% | 0.4% | 32.2% | @@ -498,10 +507,13 @@ | cross-sha3-r-sdpg-5-fast | ref | 18,593 | 0 | 208 | 18,801 | | dilithium2 | clean | 8,064 | 0 | 0 | 8,064 | | dilithium2 | m4f | 18,596 | 0 | 0 | 18,596 | +| dilithium2 | m4fstack | 24,184 | 0 | 0 | 24,184 | | dilithium3 | clean | 7,580 | 0 | 0 | 7,580 | | dilithium3 | m4f | 18,588 | 0 | 0 | 18,588 | +| dilithium3 | m4fstack | 23,448 | 0 | 0 | 23,448 | | dilithium5 | clean | 7,808 | 0 | 0 | 7,808 | | dilithium5 | m4f | 18,468 | 0 | 0 | 18,468 | +| dilithium5 | m4fstack | 23,820 | 0 | 0 | 23,820 | | falcon-1024 | clean | 82,647 | 0 | 0 | 82,647 | | falcon-1024 | m4-ct | 81,825 | 0 | 79,872 | 161,697 | | falcon-1024 | opt-ct | 81,825 | 0 | 79,872 | 161,697 |