From 9c2bc412b0b2537437cd56ff3dc5a29c7a048480 Mon Sep 17 00:00:00 2001
From: Huang Junhao <1561868283@qq.com>
Date: Mon, 15 Apr 2024 15:30:22 +0800
Subject: [PATCH] Revisiting Keccak and Dilithium Implementations on ARMv7-M
 (#338)

* Use Plantard arithmetic for NTT_769 in Dilithium

* rm old smallntt.S

* update benchmarks

---------

Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 benchmarks.csv                               |  10 +-
 benchmarks.md                                |  10 +-
 crypto_sign/dilithium3/m4f/macros_fnt.i      |   1 -
 crypto_sign/dilithium3/m4f/macros_smallntt.i |  98 +++
 crypto_sign/dilithium3/m4f/smallntt.S        | 837 -------------------
 crypto_sign/dilithium3/m4f/smallntt.h        |  85 +-
 crypto_sign/dilithium3/m4f/smallntt_769.S    | 681 +++++++++++++++
 7 files changed, 829 insertions(+), 893 deletions(-)
 delete mode 120000 crypto_sign/dilithium3/m4f/macros_fnt.i
 create mode 100644 crypto_sign/dilithium3/m4f/macros_smallntt.i
 delete mode 100644 crypto_sign/dilithium3/m4f/smallntt.S
 create mode 100644 crypto_sign/dilithium3/m4f/smallntt_769.S
diff --git a/benchmarks.csv b/benchmarks.csv
index 9270fca8..981accb6 100644
--- a/benchmarks.csv
+++ b/benchmarks.csv
@@ -44,8 +44,8 @@ cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565
 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852
 dilithium2 (90 executions),clean,1873447,1838554,1903845,7846622,3321671,28761609,2062804,2062332,2063181
 dilithium2 (100 executions),m4f,1427684,1390524,1466437,4219137,1813668,12587382,1417706,1417251,1418128
-dilithium3 (90 executions),clean,3205542,3204354,3206592,12108503,5097440,50759276,3377010,3376729,3377395
-dilithium3 (100 executions),m4f,2515970,2514894,2516922,5896583,2935265,23718896,2411234,2410948,2411551
+dilithium3 (1000 executions),clean,3205551,3204090,3207411,12696585,5097364,74392293,3376992,3376581,3377393
+dilithium3 (1000 executions),m4f,2515969,2514498,2517634,5884832,2917322,25268693,2411257,2410858,2411717
 dilithium5 (90 executions),clean,5346066,5287239,5395626,15205929,7953360,49173429,5609664,5609137,5610119
 dilithium5 (100 executions),m4f,4273211,4210308,4329697,8062110,4882708,18398575,4185407,4184878,4185954
 falcon-1024 (10 executions),m4-ct,354880005,284902033,635131652,87741288,87506676,87922628,991320,982548,997219
@@ -341,8 +341,8 @@ cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,,
 cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,,
 dilithium2,clean,60.9,30.2,52.9,,,,,,
 dilithium2,m4f,79.9,62.2,76.8,,,,,,
-dilithium3,clean,64.7,33.8,56.8,,,,,,
-dilithium3,m4f,82.3,57.9,79.4,,,,,,
+dilithium3,clean,64.7,31.3,56.8,,,,,,
+dilithium3,m4f,82.3,60.3,79.4,,,,,,
 dilithium5,clean,67.0,38.4,61.1,,,,,,
 dilithium5,m4f,83.4,63.5,81.7,,,,,,
 falcon-1024,clean,6.5,0.3,23.7,,,,,,
@@ -491,7 +491,7 @@ cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,,
 dilithium2,clean,8064,0,0,8064,,,,,
 dilithium2,m4f,18596,0,0,18596,,,,,
 dilithium3,clean,7580,0,0,7580,,,,,
-dilithium3,m4f,20108,0,0,20108,,,,,
+dilithium3,m4f,18588,0,0,18588,,,,,
 dilithium5,clean,7808,0,0,7808,,,,,
 dilithium5,m4f,18468,0,0,18468,,,,,
 falcon-1024,clean,82647,0,0,82647,,,,,
diff --git a/benchmarks.md b/benchmarks.md
index e2b4a4e3..5574fe2c 100644
--- a/benchmarks.md
+++ b/benchmarks.md
@@ -46,8 +46,8 @@
 | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280 <br /> MIN: 1,142,409 <br /> MAX: 1,153,794 | AVG: 93,557,878 <br /> MIN: 93,547,167 <br /> MAX: 93,566,329 | AVG: 59,948,216 <br /> MIN: 59,857,434 <br /> MAX: 60,043,852 |
 | dilithium2 (90 executions) | clean | AVG: 1,873,447 <br /> MIN: 1,838,554 <br /> MAX: 1,903,845 | AVG: 7,846,622 <br /> MIN: 3,321,671 <br /> MAX: 28,761,609 | AVG: 2,062,804 <br /> MIN: 2,062,332 <br /> MAX: 2,063,181 |
 | dilithium2 (100 executions) | m4f | AVG: 1,427,684 <br /> MIN: 1,390,524 <br /> MAX: 1,466,437 | AVG: 4,219,137 <br /> MIN: 1,813,668 <br /> MAX: 12,587,382 | AVG: 1,417,706 <br /> MIN: 1,417,251 <br /> MAX: 1,418,128 |
-| dilithium3 (90 executions) | clean | AVG: 3,205,542 <br /> MIN: 3,204,354 <br /> MAX: 3,206,592 | AVG: 12,108,503 <br /> MIN: 5,097,440 <br /> MAX: 50,759,276 | AVG: 3,377,010 <br /> MIN: 3,376,729 <br /> MAX: 3,377,395 |
-| dilithium3 (100 executions) | m4f | AVG: 2,515,970 <br /> MIN: 2,514,894 <br /> MAX: 2,516,922 | AVG: 5,896,583 <br /> MIN: 2,935,265 <br /> MAX: 23,718,896 | AVG: 2,411,234 <br /> MIN: 2,410,948 <br /> MAX: 2,411,551 |
+| dilithium3 (1000 executions) | clean | AVG: 3,205,551 <br /> MIN: 3,204,090 <br /> MAX: 3,207,411 | AVG: 12,696,585 <br /> MIN: 5,097,364 <br /> MAX: 74,392,293 | AVG: 3,376,992 <br /> MIN: 3,376,581 <br /> MAX: 3,377,393 |
+| dilithium3 (1000 executions) | m4f | AVG: 2,515,969 <br /> MIN: 2,514,498 <br /> MAX: 2,517,634 | AVG: 5,884,832 <br /> MIN: 2,917,322 <br /> MAX: 25,268,693 | AVG: 2,411,257 <br /> MIN: 2,410,858 <br /> MAX: 2,411,717 |
 | dilithium5 (90 executions) | clean | AVG: 5,346,066 <br /> MIN: 5,287,239 <br /> MAX: 5,395,626 | AVG: 15,205,929 <br /> MIN: 7,953,360 <br /> MAX: 49,173,429 | AVG: 5,609,664 <br /> MIN: 5,609,137 <br /> MAX: 5,610,119 |
 | dilithium5 (100 executions) | m4f | AVG: 4,273,211 <br /> MIN: 4,210,308 <br /> MAX: 4,329,697 | AVG: 8,062,110 <br /> MIN: 4,882,708 <br /> MAX: 18,398,575 | AVG: 4,185,407 <br /> MIN: 4,184,878 <br /> MAX: 4,185,954 |
 | falcon-1024 (10 executions) | m4-ct | AVG: 354,880,005 <br /> MIN: 284,902,033 <br /> MAX: 635,131,652 | AVG: 87,741,288 <br /> MIN: 87,506,676 <br /> MAX: 87,922,628 | AVG: 991,320 <br /> MIN: 982,548 <br /> MAX: 997,219 |
@@ -347,8 +347,8 @@
 | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% |
 | dilithium2 | clean | 60.9% | 30.2% | 52.9% |
 | dilithium2 | m4f | 79.9% | 62.2% | 76.8% |
-| dilithium3 | clean | 64.7% | 33.8% | 56.8% |
-| dilithium3 | m4f | 82.3% | 57.9% | 79.4% |
+| dilithium3 | clean | 64.7% | 31.3% | 56.8% |
+| dilithium3 | m4f | 82.3% | 60.3% | 79.4% |
 | dilithium5 | clean | 67.0% | 38.4% | 61.1% |
 | dilithium5 | m4f | 83.4% | 63.5% | 81.7% |
 | falcon-1024 | clean | 6.5% | 0.3% | 23.7% |
@@ -499,7 +499,7 @@
 | dilithium2 | clean | 8,064 | 0 | 0 | 8,064 |
 | dilithium2 | m4f | 18,596 | 0 | 0 | 18,596 |
 | dilithium3 | clean | 7,580 | 0 | 0 | 7,580 |
-| dilithium3 | m4f | 20,108 | 0 | 0 | 20,108 |
+| dilithium3 | m4f | 18,588 | 0 | 0 | 18,588 |
 | dilithium5 | clean | 7,808 | 0 | 0 | 7,808 |
 | dilithium5 | m4f | 18,468 | 0 | 0 | 18,468 |
 | falcon-1024 | clean | 82,647 | 0 | 0 | 82,647 |
diff --git a/crypto_sign/dilithium3/m4f/macros_fnt.i b/crypto_sign/dilithium3/m4f/macros_fnt.i
deleted file mode 120000
index 1abff093..00000000
--- a/crypto_sign/dilithium3/m4f/macros_fnt.i
+++ /dev/null
@@ -1 +0,0 @@
-../../dilithium2/m4f/macros_fnt.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/macros_smallntt.i b/crypto_sign/dilithium3/m4f/macros_smallntt.i
new file mode 100644
index 00000000..61b63241
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/macros_smallntt.i
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MACROS_SMALLNTT_I
+#define MACROS_SMALLNTT_I
+
+// general macros
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro doubleplant a, tmp, q, qa, plantconst
+  smulwb \tmp, \plantconst, \a
+  smulwt \a, \plantconst, \a
+  smlabt \tmp, \tmp, \q, \qa
+  smlabt \a, \a, \q, \qa
+  pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebarrett a, tmp, tmp2, q, barrettconst
+  smulbb \tmp, \a, \barrettconst
+  smultb \tmp2, \a, \barrettconst
+  asr \tmp, \tmp, #26
+  asr \tmp2, \tmp2, #26
+  smulbb \tmp, \tmp, \q
+  smulbb \tmp2, \tmp2, \q
+  pkhbt \tmp, \tmp, \tmp2, lsl#16
+  usub16 \a, \a, \tmp
+.endm
+
+// q locate in the top half of the register
+.macro plant_red q, qa, qinv, tmp
+  mul \tmp, \tmp, \qinv     
+  //tmp*qinv mod 2^2n/ 2^n; in high half
+  smlatt \tmp, \tmp, \q, \qa
+  // result in high half
+.endm
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+//For 3329
+.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+	doubleplant \a4, \tmp, \q, \qa, \plantconst
+	doubleplant \a5, \tmp, \q, \qa, \plantconst
+	doubleplant \a6, \tmp, \q, \qa, \plantconst
+	doubleplant \a7, \tmp, \q, \qa, \plantconst
+.endm
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/smallntt.S b/crypto_sign/dilithium3/m4f/smallntt.S
deleted file mode 100644
index 747c111c..00000000
--- a/crypto_sign/dilithium3/m4f/smallntt.S
+++ /dev/null
@@ -1,837 +0,0 @@
-#include "macros.i"
-
-.syntax unified
-.cpu cortex-m4
-.thumb
-
-// general macros
-.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  ldr.w \a0, [\a, \mem0]
-  ldr.w \a1, [\a, \mem1]
-  ldr.w \a2, [\a, \mem2]
-  ldr.w \a3, [\a, \mem3]
-.endm
-
-.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
-  str.w \a0, [\a, \mem0]
-  str.w \a1, [\a, \mem1]
-  str.w \a2, [\a, \mem2]
-  str.w \a3, [\a, \mem3]
-.endm
-
-.macro montgomery q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \tmp, \q, \tmp, \a
-.endm
-
-.macro montgomery_inplace q, qinv, a, tmp
-  smulbt \tmp, \a, \qinv
-  smlabb \a, \q, \tmp, \a
-.endm
-
-.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst
-  smulbb \tmp2, \a, \montconst
-  montgomery \q, \qinv, \tmp2, \tmp
-  smultb \a, \a, \montconst
-  montgomery \q, \qinv, \a, \tmp2
-  pkhtb \a, \tmp2, \tmp, asr#16
-.endm
-
-// #######
-// #######
-// # NTT #
-// #######
-// #######
-
-.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv
-    smulb\tb \tmp, \a, \twiddle
-    smult\tb \a, \a, \twiddle
-    montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-    montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2
-    pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-.endm
-
-.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv
-  smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb
-  smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb
-  montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2
-  montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp
-  pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves
-  usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs)
-  uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs)
-.endm
-
-.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv
-  doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv
-  doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w \twiddle, [\twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2
-    // layer 3
-    vmov \twiddle, \xi01
-    two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    vmov \twiddle, \xi23
-    two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    vmov \twiddle, \xi45
-    two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    vmov \twiddle, \xi67
-    two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.global small_ntt_asm
-.type small_ntt_asm, %function
-.align 2
-small_ntt_asm:
-  push {r4-r11, r14}
-  vpush.w {s16}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 256
-  .equ offset, 32
-  .equ strincr, 4
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
-
-
-  add tmp, poly, #strincr*8
-  vmov s16, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // 8-NTT on a1, a3, ..., a15
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s13
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s14
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s15
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // 8-NTT on a0, a2, ..., a14
-    _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s16
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-
-  .equ distance, distance/16
-  .equ strincr, 32
-
-  add.w tmp, poly, #strincr*16
-  vmov s13, tmp
-
-  2:
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  vpop.w {s16}
-  pop {r4-r11, pc}
-
-
-.unreq poly
-.unreq twiddle_ptr
-.unreq poly0
-.unreq poly1
-.unreq poly2
-.unreq poly3
-.unreq poly4
-.unreq poly5
-.unreq poly6
-.unreq poly7
-.unreq twiddle
-.unreq qinv
-.unreq q
-.unreq tmp
-.unreq tmp2
-
-// ########
-// ########
-// # INTT #
-// ########
-// ########
-
-.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv
-  uadd16 \tmp, \a0, \a1
-  usub16 \a1, \a0, \a1
-  mov.w \a0, \tmp
-.endm
-
-.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv
-  doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv
-  doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv
-.endm
-
-.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  // layer 2
-  sadd16.w \c6, \tmp, \tmp2 // c0, c2
-  ssub16.w \tmp2, \tmp, \tmp2
-  sadd16.w \c4, \c0, \c2 // c4, c6
-  ssub16.w \c2, \c0, \c2
-
-  vmov.w \twiddle, \xi12
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
-.endm
-
-.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2
-
-  // layer 1
-  sadd16.w \tmp, \c0, \c1 // c0, c1
-  ssub16.w \c1, \c0, \c1
-  sadd16.w \tmp2, \c2, \c3 // c2, c3
-  ssub16.w \c3, \c2, \c3
-
-  sadd16.w \c0, \c4, \c5 // c4, c5
-  ssub16.w \c5, \c4, \c5
-  sadd16.w \c2, \c6, \c7 // c6, c7
-  ssub16.w \c7, \c6, \c7
-  // c4, c6 are free at this point
-
-  mov.w \c6, \tmp
-  mov.w \c4, \c0
-
-  // layer 2
-  vmov.w \twiddle, \xi12
-  doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv
-  doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free
-  doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv
-  // c0, c6 are free at this point
-
-  // layer 3
-  sadd16.w \c0, \c6, \c4 // c0, c4
-  ssub16.w \c4, \c6, \c4
-
-  vmov.w \twiddle, \xi34
-  doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv
-
-  vmov.w \twiddle, \xi56
-  // this block is one doublebutterfly
-  smulbb \tmp, \c2, \twiddle // c2, c6
-  smultb \c2, \c2, \twiddle
-  montgomery_inplace \q, \qinv, \tmp, \c6
-  montgomery_inplace \q, \qinv, \c2, \c6
-  pkhtb \tmp, \c2, \tmp, asr #16
-  ssub16.w \c6, \tmp2, \tmp
-  sadd16.w \c2, \tmp2, \tmp
-
-  doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv
-
-.endm
-
-.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2
-    // layer 3
-    ldrh.w twiddle, [twiddle_ptr], #2
-    two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-    two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 2
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    // layer 1
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime
-
-    ldr.w twiddle, [twiddle_ptr], #4
-    two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime
-.endm
-
-.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2
-    smulb\tb \tmp, \a, \twiddle
-    smmulr.w \tmp2, \tmp, \Qbar
-    mls.w \tmp, \tmp2, \Q, \tmp
-    smult\tb \a, \a, \twiddle
-    smmulr.w \tmp2, \a, \Qbar
-    mls.w \a, \tmp2, \Q, \a
-    pkhbt \a, \tmp, \a, lsl #16
-.endm
-
-.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2
-
-    movt \Q, #0
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    ldr.w \twiddle, [\twiddle_ptr], #4
-
-    mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2
-    mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2
-
-    movt \Q, #767
-
-.endm
-
-.global small_invntt_tomont_asm
-.type small_invntt_tomont_asm, %function
-.align 2
-small_invntt_tomont_asm:
-  push {r4-r11, r14}
-
-  poly        .req r0
-  twiddle_ptr .req r1
-  poly0       .req r2
-  poly1       .req r3
-  poly2       .req r4
-  poly3       .req r5
-  poly4       .req r6
-  poly5       .req r7
-  poly6       .req r8
-  poly7       .req r9
-  twiddle     .req r10
-  qinv        .req r11
-  q           .req r11
-  tmp         .req r12
-  tmp2        .req r14
-
-  movw q, #769
-  movt qinv, #767
-
-  ### LAYER 7+6+5+4
-  .equ distance, 16
-  .equ offset, 32
-  .equ strincr, 64
-
-  // pre-load twiddle factors to FPU registers
-  vldm twiddle_ptr!, {s8-s15}
-
-  add.w tmp, poly, #8*strincr
-  vmov s8, tmp
-  1:
-    // load a1, a3, ..., a15
-    load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
-    load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
-
-    // NTT on a1, a3, ..., a15
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
-
-    // multiply coeffs by layer 4 twiddles for later use
-    vmov twiddle, s12
-    mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only
-    mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s13
-    mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s14
-    mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv
-
-    vmov twiddle, s15
-    mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv
-    mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv
-
-    vmov s0, poly0 // a1
-    vmov s1, poly1 // a3
-    vmov s2, poly2 // a5
-    vmov s3, poly3 // a7
-    vmov s4, poly4 // a9
-    vmov s5, poly5 // a11
-    vmov s6, poly6 // a13
-    vmov s7, poly7 // a15
-
-    // ----------
-
-    // load a0, a2, ..., a14
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    // NTT on a0, a2, ..., a14
-    _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2
-
-    // layer 4 - 1
-    // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
-    vmov tmp2, s1 // load a3
-    vmov s1, poly0 // preserve a0
-    uadd16 poly0, poly1, tmp2
-    usub16 poly1, poly1, tmp2
-
-    vmov tmp2, s3 // load a7
-    vmov s3, poly2 // preserve a4
-    uadd16 poly2, poly3, tmp2
-    usub16 poly3, poly3, tmp2
-
-    vmov tmp2, s5 // load a11
-    vmov s5, poly4 // preserve a8
-    uadd16 poly4, poly5, tmp2
-    usub16 poly5, poly5, tmp2
-
-    vmov tmp2, s7 // load a15
-    vmov s7, poly6 // preserve a12
-    uadd16 poly6, poly7, tmp2
-    usub16 poly7, poly7, tmp2
-
-    str.w poly0, [poly, #1*distance/4]
-    str.w poly1, [poly, #1*distance/4+offset]
-    str.w poly2, [poly, #3*distance/4]
-    str.w poly3, [poly, #3*distance/4+offset]
-    str.w poly4, [poly, #5*distance/4]
-    str.w poly5, [poly, #5*distance/4+offset]
-    str.w poly6, [poly, #7*distance/4]
-    str.w poly7, [poly, #7*distance/4+offset]
-
-    // layer 4 - 2
-    // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
-    vmov tmp2, s1 // load a0
-    vmov poly1, s0 // load a1
-    uadd16 poly0, tmp2, poly1
-    usub16 poly1, tmp2, poly1
-
-    vmov tmp2, s3 // load a4
-    vmov poly3, s2 // load a5
-    uadd16 poly2, tmp2, poly3
-    usub16 poly3, tmp2, poly3
-
-    vmov tmp2, s5 // load a8
-    vmov poly5, s4 // load a9
-    uadd16 poly4, tmp2, poly5
-    usub16 poly5, tmp2, poly5
-
-    vmov tmp2, s7 // load a12
-    vmov poly7, s6 // load a13
-    uadd16 poly6, tmp2, poly7
-    usub16 poly7, tmp2, poly7
-
-    str.w poly1, [poly, #offset]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #2*distance/4+offset]
-    str.w poly4, [poly, #4*distance/4]
-    str.w poly5, [poly, #4*distance/4+offset]
-    str.w poly6, [poly, #6*distance/4]
-    str.w poly7, [poly, #6*distance/4+offset]
-    str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
-
-    vmov tmp, s8
-    cmp.w poly, tmp
-  bne.w 1b
-
-  sub.w poly, #8*strincr
-
-  ### LAYER 3+2+1
-  .equ distance, distance*16
-  .equ strincr, 4
-
-  // ITER 0
-  load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-  load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-  vldm twiddle_ptr!, {s5-s7}
-
-  _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2
-
-  vmov.w s2, poly
-  movw poly, #:lower16:5585133
-  movt poly, #:upper16:5585133
-
-  // twisting
-  _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-  vmov.w poly, s2
-
-  store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-  str.w poly1, [poly, #distance/4]
-  str.w poly2, [poly, #2*distance/4]
-  str.w poly3, [poly, #3*distance/4]
-  str.w poly0, [poly], #4
-
-  // ITER 1-12
-  add.w tmp, poly, #strincr*3*(3+1)
-  vmov s14, tmp
-  3:
-    add.w tmp, poly, #strincr*3
-    vmov s13, tmp
-    2:
-      // polys upto 6q
-      load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-      load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-
-      _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-      vmov.w s2, poly
-      movw poly, #:lower16:5585133
-      movt poly, #:upper16:5585133
-
-      // twisting
-      _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-      vmov.w poly, s2
-
-      store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-      str.w poly1, [poly, #distance/4]
-      str.w poly2, [poly, #2*distance/4]
-      str.w poly3, [poly, #3*distance/4]
-      str.w poly0, [poly], #4
-
-      vmov tmp, s13
-      cmp.w poly, tmp
-    bne.w 2b
-
-    // polys upto 9q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #4
-
-    vmov tmp, s14
-    cmp.w poly, tmp
-  bne.w 3b
-
-  // ITER 13-15
-  add tmp, poly, #3*strincr
-  vmov s13, tmp
-  2:
-    // polys upto 6q
-    load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
-    load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-
-    _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2
-
-    vmov.w s2, poly
-    movw poly, #:lower16:5585133
-    movt poly, #:upper16:5585133
-
-    // twisting
-    _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2
-
-    vmov.w poly, s2
-
-    store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
-    str.w poly1, [poly, #distance/4]
-    str.w poly2, [poly, #2*distance/4]
-    str.w poly3, [poly, #3*distance/4]
-    str.w poly0, [poly], #strincr
-
-    vmov tmp, s13
-    cmp.w poly, tmp
-  bne.w 2b
-
-  pop {r4-r11, pc}
-
-.unreq poly
-.unreq twiddle_ptr
-.unreq poly0
-.unreq poly1
-.unreq poly2
-.unreq poly3
-.unreq poly4
-.unreq poly5
-.unreq poly6
-.unreq poly7
-.unreq twiddle
-.unreq qinv
-.unreq q
-.unreq tmp
-.unreq tmp2
-
-.align 2
-.global small_pointmul_asm
-.type small_pointmul_asm, %function
-small_pointmul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-
-    .equ width, 4
-
-    add.w r12, r2, #64*2
-    _point_mul_16_loop:
-
-    ldr.w r7, [r1, #2*width]
-    ldr.w r8, [r1, #3*width]
-    ldrsh.w r9, [r2, #1*2]
-    ldr.w r5, [r1, #1*width]
-    ldr.w r4, [r1], #4*width
-    ldrsh.w r6, [r2], #2*2
-
-    smultb r10, r4, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r4, r4, r11
-
-
-    neg.w r6, r6
-
-    smultb r10, r5, r6
-    montgomery r14, r14, r10, r11
-    pkhbt r5, r5, r11
-
-    str.w r5, [r0, #1*width]
-    str.w r4, [r0], #2*width
-
-    smultb r10, r7, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r7, r7, r11
-
-    neg.w r9, r9
-
-    smultb r10, r8, r9
-    montgomery r14, r14, r10, r11
-    pkhbt r8, r8, r11
-
-    str.w r8, [r0, #1*width]
-    str.w r7, [r0], #2*width
-
-    cmp.w r2, r12
-    bne.w _point_mul_16_loop
-
-    pop.w {r4-r11, pc}
-
-  .align 2
-.global small_asymmetric_mul_asm
-.type small_asymmetric_mul_asm, %function
-small_asymmetric_mul_asm:
-    push.w {r4-r11, lr}
-
-    movw r14, #769
-    movt r14, #767
-    .equ width, 4
-    add.w r12, r0, #256*2
-    _asymmetric_mul_16_loop:
-    ldr.w r7, [r1, #width]
-    ldr.w r4, [r1], #2*width
-    ldr.w r8, [r2, #width]
-    ldr.w r5, [r2], #2*width
-    ldr.w r9, [r3, #width]
-    ldr.w r6, [r3], #2*width
-
-    smuad r10, r4, r6
-    montgomery r14, r14, r10, r6
-    smuadx r11, r4, r5
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-
-    str.w r10, [r0], #width
-
-    smuad r10, r7, r9
-    montgomery r14, r14, r10, r6
-    smuadx r11, r7, r8
-    montgomery r14, r14, r11, r10
-
-    pkhtb r10, r10, r6, asr#16
-    str.w r10, [r0], #width
-
-
-    cmp.w r0, r12
-    bne.w _asymmetric_mul_16_loop
-
-    pop.w {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/smallntt.h b/crypto_sign/dilithium3/m4f/smallntt.h
index 0aa0ce9b..2927ff4d 100644
--- a/crypto_sign/dilithium3/m4f/smallntt.h
+++ b/crypto_sign/dilithium3/m4f/smallntt.h
@@ -1,53 +1,48 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #ifndef SMALLNTT_H
 #define SMALLNTT_H
 
 #include <stdint.h>
 #include "params.h"
 
-static const int16_t zetas[64] = {
--23, 112, -151, -134, -52, -148, 227, 232,
--71, 212, 236, 21, 341, 379, -202, -220,
-352, 292, 238, 145, 194, -276, 70, -274,
-117, 333, 66, 247, -237, -83, -252, -244,
-331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168
-};
-
-static const int16_t zetas_asm[128] = {
-0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337,
--76, 147, -114, -23, 112, -151, -134,
--98, -272, 54, -52, -148, 227, 232,
-36, -2, -124, -71, 212, 236, 21,
--75, -80, -346, 341, 379, -202, -220,
--339, 86, -51, 352, 292, 238, 145,
--255, 364, 267, 194, -276, 70, -274,
-282, 161, -15, 117, 333, 66, 247,
--203, 288, 169, -237, -83, -252, -244,
--34, 191, 307, 331, -241, 167, 357,
-199, -50, -24, -355, 291, -358, 105,
-178, -170, 226, -115, -209, 14, 99,
-270, 121, -188, -260, 29, 366, -378,
--10, -380, 279, -318, 278, 353, 354,
-149, 180, -375, -184, 127, 330, -303,
-369, -157, 263, 222, -78, -348, -44,
--192, -128, -246, 201, 158, 350, 168
-};
-
-static const int16_t zetas_inv_CT_asm[256] = {
-0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186,
-171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138,
-};
-
-
-#define SMALL_Q 769
-
-void small_ntt_asm(int16_t a[N], const int16_t * zetas);
-void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas);
-void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas);
-void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]);
-
-#define small_ntt(a) small_ntt_asm(a, zetas_asm)
-#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm)
-#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas)
-#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime);
+#define SMALL_Q 769 
+
+static const int32_t zetas_769[64] = {
+	3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838};
+
+static const int32_t zetas_asm_769[128] = {
+	346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0};
+
+// INTT with CT butterfly
+static const int32_t zetas_inv_asm_769[256] = {
+	5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091,
+	// removed first "2285" + LAYER 3+2+1 - 1 - butterfly
+	5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0};
+
+// Q1=769
+void small_ntt_asm_769(int16_t a[N], const int32_t *zetas);
+void small_invntt_asm_769(int16_t a[N], const int32_t *zetas);
+void small_pointmul_asm_769(int16_t out[N], const int16_t in[N], const int32_t *zetas);
+void small_asymmetric_mul_asm_769(int16_t c[N], const int16_t a[N], const int16_t b[N], const int16_t b_prime[N]);
+
+// small NTT for computing cs0 and cs1; default use 769 as modulus.
+#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769)
+#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769)
+#define small_point_mul(out, in) small_pointmul_asm_769(out, in, zetas_769)
+#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm_769(c, a, b, b_prime);
 
 #endif
diff --git a/crypto_sign/dilithium3/m4f/smallntt_769.S b/crypto_sign/dilithium3/m4f/smallntt_769.S
new file mode 100644
index 00000000..97c60f03
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/smallntt_769.S
@@ -0,0 +1,681 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+#include "macros_smallntt.i"
+// #######
+// #######
+// # NTT #
+// #######
+// #######
+
+.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp
+	// layer 3
+	vmov \twiddle1, \xi0
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	vmov \twiddle1, \xi1
+	vmov \twiddle2, \xi2
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	vmov \twiddle1, \xi3
+	vmov \twiddle2, \xi4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	vmov \twiddle1, \xi5
+	vmov \twiddle2, \xi6
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.global small_ntt_asm_769
+.type small_ntt_asm_769, %function
+.align 2
+small_ntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s24}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	###  qinv        .req r11 ### q^-1 mod 2^2n; n=16
+	q           .req r12 
+	### at the top of r12
+	qa          .req r0
+	### qa=2^a q;a=3; at the bottom of r12
+	tmp         .req r14
+
+	// movw qa, #24608
+	// Why movt? Because we initially placed qa at the bottom of the same register as q;
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 256
+	.equ offset, 32
+	.equ strincr, 4
+	// pre-load 15 twiddle factors to 15 FPU registers
+	// s0-s7 used to temporary store 16 16-bit polys.
+	vldm twiddle_ptr!, {s8-s22}
+ 
+	add tmp, poly, #strincr*8
+	// s23: poly addr
+	// s24: tmp  
+	vmov s24, tmp  
+	1:
+		// load a1, a3, ..., a15
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+		
+		movw qa, #24608
+
+		// 8-NTT on a1, a3, ..., a15
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// s15, s16, s17, s18, s19, s20, s21, s22 left
+		// multiply coeffs by layer 8 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16 
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18 
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20 
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22 
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+
+		vmov poly, s23
+	
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// 8-NTT on a0, a2, ..., a14
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle1, s1 // load a3
+		uadd16 tmp, poly1, twiddle1
+		usub16 poly1, poly1, twiddle1
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle1, s3 // load a7
+		uadd16 tmp, poly3, twiddle1
+		usub16 poly3, poly3, twiddle1
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle1, s5 // load a11
+		uadd16 tmp, poly5, twiddle1
+		usub16 poly5, poly5, twiddle1
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle1, s7 // load a15
+		uadd16 tmp, poly7, twiddle1
+		usub16 poly7, poly7, twiddle1
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle1, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle1, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle1, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle1, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle1, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle1, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle1, poly0, poly1
+		str.w twiddle1, [poly, #offset]
+		str.w tmp, [poly], #4
+
+	vmov tmp, s24
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr
+
+	### LAYER 3+2+1
+
+	.equ distance, distance/16
+	.equ strincr, 32
+
+	add.w tmp, poly, #strincr*16
+	vmov s13, tmp
+	2:
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+		
+		vmov poly, s23
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #strincr
+
+	vmov tmp, s13
+	cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s24}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+
+// ########
+// ########
+// # INTT #
+// ########
+// ########
+
+// input: 0.5/1q
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 1q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 1q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 2q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 1.5q maximum;
+
+	// tmp and c0 are free at this point
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 4q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 2q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 4.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 2.5q maximum
+.endm
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
+.endm
+# input coefficients < 0.5q
+.global small_invntt_asm_769
+.type small_invntt_asm_769, %function
+.align 2
+small_invntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #24608
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		// vmov twiddle1, s15 
+		vmov twiddle2, s16
+		// mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+		// ----------
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+		// 1,3,5,7: <5q; 0,2,4,6:<1q
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: < 5.5q
+
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: < 1.5q
+	vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #24608
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 5.5q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+	vmov tmp, s14
+	cmp.w poly, tmp
+	bne.w 2b
+
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+
+###################################
+#### small point-multiplication####
+#### r0: out; r1: in; r2: zetas####
+###################################
+.align 2
+.global small_pointmul_asm_769
+.type small_pointmul_asm_769, %function
+small_pointmul_asm_769:
+    push.w {r4-r11, lr}
+
+    movw r14, #24608 // qa
+    movt r12, #769  // q
+    .equ width, 4
+
+
+    add.w r3, r2, #64*width
+    _point_mul_16_loop:
+
+    ldr.w r7, [r1, #2*width]
+    ldr.w r8, [r1, #3*width]
+    ldr.w r9, [r2, #1*width]
+    ldr.w r5, [r1, #1*width]
+    ldr.w r4, [r1], #4*width
+    ldr.w r6, [r2], #2*width
+
+    smulwt r10, r6, r4
+    smlabt r10, r10, r12, r14
+    pkhbt r4, r4, r10
+
+    neg.w r6, r6
+
+	smulwt r10, r6, r5
+    smlabt r10, r10, r12, r14
+    pkhbt r5, r5, r10
+
+    str.w r5, [r0, #1*width]
+    str.w r4, [r0], #2*width
+
+    smulwt r10, r9, r7
+    smlabt r10, r10, r12, r14
+    pkhbt r7, r7, r10
+
+    neg.w r9, r9
+
+    smulwt r10, r9, r8
+    smlabt r10, r10, r12, r14
+    pkhbt r8, r8, r10
+
+    str.w r8, [r0, #1*width]
+    str.w r7, [r0], #2*width
+
+    cmp.w r2, r3
+    bne.w _point_mul_16_loop
+
+    pop.w {r4-r11, pc}
+
+
+#### r0: out; r1: a; r2: b; r3: bprime
+  .align 2
+.global small_asymmetric_mul_asm_769
+.type small_asymmetric_mul_asm_769, %function
+small_asymmetric_mul_asm_769:
+    push.w {r4-r11, lr}
+
+    movw r14, #24608 // qa
+    movt r12, #769  // q
+	movw r11, #64769
+	movt r11, #58632 // qinv
+    .equ width, 4
+    add.w r10, r0, #256*2
+    _asymmetric_mul_16_loop:
+    ldr.w r7, [r1, #width]
+    ldr.w r4, [r1], #2*width
+    ldr.w r8, [r2, #width]
+    ldr.w r5, [r2], #2*width
+    ldr.w r9, [r3, #width]
+    ldr.w r6, [r3], #2*width
+
+    smuad r6, r4, r6
+    plant_red r12, r14, r11, r6
+    smuadx r5, r4, r5
+    plant_red r12, r14, r11, r5
+
+    pkhtb r5, r5, r6, asr#16
+    str.w r5, [r0], #width
+
+	smuad r6, r7, r9
+    plant_red r12, r14, r11, r6
+    smuadx r8, r7, r8
+    plant_red r12, r14, r11, r8
+
+    pkhtb r8, r8, r6, asr#16
+    str.w r8, [r0], #width
+
+    cmp.w r0, r10
+    bne.w _asymmetric_mul_16_loop
+
+    pop.w {r4-r11, pc}
\ No newline at end of file