From 9c2bc412b0b2537437cd56ff3dc5a29c7a048480 Mon Sep 17 00:00:00 2001 From: Huang Junhao <1561868283@qq.com> Date: Mon, 15 Apr 2024 15:30:22 +0800 Subject: [PATCH] Revisiting Keccak and Dilithium Implementations on ARMv7-M (#338) * Use Plantard arithmetic for NTT_769 in Dilithium * rm old smallntt.S * update benchmarks --------- Co-authored-by: Matthias J. Kannwischer --- benchmarks.csv | 10 +- benchmarks.md | 10 +- crypto_sign/dilithium3/m4f/macros_fnt.i | 1 - crypto_sign/dilithium3/m4f/macros_smallntt.i | 98 +++ crypto_sign/dilithium3/m4f/smallntt.S | 837 ------------------- crypto_sign/dilithium3/m4f/smallntt.h | 85 +- crypto_sign/dilithium3/m4f/smallntt_769.S | 681 +++++++++++++++ 7 files changed, 829 insertions(+), 893 deletions(-) delete mode 120000 crypto_sign/dilithium3/m4f/macros_fnt.i create mode 100644 crypto_sign/dilithium3/m4f/macros_smallntt.i delete mode 100644 crypto_sign/dilithium3/m4f/smallntt.S create mode 100644 crypto_sign/dilithium3/m4f/smallntt_769.S diff --git a/benchmarks.csv b/benchmarks.csv index 9270fca8..981accb6 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -44,8 +44,8 @@ cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852 dilithium2 (90 executions),clean,1873447,1838554,1903845,7846622,3321671,28761609,2062804,2062332,2063181 dilithium2 (100 executions),m4f,1427684,1390524,1466437,4219137,1813668,12587382,1417706,1417251,1418128 -dilithium3 (90 executions),clean,3205542,3204354,3206592,12108503,5097440,50759276,3377010,3376729,3377395 -dilithium3 (100 executions),m4f,2515970,2514894,2516922,5896583,2935265,23718896,2411234,2410948,2411551 +dilithium3 (1000 executions),clean,3205551,3204090,3207411,12696585,5097364,74392293,3376992,3376581,3377393 +dilithium3 (1000 executions),m4f,2515969,2514498,2517634,5884832,2917322,25268693,2411257,2410858,2411717 dilithium5 (90 executions),clean,5346066,5287239,5395626,15205929,7953360,49173429,5609664,5609137,5610119 dilithium5 (100 executions),m4f,4273211,4210308,4329697,8062110,4882708,18398575,4185407,4184878,4185954 falcon-1024 (10 executions),m4-ct,354880005,284902033,635131652,87741288,87506676,87922628,991320,982548,997219 @@ -341,8 +341,8 @@ cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,, cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,, dilithium2,clean,60.9,30.2,52.9,,,,,, dilithium2,m4f,79.9,62.2,76.8,,,,,, -dilithium3,clean,64.7,33.8,56.8,,,,,, -dilithium3,m4f,82.3,57.9,79.4,,,,,, +dilithium3,clean,64.7,31.3,56.8,,,,,, +dilithium3,m4f,82.3,60.3,79.4,,,,,, dilithium5,clean,67.0,38.4,61.1,,,,,, dilithium5,m4f,83.4,63.5,81.7,,,,,, falcon-1024,clean,6.5,0.3,23.7,,,,,, @@ -491,7 +491,7 @@ cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,, dilithium2,clean,8064,0,0,8064,,,,, dilithium2,m4f,18596,0,0,18596,,,,, dilithium3,clean,7580,0,0,7580,,,,, -dilithium3,m4f,20108,0,0,20108,,,,, +dilithium3,m4f,18588,0,0,18588,,,,, dilithium5,clean,7808,0,0,7808,,,,, dilithium5,m4f,18468,0,0,18468,,,,, falcon-1024,clean,82647,0,0,82647,,,,, diff --git a/benchmarks.md b/benchmarks.md index e2b4a4e3..5574fe2c 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -46,8 +46,8 @@ | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280
MIN: 1,142,409
MAX: 1,153,794 | AVG: 93,557,878
MIN: 93,547,167
MAX: 93,566,329 | AVG: 59,948,216
MIN: 59,857,434
MAX: 60,043,852 | | dilithium2 (90 executions) | clean | AVG: 1,873,447
MIN: 1,838,554
MAX: 1,903,845 | AVG: 7,846,622
MIN: 3,321,671
MAX: 28,761,609 | AVG: 2,062,804
MIN: 2,062,332
MAX: 2,063,181 | | dilithium2 (100 executions) | m4f | AVG: 1,427,684
MIN: 1,390,524
MAX: 1,466,437 | AVG: 4,219,137
MIN: 1,813,668
MAX: 12,587,382 | AVG: 1,417,706
MIN: 1,417,251
MAX: 1,418,128 | -| dilithium3 (90 executions) | clean | AVG: 3,205,542
MIN: 3,204,354
MAX: 3,206,592 | AVG: 12,108,503
MIN: 5,097,440
MAX: 50,759,276 | AVG: 3,377,010
MIN: 3,376,729
MAX: 3,377,395 | -| dilithium3 (100 executions) | m4f | AVG: 2,515,970
MIN: 2,514,894
MAX: 2,516,922 | AVG: 5,896,583
MIN: 2,935,265
MAX: 23,718,896 | AVG: 2,411,234
MIN: 2,410,948
MAX: 2,411,551 | +| dilithium3 (1000 executions) | clean | AVG: 3,205,551
MIN: 3,204,090
MAX: 3,207,411 | AVG: 12,696,585
MIN: 5,097,364
MAX: 74,392,293 | AVG: 3,376,992
MIN: 3,376,581
MAX: 3,377,393 | +| dilithium3 (1000 executions) | m4f | AVG: 2,515,969
MIN: 2,514,498
MAX: 2,517,634 | AVG: 5,884,832
MIN: 2,917,322
MAX: 25,268,693 | AVG: 2,411,257
MIN: 2,410,858
MAX: 2,411,717 | | dilithium5 (90 executions) | clean | AVG: 5,346,066
MIN: 5,287,239
MAX: 5,395,626 | AVG: 15,205,929
MIN: 7,953,360
MAX: 49,173,429 | AVG: 5,609,664
MIN: 5,609,137
MAX: 5,610,119 | | dilithium5 (100 executions) | m4f | AVG: 4,273,211
MIN: 4,210,308
MAX: 4,329,697 | AVG: 8,062,110
MIN: 4,882,708
MAX: 18,398,575 | AVG: 4,185,407
MIN: 4,184,878
MAX: 4,185,954 | | falcon-1024 (10 executions) | m4-ct | AVG: 354,880,005
MIN: 284,902,033
MAX: 635,131,652 | AVG: 87,741,288
MIN: 87,506,676
MAX: 87,922,628 | AVG: 991,320
MIN: 982,548
MAX: 997,219 | @@ -347,8 +347,8 @@ | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% | | dilithium2 | clean | 60.9% | 30.2% | 52.9% | | dilithium2 | m4f | 79.9% | 62.2% | 76.8% | -| dilithium3 | clean | 64.7% | 33.8% | 56.8% | -| dilithium3 | m4f | 82.3% | 57.9% | 79.4% | +| dilithium3 | clean | 64.7% | 31.3% | 56.8% | +| dilithium3 | m4f | 82.3% | 60.3% | 79.4% | | dilithium5 | clean | 67.0% | 38.4% | 61.1% | | dilithium5 | m4f | 83.4% | 63.5% | 81.7% | | falcon-1024 | clean | 6.5% | 0.3% | 23.7% | @@ -499,7 +499,7 @@ | dilithium2 | clean | 8,064 | 0 | 0 | 8,064 | | dilithium2 | m4f | 18,596 | 0 | 0 | 18,596 | | dilithium3 | clean | 7,580 | 0 | 0 | 7,580 | -| dilithium3 | m4f | 20,108 | 0 | 0 | 20,108 | +| dilithium3 | m4f | 18,588 | 0 | 0 | 18,588 | | dilithium5 | clean | 7,808 | 0 | 0 | 7,808 | | dilithium5 | m4f | 18,468 | 0 | 0 | 18,468 | | falcon-1024 | clean | 82,647 | 0 | 0 | 82,647 | diff --git a/crypto_sign/dilithium3/m4f/macros_fnt.i b/crypto_sign/dilithium3/m4f/macros_fnt.i deleted file mode 120000 index 1abff093..00000000 --- a/crypto_sign/dilithium3/m4f/macros_fnt.i +++ /dev/null @@ -1 +0,0 @@ -../../dilithium2/m4f/macros_fnt.i \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/macros_smallntt.i b/crypto_sign/dilithium3/m4f/macros_smallntt.i new file mode 100644 index 00000000..61b63241 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/macros_smallntt.i @@ -0,0 +1,98 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MACROS_SMALLNTT_I +#define MACROS_SMALLNTT_I + +// general macros +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro doubleplant a, tmp, q, qa, plantconst + smulwb \tmp, \plantconst, \a + smulwt \a, \plantconst, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebarrett a, tmp, tmp2, q, barrettconst + smulbb \tmp, \a, \barrettconst + smultb \tmp2, \a, \barrettconst + asr \tmp, \tmp, #26 + asr \tmp2, \tmp2, #26 + smulbb \tmp, \tmp, \q + smulbb \tmp2, \tmp2, \q + pkhbt \tmp, \tmp, \tmp2, lsl#16 + usub16 \a, \a, \tmp +.endm + +// q locate in the top half of the register +.macro plant_red q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatt \tmp, \tmp, \q, \qa + // result in high half +.endm + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +//For 3329 +.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst + doubleplant \a4, \tmp, \q, \qa, \plantconst + doubleplant \a5, \tmp, \q, \qa, \plantconst + doubleplant \a6, \tmp, \q, \qa, \plantconst + doubleplant \a7, \tmp, \q, \qa, \plantconst +.endm + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/smallntt.S b/crypto_sign/dilithium3/m4f/smallntt.S deleted file mode 100644 index 747c111c..00000000 --- a/crypto_sign/dilithium3/m4f/smallntt.S +++ /dev/null @@ -1,837 +0,0 @@ -#include "macros.i" - -.syntax unified -.cpu cortex-m4 -.thumb - -// general macros -.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - ldr.w \a0, [\a, \mem0] - ldr.w \a1, [\a, \mem1] - ldr.w \a2, [\a, \mem2] - ldr.w \a3, [\a, \mem3] -.endm - -.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - str.w \a0, [\a, \mem0] - str.w \a1, [\a, \mem1] - str.w \a2, [\a, \mem2] - str.w \a3, [\a, \mem3] -.endm - -.macro montgomery q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \tmp, \q, \tmp, \a -.endm - -.macro montgomery_inplace q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \a, \q, \tmp, \a -.endm - -.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst - smulbb \tmp2, \a, \montconst - montgomery \q, \qinv, \tmp2, \tmp - smultb \a, \a, \montconst - montgomery \q, \qinv, \a, \tmp2 - pkhtb \a, \tmp2, \tmp, asr#16 -.endm - -// ####### -// ####### -// # NTT # -// ####### -// ####### - -.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a, \twiddle - smult\tb \a, \a, \twiddle - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2 - pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves -.endm - -.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb - smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp - pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves - usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs) - uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs) -.endm - -.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv - doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv - doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv -.endm - -.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w \twiddle, [\twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2 - // layer 3 - vmov \twiddle, \xi01 - two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - vmov \twiddle, \xi23 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - vmov \twiddle, \xi45 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - vmov \twiddle, \xi67 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.global small_ntt_asm -.type small_ntt_asm, %function -.align 2 -small_ntt_asm: - push {r4-r11, r14} - vpush.w {s16} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 256 - .equ offset, 32 - .equ strincr, 4 - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} - - - add tmp, poly, #strincr*8 - vmov s16, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s13 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s14 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s15 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #4 - - vmov tmp, s16 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - - .equ distance, distance/16 - .equ strincr, 32 - - add.w tmp, poly, #strincr*16 - vmov s13, tmp - - 2: - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - vpop.w {s16} - pop {r4-r11, pc} - - -.unreq poly -.unreq twiddle_ptr -.unreq poly0 -.unreq poly1 -.unreq poly2 -.unreq poly3 -.unreq poly4 -.unreq poly5 -.unreq poly6 -.unreq poly7 -.unreq twiddle -.unreq qinv -.unreq q -.unreq tmp -.unreq tmp2 - -// ######## -// ######## -// # INTT # -// ######## -// ######## - -.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv - uadd16 \tmp, \a0, \a1 - usub16 \a1, \a0, \a1 - mov.w \a0, \tmp -.endm - -.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv - doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv - doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv -.endm - -.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - // layer 2 - sadd16.w \c6, \tmp, \tmp2 // c0, c2 - ssub16.w \tmp2, \tmp, \tmp2 - sadd16.w \c4, \c0, \c2 // c4, c6 - ssub16.w \c2, \c0, \c2 - - vmov.w \twiddle, \xi12 - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - -.endm - -.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - mov.w \c6, \tmp - mov.w \c4, \c0 - - // layer 2 - vmov.w \twiddle, \xi12 - doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - -.endm - -.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w twiddle, [twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2 - smulb\tb \tmp, \a, \twiddle - smmulr.w \tmp2, \tmp, \Qbar - mls.w \tmp, \tmp2, \Q, \tmp - smult\tb \a, \a, \twiddle - smmulr.w \tmp2, \a, \Qbar - mls.w \a, \tmp2, \Q, \a - pkhbt \a, \tmp, \a, lsl #16 -.endm - -.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2 - - movt \Q, #0 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - movt \Q, #767 - -.endm - -.global small_invntt_tomont_asm -.type small_invntt_tomont_asm, %function -.align 2 -small_invntt_tomont_asm: - push {r4-r11, r14} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 16 - .equ offset, 32 - .equ strincr, 64 - - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} - - add.w tmp, poly, #8*strincr - vmov s8, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s13 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s14 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s15 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - - vmov tmp, s8 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - .equ distance, distance*16 - .equ strincr, 4 - - // ITER 0 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - vldm twiddle_ptr!, {s5-s7} - - _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - // ITER 1-12 - add.w tmp, poly, #strincr*3*(3+1) - vmov s14, tmp - 3: - add.w tmp, poly, #strincr*3 - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - // polys upto 9q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s14 - cmp.w poly, tmp - bne.w 3b - - // ITER 13-15 - add tmp, poly, #3*strincr - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - pop {r4-r11, pc} - -.unreq poly -.unreq twiddle_ptr -.unreq poly0 -.unreq poly1 -.unreq poly2 -.unreq poly3 -.unreq poly4 -.unreq poly5 -.unreq poly6 -.unreq poly7 -.unreq twiddle -.unreq qinv -.unreq q -.unreq tmp -.unreq tmp2 - -.align 2 -.global small_pointmul_asm -.type small_pointmul_asm, %function -small_pointmul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - - .equ width, 4 - - add.w r12, r2, #64*2 - _point_mul_16_loop: - - ldr.w r7, [r1, #2*width] - ldr.w r8, [r1, #3*width] - ldrsh.w r9, [r2, #1*2] - ldr.w r5, [r1, #1*width] - ldr.w r4, [r1], #4*width - ldrsh.w r6, [r2], #2*2 - - smultb r10, r4, r6 - montgomery r14, r14, r10, r11 - pkhbt r4, r4, r11 - - - neg.w r6, r6 - - smultb r10, r5, r6 - montgomery r14, r14, r10, r11 - pkhbt r5, r5, r11 - - str.w r5, [r0, #1*width] - str.w r4, [r0], #2*width - - smultb r10, r7, r9 - montgomery r14, r14, r10, r11 - pkhbt r7, r7, r11 - - neg.w r9, r9 - - smultb r10, r8, r9 - montgomery r14, r14, r10, r11 - pkhbt r8, r8, r11 - - str.w r8, [r0, #1*width] - str.w r7, [r0], #2*width - - cmp.w r2, r12 - bne.w _point_mul_16_loop - - pop.w {r4-r11, pc} - - .align 2 -.global small_asymmetric_mul_asm -.type small_asymmetric_mul_asm, %function -small_asymmetric_mul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - .equ width, 4 - add.w r12, r0, #256*2 - _asymmetric_mul_16_loop: - ldr.w r7, [r1, #width] - ldr.w r4, [r1], #2*width - ldr.w r8, [r2, #width] - ldr.w r5, [r2], #2*width - ldr.w r9, [r3, #width] - ldr.w r6, [r3], #2*width - - smuad r10, r4, r6 - montgomery r14, r14, r10, r6 - smuadx r11, r4, r5 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - - str.w r10, [r0], #width - - smuad r10, r7, r9 - montgomery r14, r14, r10, r6 - smuadx r11, r7, r8 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - str.w r10, [r0], #width - - - cmp.w r0, r12 - bne.w _asymmetric_mul_16_loop - - pop.w {r4-r11, pc} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/smallntt.h b/crypto_sign/dilithium3/m4f/smallntt.h index 0aa0ce9b..2927ff4d 100644 --- a/crypto_sign/dilithium3/m4f/smallntt.h +++ b/crypto_sign/dilithium3/m4f/smallntt.h @@ -1,53 +1,48 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef SMALLNTT_H #define SMALLNTT_H #include #include "params.h" -static const int16_t zetas[64] = { --23, 112, -151, -134, -52, -148, 227, 232, --71, 212, 236, 21, 341, 379, -202, -220, -352, 292, 238, 145, 194, -276, 70, -274, -117, 333, 66, 247, -237, -83, -252, -244, -331, -241, 167, 357, -355, 291, -358, 105, -115, -209, 14, 99, -260, 29, 366, -378, -318, 278, 353, 354, -184, 127, 330, -303, 222, -78, -348, -44, 201, 158, 350, 168 -}; - -static const int16_t zetas_asm[128] = { -0, -164, -81, 361, 186, -3, -250, -120, -308, 129, -16, -223, -362, -143, 131, -337, --76, 147, -114, -23, 112, -151, -134, --98, -272, 54, -52, -148, 227, 232, -36, -2, -124, -71, 212, 236, 21, --75, -80, -346, 341, 379, -202, -220, --339, 86, -51, 352, 292, 238, 145, --255, 364, 267, 194, -276, 70, -274, -282, 161, -15, 117, 333, 66, 247, --203, 288, 169, -237, -83, -252, -244, --34, 191, 307, 331, -241, 167, 357, -199, -50, -24, -355, 291, -358, 105, -178, -170, 226, -115, -209, 14, 99, -270, 121, -188, -260, 29, 366, -378, --10, -380, 279, -318, 278, 353, 354, -149, 180, -375, -184, 127, 330, -303, -369, -157, 263, 222, -78, -348, -44, --192, -128, -246, 201, 158, 350, 168 -}; - -static const int16_t zetas_inv_CT_asm[256] = { -0, 171, 171, 164, 171, -361, 164, 81, 171, 120, -361, 3, 164, 250, 81, -186, -171, 164, 171, -361, 164, 81, -257, 49, -141, -18, -215, 38, 283, 347, 337, 192, -369, 246, -263, 128, 157, 239, -264, 179, 301, -207, 219, -332, -206, 120, 337, -131, 192, -149, -369, 10, 62, 57, 40, 136, 1, 311, -173, 27, 223, 203, -282, -169, 15, -288, -161, 74, -56, 271, -309, 26, -373, 116, -67, -361, 120, 250, 337, 143, -131, 362, -383, 82, 125, -344, -93, 299, -60, -204, 143, -270, -178, 188, -226, -121, 170, 39, -175, 174, 284, -111, 84, -22, 79, 3, 223, 16, 203, 255, -282, 339, 245, 64, -90, -306, 190, -123, 197, -253, -129, 75, -36, 346, 124, 80, 2, 218, 126, -33, -266, 326, -122, -261, 343, 164, -361, 81, 120, 3, 250, -186, 285, 200, -89, 5, 17, -96, 135, -310, -131, -149, 10, 375, -279, -180, 380, -280, -183, -7, 130, -327, -189, -335, -370, 250, 143, 362, -270, -199, -178, 34, -359, -144, -182, 304, -43, -300, -251, 377, 16, 255, 339, -267, 51, -364, -86, -106, 101, -118, 214, -349, -110, -374, -195, 81, 3, -186, 223, -129, 16, 308, 320, 319, 8, 181, 154, 216, 273, 313, 362, -199, 34, 24, -307, 50, -191, -139, -165, 208, 92, 159, 233, 177, -321, -186, -129, 308, 75, 98, -36, 76, 231, 324, 25, 85, 289, -94, -12, 113, 308, 98, 76, -54, 114, 272, -147, -146, -35, -119, -97, -176, -137, -312, -138, -}; - - -#define SMALL_Q 769 - -void small_ntt_asm(int16_t a[N], const int16_t * zetas); -void small_invntt_tomont_asm(int16_t a[N], const int16_t * zetas); -void small_pointmul_asm(int16_t out[N], const int16_t in[N], const int16_t *zetas); -void small_asymmetric_mul_asm(int16_t c[256], const int16_t a[256], const int16_t b[256], const int16_t b_prime[256]); - -#define small_ntt(a) small_ntt_asm(a, zetas_asm) -#define small_invntt_tomont(a) small_invntt_tomont_asm(a, zetas_inv_CT_asm) -#define small_point_mul(out, in) small_pointmul_asm(out, in, zetas) -#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm(c, a, b, b_prime); +#define SMALL_Q 769 + +static const int32_t zetas_769[64] = { + 3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838}; + +static const int32_t zetas_asm_769[128] = { + 346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0}; + +// INTT with CT butterfly +static const int32_t zetas_inv_asm_769[256] = { + 5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0}; + +// Q1=769 +void small_ntt_asm_769(int16_t a[N], const int32_t *zetas); +void small_invntt_asm_769(int16_t a[N], const int32_t *zetas); +void small_pointmul_asm_769(int16_t out[N], const int16_t in[N], const int32_t *zetas); +void small_asymmetric_mul_asm_769(int16_t c[N], const int16_t a[N], const int16_t b[N], const int16_t b_prime[N]); + +// small NTT for computing cs0 and cs1; default use 769 as modulus. +#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769) +#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769) +#define small_point_mul(out, in) small_pointmul_asm_769(out, in, zetas_769) +#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm_769(c, a, b, b_prime); #endif diff --git a/crypto_sign/dilithium3/m4f/smallntt_769.S b/crypto_sign/dilithium3/m4f/smallntt_769.S new file mode 100644 index 00000000..97c60f03 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/smallntt_769.S @@ -0,0 +1,681 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +#include "macros_smallntt.i" +// ####### +// ####### +// # NTT # +// ####### +// ####### + +.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp + // layer 3 + vmov \twiddle1, \xi0 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + vmov \twiddle1, \xi1 + vmov \twiddle2, \xi2 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + vmov \twiddle1, \xi3 + vmov \twiddle2, \xi4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + vmov \twiddle1, \xi5 + vmov \twiddle2, \xi6 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.global small_ntt_asm_769 +.type small_ntt_asm_769, %function +.align 2 +small_ntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s24} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + ### qinv .req r11 ### q^-1 mod 2^2n; n=16 + q .req r12 + ### at the top of r12 + qa .req r0 + ### qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + // movw qa, #24608 + // Why movt? Because we initially placed qa at the bottom of the same register as q; + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load 15 twiddle factors to 15 FPU registers + // s0-s7 used to temporary store 16 16-bit polys. + vldm twiddle_ptr!, {s8-s22} + + add tmp, poly, #strincr*8 + // s23: poly addr + // s24: tmp + vmov s24, tmp + 1: + // load a1, a3, ..., a15 + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // s15, s16, s17, s18, s19, s20, s21, s22 left + // multiply coeffs by layer 8 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + vmov poly, s23 + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle1, s1 // load a3 + uadd16 tmp, poly1, twiddle1 + usub16 poly1, poly1, twiddle1 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle1, s3 // load a7 + uadd16 tmp, poly3, twiddle1 + usub16 poly3, poly3, twiddle1 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle1, s5 // load a11 + uadd16 tmp, poly5, twiddle1 + usub16 poly5, poly5, twiddle1 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle1, s7 // load a15 + uadd16 tmp, poly7, twiddle1 + usub16 poly7, poly7, twiddle1 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle1, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle1, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle1, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle1, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle1, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle1, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle1, poly0, poly1 + str.w twiddle1, [poly, #offset] + str.w tmp, [poly], #4 + + vmov tmp, s24 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + 2: + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s23 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s24} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + + +// ######## +// ######## +// # INTT # +// ######## +// ######## + +// input: 0.5/1q +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 1q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 1q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 2q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 1.5q maximum; + + // tmp and c0 are free at this point + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 4q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 2q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 4.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 2.5q maximum +.endm +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa +.endm +# input coefficients < 0.5q +.global small_invntt_asm_769 +.type small_invntt_asm_769, %function +.align 2 +small_invntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + // vmov twiddle1, s15 + vmov twiddle2, s16 + // mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + // ---------- + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + // 1,3,5,7: <5q; 0,2,4,6:<1q + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: < 5.5q + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: < 1.5q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #24608 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 5.5q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16-s23} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + + +################################### +#### small point-multiplication#### +#### r0: out; r1: in; r2: zetas#### +################################### +.align 2 +.global small_pointmul_asm_769 +.type small_pointmul_asm_769, %function +small_pointmul_asm_769: + push.w {r4-r11, lr} + + movw r14, #24608 // qa + movt r12, #769 // q + .equ width, 4 + + + add.w r3, r2, #64*width + _point_mul_16_loop: + + ldr.w r7, [r1, #2*width] + ldr.w r8, [r1, #3*width] + ldr.w r9, [r2, #1*width] + ldr.w r5, [r1, #1*width] + ldr.w r4, [r1], #4*width + ldr.w r6, [r2], #2*width + + smulwt r10, r6, r4 + smlabt r10, r10, r12, r14 + pkhbt r4, r4, r10 + + neg.w r6, r6 + + smulwt r10, r6, r5 + smlabt r10, r10, r12, r14 + pkhbt r5, r5, r10 + + str.w r5, [r0, #1*width] + str.w r4, [r0], #2*width + + smulwt r10, r9, r7 + smlabt r10, r10, r12, r14 + pkhbt r7, r7, r10 + + neg.w r9, r9 + + smulwt r10, r9, r8 + smlabt r10, r10, r12, r14 + pkhbt r8, r8, r10 + + str.w r8, [r0, #1*width] + str.w r7, [r0], #2*width + + cmp.w r2, r3 + bne.w _point_mul_16_loop + + pop.w {r4-r11, pc} + + +#### r0: out; r1: a; r2: b; r3: bprime + .align 2 +.global small_asymmetric_mul_asm_769 +.type small_asymmetric_mul_asm_769, %function +small_asymmetric_mul_asm_769: + push.w {r4-r11, lr} + + movw r14, #24608 // qa + movt r12, #769 // q + movw r11, #64769 + movt r11, #58632 // qinv + .equ width, 4 + add.w r10, r0, #256*2 + _asymmetric_mul_16_loop: + ldr.w r7, [r1, #width] + ldr.w r4, [r1], #2*width + ldr.w r8, [r2, #width] + ldr.w r5, [r2], #2*width + ldr.w r9, [r3, #width] + ldr.w r6, [r3], #2*width + + smuad r6, r4, r6 + plant_red r12, r14, r11, r6 + smuadx r5, r4, r5 + plant_red r12, r14, r11, r5 + + pkhtb r5, r5, r6, asr#16 + str.w r5, [r0], #width + + smuad r6, r7, r9 + plant_red r12, r14, r11, r6 + smuadx r8, r7, r8 + plant_red r12, r14, r11, r8 + + pkhtb r8, r8, r6, asr#16 + str.w r8, [r0], #width + + cmp.w r0, r10 + bne.w _asymmetric_mul_16_loop + + pop.w {r4-r11, pc} \ No newline at end of file