diff --git a/mk/config.mk b/mk/config.mk index 407049cb6..841e74ea8 100644 --- a/mk/config.mk +++ b/mk/config.mk @@ -69,7 +69,8 @@ endif RNG ?= BENCH := CYCLES ?= -RETAINED_VARS := RNG BENCH CYCLES +OPT ?= 1 +RETAINED_VARS := RNG BENCH CYCLES OPT BUILD_DIR := test/build LIB_DIR := $(BUILD_DIR)/lib diff --git a/mk/schemes.mk b/mk/schemes.mk index dd4d25321..68fabd772 100644 --- a/mk/schemes.mk +++ b/mk/schemes.mk @@ -1,5 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 SOURCES = $(wildcard mlkem/*.c) +ifeq ($(OPT),1) + SOURCES += $(wildcard mlkem/asm/aarch64/*.S) + CPPFLAGS += -DMLKEM_USE_ASM +endif CPPFLAGS += -Imlkem -Imlkem/sys TESTS = test_kyber bench_kyber gen_NISTKAT gen_KAT diff --git a/mlkem/asm/aarch64/common.i b/mlkem/asm/aarch64/common.i new file mode 100644 index 000000000..55e0ed803 --- /dev/null +++ b/mlkem/asm/aarch64/common.i @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: MIT + +#if __APPLE__ +#define ASM_LOAD(dst, symbol) \ + adrp dst, symbol @PAGE %% add dst, dst, symbol @PAGEOFF +#else +#define ASM_LOAD(dst, symbol) \ + adrp dst, symbol; \ + add dst, dst, : lo12 : symbol; +.endm + +#endif diff --git a/mlkem/asm/aarch64/intt_123_4567.S b/mlkem/asm/aarch64/intt_123_4567.S new file mode 100644 index 000000000..25638533e --- /dev/null +++ b/mlkem/asm/aarch64/intt_123_4567.S @@ -0,0 +1,361 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] + mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.macro load_roots_123 + ldr q_root0, [r_ptr0], #32 + ldr q_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 + ldr q_root0, [r_ptr0], #16 +.endm + +.macro load_next_roots_67 + ldr q_root0, [r_ptr1], #(6*16) + ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_gprs + save_vregs +.endm + +.macro pop_stack + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.S" +.text + + .global intt_kyber_123_4567 + .global _intt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567: +_intt_kyber_123_4567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + q_data0 .req q8 + q_data1 .req q9 + q_data2 .req q10 + q_data3 .req q11 + q_data4 .req q12 + q_data5 .req q13 + q_data6 .req q14 + q_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + q_consts .req q7 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + mov inp, in + mov count, #8 + + .p2align 2 +layer4567_start: + ldr q_data0, [inp, #(16*0)] + ldr q_data1, [inp, #(16*1)] + ldr q_data2, [inp, #(16*2)] + ldr q_data3, [inp, #(16*3)] + + transpose4 data // manual ld4 + + load_next_roots_67 + + // Layer 7 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 6 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_45 + + // Layer 5 + gs_butterfly data0, data1, root0, 2, 3 + gs_butterfly data2, data3, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data2 + barrett_reduce data1 + barrett_reduce data3 + + // Layer 4 + gs_butterfly data0, data2, root0, 0, 1 + gs_butterfly data1, data3, root0, 0, 1 + + str q_data0, [inp], #(64) + str q_data1, [inp, #(-64 + 16*1)] + str q_data2, [inp, #(-64 + 16*2)] + str q_data3, [inp, #(-64 + 16*3)] + + subs count, count, #1 + cbnz count, layer4567_start + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + +layer123_start: + + ldr q_data0, [in, #0] + ldr q_data1, [in, #(1*(512/8))] + ldr q_data2, [in, #(2*(512/8))] + ldr q_data3, [in, #(3*(512/8))] + ldr q_data4, [in, #(4*(512/8))] + ldr q_data5, [in, #(5*(512/8))] + ldr q_data6, [in, #(6*(512/8))] + ldr q_data7, [in, #(7*(512/8))] + + gs_butterfly data0, data1, root0, 6, 7 + gs_butterfly data2, data3, root1, 0, 1 + gs_butterfly data4, data5, root1, 2, 3 + gs_butterfly data6, data7, root1, 4, 5 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root0, 4, 5 + gs_butterfly data5, data7, root0, 4, 5 + + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str q_data4, [in, #(4*(512/8))] + str q_data5, [in, #(5*(512/8))] + str q_data6, [in, #(6*(512/8))] + str q_data7, [in, #(7*(512/8))] + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + str q_data0, [in], #(16) + str q_data1, [in, #(-16 + 1*(512/8))] + str q_data2, [in, #(-16 + 2*(512/8))] + str q_data3, [in, #(-16 + 3*(512/8))] + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/intt_kyber_123_45_67_twiddles.S b/mlkem/asm/aarch64/intt_kyber_123_45_67_twiddles.S new file mode 100644 index 000000000..01f56568d --- /dev/null +++ b/mlkem/asm/aarch64/intt_kyber_123_45_67_twiddles.S @@ -0,0 +1,498 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +roots_l56: +.short -910 +.short -910 +.short -1227 +.short -1227 +.short 219 +.short 219 +.short 855 +.short 855 +.short -8957 +.short -8957 +.short -12078 +.short -12078 +.short 2156 +.short 2156 +.short 8416 +.short 8416 +.short 1175 +.short 1175 +.short 394 +.short 394 +.short -1029 +.short -1029 +.short -1212 +.short -1212 +.short 11566 +.short 11566 +.short 3878 +.short 3878 +.short -10129 +.short -10129 +.short -11930 +.short -11930 +.short -885 +.short -885 +.short 1219 +.short 1219 +.short 1455 +.short 1455 +.short 1607 +.short 1607 +.short -8711 +.short -8711 +.short 11999 +.short 11999 +.short 14322 +.short 14322 +.short 15818 +.short 15818 +.short -648 +.short -648 +.short -1481 +.short -1481 +.short 712 +.short 712 +.short 682 +.short 682 +.short -6378 +.short -6378 +.short -14578 +.short -14578 +.short 7008 +.short 7008 +.short 6713 +.short 6713 +.short -886 +.short -886 +.short 1179 +.short 1179 +.short -1026 +.short -1026 +.short -1092 +.short -1092 +.short -8721 +.short -8721 +.short 11605 +.short 11605 +.short -10099 +.short -10099 +.short -10749 +.short -10749 +.short 554 +.short 554 +.short -1143 +.short -1143 +.short -403 +.short -403 +.short 525 +.short 525 +.short 5453 +.short 5453 +.short -11251 +.short -11251 +.short -3967 +.short -3967 +.short 5168 +.short 5168 +.short 927 +.short 927 +.short -1534 +.short -1534 +.short 461 +.short 461 +.short -1438 +.short -1438 +.short 9125 +.short 9125 +.short -15099 +.short -15099 +.short 4538 +.short 4538 +.short -14155 +.short -14155 +.short 735 +.short 735 +.short -561 +.short -561 +.short -757 +.short -757 +.short -319 +.short -319 +.short 7235 +.short 7235 +.short -5522 +.short -5522 +.short -7451 +.short -7451 +.short -3140 +.short -3140 +.short 863 +.short 863 +.short 1230 +.short 1230 +.short 556 +.short 556 +.short -1063 +.short -1063 +.short 8495 +.short 8495 +.short 12107 +.short 12107 +.short 5473 +.short 5473 +.short -10463 +.short -10463 +.short -452 +.short -452 +.short -807 +.short -807 +.short -1435 +.short -1435 +.short 1010 +.short 1010 +.short -4449 +.short -4449 +.short -7943 +.short -7943 +.short -14125 +.short -14125 +.short 9942 +.short 9942 +.short -1645 +.short -1645 +.short 780 +.short 780 +.short 109 +.short 109 +.short 1031 +.short 1031 +.short -16192 +.short -16192 +.short 7678 +.short 7678 +.short 1073 +.short 1073 +.short 10148 +.short 10148 +.short 1239 +.short 1239 +.short -375 +.short -375 +.short 1292 +.short 1292 +.short -1584 +.short -1584 +.short 12196 +.short 12196 +.short -3691 +.short -3691 +.short 12717 +.short 12717 +.short -15592 +.short -15592 +.short 1414 +.short 1414 +.short -1320 +.short -1320 +.short -33 +.short -33 +.short 464 +.short 464 +.short 13918 +.short 13918 +.short -12993 +.short -12993 +.short -325 +.short -325 +.short 4567 +.short 4567 +.short -641 +.short -641 +.short 992 +.short 992 +.short 941 +.short 941 +.short 1021 +.short 1021 +.short -6309 +.short -6309 +.short 9764 +.short 9764 +.short 9262 +.short 9262 +.short 10050 +.short 10050 +.short -268 +.short -268 +.short -733 +.short -733 +.short 892 +.short 892 +.short -939 +.short -939 +.short -2638 +.short -2638 +.short -7215 +.short -7215 +.short 8780 +.short 8780 +.short -9243 +.short -9243 +.short -632 +.short -632 +.short 816 +.short 816 +.short 1352 +.short 1352 +.short -650 +.short -650 +.short -6221 +.short -6221 +.short 8032 +.short 8032 +.short 13308 +.short 13308 +.short -6398 +.short -6398 +.short 642 +.short 642 +.short -952 +.short -952 +.short 1540 +.short 1540 +.short -1651 +.short -1651 +.short 6319 +.short 6319 +.short -9371 +.short -9371 +.short 15159 +.short 15159 +.short -16251 +.short -16251 +.short -1461 +.short -1461 +.short 1482 +.short 1482 +.short 540 +.short 540 +.short 1626 +.short 1626 +.short -14381 +.short -14381 +.short 14588 +.short 14588 +.short 5315 +.short 5315 +.short 16005 +.short 16005 +.short 1274 +.short 1274 +.short 1052 +.short 1052 +.short 1025 +.short 1025 +.short -1197 +.short -1197 +.short 12540 +.short 12540 +.short 10355 +.short 10355 +.short 10089 +.short 10089 +.short -11782 +.short -11782 +.short 279 +.short 279 +.short 1173 +.short 1173 +.short -233 +.short -233 +.short 667 +.short 667 +.short 2746 +.short 2746 +.short 11546 +.short 11546 +.short -2293 +.short -2293 +.short 6565 +.short 6565 +.short 314 +.short 314 +.short -756 +.short -756 +.short 48 +.short 48 +.short -1409 +.short -1409 +.short 3091 +.short 3091 +.short -7441 +.short -7441 +.short 472 +.short 472 +.short -13869 +.short -13869 +.short 1573 +.short 1573 +.short 76 +.short 76 +.short -331 +.short -331 +.short -289 +.short -289 +.short 15483 +.short 15483 +.short 748 +.short 748 +.short -3258 +.short -3258 +.short -2845 +.short -2845 +.short -1100 +.short -1100 +.short -723 +.short -723 +.short 680 +.short 680 +.short 568 +.short 568 +.short -10828 +.short -10828 +.short -7117 +.short -7117 +.short 6693 +.short 6693 +.short 5591 +.short 5591 +.short 1041 +.short 1041 +.short -1637 +.short -1637 +.short -583 +.short -583 +.short -17 +.short -17 +.short 10247 +.short 10247 +.short -16113 +.short -16113 +.short -5739 +.short -5739 +.short -167 +.short -167 +roots_l34: +.short 1583 +.short 15582 +.short -821 +.short -8081 +.short 1355 +.short 13338 +.short 0 +.short 0 +.short -569 +.short -5601 +.short 450 +.short 4429 +.short 936 +.short 9213 +.short 0 +.short 0 +.short 69 +.short 679 +.short 447 +.short 4400 +.short -535 +.short -5266 +.short 0 +.short 0 +.short 543 +.short 5345 +.short 1235 +.short 12156 +.short -1426 +.short -14036 +.short 0 +.short 0 +.short -797 +.short -7845 +.short -1333 +.short -13121 +.short 1089 +.short 10719 +.short 0 +.short 0 +.short -193 +.short -1900 +.short -56 +.short -551 +.short 283 +.short 2786 +.short 0 +.short 0 +.short 1410 +.short 13879 +.short -1476 +.short -14529 +.short -1339 +.short -13180 +.short 0 +.short 0 +.short -1062 +.short -10453 +.short 882 +.short 8682 +.short -296 +.short -2914 +.short 0 +.short 0 +roots_l012: +// layer 0 root modified to include ninv +.short 266 // originally: 1600 +.short 2618 // originally: 15749 +.short 40 +.short 394 +.short 749 +.short 7373 +.short -848 +.short -8347 +.short 1432 +.short 14095 +.short -630 +.short -6201 +.short 687 +.short 6762 +.short 0 +.short 0 + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/ntt_123_4567.S b/mlkem/asm/aarch64/ntt_123_4567.S new file mode 100644 index 000000000..dc547bc24 --- /dev/null +++ b/mlkem/asm/aarch64/ntt_123_4567.S @@ -0,0 +1,318 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1] + mul \dst\().8h, \src\().8h, \const\().h[\idx0] + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.macro load_roots_123 + ldr q_root0, [r_ptr0], #32 + ldr q_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 + ldr q_root0, [r_ptr0], #16 +.endm + +.macro load_next_roots_67 + ldr q_root0, [r_ptr1], #(6*16) + ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc + ldr \a, [sp, #\loc] +.endm +.macro save loc, a + str \a, [sp, #\loc] +.endm +.macro push_stack + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: + #include "ntt_kyber_123_45_67_twiddles.S" + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + q_data0 .req q8 + q_data1 .req q9 + q_data2 .req q10 + q_data3 .req q11 + q_data4 .req q12 + q_data5 .req q13 + q_data6 .req q14 + q_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + + consts .req v7 + q_consts .req q7 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + .text + .global ntt_kyber_123_4567 + .global _ntt_kyber_123_4567 + +.p2align 4 +const_addr: + .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_123_4567: +_ntt_kyber_123_4567: + push_stack + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l56) + ASM_LOAD(xtmp, const_addr) + + ld1 {consts.8h}, [xtmp] + + str in, [sp, #STACK0] // @slothy:writes=STACK0 + mov count, #4 + + load_roots_123 + + .p2align 2 +layer123_start: + + ldr q_data0, [in, #0] + ldr q_data1, [in, #(1*(512/8))] + ldr q_data2, [in, #(2*(512/8))] + ldr q_data3, [in, #(3*(512/8))] + ldr q_data4, [in, #(4*(512/8))] + ldr q_data5, [in, #(5*(512/8))] + ldr q_data6, [in, #(6*(512/8))] + ldr q_data7, [in, #(7*(512/8))] + + ct_butterfly data0, data4, root0, 0, 1 + ct_butterfly data1, data5, root0, 0, 1 + ct_butterfly data2, data6, root0, 0, 1 + ct_butterfly data3, data7, root0, 0, 1 + + ct_butterfly data0, data2, root0, 2, 3 + ct_butterfly data1, data3, root0, 2, 3 + ct_butterfly data4, data6, root0, 4, 5 + ct_butterfly data5, data7, root0, 4, 5 + + ct_butterfly data0, data1, root0, 6, 7 + ct_butterfly data2, data3, root1, 0, 1 + ct_butterfly data4, data5, root1, 2, 3 + ct_butterfly data6, data7, root1, 4, 5 + + str q_data0, [in], #(16) + str q_data1, [in, #(-16 + 1*(512/8))] + str q_data2, [in, #(-16 + 2*(512/8))] + str q_data3, [in, #(-16 + 3*(512/8))] + str q_data4, [in, #(-16 + 4*(512/8))] + str q_data5, [in, #(-16 + 5*(512/8))] + str q_data6, [in, #(-16 + 6*(512/8))] + str q_data7, [in, #(-16 + 7*(512/8))] + + subs count, count, #1 + cbnz count, layer123_start + + ldr inp, [sp, #STACK0] // @slothy:reads=STACK0 + mov count, #8 + + .p2align 2 +layer4567_start: + + ldr q_data0, [inp, #(16*0)] + ldr q_data1, [inp, #(16*1)] + ldr q_data2, [inp, #(16*2)] + ldr q_data3, [inp, #(16*3)] + + load_next_roots_45 + + ct_butterfly data0, data2, root0, 0, 1 + ct_butterfly data1, data3, root0, 0, 1 + ct_butterfly data0, data1, root0, 2, 3 + ct_butterfly data2, data3, root0, 4, 5 + + transpose4 data + load_next_roots_67 + + ct_butterfly_v data0, data2, root0, root0_tw + ct_butterfly_v data1, data3, root0, root0_tw + ct_butterfly_v data0, data1, root1, root1_tw + ct_butterfly_v data2, data3, root2, root2_tw + + barrett_reduce data0 + barrett_reduce data1 + barrett_reduce data2 + barrett_reduce data3 + + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp], #64 + + subs count, count, #1 + cbnz count, layer4567_start + + pop_stack + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/ntt_kyber_123_45_67_twiddles.S b/mlkem/asm/aarch64/ntt_kyber_123_45_67_twiddles.S new file mode 100644 index 000000000..fb56912b0 --- /dev/null +++ b/mlkem/asm/aarch64/ntt_kyber_123_45_67_twiddles.S @@ -0,0 +1,498 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +.p2align 2 +roots_l012: +.short -1600 +.short -15749 +.short -749 +.short -7373 +.short -40 +.short -394 +.short -687 +.short -6762 +.short 630 +.short 6201 +.short -1432 +.short -14095 +.short 848 +.short 8347 +.short 0 +.short 0 +roots_l34: +.short 1062 +.short 10453 +.short 296 +.short 2914 +.short -882 +.short -8682 +.short 0 +.short 0 +.short -1410 +.short -13879 +.short 1339 +.short 13180 +.short 1476 +.short 14529 +.short 0 +.short 0 +.short 193 +.short 1900 +.short -283 +.short -2786 +.short 56 +.short 551 +.short 0 +.short 0 +.short 797 +.short 7845 +.short -1089 +.short -10719 +.short 1333 +.short 13121 +.short 0 +.short 0 +.short -543 +.short -5345 +.short 1426 +.short 14036 +.short -1235 +.short -12156 +.short 0 +.short 0 +.short -69 +.short -679 +.short 535 +.short 5266 +.short -447 +.short -4400 +.short 0 +.short 0 +.short 569 +.short 5601 +.short -936 +.short -9213 +.short -450 +.short -4429 +.short 0 +.short 0 +.short -1583 +.short -15582 +.short -1355 +.short -13338 +.short 821 +.short 8081 +.short 0 +.short 0 +roots_l56: +.short 289 +.short 289 +.short 331 +.short 331 +.short -76 +.short -76 +.short -1573 +.short -1573 +.short 2845 +.short 2845 +.short 3258 +.short 3258 +.short -748 +.short -748 +.short -15483 +.short -15483 +.short 17 +.short 17 +.short 583 +.short 583 +.short 1637 +.short 1637 +.short -1041 +.short -1041 +.short 167 +.short 167 +.short 5739 +.short 5739 +.short 16113 +.short 16113 +.short -10247 +.short -10247 +.short -568 +.short -568 +.short -680 +.short -680 +.short 723 +.short 723 +.short 1100 +.short 1100 +.short -5591 +.short -5591 +.short -6693 +.short -6693 +.short 7117 +.short 7117 +.short 10828 +.short 10828 +.short 1197 +.short 1197 +.short -1025 +.short -1025 +.short -1052 +.short -1052 +.short -1274 +.short -1274 +.short 11782 +.short 11782 +.short -10089 +.short -10089 +.short -10355 +.short -10355 +.short -12540 +.short -12540 +.short 1409 +.short 1409 +.short -48 +.short -48 +.short 756 +.short 756 +.short -314 +.short -314 +.short 13869 +.short 13869 +.short -472 +.short -472 +.short 7441 +.short 7441 +.short -3091 +.short -3091 +.short -667 +.short -667 +.short 233 +.short 233 +.short -1173 +.short -1173 +.short -279 +.short -279 +.short -6565 +.short -6565 +.short 2293 +.short 2293 +.short -11546 +.short -11546 +.short -2746 +.short -2746 +.short 650 +.short 650 +.short -1352 +.short -1352 +.short -816 +.short -816 +.short 632 +.short 632 +.short 6398 +.short 6398 +.short -13308 +.short -13308 +.short -8032 +.short -8032 +.short 6221 +.short 6221 +.short -1626 +.short -1626 +.short -540 +.short -540 +.short -1482 +.short -1482 +.short 1461 +.short 1461 +.short -16005 +.short -16005 +.short -5315 +.short -5315 +.short -14588 +.short -14588 +.short 14381 +.short 14381 +.short 1651 +.short 1651 +.short -1540 +.short -1540 +.short 952 +.short 952 +.short -642 +.short -642 +.short 16251 +.short 16251 +.short -15159 +.short -15159 +.short 9371 +.short 9371 +.short -6319 +.short -6319 +.short -464 +.short -464 +.short 33 +.short 33 +.short 1320 +.short 1320 +.short -1414 +.short -1414 +.short -4567 +.short -4567 +.short 325 +.short 325 +.short 12993 +.short 12993 +.short -13918 +.short -13918 +.short 939 +.short 939 +.short -892 +.short -892 +.short 733 +.short 733 +.short 268 +.short 268 +.short 9243 +.short 9243 +.short -8780 +.short -8780 +.short 7215 +.short 7215 +.short 2638 +.short 2638 +.short -1021 +.short -1021 +.short -941 +.short -941 +.short -992 +.short -992 +.short 641 +.short 641 +.short -10050 +.short -10050 +.short -9262 +.short -9262 +.short -9764 +.short -9764 +.short 6309 +.short 6309 +.short -1010 +.short -1010 +.short 1435 +.short 1435 +.short 807 +.short 807 +.short 452 +.short 452 +.short -9942 +.short -9942 +.short 14125 +.short 14125 +.short 7943 +.short 7943 +.short 4449 +.short 4449 +.short 1584 +.short 1584 +.short -1292 +.short -1292 +.short 375 +.short 375 +.short -1239 +.short -1239 +.short 15592 +.short 15592 +.short -12717 +.short -12717 +.short 3691 +.short 3691 +.short -12196 +.short -12196 +.short -1031 +.short -1031 +.short -109 +.short -109 +.short -780 +.short -780 +.short 1645 +.short 1645 +.short -10148 +.short -10148 +.short -1073 +.short -1073 +.short -7678 +.short -7678 +.short 16192 +.short 16192 +.short 1438 +.short 1438 +.short -461 +.short -461 +.short 1534 +.short 1534 +.short -927 +.short -927 +.short 14155 +.short 14155 +.short -4538 +.short -4538 +.short 15099 +.short 15099 +.short -9125 +.short -9125 +.short 1063 +.short 1063 +.short -556 +.short -556 +.short -1230 +.short -1230 +.short -863 +.short -863 +.short 10463 +.short 10463 +.short -5473 +.short -5473 +.short -12107 +.short -12107 +.short -8495 +.short -8495 +.short 319 +.short 319 +.short 757 +.short 757 +.short 561 +.short 561 +.short -735 +.short -735 +.short 3140 +.short 3140 +.short 7451 +.short 7451 +.short 5522 +.short 5522 +.short -7235 +.short -7235 +.short -682 +.short -682 +.short -712 +.short -712 +.short 1481 +.short 1481 +.short 648 +.short 648 +.short -6713 +.short -6713 +.short -7008 +.short -7008 +.short 14578 +.short 14578 +.short 6378 +.short 6378 +.short -525 +.short -525 +.short 403 +.short 403 +.short 1143 +.short 1143 +.short -554 +.short -554 +.short -5168 +.short -5168 +.short 3967 +.short 3967 +.short 11251 +.short 11251 +.short -5453 +.short -5453 +.short 1092 +.short 1092 +.short 1026 +.short 1026 +.short -1179 +.short -1179 +.short 886 +.short 886 +.short 10749 +.short 10749 +.short 10099 +.short 10099 +.short -11605 +.short -11605 +.short 8721 +.short 8721 +.short -855 +.short -855 +.short -219 +.short -219 +.short 1227 +.short 1227 +.short 910 +.short 910 +.short -8416 +.short -8416 +.short -2156 +.short -2156 +.short 12078 +.short 12078 +.short 8957 +.short 8957 +.short -1607 +.short -1607 +.short -1455 +.short -1455 +.short -1219 +.short -1219 +.short 885 +.short 885 +.short -15818 +.short -15818 +.short -14322 +.short -14322 +.short -11999 +.short -11999 +.short 8711 +.short 8711 +.short 1212 +.short 1212 +.short 1029 +.short 1029 +.short -394 +.short -394 +.short -1175 +.short -1175 +.short 11930 +.short 11930 +.short 10129 +.short 10129 +.short -3878 +.short -3878 +.short -11566 +.short -11566 + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/asm.h b/mlkem/asm/asm.h new file mode 100644 index 000000000..083db0634 --- /dev/null +++ b/mlkem/asm/asm.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +#ifndef ASM_H +#define ASM_H + +#include +#include "params.h" +#include "config.h" + +#ifdef MLKEM_USE_AARCH64_ASM +void ntt_kyber_123_4567(int16_t *); +void intt_kyber_123_4567(int16_t *); +#endif /* MLKEM_USE_AARCH64_ASM */ + +#endif diff --git a/mlkem/ntt.c b/mlkem/ntt.c index 10cdb0e9d..3a6496f5b 100644 --- a/mlkem/ntt.c +++ b/mlkem/ntt.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 -#include -#include "params.h" #include "ntt.h" +#include "params.h" #include "reduce.h" +#include + +#include "asm/asm.h" /* Code to generate zetas and zetas_inv used in the number-theoretic transform: @@ -39,49 +41,49 @@ void init_ntt() { const int16_t zetas[128] = { - -1044, -758, -359, -1517, 1493, 1422, 287, 202, - -171, 622, 1577, 182, 962, -1202, -1474, 1468, - 573, -1325, 264, 383, -829, 1458, -1602, -130, - -681, 1017, 732, 608, -1542, 411, -205, -1571, - 1223, 652, -552, 1015, -1293, 1491, -282, -1544, - 516, -8, -320, -666, -1618, -1162, 126, 1469, - -853, -90, -271, 830, 107, -1421, -247, -951, - -398, 961, -1508, -725, 448, -1065, 677, -1275, - -1103, 430, 555, 843, -1251, 871, 1550, 105, - 422, 587, 177, -235, -291, -460, 1574, 1653, - -246, 778, 1159, -147, -777, 1483, -602, 1119, - -1590, 644, -872, 349, 418, 329, -156, -75, - 817, 1097, 603, 610, 1322, -1285, -1465, 384, - -1215, -136, 1218, -1335, -874, 220, -1187, -1659, - -1185, -1530, -1278, 794, -1510, -854, -870, 478, - -108, -308, 996, 991, 958, -1460, 1522, 1628 + -1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577, + 182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458, + -1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223, + 652, -552, 1015, -1293, 1491, -282, -1544, 516, -8, -320, -666, + -1618, -1162, 126, 1469, -853, -90, -271, 830, 107, -1421, -247, + -951, -398, 961, -1508, -725, 448, -1065, 677, -1275, -1103, 430, + 555, 843, -1251, 871, 1550, 105, 422, 587, 177, -235, -291, + -460, 1574, 1653, -246, 778, 1159, -147, -777, 1483, -602, 1119, + -1590, 644, -872, 349, 418, 329, -156, -75, 817, 1097, 603, + 610, 1322, -1285, -1465, 384, -1215, -136, 1218, -1335, -874, 220, + -1187, -1659, -1185, -1530, -1278, 794, -1510, -854, -870, 478, -108, + -308, 996, 991, 958, -1460, 1522, 1628 }; /************************************************* -* Name: fqmul -* -* Description: Multiplication followed by Montgomery reduction -* -* Arguments: - int16_t a: first factor -* - int16_t b: second factor -* -* Returns 16-bit integer congruent to a*b*R^{-1} mod q -**************************************************/ + * Name: fqmul + * + * Description: Multiplication followed by Montgomery reduction + * + * Arguments: - int16_t a: first factor + * - int16_t b: second factor + * + * Returns 16-bit integer congruent to a*b*R^{-1} mod q + **************************************************/ static int16_t fqmul(int16_t a, int16_t b) { return montgomery_reduce((int32_t)a * b); } /************************************************* -* Name: ntt -* -* Description: Inplace number-theoretic transform (NTT) in Rq. -* input is in standard order, output is in bitreversed order -* -* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq -**************************************************/ + * Name: ntt + * + * Description: Inplace number-theoretic transform (NTT) in Rq. + * input is in standard order, output is in bitreversed order + * + * Arguments: - int16_t r[256]: pointer to input/output vector of elements of + *Zq + **************************************************/ void ntt(int16_t r[256]) { + #ifdef MLKEM_USE_AARCH64_ASM + ntt_kyber_123_4567(r); + #else /* MLKEM_USE_AARCH64_ASM */ unsigned int len, start, j, k; int16_t t, zeta; @@ -99,19 +101,24 @@ void ntt(int16_t r[256]) } } } + #endif /* MLKEM_USE_AARCH64_ASM */ } /************************************************* -* Name: invntt_tomont -* -* Description: Inplace inverse number-theoretic transform in Rq and -* multiplication by Montgomery factor 2^16. -* Input is in bitreversed order, output is in standard order -* -* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq -**************************************************/ + * Name: invntt_tomont + * + * Description: Inplace inverse number-theoretic transform in Rq and + * multiplication by Montgomery factor 2^16. + * Input is in bitreversed order, output is in standard order + * + * Arguments: - int16_t r[256]: pointer to input/output vector of elements of + *Zq + **************************************************/ void invntt(int16_t r[256]) { + #ifdef MLKEM_USE_AARCH64_ASM + intt_kyber_123_4567(r); + #else /* MLKEM_USE_AARCH64_ASM */ unsigned int start, len, j, k; int16_t t, zeta; const int16_t f = 1441; // mont^2/128 @@ -136,24 +143,26 @@ void invntt(int16_t r[256]) { r[j] = fqmul(r[j], f); } + #endif /* MLKEM_USE_AARCH64_ASM */ } /************************************************* -* Name: basemul -* -* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) -* used for multiplication of elements in Rq in NTT domain -* -* Arguments: - int16_t r[2]: pointer to the output polynomial -* - const int16_t a[2]: pointer to the first factor -* - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial -**************************************************/ -void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) + * Name: basemul + * + * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) + * used for multiplication of elements in Rq in NTT domain + * + * Arguments: - int16_t r[2]: pointer to the output polynomial + * - const int16_t a[2]: pointer to the first factor + * - const int16_t b[2]: pointer to the second factor + * - int16_t zeta: integer defining the reduction polynomial + **************************************************/ +void basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], + int16_t zeta) { - r[0] = fqmul(a[1], b[1]); - r[0] = fqmul(r[0], zeta); + r[0] = fqmul(a[1], b[1]); + r[0] = fqmul(r[0], zeta); r[0] += fqmul(a[0], b[0]); - r[1] = fqmul(a[0], b[1]); + r[1] = fqmul(a[0], b[1]); r[1] += fqmul(a[1], b[0]); } diff --git a/mlkem/sys/config.h b/mlkem/sys/config.h new file mode 100644 index 000000000..0d17cc3b1 --- /dev/null +++ b/mlkem/sys/config.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CONFIG_H +#define CONFIG_H + +#include "cpucap.h" + +#if defined(MLKEM_USE_ASM) + +#if defined(SYS_AARCH64) +#define MLKEM_USE_AARCH64_ASM +#else /* SYS_AARCH64 */ +/* Check x86_64 at some point */ +#warning "Selected optimized build, but no platform-specific assembly present" +#endif /* SYS_AARCH64 */ + +#endif /* MLKEM_USE_ASM */ +#endif /* CONFIG_H */ diff --git a/scripts/tests b/scripts/tests index d63131013..4d0ef842e 100755 --- a/scripts/tests +++ b/scripts/tests @@ -290,6 +290,13 @@ _shared_options = [ nargs=1, help="Extra arch flags to passed in (e.g. '-march=armv8')", ), + click.option( + "--opt", + type=bool, + show_default=True, + default=True, + help="Choose whether to enable assembly optimizations (if present)", + ), ] @@ -302,7 +309,7 @@ def add_options(options): context_settings={"show_default": True}, ) @add_options(_shared_options) -def func(force_qemu, verbose, cross_prefix, cflags, arch_flags): +def func(force_qemu, verbose, cross_prefix, cflags, arch_flags, opt): config_logger(verbose) def expect(scheme): @@ -325,6 +332,7 @@ def func(force_qemu, verbose, cross_prefix, cflags, arch_flags): verbose, cross_prefix, extra_make_envs=process_make_envs(cflags, arch_flags), + extra_make_args=[f"OPT={int(opt)}"], ) @@ -333,7 +341,7 @@ def func(force_qemu, verbose, cross_prefix, cflags, arch_flags): context_settings={"show_default": True}, ) @add_options(_shared_options) -def nistkat(force_qemu, verbose, cross_prefix, cflags, arch_flags): +def nistkat(force_qemu, verbose, cross_prefix, cflags, arch_flags, opt): config_logger(verbose) test_schemes( @@ -345,6 +353,7 @@ def nistkat(force_qemu, verbose, cross_prefix, cflags, arch_flags): verbose, cross_prefix, extra_make_envs=process_make_envs(cflags, arch_flags), + extra_make_args=[f"OPT={int(opt)}"], ) @@ -353,7 +362,7 @@ def nistkat(force_qemu, verbose, cross_prefix, cflags, arch_flags): context_settings={"show_default": True}, ) @add_options(_shared_options) -def kat(force_qemu, verbose, cross_prefix, cflags, arch_flags): +def kat(force_qemu, verbose, cross_prefix, cflags, arch_flags, opt): config_logger(verbose) test_schemes( @@ -365,6 +374,7 @@ def kat(force_qemu, verbose, cross_prefix, cflags, arch_flags): verbose, cross_prefix, extra_make_envs=process_make_envs(cflags, arch_flags), + extra_make_args=[f"OPT={int(opt)}"], ) @@ -419,6 +429,7 @@ def bench( cross_prefix, cflags, arch_flags, + opt, output, run_as_root, exec_wrapper, @@ -446,6 +457,7 @@ def bench( extra_make_envs=process_make_envs(cflags, arch_flags), extra_make_args=[ f"CYCLES={cycles}", + f"OPT={int(opt)}", ], )