From 9744672e954804f1432783464d29512a0e49a009 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Mon, 18 Mar 2024 05:22:10 +0000 Subject: [PATCH] Kyber Neon NTT: Save some GPR stashing --- paper/clean/neon/ntt_kyber_1234_567.s | 133 ++++++------------ paper/clean/neon/ntt_kyber_123_4567.s | 94 ++++--------- .../neon/ntt_kyber_123_4567_scalar_load.s | 105 ++++---------- .../clean/neon/ntt_kyber_123_45_67_twiddles.s | 1 - paper/scripts/slothy_kyber_ntt_a55.sh | 4 +- paper/scripts/slothy_kyber_ntt_a72.sh | 8 +- 6 files changed, 110 insertions(+), 235 deletions(-) diff --git a/paper/clean/neon/ntt_kyber_1234_567.s b/paper/clean/neon/ntt_kyber_1234_567.s index 9b43d6fe..77bb34db 100644 --- a/paper/clean/neon/ntt_kyber_1234_567.s +++ b/paper/clean/neon/ntt_kyber_1234_567.s @@ -108,36 +108,6 @@ vmlaq \a, t0, consts, 0 .endm -.macro load_roots_123 - ldr_vi root0, r_ptr0, 32 - ldr_vo root1, r_ptr0, -16 -.endm - -.macro load_next_roots_45 root0, r_ptr0 - ldr_vi \root0, \r_ptr0, 16 -.endm - -.macro load_next_roots_67 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 - ldr_vi \root0, \r_ptr1, (6*16) - ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) - ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) - ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) - ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) - ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - .macro save_gprs // @slothy:no-unfold sub sp, sp, #(16*6) stp x19, x20, [sp, #16*0] @@ -196,29 +166,6 @@ restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_1234_567_twiddles.s" -.text - - .global ntt_kyber_1234_567 - .global _ntt_kyber_1234_567 - -.p2align 4 -const_addr: .short -3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - -ntt_kyber_1234_567: -_ntt_kyber_1234_567: - push_stack - in .req x0 inp .req x1 count .req x2 @@ -228,20 +175,6 @@ _ntt_kyber_1234_567: src0 .req x6 src1 .req x7 - src2 .req x8 - src3 .req x9 - src4 .req x10 - src5 .req x11 - src6 .req x12 - src7 .req x13 - src8 .req x14 - src9 .req x15 - src10 .req x16 - src11 .req x17 - src12 .req x18 - src13 .req x19 - src14 .req x20 - src15 .req x21 qform_v0 .req q0 qform_v1 .req q1 @@ -336,17 +269,43 @@ _ntt_kyber_1234_567: consts .req v8 - ASM_LOAD(r_ptr0, roots) +.data +.p2align 4 +roots: +#include "ntt_kyber_1234_567_twiddles.s" +.text + .global ntt_kyber_1234_567 + .global _ntt_kyber_1234_567 + +.p2align 4 +const_addr: .short -3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_kyber_1234_567: +_ntt_kyber_1234_567: + push_stack + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l456) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] save STACK0, in add src0, x0, #32*0 - add src8, x0, #32*8 + add src1, x0, #32*8 - ld1 { root0.8h, root1.8h, root2.8h, root3.8h}, [r_ptr0], #64 + ldr_vo root0, r_ptr0, 0 + ldr_vo root1, r_ptr0, 16 + ldr_vo root2, r_ptr0, 32 + ldr_vo root3, r_ptr0, 48 mov count, #2 @@ -362,14 +321,14 @@ layer1234_start: ldr_vo data6, src0, 6*32 ldr_vo data7, src0, 7*32 - ldr_vo data8, src8, 0 - ldr_vo data9, src8, 1*32 - ldr_vo data10, src8, 2*32 - ldr_vo data11, src8, 3*32 - ldr_vo data12, src8, 4*32 - ldr_vo data13, src8, 5*32 - ldr_vo data14, src8, 6*32 - ldr_vo data15, src8, 7*32 + ldr_vo data8, src1, 0 + ldr_vo data9, src1, 1*32 + ldr_vo data10, src1, 2*32 + ldr_vo data11, src1, 3*32 + ldr_vo data12, src1, 4*32 + ldr_vo data13, src1, 5*32 + ldr_vo data14, src1, 6*32 + ldr_vo data15, src1, 7*32 ct_butterfly data0, data8, root0, 0, 1 ct_butterfly data1, data9, root0, 0, 1 @@ -416,14 +375,14 @@ layer1234_start: str_vo data6, src0, -16+6*32 str_vo data7, src0, -16+7*32 - str_vi data8, src8, 16 - str_vo data9, src8, -16+1*32 - str_vo data10, src8, -16+2*32 - str_vo data11, src8, -16+3*32 - str_vo data12, src8, -16+4*32 - str_vo data13, src8, -16+5*32 - str_vo data14, src8, -16+6*32 - str_vo data15, src8, -16+7*32 + str_vi data8, src1, 16 + str_vo data9, src1, -16+1*32 + str_vo data10, src1, -16+2*32 + str_vo data11, src1, -16+3*32 + str_vo data12, src1, -16+4*32 + str_vo data13, src1, -16+5*32 + str_vo data14, src1, -16+6*32 + str_vo data15, src1, -16+7*32 subs count, count, #1 cbnz count, layer1234_start @@ -431,8 +390,6 @@ layer1234_start: restore inp, STACK0 mov count, #4 - ASM_LOAD(r_ptr1, roots_l456) - add src0, inp, #256*0 add src1, inp, #256*1 diff --git a/paper/clean/neon/ntt_kyber_123_4567.s b/paper/clean/neon/ntt_kyber_123_4567.s index 778841da..52d7a56d 100644 --- a/paper/clean/neon/ntt_kyber_123_4567.s +++ b/paper/clean/neon/ntt_kyber_123_4567.s @@ -139,27 +139,6 @@ trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // @slothy:no-unfold - sub sp, sp, #(16*6) - stp x19, x20, [sp, #16*0] - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - str x29, [sp, #16*5] -.endm - -.macro restore_gprs // @slothy:no-unfold - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldr x29, [sp, #16*5] - add sp, sp, #(16*6) -.endm - .macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -176,51 +155,16 @@ add sp, sp, #(16*4) .endm -#define STACK_SIZE 16 -#define STACK0 0 - -.macro restore a, loc // @slothy:no-unfold - ldr \a, [sp, #\loc\()] -.endm -.macro save loc, a // @slothy:no-unfold - str \a, [sp, #\loc\()] -.endm .macro push_stack // @slothy:no-unfold - save_gprs save_vregs - sub sp, sp, #STACK_SIZE .endm .macro pop_stack // @slothy:no-unfold - add sp, sp, #STACK_SIZE restore_vregs - restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_123_45_67_twiddles.s" -.text - - .global ntt_kyber_123_4567 - .global _ntt_kyber_123_4567 - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ntt_kyber_123_4567: -_ntt_kyber_123_4567: - push_stack - in .req x0 - inp .req x1 + in_orig .req x1 count .req x2 r_ptr0 .req x3 r_ptr1 .req x4 @@ -318,13 +262,35 @@ _ntt_kyber_123_4567: t2 .req v27 t3 .req v28 +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles.s" +.text + + .global ntt_kyber_123_4567 + .global _ntt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ntt_kyber_123_4567: +_ntt_kyber_123_4567: + push_stack + ASM_LOAD(r_ptr0, roots) ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] - save STACK0, in + mov in_orig, in mov count, #4 load_roots_123 @@ -368,15 +334,15 @@ layer123_start: subs count, count, #1 cbnz count, layer123_start - restore inp, STACK0 + mov in, in_orig mov count, #8 .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr_vo data0, in, (16*0) + ldr_vo data1, in, (16*1) + ldr_vo data2, in, (16*2) + ldr_vo data3, in, (16*3) load_next_roots_45 @@ -397,7 +363,7 @@ layer4567_start: barrett_reduce data1 barrett_reduce data2 barrett_reduce data3 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp], #64 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 subs count, count, #1 cbnz count, layer4567_start diff --git a/paper/clean/neon/ntt_kyber_123_4567_scalar_load.s b/paper/clean/neon/ntt_kyber_123_4567_scalar_load.s index 781b0494..992e8763 100644 --- a/paper/clean/neon/ntt_kyber_123_4567_scalar_load.s +++ b/paper/clean/neon/ntt_kyber_123_4567_scalar_load.s @@ -151,27 +151,6 @@ xtmp1 .req x11 trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s .endm -.macro save_gprs // @slothy:no-unfold - sub sp, sp, #(16*6) - stp x19, x20, [sp, #16*0] - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - str x29, [sp, #16*5] -.endm - -.macro restore_gprs // @slothy:no-unfold - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldr x29, [sp, #16*5] - add sp, sp, #(16*6) -.endm - .macro save_vregs // @slothy:no-unfold sub sp, sp, #(16*4) stp d8, d9, [sp, #16*0] @@ -188,9 +167,6 @@ xtmp1 .req x11 add sp, sp, #(16*4) .endm -#define STACK_SIZE 16 -#define STACK0 0 - .macro restore a, loc // @slothy:no-unfold ldr \a, [sp, #\loc\()] .endm @@ -198,41 +174,15 @@ xtmp1 .req x11 str \a, [sp, #\loc\()] .endm .macro push_stack // @slothy:no-unfold - save_gprs save_vregs - sub sp, sp, #STACK_SIZE .endm .macro pop_stack // @slothy:no-unfold - add sp, sp, #STACK_SIZE restore_vregs - restore_gprs .endm -.data -.p2align 4 -roots: -#include "ntt_kyber_123_45_67_twiddles.s" -.text - - .global ntt_kyber_123_4567_scalar_load - .global _ntt_kyber_123_4567_scalar_load - -.p2align 4 -const_addr: .short 3329 - .short 20159 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 - .short 0 -ntt_kyber_123_4567_scalar_load: -_ntt_kyber_123_4567_scalar_load: - push_stack - in .req x0 - inp .req x1 + in_orig .req x1 count .req x2 r_ptr0 .req x3 r_ptr1 .req x4 @@ -280,24 +230,6 @@ _ntt_kyber_123_4567_scalar_load: data6 .req v14 data7 .req v15 - x_00 .req x10 - x_01 .req x11 - x_10 .req x12 - x_11 .req x13 - x_20 .req x14 - x_21 .req x15 - x_30 .req x16 - x_31 .req x17 - - xt_00 .req x_00 - xt_01 .req x_20 - xt_10 .req x_10 - xt_11 .req x_30 - xt_20 .req x_01 - xt_21 .req x_21 - xt_30 .req x_11 - xt_31 .req x_31 - qform_data0 .req q8 qform_data1 .req q9 qform_data2 .req q10 @@ -330,13 +262,34 @@ _ntt_kyber_123_4567_scalar_load: t2 .req v27 t3 .req v28 +.data +.p2align 4 +roots: +#include "ntt_kyber_123_45_67_twiddles.s" +.text + + .global ntt_kyber_123_4567_scalar_load + .global _ntt_kyber_123_4567_scalar_load + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ntt_kyber_123_4567_scalar_load: +_ntt_kyber_123_4567_scalar_load: + push_stack ASM_LOAD(r_ptr0, roots) ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] - save STACK0, in + mov in_orig, in mov count, #4 load_roots_123 @@ -380,15 +333,15 @@ layer123_start: subs count, count, #1 cbnz count, layer123_start - restore inp, STACK0 + mov in, in_orig mov count, #8 .p2align 2 layer4567_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + ldr_vo data0, in, (16*0) + ldr_vo data1, in, (16*1) + ldr_vo data2, in, (16*2) + ldr_vo data3, in, (16*3) load_next_roots_45 @@ -409,7 +362,7 @@ layer4567_start: barrett_reduce data1 barrett_reduce data2 barrett_reduce data3 - st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp], #64 + st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 subs count, count, #1 cbnz count, layer4567_start diff --git a/paper/clean/neon/ntt_kyber_123_45_67_twiddles.s b/paper/clean/neon/ntt_kyber_123_45_67_twiddles.s index 6015bf48..9fa7dab0 100644 --- a/paper/clean/neon/ntt_kyber_123_45_67_twiddles.s +++ b/paper/clean/neon/ntt_kyber_123_45_67_twiddles.s @@ -1,4 +1,3 @@ - /// /// Copyright (c) 2022 Arm Limited /// Copyright (c) 2022 Hanno Becker diff --git a/paper/scripts/slothy_kyber_ntt_a55.sh b/paper/scripts/slothy_kyber_ntt_a55.sh index 31f604b2..f14338bb 100755 --- a/paper/scripts/slothy_kyber_ntt_a55.sh +++ b/paper/scripts/slothy_kyber_ntt_a55.sh @@ -33,7 +33,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A55 \ -c sw_pipelining.enabled=true \ -o ${OPT_DIR}/neon/ntt_kyber_123_4567_opt_a55.s \ -r ntt_kyber_123_4567,ntt_kyber_123_4567_opt_a55 \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x6,x30,sp]" \ + -c reserved_regs="[x0--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c constraints.stalls_first_attempt=64 -c variable_size \ @@ -48,7 +48,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A55 -c sw_pipelining.enabled=true \ -o ${OPT_DIR}/neon/ntt_kyber_123_4567_scalar_load_opt_a55.s \ -r ntt_kyber_123_4567_scalar_load,ntt_kyber_123_4567_scalar_load_opt_a55 \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x6,x30,sp]" \ + -c reserved_regs="[x0--x5,x18--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c constraints.stalls_first_attempt=64 \ diff --git a/paper/scripts/slothy_kyber_ntt_a72.sh b/paper/scripts/slothy_kyber_ntt_a72.sh index 178e547c..2b4ccb7a 100755 --- a/paper/scripts/slothy_kyber_ntt_a72.sh +++ b/paper/scripts/slothy_kyber_ntt_a72.sh @@ -33,7 +33,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ -l layer123_start \ -l layer4567_start \ -c sw_pipelining.enabled=true \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x6,x30,sp]" \ + -c reserved_regs="[x0--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c constraints.stalls_first_attempt=64 -c variable_size \ @@ -48,7 +48,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend -l layer123_start \ -l layer4567_start \ -c sw_pipelining.enabled=true \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x6,x30,sp]" \ + -c reserved_regs="[x0--x5,x18--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c constraints.stalls_first_attempt=64 -c variable_size \ @@ -115,7 +115,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ -c split_heuristic_stepsize=0.1 \ -c split_heuristic_repeat=4 \ -c max_solutions=64 \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x30,sp]" \ + -c reserved_regs="[x0--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c variable_size \ @@ -130,7 +130,7 @@ time ${SLOTHY_DIR}/slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ -c sw_pipelining.enabled=true \ -c constraints.stalls_first_attempt=40 \ -c max_solutions=64 \ - -c reserved_regs="[x0,x1,x2,x3,x4,x5,x30,sp]" \ + -c reserved_regs="[x0--x30,sp]" \ -c inputs_are_outputs \ -c sw_pipelining.minimize_overlapping=False \ -c variable_size \