diff --git a/mlkem/asm/aarch64/intt_kyber_123_45_67_twiddles.S b/mlkem/asm/aarch64/intt_123_45_67_twiddles.S similarity index 100% rename from mlkem/asm/aarch64/intt_kyber_123_45_67_twiddles.S rename to mlkem/asm/aarch64/intt_123_45_67_twiddles.S diff --git a/mlkem/asm/aarch64/intt_123_4567.S b/mlkem/asm/aarch64/intt_clean.S similarity index 98% rename from mlkem/asm/aarch64/intt_123_4567.S rename to mlkem/asm/aarch64/intt_clean.S index 25638533e..4be73e298 100644 --- a/mlkem/asm/aarch64/intt_123_4567.S +++ b/mlkem/asm/aarch64/intt_clean.S @@ -159,11 +159,11 @@ .data .p2align 4 roots: -#include "intt_kyber_123_45_67_twiddles.S" +#include "intt_123_45_67_twiddles.S" .text - .global intt_kyber_123_4567 - .global _intt_kyber_123_4567 + .global intt_asm_clean + .global _intt_asm_clean .p2align 4 const_addr: .short 3329 @@ -191,8 +191,8 @@ ninv_tw_addr: .short 5040 .short 5040 .short 5040 -intt_kyber_123_4567: -_intt_kyber_123_4567: +intt_asm_clean: +_intt_asm_clean: push_stack in .req x0 diff --git a/mlkem/asm/aarch64/intt_opt.S b/mlkem/asm/aarch64/intt_opt.S new file mode 100644 index 000000000..2d6a7701c --- /dev/null +++ b/mlkem/asm/aarch64/intt_opt.S @@ -0,0 +1,1077 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] + mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.macro load_roots_123 + ldr q_root0, [r_ptr0], #32 + ldr q_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 + ldr q_root0, [r_ptr0], #16 +.endm + +.macro load_next_roots_67 + ldr q_root0, [r_ptr1], #(6*16) + ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_gprs + save_vregs +.endm + +.macro pop_stack + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_123_45_67_twiddles.S" +.text + + .global intt_asm_opt + .global _intt_asm_opt + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_asm_opt: +_intt_asm_opt: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + q_data0 .req q8 + q_data1 .req q9 + q_data2 .req q10 + q_data3 .req q11 + q_data4 .req q12 + q_data5 .req q13 + q_data6 .req q14 + q_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + q_consts .req q7 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + mov inp, in + mov count, #8 + + .p2align 2 + // Instructions: 54 + // Expected cycles: 57 + // Expected IPC: 0.95 + // + // Cycle bound: 57.0 + // IPC bound: 0.95 + // + // Wall time: 2.40s + // User time: 2.40s + // + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + ldr q0, [x1, #32] // *........................................................ + ldr q30, [x1, #48] // *........................................................ + ldr q8, [x1, #16] // .*....................................................... + ldr q21, [x1, #0] // .*....................................................... + ldr q2, [x3], #16 // ..*...................................................... + trn1 v18.4S, v0.4S, v30.4S // ....*.................................................... + trn2 v20.4S, v0.4S, v30.4S // ....*.................................................... + trn1 v27.4S, v21.4S, v8.4S // .....*................................................... + trn2 v25.4S, v21.4S, v8.4S // .....*................................................... + trn2 v9.2D, v27.2D, v18.2D // ........*................................................ + trn2 v0.2D, v25.2D, v20.2D // ........*................................................ + trn1 v31.2D, v27.2D, v18.2D // .........*............................................... + ldr q8, [x4, #80] // .........*............................................... + trn1 v22.2D, v25.2D, v20.2D // ..........*.............................................. + ldr q13, [x4, #32] // ..........*.............................................. + sub v1.8H, v9.8H, v0.8H // ...........*............................................. + ldr q18, [x4, #16] // ...........*............................................. + ldr q28, [x4, #48] // ............*............................................ + add v16.8H, v31.8H, v22.8H // .............*........................................... + sub v23.8H, v31.8H, v22.8H // .............*........................................... + ldr q19, [x4, #64] // .............*........................................... + add v0.8H, v9.8H, v0.8H // ..............*.......................................... + sqrdmulh v11.8H, v1.8H, v8.8H // ..............*.......................................... + sqrdmulh v27.8H, v23.8H, v28.8H // ................*........................................ + sub v21.8H, v16.8H, v0.8H // .................*....................................... + mul v25.8H, v1.8H, v19.8H // ..................*...................................... + mul v10.8H, v23.8H, v13.8H // ....................*.................................... + mls v25.8H, v11.8H, v7.H[0] // ......................*.................................. + ldr q28, [x4], #(6*16) // .......................*................................. + mls v10.8H, v27.8H, v7.H[0] // ........................*................................ + sqrdmulh v30.8H, v21.8H, v18.8H // ..........................*.............................. + mul v4.8H, v21.8H, v28.8H // ............................*............................ + sub v22.8H, v10.8H, v25.8H // .............................*........................... + add v8.8H, v10.8H, v25.8H // ..............................*.......................... + sqrdmulh v25.8H, v22.8H, v18.8H // ................................*........................ + mls v4.8H, v30.8H, v7.H[0] // ..................................*...................... + mul v3.8H, v22.8H, v28.8H // ....................................*.................... + mls v3.8H, v25.8H, v7.H[0] // ......................................*.................. + add v25.8H, v16.8H, v0.8H // ......................................*.................. + trn1 v18.4S, v25.4S, v8.4S // .........................................*............... + trn2 v25.4S, v25.4S, v8.4S // ..........................................*.............. + trn1 v30.4S, v4.4S, v3.4S // ...........................................*............. + trn2 v27.4S, v4.4S, v3.4S // ...........................................*............. + trn1 v1.2D, v18.2D, v30.2D // ..............................................*.......... + trn1 v14.2D, v25.2D, v27.2D // ..............................................*.......... + trn2 v11.2D, v18.2D, v30.2D // ...............................................*......... + trn2 v27.2D, v25.2D, v27.2D // ...............................................*......... + sub v15.8H, v1.8H, v14.8H // .................................................*....... + add v13.8H, v11.8H, v27.8H // ..................................................*...... + add v16.8H, v1.8H, v14.8H // ...................................................*..... + sub v27.8H, v11.8H, v27.8H // ....................................................*.... + sqrdmulh v19.8H, v15.8H, v2.H[3] // ....................................................*.... + mul v0.8H, v15.8H, v2.H[2] // ......................................................*.. + sqdmulh v25.8H, v13.8H, v7.H[1] // ........................................................* + + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // ldr q17, [x1, #48] // *........................................................ + // ldr q29, [x1, #16] // .*....................................................... + // ldr q24, [x1, #0] // .*....................................................... + // ldr q30, [x1, #32] // *........................................................ + // ldr q12, [x4], #(6*16) // .......................*................................. + // trn2 v6.4S, v30.4S, v17.4S // ....*.................................................... + // trn1 v27.4S, v24.4S, v29.4S // .....*................................................... + // trn2 v25.4S, v24.4S, v29.4S // .....*................................................... + // ldr q29, [x4, #-64] // ..........*.............................................. + // trn1 v23.4S, v30.4S, v17.4S // ....*.................................................... + // trn2 v28.2D, v25.2D, v6.2D // ........*................................................ + // trn1 v25.2D, v25.2D, v6.2D // ..........*.............................................. + // trn1 v6.2D, v27.2D, v23.2D // .........*............................................... + // trn2 v17.2D, v27.2D, v23.2D // ........*................................................ + // sub v9.8H, v6.8H, v25.8H // .............*........................................... + // add v14.8H, v6.8H, v25.8H // .............*........................................... + // ldr q25, [x4, #-48] // ............*............................................ + // sub v27.8H, v17.8H, v28.8H // ...........*............................................. + // ldr q22, [x4, #-80] // ...........*............................................. + // add v3.8H, v17.8H, v28.8H // ..............*.......................................... + // mul v28.8H, v9.8H, v29.8H // ....................*.................................... + // ldr q10, [x4, #-16] // .........*............................................... + // ldr q26, [x4, #-32] // .............*........................................... + // sqrdmulh v15.8H, v9.8H, v25.8H // ................*........................................ + // sub v30.8H, v14.8H, v3.8H // .................*....................................... + // sqrdmulh v18.8H, v27.8H, v10.8H // ..............*.......................................... + // mul v25.8H, v27.8H, v26.8H // ..................*...................................... + // sqrdmulh v20.8H, v30.8H, v22.8H // ..........................*.............................. + // mls v25.8H, v18.8H, v7.H[0] // ......................*.................................. + // mls v28.8H, v15.8H, v7.H[0] // ........................*................................ + // mul v17.8H, v30.8H, v12.8H // ............................*............................ + // sub v6.8H, v28.8H, v25.8H // .............................*........................... + // mls v17.8H, v20.8H, v7.H[0] // ..................................*...................... + // add v21.8H, v28.8H, v25.8H // ..............................*.......................... + // sqrdmulh v15.8H, v6.8H, v22.8H // ................................*........................ + // add v29.8H, v14.8H, v3.8H // ......................................*.................. + // mul v26.8H, v6.8H, v12.8H // ....................................*.................... + // mls v26.8H, v15.8H, v7.H[0] // ......................................*.................. + // trn2 v12.4S, v29.4S, v21.4S // ..........................................*.............. + // trn1 v11.4S, v29.4S, v21.4S // .........................................*............... + // trn1 v30.4S, v17.4S, v26.4S // ...........................................*............. + // trn2 v6.4S, v17.4S, v26.4S // ...........................................*............. + // ldr q2, [x3], #16 // ..*...................................................... + // trn1 v22.2D, v11.2D, v30.2D // ..............................................*.......... + // trn1 v25.2D, v12.2D, v6.2D // ..............................................*.......... + // trn2 v18.2D, v12.2D, v6.2D // ...............................................*......... + // trn2 v4.2D, v11.2D, v30.2D // ...............................................*......... + // sub v15.8H, v22.8H, v25.8H // .................................................*....... + // add v13.8H, v4.8H, v18.8H // ..................................................*...... + // add v16.8H, v22.8H, v25.8H // ...................................................*..... + // sqrdmulh v19.8H, v15.8H, v2.H[3] // ....................................................*.... + // sub v27.8H, v4.8H, v18.8H // ....................................................*.... + // mul v0.8H, v15.8H, v2.H[2] // ......................................................*.. + // sqdmulh v25.8H, v13.8H, v7.H[1] // ........................................................* + + sub count, count, #1 +layer4567_start: + // Instructions: 83 + // Expected cycles: 65 + // Expected IPC: 1.28 + // + // Cycle bound: 65.0 + // IPC bound: 1.28 + // + // Wall time: 100.13s + // User time: 100.13s + // + // ----------------------- cycle (expected) -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q17, [x1, #112] // e................................................................ + mls v0.8H, v19.8H, v7.H[0] // *................................................................ + ldr q29, [x1, #80] // e................................................................ + ldr q24, [x1, #64] // .e............................................................... + ldr q30, [x1, #96] // .e............................................................... + sqrdmulh v31.8H, v27.8H, v2.H[5] // ..*.............................................................. + srshr v5.8H, v25.8H, #11 // ...*............................................................. + ldr q12, [x4], #(6*16) // ....e............................................................ + sqdmulh v21.8H, v16.8H, v7.H[1] // ....*............................................................ + trn2 v6.4S, v30.4S, v17.4S // .....e........................................................... + mul v8.8H, v27.8H, v2.H[4] // ......*.......................................................... + trn1 v27.4S, v24.4S, v29.4S // ......e.......................................................... + trn2 v25.4S, v24.4S, v29.4S // .......e......................................................... + ldr q29, [x4, #-64] // .......e......................................................... + trn1 v23.4S, v30.4S, v17.4S // ........e........................................................ + sqdmulh v14.8H, v0.8H, v7.H[1] // ........*........................................................ + srshr v1.8H, v21.8H, #11 // .........*....................................................... + mls v8.8H, v31.8H, v7.H[0] // ..........*...................................................... + trn2 v28.2D, v25.2D, v6.2D // ..........e...................................................... + trn1 v25.2D, v25.2D, v6.2D // ...........e..................................................... + mls v13.8H, v5.8H, v7.H[0] // ............*.................................................... + trn1 v6.2D, v27.2D, v23.2D // ............e.................................................... + srshr v22.8H, v14.8H, #11 // .............*................................................... + trn2 v17.2D, v27.2D, v23.2D // ..............e.................................................. + mls v16.8H, v1.8H, v7.H[0] // ..............*.................................................. + sub v9.8H, v6.8H, v25.8H // ...............e................................................. + mls v0.8H, v22.8H, v7.H[0] // ................*................................................ + add v14.8H, v6.8H, v25.8H // ................e................................................ + ldr q25, [x4, #-48] // ................e................................................ + sub v27.8H, v17.8H, v28.8H // .................e............................................... + ldr q22, [x4, #-80] // .................e............................................... + add v3.8H, v17.8H, v28.8H // ..................e.............................................. + mul v28.8H, v9.8H, v29.8H // ..................e.............................................. + ldr q10, [x4, #-16] // ..................e.............................................. + ldr q26, [x4, #-32] // ...................e............................................. + sqrdmulh v15.8H, v9.8H, v25.8H // ....................e............................................ + add v25.8H, v16.8H, v13.8H // ....................*............................................ + sub v30.8H, v14.8H, v3.8H // .....................e........................................... + sqrdmulh v18.8H, v27.8H, v10.8H // ......................e.......................................... + str q25, [x1], #(64) // .......................*......................................... + mul v25.8H, v27.8H, v26.8H // ........................e........................................ + sqrdmulh v20.8H, v30.8H, v22.8H // ..........................e...................................... + mls v25.8H, v18.8H, v7.H[0] // ............................e.................................... + mls v28.8H, v15.8H, v7.H[0] // ..............................e.................................. + sqdmulh v29.8H, v8.8H, v7.H[1] // ................................*................................ + mul v17.8H, v30.8H, v12.8H // ..................................e.............................. + sub v6.8H, v28.8H, v25.8H // ...................................e............................. + mls v17.8H, v20.8H, v7.H[0] // ....................................e............................ + add v21.8H, v28.8H, v25.8H // ....................................e............................ + srshr v25.8H, v29.8H, #11 // .....................................*........................... + sqrdmulh v15.8H, v6.8H, v22.8H // ......................................e.......................... + add v29.8H, v14.8H, v3.8H // ......................................e.......................... + mls v8.8H, v25.8H, v7.H[0] // ........................................*........................ + sub v25.8H, v16.8H, v13.8H // ........................................*........................ + mul v26.8H, v6.8H, v12.8H // ..........................................e...................... + mls v26.8H, v15.8H, v7.H[0] // ............................................e.................... + trn2 v12.4S, v29.4S, v21.4S // .............................................e................... + trn1 v11.4S, v29.4S, v21.4S // ..............................................e.................. + sqrdmulh v29.8H, v25.8H, v2.H[1] // ..............................................*.................. + sub v24.8H, v0.8H, v8.8H // ...............................................*................. + mul v27.8H, v25.8H, v2.H[0] // ................................................*................ + add v25.8H, v0.8H, v8.8H // ................................................*................ + trn1 v30.4S, v17.4S, v26.4S // .................................................e............... + trn2 v6.4S, v17.4S, v26.4S // ..................................................e.............. + sqrdmulh v10.8H, v24.8H, v2.H[1] // ..................................................*.............. + str q25, [x1, #-48] // ...................................................*............. + mul v26.8H, v24.8H, v2.H[0] // ....................................................*............ + ldr q2, [x3], #16 // ....................................................e............ + trn1 v22.2D, v11.2D, v30.2D // ....................................................e............ + trn1 v25.2D, v12.2D, v6.2D // .....................................................e........... + trn2 v18.2D, v12.2D, v6.2D // ......................................................e.......... + mls v27.8H, v29.8H, v7.H[0] // ......................................................*.......... + trn2 v4.2D, v11.2D, v30.2D // .......................................................e......... + mls v26.8H, v10.8H, v7.H[0] // ........................................................*........ + sub v15.8H, v22.8H, v25.8H // ........................................................e........ + add v13.8H, v4.8H, v18.8H // ..........................................................e...... + add v16.8H, v22.8H, v25.8H // ...........................................................e..... + str q27, [x1, #-32] // ...........................................................*..... + sqrdmulh v19.8H, v15.8H, v2.H[3] // ...........................................................e..... + sub v27.8H, v4.8H, v18.8H // ............................................................e.... + str q26, [x1, #-16] // .............................................................*... + mul v0.8H, v15.8H, v2.H[2] // .............................................................e... + sqdmulh v25.8H, v13.8H, v7.H[1] // ...............................................................e. + + // ------------------------------------------------------ cycle (expected) ------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|- + // ldr q8, [x1, #(16*0)] // .e...............................................................'~............................................................ + // ldr q9, [x1, #(16*1)] // e................................................................~............................................................. + // ldr q10, [x1, #(16*2)] // .e...............................................................'~............................................................ + // ldr q11, [x1, #(16*3)] // e................................................................~............................................................. + // trn1 v25.4s, v8.4s, v9.4s // ......e..........................................................'.....~....................................................... + // trn2 v26.4s, v8.4s, v9.4s // .......e.........................................................'......~...................................................... + // trn1 v27.4s, v10.4s, v11.4s // ........e........................................................'.......~..................................................... + // trn2 v28.4s, v10.4s, v11.4s // .....e...........................................................'....~........................................................ + // trn2 v10.2d, v25.2d, v27.2d // ..............e..................................................'.............~............................................... + // trn2 v11.2d, v26.2d, v28.2d // ..........e......................................................'.........~................................................... + // trn1 v8.2d, v25.2d, v27.2d // ............e....................................................'...........~................................................. + // trn1 v9.2d, v26.2d, v28.2d // ...........e.....................................................'..........~.................................................. + // ldr q0, [x4], #(6*16) // ....e............................................................'...~......................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .................e...............................................'................~............................................ + // ldr q1, [x4, #(-6*16 + 2*16)] // .......e.........................................................'......~...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ................e................................................'...............~............................................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ...................e.............................................'..................~.......................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..................e..............................................'.................~........................................... + // sub v24.8h, v8.8h, v9.8h // ...............e.................................................'..............~.............................................. + // add v8.8h, v8.8h, v9.8h // ................e................................................'...............~............................................. + // sqrdmulh v27.8h, v24.8h, v5.8h // ....................e............................................'...................~......................................... + // mul v9.8h, v24.8h, v1.8h // ..................e..............................................'.................~........................................... + // mls v9.8h, v27.8h, v7.h[0] // ..............................e..................................'.............................~............................... + // sub v24.8h, v10.8h, v11.8h // .................e...............................................'................~............................................ + // add v10.8h, v10.8h, v11.8h // ..................e..............................................'.................~........................................... + // sqrdmulh v27.8h, v24.8h, v6.8h // ......................e..........................................'.....................~....................................... + // mul v11.8h, v24.8h, v2.8h // ........................e........................................'.......................~..................................... + // mls v11.8h, v27.8h, v7.h[0] // ............................e....................................'...........................~................................. + // sub v24.8h, v8.8h, v10.8h // .....................e...........................................'....................~........................................ + // add v8.8h, v8.8h, v10.8h // ......................................e..........................'.....................................~....................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ..........................e......................................'.........................~................................... + // mul v10.8h, v24.8h, v0.8h // ..................................e..............................'.................................~........................... + // mls v10.8h, v27.8h, v7.h[0] // ....................................e............................'...................................~......................... + // sub v24.8h, v9.8h, v11.8h // ...................................e.............................'..................................~.......................... + // add v9.8h, v9.8h, v11.8h // ....................................e............................'...................................~......................... + // sqrdmulh v27.8h, v24.8h, v4.8h // ......................................e..........................'.....................................~....................... + // mul v11.8h, v24.8h, v0.8h // ..........................................e......................'.........................................~................... + // mls v11.8h, v27.8h, v7.h[0] // ............................................e....................'...........................................~................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................................e..................'.............................................~............... + // trn2 v26.4s, v8.4s, v9.4s // .............................................e...................'............................................~................ + // trn1 v27.4s, v10.4s, v11.4s // .................................................e...............'................................................~............ + // trn2 v28.4s, v10.4s, v11.4s // ..................................................e..............'.................................................~........... + // trn2 v10.2d, v25.2d, v27.2d // .......................................................e.........'......................................................~...... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................e..........'.....................................................~....... + // trn1 v8.2d, v25.2d, v27.2d // ....................................................e............'...................................................~......... + // trn1 v9.2d, v26.2d, v28.2d // .....................................................e...........'....................................................~........ + // ldr q0, [x3], #16 // ....................................................e............'...................................................~......... + // sub v24.8h, v8.8h, v9.8h // ........................................................e........'.......................................................~..... + // add v8.8h, v8.8h, v9.8h // ...........................................................e.....'..........................................................~.. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...........................................................e.....'..........................................................~.. + // mul v9.8h, v24.8h, v0.h[2] // .............................................................e...'............................................................. + // mls v9.8h, v27.8h, v7.h[0] // ~................................................................*............................................................. + // sub v24.8h, v10.8h, v11.8h // ............................................................e....'...........................................................~. + // add v10.8h, v10.8h, v11.8h // ..........................................................e......'.........................................................~... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ..~..............................................................'.*........................................................... + // mul v11.8h, v24.8h, v0.h[4] // ......~..........................................................'.....*....................................................... + // mls v11.8h, v27.8h, v7.h[0] // ..........~......................................................'.........*................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ....~............................................................'...*......................................................... + // srshr v25.8h, v25.8h, #11 // .........~.......................................................'........*.................................................... + // mls v8.8h, v25.8h, v7.h[0] // ..............~..................................................'.............*............................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...............................................................e.'............................................................. + // srshr v25.8h, v25.8h, #11 // ...~.............................................................'..*.......................................................... + // mls v10.8h, v25.8h, v7.h[0] // ............~....................................................'...........*................................................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........~........................................................'.......*..................................................... + // srshr v25.8h, v25.8h, #11 // .............~...................................................'............*................................................ + // mls v9.8h, v25.8h, v7.h[0] // ................~................................................'...............*............................................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // ................................~................................'...............................*............................. + // srshr v25.8h, v25.8h, #11 // .....................................~...........................'....................................*........................ + // mls v11.8h, v25.8h, v7.h[0] // ........................................~........................'.......................................*..................... + // sub v24.8h, v8.8h, v10.8h // ........................................~........................'.......................................*..................... + // add v8.8h, v8.8h, v10.8h // ....................~............................................'...................*......................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..............................................~..................'.............................................*............... + // mul v10.8h, v24.8h, v0.h[0] // ................................................~................'...............................................*............. + // mls v10.8h, v27.8h, v7.h[0] // ......................................................~..........'.....................................................*....... + // sub v24.8h, v9.8h, v11.8h // ...............................................~.................'..............................................*.............. + // add v9.8h, v9.8h, v11.8h // ................................................~................'...............................................*............. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // ..................................................~..............'.................................................*........... + // mul v11.8h, v24.8h, v0.h[0] // ....................................................~............'...................................................*......... + // mls v11.8h, v27.8h, v7.h[0] // ........................................................~........'.......................................................*..... + // str q8, [x1], #(64) // .......................~.........................................'......................*...................................... + // str q9, [x1, #(-64 + 16*1)] // ...................................................~.............'..................................................*.......... + // str q10, [x1, #(-64 + 16*2)] // ...........................................................~.....'..........................................................*.. + // str q11, [x1, #(-64 + 16*3)] // .............................................................~...'............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + // Instructions: 29 + // Expected cycles: 40 + // Expected IPC: 0.72 + // + // Cycle bound: 40.0 + // IPC bound: 0.72 + // + // Wall time: 0.19s + // User time: 0.19s + // + // ---------- cycle (expected) -----------> + // 0 25 + // |------------------------|-------------- + srshr v1.8H, v25.8H, #11 // *....................................... + sqrdmulh v29.8H, v27.8H, v2.H[5] // *....................................... + mls v0.8H, v19.8H, v7.H[0] // ..*..................................... + mul v21.8H, v27.8H, v2.H[4] // ....*................................... + mls v21.8H, v29.8H, v7.H[0] // ......*................................. + sqdmulh v25.8H, v16.8H, v7.H[1] // ........*............................... + sqdmulh v10.8H, v0.8H, v7.H[1] // ..........*............................. + sqdmulh v17.8H, v21.8H, v7.H[1] // ............*........................... + srshr v22.8H, v25.8H, #11 // .............*.......................... + mls v13.8H, v1.8H, v7.H[0] // ..............*......................... + srshr v5.8H, v10.8H, #11 // ...............*........................ + mls v16.8H, v22.8H, v7.H[0] // ................*....................... + srshr v6.8H, v17.8H, #11 // .................*...................... + mls v0.8H, v5.8H, v7.H[0] // ..................*..................... + mls v21.8H, v6.8H, v7.H[0] // ....................*................... + sub v3.8H, v16.8H, v13.8H // .....................*.................. + add v12.8H, v16.8H, v13.8H // ......................*................. + mul v19.8H, v3.8H, v2.H[0] // ........................*............... + str q12, [x1], #(64) // .........................*.............. + sub v17.8H, v0.8H, v21.8H // .........................*.............. + add v18.8H, v0.8H, v21.8H // ..........................*............. + sqrdmulh v21.8H, v3.8H, v2.H[1] // ..........................*............. + sqrdmulh v29.8H, v17.8H, v2.H[1] // ............................*........... + str q18, [x1, #-48] // .............................*.......... + mul v27.8H, v17.8H, v2.H[0] // ..............................*......... + mls v19.8H, v21.8H, v7.H[0] // ................................*....... + mls v27.8H, v29.8H, v7.H[0] // ..................................*..... + str q19, [x1, #-32] // .....................................*.. + str q27, [x1, #-16] // .......................................* + + // ---------- cycle (expected) -----------> + // 0 25 + // |------------------------|-------------- + // mls v0.8H, v19.8H, v7.H[0] // ..*..................................... + // sqrdmulh v31.8H, v27.8H, v2.H[5] // *....................................... + // srshr v5.8H, v25.8H, #11 // *....................................... + // sqdmulh v21.8H, v16.8H, v7.H[1] // ........*............................... + // mul v8.8H, v27.8H, v2.H[4] // ....*................................... + // sqdmulh v14.8H, v0.8H, v7.H[1] // ..........*............................. + // srshr v1.8H, v21.8H, #11 // .............*.......................... + // mls v8.8H, v31.8H, v7.H[0] // ......*................................. + // mls v13.8H, v5.8H, v7.H[0] // ..............*......................... + // srshr v22.8H, v14.8H, #11 // ...............*........................ + // mls v16.8H, v1.8H, v7.H[0] // ................*....................... + // mls v0.8H, v22.8H, v7.H[0] // ..................*..................... + // add v25.8H, v16.8H, v13.8H // ......................*................. + // str q25, [x1], #(64) // .........................*.............. + // sqdmulh v29.8H, v8.8H, v7.H[1] // ............*........................... + // srshr v25.8H, v29.8H, #11 // .................*...................... + // mls v8.8H, v25.8H, v7.H[0] // ....................*................... + // sub v25.8H, v16.8H, v13.8H // .....................*.................. + // sqrdmulh v29.8H, v25.8H, v2.H[1] // ..........................*............. + // sub v24.8H, v0.8H, v8.8H // .........................*.............. + // mul v27.8H, v25.8H, v2.H[0] // ........................*............... + // add v25.8H, v0.8H, v8.8H // ..........................*............. + // sqrdmulh v10.8H, v24.8H, v2.H[1] // ............................*........... + // str q25, [x1, #-48] // .............................*.......... + // mul v26.8H, v24.8H, v2.H[0] // ..............................*......... + // mls v27.8H, v29.8H, v7.H[0] // ................................*....... + // mls v26.8H, v10.8H, v7.H[0] // ..................................*..... + // str q27, [x1, #-32] // .....................................*.. + // str q26, [x1, #-16] // .......................................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + // Instructions: 35 + // Expected cycles: 38 + // Expected IPC: 0.92 + // + // Cycle bound: 38.0 + // IPC bound: 0.92 + // + // Wall time: 0.17s + // User time: 0.17s + // + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + ldr q25, [x0, #384] // *..................................... + ldr q2, [x0, #448] // *..................................... + ldr q6, [x0, #256] // ...*.................................. + add v31.8H, v25.8H, v2.8H // ....*................................. + sub v25.8H, v25.8H, v2.8H // ....*................................. + ldr q8, [x0, #320] // ....*................................. + sqrdmulh v2.8H, v25.8H, v1.H[5] // .......*.............................. + ldr q12, [x0, #0] // ........*............................. + ldr q17, [x0, #64] // ........*............................. + sub v14.8H, v6.8H, v8.8H // ........*............................. + add v11.8H, v6.8H, v8.8H // .........*............................ + mul v6.8H, v25.8H, v1.H[4] // .........*............................ + ldr q13, [x0, #192] // .........*............................ + sqrdmulh v28.8H, v14.8H, v1.H[3] // ...........*.......................... + sub v25.8H, v12.8H, v17.8H // ............*......................... + ldr q8, [x0, #128] // ............*......................... + mls v6.8H, v2.8H, v7.H[0] // .............*........................ + add v9.8H, v12.8H, v17.8H // .............*........................ + mul v23.8H, v25.8H, v0.H[6] // ...............*...................... + sub v2.8H, v8.8H, v13.8H // ................*..................... + sqrdmulh v25.8H, v25.8H, v0.H[7] // .................*.................... + mul v22.8H, v2.8H, v1.H[0] // ...................*.................. + sqrdmulh v2.8H, v2.8H, v1.H[1] // .....................*................ + mls v23.8H, v25.8H, v7.H[0] // .......................*.............. + sub v25.8H, v11.8H, v31.8H // .......................*.............. + mul v14.8H, v14.8H, v1.H[2] // .........................*............ + mls v14.8H, v28.8H, v7.H[0] // ...........................*.......... + mls v22.8H, v2.8H, v7.H[0] // .............................*........ + sqrdmulh v21.8H, v25.8H, v0.H[5] // ...............................*...... + sub v15.8H, v14.8H, v6.8H // ................................*..... + add v17.8H, v14.8H, v6.8H // .................................*.... + mul v18.8H, v25.8H, v0.H[4] // ..................................*... + add v14.8H, v23.8H, v22.8H // ..................................*... + mul v10.8H, v15.8H, v0.H[4] // ....................................*. + sub v24.8H, v14.8H, v17.8H // .....................................* + + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + // ldr q13, [x0, #192] // .........*............................ + // ldr q27, [x0, #0] // ........*............................. + // ldr q12, [x0, #64] // ........*............................. + // ldr q23, [x0, #448] // *..................................... + // ldr q4, [x0, #384] // *..................................... + // ldr q8, [x0, #256] // ...*.................................. + // ldr q9, [x0, #320] // ....*................................. + // sub v28.8H, v4.8H, v23.8H // ....*................................. + // add v31.8H, v4.8H, v23.8H // ....*................................. + // sub v20.8H, v8.8H, v9.8H // ........*............................. + // mul v5.8H, v20.8H, v1.H[2] // .........................*............ + // sqrdmulh v3.8H, v28.8H, v1.H[5] // .......*.............................. + // sqrdmulh v23.8H, v20.8H, v1.H[3] // ...........*.......................... + // mul v28.8H, v28.8H, v1.H[4] // .........*............................ + // mls v28.8H, v3.8H, v7.H[0] // .............*........................ + // add v11.8H, v8.8H, v9.8H // .........*............................ + // ldr q8, [x0, #128] // ............*......................... + // sub v2.8H, v8.8H, v13.8H // ................*..................... + // mls v5.8H, v23.8H, v7.H[0] // ...........................*.......... + // mul v22.8H, v2.8H, v1.H[0] // ...................*.................. + // sqrdmulh v14.8H, v2.8H, v1.H[1] // .....................*................ + // sub v15.8H, v27.8H, v12.8H // ............*......................... + // mls v22.8H, v14.8H, v7.H[0] // .............................*........ + // sqrdmulh v26.8H, v15.8H, v0.H[7] // .................*.................... + // mul v23.8H, v15.8H, v0.H[6] // ...............*...................... + // mls v23.8H, v26.8H, v7.H[0] // .......................*.............. + // add v17.8H, v5.8H, v28.8H // .................................*.... + // add v14.8H, v23.8H, v22.8H // ..................................*... + // add v9.8H, v27.8H, v12.8H // .............*........................ + // sub v15.8H, v5.8H, v28.8H // ................................*..... + // sub v16.8H, v11.8H, v31.8H // .......................*.............. + // sqrdmulh v21.8H, v16.8H, v0.H[5] // ...............................*...... + // mul v18.8H, v16.8H, v0.H[4] // ..................................*... + // sub v24.8H, v14.8H, v17.8H // .....................................* + // mul v10.8H, v15.8H, v0.H[4] // ....................................*. + + sub count, count, #1 +layer123_start: + // Instructions: 88 + // Expected cycles: 96 + // Expected IPC: 0.92 + // + // Cycle bound: 96.0 + // IPC bound: 0.92 + // + // Wall time: 43.38s + // User time: 43.38s + // + // -------------------------------------- cycle (expected) ---------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-------------------- + add v2.8H, v8.8H, v13.8H // *............................................................................................... + ldr q13, [x0, #208] // e............................................................................................... + ldr q27, [x0, #16] // e............................................................................................... + add v3.8H, v14.8H, v17.8H // .*.............................................................................................. + mls v18.8H, v21.8H, v7.H[0] // .*.............................................................................................. + ldr q12, [x0, #80] // .e.............................................................................................. + sub v25.8H, v23.8H, v22.8H // ..*............................................................................................. + ldr q23, [x0, #464] // ..e............................................................................................. + ldr q4, [x0, #400] // ..e............................................................................................. + ldr q8, [x0, #272] // ...e............................................................................................ + add v11.8H, v11.8H, v31.8H // ...*............................................................................................ + mul v16.8H, v24.8H, v0.H[0] // ...*............................................................................................ + sub v22.8H, v9.8H, v2.8H // ....*........................................................................................... + add v6.8H, v9.8H, v2.8H // .....*.......................................................................................... + ldr q9, [x0, #336] // .....e.......................................................................................... + mul v14.8H, v25.8H, v0.H[2] // .....*.......................................................................................... + sub v28.8H, v4.8H, v23.8H // ......e......................................................................................... + add v31.8H, v4.8H, v23.8H // .......e........................................................................................ + mul v26.8H, v3.8H, v29.8H // .......*........................................................................................ + sub v20.8H, v8.8H, v9.8H // .........e...................................................................................... + sqrdmulh v21.8H, v15.8H, v0.H[5] // .........*...................................................................................... + sqrdmulh v17.8H, v3.8H, v30.8H // ...........*.................................................................................... + sqrdmulh v4.8H, v22.8H, v0.H[3] // .............*.................................................................................. + mul v22.8H, v22.8H, v0.H[2] // ...............*................................................................................ + mul v5.8H, v20.8H, v1.H[2] // .................e.............................................................................. + mls v22.8H, v4.8H, v7.H[0] // ...................*............................................................................ + sqrdmulh v3.8H, v28.8H, v1.H[5] // .....................e.......................................................................... + sqrdmulh v23.8H, v20.8H, v1.H[3] // .......................e........................................................................ + sub v20.8H, v22.8H, v18.8H // ........................*....................................................................... + add v19.8H, v22.8H, v18.8H // .........................*...................................................................... + sqrdmulh v18.8H, v25.8H, v0.H[3] // .........................*...................................................................... + add v25.8H, v6.8H, v11.8H // ..........................*..................................................................... + mls v26.8H, v17.8H, v7.H[0] // ...........................*.................................................................... + mul v28.8H, v28.8H, v1.H[4] // .............................e.................................................................. + mls v28.8H, v3.8H, v7.H[0] // ...............................e................................................................ + str q26, [x0, #64] // ................................*............................................................... + sqrdmulh v15.8H, v19.8H, v30.8H // .................................*.............................................................. + mul v4.8H, v19.8H, v29.8H // ...................................*............................................................ + sub v19.8H, v6.8H, v11.8H // ...................................*............................................................ + add v11.8H, v8.8H, v9.8H // ....................................e........................................................... + ldr q8, [x0, #144] // ....................................e........................................................... + mls v14.8H, v18.8H, v7.H[0] // .....................................*.......................................................... + mls v10.8H, v21.8H, v7.H[0] // .......................................*........................................................ + sub v2.8H, v8.8H, v13.8H // ........................................e....................................................... + mls v5.8H, v23.8H, v7.H[0] // .........................................e...................................................... + mul v22.8H, v2.8H, v1.H[0] // ...........................................e.................................................... + sub v9.8H, v14.8H, v10.8H // ............................................*................................................... + add v18.8H, v14.8H, v10.8H // .............................................*.................................................. + sqrdmulh v14.8H, v2.8H, v1.H[1] // .............................................e.................................................. + mls v4.8H, v15.8H, v7.H[0] // ...............................................*................................................ + sub v15.8H, v27.8H, v12.8H // ...............................................e................................................ + sqrdmulh v6.8H, v24.8H, v0.H[1] // .................................................*.............................................. + mls v22.8H, v14.8H, v7.H[0] // ...................................................e............................................ + str q4, [x0, #128] // ....................................................*........................................... + sqrdmulh v26.8H, v15.8H, v0.H[7] // .....................................................e.......................................... + sqrdmulh v17.8H, v9.8H, v0.H[1] // .......................................................*........................................ + mul v23.8H, v15.8H, v0.H[6] // .........................................................e...................................... + mls v23.8H, v26.8H, v7.H[0] // ...........................................................e.................................... + mul v24.8H, v9.8H, v0.H[0] // .............................................................*.................................. + mls v24.8H, v17.8H, v7.H[0] // ...............................................................*................................ + add v17.8H, v5.8H, v28.8H // ...............................................................e................................ + add v14.8H, v23.8H, v22.8H // ................................................................e............................... + add v9.8H, v27.8H, v12.8H // .................................................................e.............................. + mls v16.8H, v6.8H, v7.H[0] // .................................................................*.............................. + sqrdmulh v12.8H, v25.8H, v30.8H // ...................................................................*............................ + mul v15.8H, v20.8H, v0.H[0] // .....................................................................*.......................... + str q16, [x0, #320] // ......................................................................*......................... + sqrdmulh v2.8H, v20.8H, v0.H[1] // .......................................................................*........................ + str q24, [x0, #448] // .......................................................................*........................ + mul v4.8H, v25.8H, v29.8H // .........................................................................*...................... + mul v25.8H, v19.8H, v0.H[0] // ...........................................................................*.................... + sqrdmulh v16.8H, v19.8H, v0.H[1] // .............................................................................*.................. + mls v15.8H, v2.8H, v7.H[0] // ...............................................................................*................ + mls v4.8H, v12.8H, v7.H[0] // .................................................................................*.............. + sqrdmulh v6.8H, v18.8H, v30.8H // ...................................................................................*............ + str q15, [x0, #384] // ....................................................................................*........... + sub v15.8H, v5.8H, v28.8H // ....................................................................................e........... + mls v25.8H, v16.8H, v7.H[0] // .....................................................................................*.......... + sub v16.8H, v11.8H, v31.8H // .....................................................................................e.......... + str q4, [x0], #(16) // ......................................................................................*......... + mul v27.8H, v18.8H, v29.8H // .......................................................................................*........ + mls v27.8H, v6.8H, v7.H[0] // .........................................................................................*...... + str q25, [x0, #240] // ..........................................................................................*..... + sqrdmulh v21.8H, v16.8H, v0.H[5] // ...........................................................................................e.... + mul v18.8H, v16.8H, v0.H[4] // .............................................................................................e.. + str q27, [x0, #176] // ..............................................................................................*. + sub v24.8H, v14.8H, v17.8H // ...............................................................................................e + mul v10.8H, v15.8H, v0.H[4] // ...............................................................................................e + + // -------------------------------------------------------------------------------------- cycle (expected) --------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--------------- + // ldr q8, [x0, #0] // e...............................................................................................~.............................................................................................. + // ldr q9, [x0, #(1*(512/8))] // .e..............................................................................................'~............................................................................................. + // ldr q10, [x0, #(2*(512/8))] // ....................................e...........................................................'...................................~.......................................................... + // ldr q11, [x0, #(3*(512/8))] // e...............................................................................................~.............................................................................................. + // ldr q12, [x0, #(4*(512/8))] // ...e............................................................................................'..~........................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .....e..........................................................................................'....~......................................................................................... + // ldr q14, [x0, #(6*(512/8))] // ..e.............................................................................................'.~............................................................................................ + // ldr q15, [x0, #(7*(512/8))] // ..e.............................................................................................'.~............................................................................................ + // sub v24.8h, v8.8h, v9.8h // ...............................................e................................................'..............................................~............................................... + // add v8.8h, v8.8h, v9.8h // .................................................................e..............................'................................................................~............................. + // sqrdmulh v27.8h, v24.8h, v0.h[7] // .....................................................e..........................................'....................................................~......................................... + // mul v9.8h, v24.8h, v0.h[6] // .........................................................e......................................'........................................................~..................................... + // mls v9.8h, v27.8h, v7.h[0] // ...........................................................e....................................'..........................................................~................................... + // sub v24.8h, v10.8h, v11.8h // ........................................e.......................................................'.......................................~...................................................... + // add v10.8h, v10.8h, v11.8h // ~...............................................................................................*.............................................................................................. + // sqrdmulh v27.8h, v24.8h, v1.h[1] // .............................................e..................................................'............................................~................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...........................................e....................................................'..........................................~................................................... + // mls v11.8h, v27.8h, v7.h[0] // ...................................................e............................................'..................................................~........................................... + // sub v24.8h, v12.8h, v13.8h // .........e......................................................................................'........~..................................................................................... + // add v12.8h, v12.8h, v13.8h // ....................................e...........................................................'...................................~.......................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[3] // .......................e........................................................................'......................~....................................................................... + // mul v13.8h, v24.8h, v1.h[2] // .................e..............................................................................'................~............................................................................. + // mls v13.8h, v27.8h, v7.h[0] // .........................................e......................................................'........................................~..................................................... + // sub v24.8h, v14.8h, v15.8h // ......e.........................................................................................'.....~........................................................................................ + // add v14.8h, v14.8h, v15.8h // .......e........................................................................................'......~....................................................................................... + // sqrdmulh v27.8h, v24.8h, v1.h[5] // .....................e..........................................................................'....................~......................................................................... + // mul v15.8h, v24.8h, v1.h[4] // .............................e..................................................................'............................~................................................................. + // mls v15.8h, v27.8h, v7.h[0] // ...............................e................................................................'..............................~............................................................... + // sub v24.8h, v8.8h, v10.8h // ....~...........................................................................................'...*.......................................................................................... + // add v8.8h, v8.8h, v10.8h // .....~..........................................................................................'....*......................................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .............~..................................................................................'............*................................................................................. + // mul v10.8h, v24.8h, v0.h[2] // ...............~................................................................................'..............*............................................................................... + // mls v10.8h, v27.8h, v7.h[0] // ...................~............................................................................'..................*........................................................................... + // sub v24.8h, v9.8h, v11.8h // ..~.............................................................................................'.*............................................................................................ + // add v9.8h, v9.8h, v11.8h // ................................................................e...............................'...............................................................~.............................. + // sqrdmulh v27.8h, v24.8h, v0.h[3] // .........................~......................................................................'........................*..................................................................... + // mul v11.8h, v24.8h, v0.h[2] // .....~..........................................................................................'....*......................................................................................... + // mls v11.8h, v27.8h, v7.h[0] // .....................................~..........................................................'....................................*......................................................... + // sub v24.8h, v12.8h, v14.8h // .....................................................................................e..........'....................................................................................~......... + // add v12.8h, v12.8h, v14.8h // ...~............................................................................................'..*........................................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // ...........................................................................................e....'..........................................................................................~... + // mul v14.8h, v24.8h, v0.h[4] // .............................................................................................e..'............................................................................................~. + // mls v14.8h, v27.8h, v7.h[0] // .~..............................................................................................'*............................................................................................. + // sub v24.8h, v13.8h, v15.8h // ....................................................................................e...........'...................................................................................~.......... + // add v13.8h, v13.8h, v15.8h // ...............................................................e................................'..............................................................~............................... + // sqrdmulh v27.8h, v24.8h, v0.h[5] // .........~......................................................................................'........*..................................................................................... + // mul v15.8h, v24.8h, v0.h[4] // ...............................................................................................e'.............................................................................................. + // mls v15.8h, v27.8h, v7.h[0] // .......................................~........................................................'......................................*....................................................... + // sub v24.8h, v8.8h, v12.8h // ...................................~............................................................'..................................*........................................................... + // add v8.8h, v8.8h, v12.8h // ..........................~.....................................................................'.........................*.................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .............................................................................~..................'............................................................................*................. + // mul v12.8h, v24.8h, v0.h[0] // ...........................................................................~....................'..........................................................................*................... + // mls v12.8h, v27.8h, v7.h[0] // .....................................................................................~..........'....................................................................................*......... + // sub v24.8h, v9.8h, v13.8h // ...............................................................................................e'.............................................................................................. + // add v9.8h, v9.8h, v13.8h // .~..............................................................................................'*............................................................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .................................................~..............................................'................................................*............................................. + // mul v13.8h, v24.8h, v0.h[0] // ...~............................................................................................'..*........................................................................................... + // mls v13.8h, v27.8h, v7.h[0] // .................................................................~..............................'................................................................*............................. + // sub v24.8h, v10.8h, v14.8h // ........................~.......................................................................'.......................*...................................................................... + // add v10.8h, v10.8h, v14.8h // .........................~......................................................................'........................*..................................................................... + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .......................................................................~........................'......................................................................*....................... + // mul v14.8h, v24.8h, v0.h[0] // .....................................................................~..........................'....................................................................*......................... + // mls v14.8h, v27.8h, v7.h[0] // ...............................................................................~................'..............................................................................*............... + // sub v24.8h, v11.8h, v15.8h // ............................................~...................................................'...........................................*.................................................. + // add v11.8h, v11.8h, v15.8h // .............................................~..................................................'............................................*................................................. + // sqrdmulh v27.8h, v24.8h, v0.h[1] // .......................................................~........................................'......................................................*....................................... + // mul v15.8h, v24.8h, v0.h[0] // .............................................................~..................................'............................................................*................................. + // mls v15.8h, v27.8h, v7.h[0] // ...............................................................~................................'..............................................................*............................... + // str q12, [x0, #(4*(512/8))] // ..........................................................................................~.....'.........................................................................................*.... + // str q13, [x0, #(5*(512/8))] // ......................................................................~.........................'.....................................................................*........................ + // str q14, [x0, #(6*(512/8))] // ....................................................................................~...........'...................................................................................*.......... + // str q15, [x0, #(7*(512/8))] // .......................................................................~........................'......................................................................*....................... + // sqrdmulh v27.8h, v8.8h, v30.8h // ...................................................................~............................'..................................................................*........................... + // mul v8.8h, v8.8h, v29.8h // .........................................................................~......................'........................................................................*..................... + // mls v8.8h, v27.8h, v7.h[0] // .................................................................................~..............'................................................................................*............. + // sqrdmulh v27.8h, v9.8h, v30.8h // ...........~....................................................................................'..........*................................................................................... + // mul v9.8h, v9.8h, v29.8h // .......~........................................................................................'......*....................................................................................... + // mls v9.8h, v27.8h, v7.h[0] // ...........................~....................................................................'..........................*................................................................... + // sqrdmulh v27.8h, v10.8h, v30.8h // .................................~..............................................................'................................*............................................................. + // mul v10.8h, v10.8h, v29.8h // ...................................~............................................................'..................................*........................................................... + // mls v10.8h, v27.8h, v7.h[0] // ...............................................~................................................'..............................................*............................................... + // sqrdmulh v27.8h, v11.8h, v30.8h // ...................................................................................~............'..................................................................................*........... + // mul v11.8h, v11.8h, v29.8h // .......................................................................................~........'......................................................................................*....... + // mls v11.8h, v27.8h, v7.h[0] // .........................................................................................~......'........................................................................................*..... + // str q8, [x0], #(16) // ......................................................................................~.........'.....................................................................................*........ + // str q9, [x0, #(-16 + 1*(512/8))] // ................................~...............................................................'...............................*.............................................................. + // str q10, [x0, #(-16 + 2*(512/8))] // ....................................................~...........................................'...................................................*.......................................... + // str q11, [x0, #(-16 + 3*(512/8))] // ..............................................................................................~.'.............................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 53 + // Expected cycles: 70 + // Expected IPC: 0.76 + // + // Cycle bound: 70.0 + // IPC bound: 0.76 + // + // Wall time: 2.29s + // User time: 2.29s + // + // ------------------------- cycle (expected) --------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------- + add v25.8H, v8.8H, v13.8H // *..................................................................... + mls v18.8H, v21.8H, v7.H[0] // *..................................................................... + add v2.8H, v14.8H, v17.8H // .*.................................................................... + sub v8.8H, v23.8H, v22.8H // ..*................................................................... + sqrdmulh v6.8H, v15.8H, v0.H[5] // ..*................................................................... + add v28.8H, v11.8H, v31.8H // ...*.................................................................. + sub v14.8H, v9.8H, v25.8H // ....*................................................................. + mul v17.8H, v24.8H, v0.H[0] // ....*................................................................. + add v25.8H, v9.8H, v25.8H // .....*................................................................ + sqrdmulh v12.8H, v24.8H, v0.H[1] // ......*............................................................... + mls v10.8H, v6.8H, v7.H[0] // ........*............................................................. + add v6.8H, v25.8H, v28.8H // ........*............................................................. + sub v25.8H, v25.8H, v28.8H // .........*............................................................ + sqrdmulh v28.8H, v14.8H, v0.H[3] // ..........*........................................................... + mul v22.8H, v2.8H, v29.8H // ............*......................................................... + mul v14.8H, v14.8H, v0.H[2] // ..............*....................................................... + mls v14.8H, v28.8H, v7.H[0] // ................*..................................................... + sqrdmulh v2.8H, v2.8H, v30.8H // ..................*................................................... + mul v28.8H, v8.8H, v0.H[2] // ....................*................................................. + sub v15.8H, v14.8H, v18.8H // .....................*................................................ + add v14.8H, v14.8H, v18.8H // ......................*............................................... + sqrdmulh v8.8H, v8.8H, v0.H[3] // ......................*............................................... + mls v17.8H, v12.8H, v7.H[0] // ........................*............................................. + sqrdmulh v12.8H, v6.8H, v30.8H // ..........................*........................................... + mls v28.8H, v8.8H, v7.H[0] // ............................*......................................... + str q17, [x0, #320] // .............................*........................................ + mul v8.8H, v6.8H, v29.8H // ..............................*....................................... + mul v6.8H, v25.8H, v0.H[0] // ................................*..................................... + sub v17.8H, v28.8H, v10.8H // .................................*.................................... + add v28.8H, v28.8H, v10.8H // ..................................*................................... + sqrdmulh v25.8H, v25.8H, v0.H[1] // ..................................*................................... + mls v22.8H, v2.8H, v7.H[0] // ....................................*................................. + sqrdmulh v2.8H, v14.8H, v30.8H // ......................................*............................... + mul v14.8H, v14.8H, v29.8H // ........................................*............................. + str q22, [x0, #64] // .........................................*............................ + sqrdmulh v22.8H, v17.8H, v0.H[1] // ..........................................*........................... + mls v14.8H, v2.8H, v7.H[0] // ............................................*......................... + mul v2.8H, v17.8H, v0.H[0] // ..............................................*....................... + mls v2.8H, v22.8H, v7.H[0] // ................................................*..................... + str q14, [x0, #128] // .................................................*.................... + mul v14.8H, v15.8H, v0.H[0] // ..................................................*................... + sqrdmulh v17.8H, v15.8H, v0.H[1] // ....................................................*................. + str q2, [x0, #448] // .....................................................*................ + mls v8.8H, v12.8H, v7.H[0] // ......................................................*............... + sqrdmulh v2.8H, v28.8H, v30.8H // ........................................................*............. + mls v14.8H, v17.8H, v7.H[0] // ..........................................................*........... + str q8, [x0], #(16) // ...........................................................*.......... + mls v6.8H, v25.8H, v7.H[0] // ............................................................*......... + mul v25.8H, v28.8H, v29.8H // ..............................................................*....... + str q14, [x0, #368] // ...............................................................*...... + mls v25.8H, v2.8H, v7.H[0] // ................................................................*..... + str q6, [x0, #240] // .................................................................*.... + str q25, [x0, #176] // .....................................................................* + + // ------------------------- cycle (expected) --------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------- + // add v2.8H, v8.8H, v13.8H // *..................................................................... + // add v3.8H, v14.8H, v17.8H // .*.................................................................... + // mls v18.8H, v21.8H, v7.H[0] // *..................................................................... + // sub v25.8H, v23.8H, v22.8H // ..*................................................................... + // add v11.8H, v11.8H, v31.8H // ...*.................................................................. + // mul v16.8H, v24.8H, v0.H[0] // ....*................................................................. + // sub v22.8H, v9.8H, v2.8H // ....*................................................................. + // add v6.8H, v9.8H, v2.8H // .....*................................................................ + // mul v14.8H, v25.8H, v0.H[2] // ....................*................................................. + // mul v26.8H, v3.8H, v29.8H // ............*......................................................... + // sqrdmulh v21.8H, v15.8H, v0.H[5] // ..*................................................................... + // sqrdmulh v17.8H, v3.8H, v30.8H // ..................*................................................... + // sqrdmulh v4.8H, v22.8H, v0.H[3] // ..........*........................................................... + // mul v22.8H, v22.8H, v0.H[2] // ..............*....................................................... + // mls v22.8H, v4.8H, v7.H[0] // ................*..................................................... + // sub v20.8H, v22.8H, v18.8H // .....................*................................................ + // add v19.8H, v22.8H, v18.8H // ......................*............................................... + // sqrdmulh v18.8H, v25.8H, v0.H[3] // ......................*............................................... + // add v25.8H, v6.8H, v11.8H // ........*............................................................. + // mls v26.8H, v17.8H, v7.H[0] // ....................................*................................. + // str q26, [x0, #64] // .........................................*............................ + // sqrdmulh v15.8H, v19.8H, v30.8H // ......................................*............................... + // mul v4.8H, v19.8H, v29.8H // ........................................*............................. + // sub v19.8H, v6.8H, v11.8H // .........*............................................................ + // mls v14.8H, v18.8H, v7.H[0] // ............................*......................................... + // mls v10.8H, v21.8H, v7.H[0] // ........*............................................................. + // sub v9.8H, v14.8H, v10.8H // .................................*.................................... + // add v18.8H, v14.8H, v10.8H // ..................................*................................... + // mls v4.8H, v15.8H, v7.H[0] // ............................................*......................... + // sqrdmulh v6.8H, v24.8H, v0.H[1] // ......*............................................................... + // str q4, [x0, #128] // .................................................*.................... + // sqrdmulh v17.8H, v9.8H, v0.H[1] // ..........................................*........................... + // mul v24.8H, v9.8H, v0.H[0] // ..............................................*....................... + // mls v24.8H, v17.8H, v7.H[0] // ................................................*..................... + // mls v16.8H, v6.8H, v7.H[0] // ........................*............................................. + // sqrdmulh v12.8H, v25.8H, v30.8H // ..........................*........................................... + // mul v15.8H, v20.8H, v0.H[0] // ..................................................*................... + // str q16, [x0, #320] // .............................*........................................ + // sqrdmulh v2.8H, v20.8H, v0.H[1] // ....................................................*................. + // str q24, [x0, #448] // .....................................................*................ + // mul v4.8H, v25.8H, v29.8H // ..............................*....................................... + // mul v25.8H, v19.8H, v0.H[0] // ................................*..................................... + // sqrdmulh v16.8H, v19.8H, v0.H[1] // ..................................*................................... + // mls v15.8H, v2.8H, v7.H[0] // ..........................................................*........... + // mls v4.8H, v12.8H, v7.H[0] // ......................................................*............... + // sqrdmulh v6.8H, v18.8H, v30.8H // ........................................................*............. + // str q15, [x0, #384] // ...............................................................*...... + // mls v25.8H, v16.8H, v7.H[0] // ............................................................*......... + // str q4, [x0], #(16) // ...........................................................*.......... + // mul v27.8H, v18.8H, v29.8H // ..............................................................*....... + // mls v27.8H, v6.8H, v7.H[0] // ................................................................*..... + // str q25, [x0, #240] // .................................................................*.... + // str q27, [x0, #176] // .....................................................................* + + + pop_stack + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/ntt_kyber_123_45_67_twiddles.S b/mlkem/asm/aarch64/ntt_123_45_67_twiddles.S similarity index 100% rename from mlkem/asm/aarch64/ntt_kyber_123_45_67_twiddles.S rename to mlkem/asm/aarch64/ntt_123_45_67_twiddles.S diff --git a/mlkem/asm/aarch64/ntt_123_4567.S b/mlkem/asm/aarch64/ntt_clean.S similarity index 98% rename from mlkem/asm/aarch64/ntt_123_4567.S rename to mlkem/asm/aarch64/ntt_clean.S index dc547bc24..c3526b0f4 100644 --- a/mlkem/asm/aarch64/ntt_123_4567.S +++ b/mlkem/asm/aarch64/ntt_clean.S @@ -157,7 +157,7 @@ .data .p2align 4 roots: - #include "ntt_kyber_123_45_67_twiddles.S" + #include "ntt_123_45_67_twiddles.S" in .req x0 inp .req x1 @@ -208,8 +208,8 @@ roots: t3 .req v28 .text - .global ntt_kyber_123_4567 - .global _ntt_kyber_123_4567 + .global ntt_asm_clean + .global _ntt_asm_clean .p2align 4 const_addr: @@ -222,8 +222,8 @@ const_addr: .short 0 .short 0 -ntt_kyber_123_4567: -_ntt_kyber_123_4567: +ntt_asm_clean: +_ntt_asm_clean: push_stack ASM_LOAD(r_ptr0, roots) diff --git a/mlkem/asm/aarch64/ntt_opt.S b/mlkem/asm/aarch64/ntt_opt.S new file mode 100644 index 000000000..db497ed31 --- /dev/null +++ b/mlkem/asm/aarch64/ntt_opt.S @@ -0,0 +1,958 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro mulmodq dst, src, const, idx0, idx1 + sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1] + mul \dst\().8h, \src\().8h, \const\().h[\idx0] + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro mulmod dst, src, const, const_twisted + sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, t2.8h, consts.h[0] +.endm + +.macro ct_butterfly a, b, root, idx0, idx1 + mulmodq tmp, \b, \root, \idx0, \idx1 + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro ct_butterfly_v a, b, root, root_twisted + mulmod tmp, \b, \root, \root_twisted + sub \b\().8h, \a\().8h, tmp.8h + add \a\().8h, \a\().8h, tmp.8h +.endm + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.macro load_roots_123 + ldr q_root0, [r_ptr0], #32 + ldr q_root1, [r_ptr0, #-16] +.endm + +.macro load_next_roots_45 + ldr q_root0, [r_ptr0], #16 +.endm + +.macro load_next_roots_67 + ldr q_root0, [r_ptr1], #(6*16) + ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] + ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] + ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] + ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] + ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc + ldr \a, [sp, #\loc] +.endm +.macro save loc, a + str \a, [sp, #\loc] +.endm +.macro push_stack + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: + #include "ntt_123_45_67_twiddles.S" + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + q_data0 .req q8 + q_data1 .req q9 + q_data2 .req q10 + q_data3 .req q11 + q_data4 .req q12 + q_data5 .req q13 + q_data6 .req q14 + q_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + q_root0 .req q0 + q_root1 .req q1 + q_root2 .req q2 + q_root0_tw .req q4 + q_root1_tw .req q5 + q_root2_tw .req q6 + + consts .req v7 + q_consts .req q7 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + .text + .global ntt_asm_opt + .global _ntt_asm_opt + +.p2align 4 +const_addr: + .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + +ntt_asm_opt: +_ntt_asm_opt: + push_stack + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l56) + ASM_LOAD(xtmp, const_addr) + + ld1 {consts.8h}, [xtmp] + + str in, [sp, #STACK0] // @slothy:writes=STACK0 + mov count, #4 + + load_roots_123 + + .p2align 2 + // Instructions: 32 + // Expected cycles: 39 + // Expected IPC: 0.82 + // + // Cycle bound: 39.0 + // IPC bound: 0.82 + // + // Wall time: 0.19s + // User time: 0.19s + // + // ---------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------- + ldr q22, [x0, #384] // *...................................... + ldr q5, [x0, #256] // *...................................... + ldr q14, [x0, #320] // .*..................................... + ldr q3, [x0, #192] // .*..................................... + ldr q29, [x0, #0] // ..*.................................... + ldr q25, [x0, #448] // ...*................................... + sqrdmulh v31.8H, v22.8H, v0.H[1] // ....*.................................. + sqrdmulh v13.8H, v5.8H, v0.H[1] // ......*................................ + mul v9.8H, v5.8H, v0.H[0] // ........*.............................. + ldr q5, [x0, #128] // ........*.............................. + mul v18.8H, v22.8H, v0.H[0] // ..........*............................ + mls v18.8H, v31.8H, v7.H[0] // ............*.......................... + sqrdmulh v6.8H, v14.8H, v0.H[1] // ..............*........................ + mul v22.8H, v25.8H, v0.H[0] // ................*...................... + sub v20.8H, v5.8H, v18.8H // .................*..................... + sqrdmulh v25.8H, v25.8H, v0.H[1] // ..................*.................... + add v23.8H, v5.8H, v18.8H // ..................*.................... + mul v28.8H, v20.8H, v0.H[4] // ....................*.................. + sqrdmulh v5.8H, v20.8H, v0.H[5] // ......................*................ + mls v22.8H, v25.8H, v7.H[0] // ........................*.............. + mul v4.8H, v14.8H, v0.H[0] // ..........................*............ + mls v28.8H, v5.8H, v7.H[0] // ............................*.......... + add v5.8H, v3.8H, v22.8H // .............................*......... + mls v4.8H, v6.8H, v7.H[0] // ..............................*........ + sub v26.8H, v3.8H, v22.8H // ..............................*........ + ldr q22, [x0, #64] // ..............................*........ + mls v9.8H, v13.8H, v7.H[0] // ................................*...... + sqrdmulh v31.8H, v5.8H, v0.H[3] // ..................................*.... + add v13.8H, v22.8H, v4.8H // ...................................*... + sqrdmulh v21.8H, v23.8H, v0.H[3] // ....................................*.. + mul v5.8H, v5.8H, v0.H[2] // ......................................* + sub v16.8H, v29.8H, v9.8H // ......................................* + + // ---------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------- + // ldr q30, [x0, #320] // .*..................................... + // ldr q8, [x0, #256] // *...................................... + // ldr q6, [x0, #384] // *...................................... + // ldr q15, [x0, #192] // .*..................................... + // ldr q22, [x0, #64] // ..............................*........ + // ldr q12, [x0, #128] // ........*.............................. + // ldr q27, [x0, #448] // ...*................................... + // sqrdmulh v3.8H, v8.8H, v0.H[1] // ......*................................ + // mul v9.8H, v8.8H, v0.H[0] // ........*.............................. + // sqrdmulh v4.8H, v27.8H, v0.H[1] // ..................*.................... + // mul v11.8H, v27.8H, v0.H[0] // ................*...................... + // sqrdmulh v23.8H, v6.8H, v0.H[1] // ....*.................................. + // mul v17.8H, v6.8H, v0.H[0] // ..........*............................ + // sqrdmulh v25.8H, v30.8H, v0.H[1] // ..............*........................ + // mls v17.8H, v23.8H, v7.H[0] // ............*.......................... + // sub v18.8H, v12.8H, v17.8H // .................*..................... + // mls v11.8H, v4.8H, v7.H[0] // ........................*.............. + // add v27.8H, v15.8H, v11.8H // .............................*......... + // sqrdmulh v26.8H, v18.8H, v0.H[5] // ......................*................ + // ldr q29, [x0, #0] // ..*.................................... + // add v23.8H, v12.8H, v17.8H // ..................*.................... + // mul v28.8H, v18.8H, v0.H[4] // ....................*.................. + // mls v9.8H, v3.8H, v7.H[0] // ................................*...... + // sqrdmulh v31.8H, v27.8H, v0.H[3] // ..................................*.... + // mul v4.8H, v30.8H, v0.H[0] // ..........................*............ + // mls v4.8H, v25.8H, v7.H[0] // ..............................*........ + // mls v28.8H, v26.8H, v7.H[0] // ............................*.......... + // sqrdmulh v21.8H, v23.8H, v0.H[3] // ....................................*.. + // sub v16.8H, v29.8H, v9.8H // ......................................* + // add v13.8H, v22.8H, v4.8H // ...................................*... + // sub v26.8H, v15.8H, v11.8H // ..............................*........ + // mul v5.8H, v27.8H, v0.H[2] // ......................................* + + sub count, count, #1 +layer123_start: + // Instructions: 76 + // Expected cycles: 72 + // Expected IPC: 1.06 + // + // Cycle bound: 72.0 + // IPC bound: 1.06 + // + // Wall time: 23.42s + // User time: 23.42s + // + // -------------------------- cycle (expected) ---------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------------- + mls v5.8H, v31.8H, v7.H[0] // *....................................................................... + ldr q30, [x0, #336] // e....................................................................... + ldr q8, [x0, #272] // e....................................................................... + ldr q6, [x0, #400] // .e...................................................................... + sub v24.8H, v16.8H, v28.8H // .*...................................................................... + ldr q15, [x0, #208] // .e...................................................................... + mul v31.8H, v23.8H, v0.H[2] // ..*..................................................................... + sub v20.8H, v22.8H, v4.8H // ..*..................................................................... + ldr q22, [x0, #80] // ..e..................................................................... + ldr q12, [x0, #144] // ...e.................................................................... + ldr q27, [x0, #464] // ...e.................................................................... + add v10.8H, v29.8H, v9.8H // ...*.................................................................... + sqrdmulh v3.8H, v8.8H, v0.H[1] // ....e................................................................... + sub v2.8H, v13.8H, v5.8H // .....*.................................................................. + mul v9.8H, v8.8H, v0.H[0] // ......e................................................................. + sqrdmulh v4.8H, v27.8H, v0.H[1] // ........e............................................................... + mul v11.8H, v27.8H, v0.H[0] // ..........e............................................................. + mls v31.8H, v21.8H, v7.H[0] // ............*........................................................... + sqrdmulh v23.8H, v6.8H, v0.H[1] // ..............e......................................................... + mul v17.8H, v6.8H, v0.H[0] // ................e....................................................... + add v6.8H, v13.8H, v5.8H // ................*....................................................... + sqrdmulh v25.8H, v30.8H, v0.H[1] // ..................e..................................................... + mls v17.8H, v23.8H, v7.H[0] // ....................e................................................... + sqrdmulh v19.8H, v2.8H, v1.H[1] // ......................*................................................. + mul v14.8H, v2.8H, v1.H[0] // ........................*............................................... + sub v18.8H, v12.8H, v17.8H // .........................e.............................................. + sub v21.8H, v10.8H, v31.8H // ..........................*............................................. + sqrdmulh v13.8H, v6.8H, v0.H[7] // ..........................*............................................. + mls v14.8H, v19.8H, v7.H[0] // ............................*........................................... + mls v11.8H, v4.8H, v7.H[0] // ..............................e......................................... + mul v23.8H, v6.8H, v0.H[6] // ................................*....................................... + add v4.8H, v21.8H, v14.8H // .................................*...................................... + sub v19.8H, v21.8H, v14.8H // ..................................*..................................... + mls v23.8H, v13.8H, v7.H[0] // ..................................*..................................... + add v27.8H, v15.8H, v11.8H // ...................................e.................................... + add v29.8H, v10.8H, v31.8H // ....................................*................................... + str q4, [x0, #128] // ....................................*................................... + sqrdmulh v13.8H, v26.8H, v0.H[5] // ....................................*................................... + str q19, [x0, #192] // .....................................*.................................. + add v10.8H, v16.8H, v28.8H // .....................................*.................................. + mul v6.8H, v26.8H, v0.H[4] // ......................................*................................. + sub v5.8H, v29.8H, v23.8H // .......................................*................................ + add v21.8H, v29.8H, v23.8H // ........................................*............................... + sqrdmulh v26.8H, v18.8H, v0.H[5] // ........................................e............................... + ldr q29, [x0, #16] // ........................................e............................... + add v23.8H, v12.8H, v17.8H // .........................................e.............................. + mls v6.8H, v13.8H, v7.H[0] // ..........................................*............................. + str q5, [x0, #64] // ..........................................*............................. + str q21, [x0], #(16) // ...........................................*............................ + mul v28.8H, v18.8H, v0.H[4] // ............................................e........................... + mls v9.8H, v3.8H, v7.H[0] // ..............................................e......................... + add v5.8H, v20.8H, v6.8H // ...............................................*........................ + sub v14.8H, v20.8H, v6.8H // ................................................*....................... + sqrdmulh v31.8H, v27.8H, v0.H[3] // ................................................e....................... + mul v16.8H, v5.8H, v1.H[2] // ..................................................*..................... + sqrdmulh v8.8H, v14.8H, v1.H[5] // ....................................................*................... + sqrdmulh v4.8H, v5.8H, v1.H[3] // ......................................................*................. + mul v2.8H, v14.8H, v1.H[4] // ........................................................*............... + mls v2.8H, v8.8H, v7.H[0] // ..........................................................*............. + mls v16.8H, v4.8H, v7.H[0] // ............................................................*........... + mul v4.8H, v30.8H, v0.H[0] // ..............................................................e......... + add v14.8H, v24.8H, v2.8H // ...............................................................*........ + mls v4.8H, v25.8H, v7.H[0] // ................................................................e....... + sub v20.8H, v10.8H, v16.8H // .................................................................*...... + mls v28.8H, v26.8H, v7.H[0] // ..................................................................e..... + sub v26.8H, v24.8H, v2.8H // ..................................................................*..... + str q14, [x0, #368] // ..................................................................*..... + add v6.8H, v10.8H, v16.8H // ...................................................................*.... + str q20, [x0, #304] // ....................................................................*... + sqrdmulh v21.8H, v23.8H, v0.H[3] // ....................................................................e... + sub v16.8H, v29.8H, v9.8H // ....................................................................e... + add v13.8H, v22.8H, v4.8H // .....................................................................e.. + str q26, [x0, #432] // .....................................................................*.. + sub v26.8H, v15.8H, v11.8H // ......................................................................e. + mul v5.8H, v27.8H, v0.H[2] // ......................................................................e. + str q6, [x0, #240] // ......................................................................*. + + // -------------------------------------------------------------- cycle (expected) --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // ldr q8, [x0, #0] // ........................................e...............................'.......................................~.............................. + // ldr q9, [x0, #(1*(512/8))] // ..e.....................................................................'.~.................................................................... + // ldr q10, [x0, #(2*(512/8))] // ...e....................................................................'..~................................................................... + // ldr q11, [x0, #(3*(512/8))] // .e......................................................................'~..................................................................... + // ldr q12, [x0, #(4*(512/8))] // e.......................................................................~...................................................................... + // ldr q13, [x0, #(5*(512/8))] // e.......................................................................~...................................................................... + // ldr q14, [x0, #(6*(512/8))] // .e......................................................................'~..................................................................... + // ldr q15, [x0, #(7*(512/8))] // ...e....................................................................'..~................................................................... + // sqrdmulh v27.8h, v12.8h, v0.h[1] // ....e...................................................................'...~.................................................................. + // mul v24.8h, v12.8h, v0.h[0] // ......e.................................................................'.....~................................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..............................................e.........................'.............................................~........................ + // sub v12.8h, v8.8h, v24.8h // ....................................................................e...'...................................................................~.. + // add v8.8h, v8.8h, v24.8h // ...~....................................................................'..*................................................................... + // sqrdmulh v27.8h, v13.8h, v0.h[1] // ..................e.....................................................'.................~.................................................... + // mul v24.8h, v13.8h, v0.h[0] // ..............................................................e.........'.............................................................~........ + // mls v24.8h, v27.8h, v7.h[0] // ................................................................e.......'...............................................................~...... + // sub v13.8h, v9.8h, v24.8h // ..~.....................................................................'.*.................................................................... + // add v9.8h, v9.8h, v24.8h // .....................................................................e..'....................................................................~. + // sqrdmulh v27.8h, v14.8h, v0.h[1] // ..............e.........................................................'.............~........................................................ + // mul v24.8h, v14.8h, v0.h[0] // ................e.......................................................'...............~...................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................e...................................................'...................~.................................................. + // sub v14.8h, v10.8h, v24.8h // .........................e..............................................'........................~............................................. + // add v10.8h, v10.8h, v24.8h // .........................................e..............................'........................................~............................. + // sqrdmulh v27.8h, v15.8h, v0.h[1] // ........e...............................................................'.......~.............................................................. + // mul v24.8h, v15.8h, v0.h[0] // ..........e.............................................................'.........~............................................................ + // mls v24.8h, v27.8h, v7.h[0] // ..............................e.........................................'.............................~........................................ + // sub v15.8h, v11.8h, v24.8h // ......................................................................e.'...................................................................... + // add v11.8h, v11.8h, v24.8h // ...................................e....................................'..................................~................................... + // sqrdmulh v27.8h, v10.8h, v0.h[3] // ....................................................................e...'...................................................................~.. + // mul v24.8h, v10.8h, v0.h[2] // ..~.....................................................................'.*.................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ............~...........................................................'...........*.......................................................... + // sub v10.8h, v8.8h, v24.8h // ..........................~.............................................'.........................*............................................ + // add v8.8h, v8.8h, v24.8h // ....................................~...................................'...................................*.................................. + // sqrdmulh v27.8h, v11.8h, v0.h[3] // ................................................e.......................'...............................................~...................... + // mul v24.8h, v11.8h, v0.h[2] // ......................................................................e.'...................................................................... + // mls v24.8h, v27.8h, v7.h[0] // ~.......................................................................*...................................................................... + // sub v11.8h, v9.8h, v24.8h // .....~..................................................................'....*................................................................. + // add v9.8h, v9.8h, v24.8h // ................~.......................................................'...............*...................................................... + // sqrdmulh v27.8h, v14.8h, v0.h[5] // ........................................e...............................'.......................................~.............................. + // mul v24.8h, v14.8h, v0.h[4] // ............................................e...........................'...........................................~.......................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................................................e.....'.................................................................~.... + // sub v14.8h, v12.8h, v24.8h // .~......................................................................'*..................................................................... + // add v12.8h, v12.8h, v24.8h // .....................................~..................................'....................................*................................. + // sqrdmulh v27.8h, v15.8h, v0.h[5] // ....................................~...................................'...................................*.................................. + // mul v24.8h, v15.8h, v0.h[4] // ......................................~.................................'.....................................*................................ + // mls v24.8h, v27.8h, v7.h[0] // ..........................................~.............................'.........................................*............................ + // sub v15.8h, v13.8h, v24.8h // ................................................~.......................'...............................................*...................... + // add v13.8h, v13.8h, v24.8h // ...............................................~........................'..............................................*....................... + // sqrdmulh v27.8h, v9.8h, v0.h[7] // ..........................~.............................................'.........................*............................................ + // mul v24.8h, v9.8h, v0.h[6] // ................................~.......................................'...............................*...................................... + // mls v24.8h, v27.8h, v7.h[0] // ..................................~.....................................'.................................*.................................... + // sub v9.8h, v8.8h, v24.8h // .......................................~................................'......................................*............................... + // add v8.8h, v8.8h, v24.8h // ........................................~...............................'.......................................*.............................. + // sqrdmulh v27.8h, v11.8h, v1.h[1] // ......................~.................................................'.....................*................................................ + // mul v24.8h, v11.8h, v1.h[0] // ........................~...............................................'.......................*.............................................. + // mls v24.8h, v27.8h, v7.h[0] // ............................~...........................................'...........................*.......................................... + // sub v11.8h, v10.8h, v24.8h // ..................................~.....................................'.................................*.................................... + // add v10.8h, v10.8h, v24.8h // .................................~......................................'................................*..................................... + // sqrdmulh v27.8h, v13.8h, v1.h[3] // ......................................................~.................'.....................................................*................ + // mul v24.8h, v13.8h, v1.h[2] // ..................................................~.....................'.................................................*.................... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................~...........'...........................................................*.......... + // sub v13.8h, v12.8h, v24.8h // .................................................................~......'................................................................*..... + // add v12.8h, v12.8h, v24.8h // ...................................................................~....'..................................................................*... + // sqrdmulh v27.8h, v15.8h, v1.h[5] // ....................................................~...................'...................................................*.................. + // mul v24.8h, v15.8h, v1.h[4] // ........................................................~...............'.......................................................*.............. + // mls v24.8h, v27.8h, v7.h[0] // ..........................................................~.............'.........................................................*............ + // sub v15.8h, v14.8h, v24.8h // ..................................................................~.....'.................................................................*.... + // add v14.8h, v14.8h, v24.8h // ...............................................................~........'..............................................................*....... + // str q8, [x0], #(16) // ...........................................~............................'..........................................*........................... + // str q9, [x0, #(-16 + 1*(512/8))] // ..........................................~.............................'.........................................*............................ + // str q10, [x0, #(-16 + 2*(512/8))] // ....................................~...................................'...................................*.................................. + // str q11, [x0, #(-16 + 3*(512/8))] // .....................................~..................................'....................................*................................. + // str q12, [x0, #(-16 + 4*(512/8))] // ......................................................................~.'.....................................................................* + // str q13, [x0, #(-16 + 5*(512/8))] // ....................................................................~...'...................................................................*.. + // str q14, [x0, #(-16 + 6*(512/8))] // ..................................................................~.....'.................................................................*.... + // str q15, [x0, #(-16 + 7*(512/8))] // .....................................................................~..'....................................................................*. + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 44 + // Expected cycles: 43 + // Expected IPC: 1.02 + // + // Cycle bound: 43.0 + // IPC bound: 1.02 + // + // Wall time: 0.92s + // User time: 0.92s + // + // ------------ cycle (expected) ------------> + // 0 25 + // |------------------------|----------------- + sub v19.8H, v22.8H, v4.8H // *.......................................... + sqrdmulh v30.8H, v26.8H, v0.H[5] // *.......................................... + add v3.8H, v29.8H, v9.8H // .*......................................... + mul v25.8H, v26.8H, v0.H[4] // ..*........................................ + mls v5.8H, v31.8H, v7.H[0] // ....*...................................... + mls v25.8H, v30.8H, v7.H[0] // ......*.................................... + mul v17.8H, v23.8H, v0.H[2] // ........*.................................. + add v6.8H, v13.8H, v5.8H // .........*................................. + mls v17.8H, v21.8H, v7.H[0] // ..........*................................ + add v22.8H, v19.8H, v25.8H // ...........*............................... + sub v14.8H, v19.8H, v25.8H // ............*.............................. + mul v15.8H, v6.8H, v0.H[6] // ............*.............................. + sub v5.8H, v13.8H, v5.8H // .............*............................. + sqrdmulh v24.8H, v22.8H, v1.H[3] // ..............*............................ + add v19.8H, v3.8H, v17.8H // ...............*........................... + mul v9.8H, v5.8H, v1.H[0] // ................*.......................... + sqrdmulh v5.8H, v5.8H, v1.H[1] // ..................*........................ + mul v20.8H, v22.8H, v1.H[2] // ....................*...................... + mls v20.8H, v24.8H, v7.H[0] // ......................*.................... + mls v9.8H, v5.8H, v7.H[0] // ........................*.................. + add v5.8H, v16.8H, v28.8H // .........................*................. + sqrdmulh v11.8H, v14.8H, v1.H[5] // ..........................*................ + sub v31.8H, v5.8H, v20.8H // ............................*.............. + sqrdmulh v10.8H, v6.8H, v0.H[7] // ............................*.............. + add v18.8H, v5.8H, v20.8H // .............................*............. + sub v5.8H, v16.8H, v28.8H // ..............................*............ + mul v22.8H, v14.8H, v1.H[4] // ..............................*............ + str q31, [x0, #320] // ...............................*........... + mls v22.8H, v11.8H, v7.H[0] // ................................*.......... + str q18, [x0, #256] // ................................*.......... + sub v21.8H, v3.8H, v17.8H // .................................*......... + mls v15.8H, v10.8H, v7.H[0] // ..................................*........ + sub v4.8H, v21.8H, v9.8H // ....................................*...... + sub v28.8H, v5.8H, v22.8H // .....................................*..... + add v6.8H, v5.8H, v22.8H // .....................................*..... + add v5.8H, v21.8H, v9.8H // ......................................*.... + str q4, [x0, #192] // .......................................*... + sub v26.8H, v19.8H, v15.8H // .......................................*... + add v22.8H, v19.8H, v15.8H // .......................................*... + str q28, [x0, #448] // ........................................*.. + str q6, [x0, #384] // ........................................*.. + str q5, [x0, #128] // .........................................*. + str q26, [x0, #64] // ..........................................* + str q22, [x0], #(16) // ..........................................* + + // ------------ cycle (expected) ------------> + // 0 25 + // |------------------------|----------------- + // mls v5.8H, v31.8H, v7.H[0] // ....*...................................... + // sub v24.8H, v16.8H, v28.8H // ..............................*............ + // mul v31.8H, v23.8H, v0.H[2] // ........*.................................. + // sub v20.8H, v22.8H, v4.8H // *.......................................... + // add v10.8H, v29.8H, v9.8H // .*......................................... + // sub v2.8H, v13.8H, v5.8H // .............*............................. + // mls v31.8H, v21.8H, v7.H[0] // ..........*................................ + // add v6.8H, v13.8H, v5.8H // .........*................................. + // sqrdmulh v19.8H, v2.8H, v1.H[1] // ..................*........................ + // mul v14.8H, v2.8H, v1.H[0] // ................*.......................... + // sub v21.8H, v10.8H, v31.8H // .................................*......... + // sqrdmulh v13.8H, v6.8H, v0.H[7] // ............................*.............. + // mls v14.8H, v19.8H, v7.H[0] // ........................*.................. + // mul v23.8H, v6.8H, v0.H[6] // ............*.............................. + // add v4.8H, v21.8H, v14.8H // ......................................*.... + // sub v19.8H, v21.8H, v14.8H // ....................................*...... + // mls v23.8H, v13.8H, v7.H[0] // ..................................*........ + // add v29.8H, v10.8H, v31.8H // ...............*........................... + // str q4, [x0, #128] // .........................................*. + // sqrdmulh v13.8H, v26.8H, v0.H[5] // *.......................................... + // str q19, [x0, #192] // .......................................*... + // add v10.8H, v16.8H, v28.8H // .........................*................. + // mul v6.8H, v26.8H, v0.H[4] // ..*........................................ + // sub v5.8H, v29.8H, v23.8H // .......................................*... + // add v21.8H, v29.8H, v23.8H // .......................................*... + // mls v6.8H, v13.8H, v7.H[0] // ......*.................................... + // str q5, [x0, #64] // ..........................................* + // str q21, [x0], #(16) // ..........................................* + // add v5.8H, v20.8H, v6.8H // ...........*............................... + // sub v14.8H, v20.8H, v6.8H // ............*.............................. + // mul v16.8H, v5.8H, v1.H[2] // ....................*...................... + // sqrdmulh v8.8H, v14.8H, v1.H[5] // ..........................*................ + // sqrdmulh v4.8H, v5.8H, v1.H[3] // ..............*............................ + // mul v2.8H, v14.8H, v1.H[4] // ..............................*............ + // mls v2.8H, v8.8H, v7.H[0] // ................................*.......... + // mls v16.8H, v4.8H, v7.H[0] // ......................*.................... + // add v14.8H, v24.8H, v2.8H // .....................................*..... + // sub v20.8H, v10.8H, v16.8H // ............................*.............. + // sub v26.8H, v24.8H, v2.8H // .....................................*..... + // str q14, [x0, #368] // ........................................*.. + // add v6.8H, v10.8H, v16.8H // .............................*............. + // str q20, [x0, #304] // ...............................*........... + // str q26, [x0, #432] // ........................................*.. + // str q6, [x0, #240] // ................................*.......... + + + ldr inp, [sp, #STACK0] // @slothy:reads=STACK0 + mov count, #8 + + .p2align 2 + // Instructions: 48 + // Expected cycles: 57 + // Expected IPC: 0.84 + // + // Cycle bound: 57.0 + // IPC bound: 0.84 + // + // Wall time: 1.07s + // User time: 1.07s + // + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + ldr q2, [x3], #16 // *........................................................ + ldr q18, [x1, #48] // *........................................................ + ldr q1, [x4, #48] // .*....................................................... + ldr q10, [x4, #64] // ..*...................................................... + sqrdmulh v0.8H, v18.8H, v2.H[1] // ....*.................................................... + mul v22.8H, v18.8H, v2.H[0] // ......*.................................................. + ldr q20, [x1, #32] // ......*.................................................. + ldr q26, [x1, #0] // .......*................................................. + ldr q6, [x1, #16] // ........*................................................ + mls v22.8H, v0.8H, v7.H[0] // .........*............................................... + sqrdmulh v11.8H, v20.8H, v2.H[1] // ...........*............................................. + add v29.8H, v6.8H, v22.8H // ..............*.......................................... + mul v8.8H, v20.8H, v2.H[0] // ..............*.......................................... + sub v6.8H, v6.8H, v22.8H // ...............*......................................... + mls v8.8H, v11.8H, v7.H[0] // ................*........................................ + mul v28.8H, v6.8H, v2.H[4] // ..................*...................................... + sqrdmulh v18.8H, v29.8H, v2.H[3] // ....................*.................................... + sub v22.8H, v26.8H, v8.8H // .....................*................................... + sqrdmulh v5.8H, v6.8H, v2.H[5] // ......................*.................................. + add v24.8H, v26.8H, v8.8H // ......................*.................................. + mul v19.8H, v29.8H, v2.H[2] // ........................*................................ + ldr q29, [x4, #80] // ........................*................................ + mls v19.8H, v18.8H, v7.H[0] // ..........................*.............................. + mls v28.8H, v5.8H, v7.H[0] // ............................*............................ + add v12.8H, v24.8H, v19.8H // ...............................*......................... + sub v8.8H, v24.8H, v19.8H // ...............................*......................... + sub v5.8H, v22.8H, v28.8H // .................................*....................... + add v19.8H, v22.8H, v28.8H // .................................*....................... + ldr q22, [x4, #32] // .................................*....................... + trn2 v6.4S, v12.4S, v8.4S // ..................................*...................... + ldr q14, [x4, #16] // ..................................*...................... + trn1 v15.4S, v12.4S, v8.4S // ..................................*...................... + ldr q30, [x4], #(6*16) // ...................................*..................... + trn1 v0.4S, v19.4S, v5.4S // ....................................*.................... + trn2 v12.4S, v19.4S, v5.4S // ....................................*.................... + trn2 v3.2D, v6.2D, v12.2D // .......................................*................. + trn1 v20.2D, v6.2D, v12.2D // ........................................*................ + trn2 v19.2D, v15.2D, v0.2D // .........................................*............... + trn1 v17.2D, v15.2D, v0.2D // ..........................................*.............. + sqrdmulh v5.8H, v3.8H, v14.8H // ..........................................*.............. + mul v4.8H, v3.8H, v30.8H // ............................................*............ + mls v4.8H, v5.8H, v7.H[0] // ...............................................*......... + sqrdmulh v5.8H, v19.8H, v14.8H // .................................................*....... + mul v25.8H, v19.8H, v30.8H // ...................................................*..... + add v13.8H, v20.8H, v4.8H // ....................................................*.... + sub v28.8H, v20.8H, v4.8H // .....................................................*... + mls v25.8H, v5.8H, v7.H[0] // ......................................................*.. + mul v30.8H, v13.8H, v22.8H // ........................................................* + + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // ldr q18, [x1, #48] // *........................................................ + // ldr q0, [x3], #16 // *........................................................ + // ldr q14, [x4, #16] // ..................................*...................... + // ldr q29, [x4, #80] // ........................*................................ + // ldr q4, [x1, #32] // ......*.................................................. + // ldr q27, [x4], #(6*16) // ...................................*..................... + // ldr q23, [x1, #0] // .......*................................................. + // ldr q20, [x4, #-64] // .................................*....................... + // sqrdmulh v19.8H, v18.8H, v0.H[1] // ....*.................................................... + // ldr q1, [x4, #-48] // .*....................................................... + // sqrdmulh v3.8H, v4.8H, v0.H[1] // ...........*............................................. + // mul v13.8H, v18.8H, v0.H[0] // ......*.................................................. + // mls v13.8H, v19.8H, v7.H[0] // .........*............................................... + // ldr q19, [x1, #16] // ........*................................................ + // mul v11.8H, v4.8H, v0.H[0] // ..............*.......................................... + // add v6.8H, v19.8H, v13.8H // ..............*.......................................... + // sub v18.8H, v19.8H, v13.8H // ...............*......................................... + // mls v11.8H, v3.8H, v7.H[0] // ................*........................................ + // sqrdmulh v17.8H, v6.8H, v0.H[3] // ....................*.................................... + // mul v30.8H, v18.8H, v0.H[4] // ..................*...................................... + // add v15.8H, v23.8H, v11.8H // ......................*.................................. + // sub v16.8H, v23.8H, v11.8H // .....................*................................... + // mul v6.8H, v6.8H, v0.H[2] // ........................*................................ + // mls v6.8H, v17.8H, v7.H[0] // ..........................*.............................. + // sqrdmulh v24.8H, v18.8H, v0.H[5] // ......................*.................................. + // ldr q10, [x4, #-32] // ..*...................................................... + // add v4.8H, v15.8H, v6.8H // ...............................*......................... + // sub v6.8H, v15.8H, v6.8H // ...............................*......................... + // mls v30.8H, v24.8H, v7.H[0] // ............................*............................ + // trn1 v15.4S, v4.4S, v6.4S // ..................................*...................... + // trn2 v18.4S, v4.4S, v6.4S // ..................................*...................... + // sub v22.8H, v16.8H, v30.8H // .................................*....................... + // add v6.8H, v16.8H, v30.8H // .................................*....................... + // trn2 v25.4S, v6.4S, v22.4S // ....................................*.................... + // trn1 v24.4S, v6.4S, v22.4S // ....................................*.................... + // trn2 v23.2D, v18.2D, v25.2D // .......................................*................. + // trn1 v30.2D, v18.2D, v25.2D // ........................................*................ + // trn2 v26.2D, v15.2D, v24.2D // .........................................*............... + // trn1 v17.2D, v15.2D, v24.2D // ..........................................*.............. + // sqrdmulh v9.8H, v23.8H, v14.8H // ..........................................*.............. + // sqrdmulh v22.8H, v26.8H, v14.8H // .................................................*....... + // mul v21.8H, v23.8H, v27.8H // ............................................*............ + // mls v21.8H, v9.8H, v7.H[0] // ...............................................*......... + // mul v25.8H, v26.8H, v27.8H // ...................................................*..... + // add v13.8H, v30.8H, v21.8H // ....................................................*.... + // sub v28.8H, v30.8H, v21.8H // .....................................................*... + // mls v25.8H, v22.8H, v7.H[0] // ......................................................*.. + // mul v30.8H, v13.8H, v20.8H // ........................................................* + + sub count, count, #1 +layer4567_start: + // Instructions: 72 + // Expected cycles: 64 + // Expected IPC: 1.12 + // + // Cycle bound: 64.0 + // IPC bound: 1.12 + // + // Wall time: 41.55s + // User time: 41.55s + // + // ---------------------- cycle (expected) -----------------------> + // 0 25 50 + // |------------------------|------------------------|------------- + ldr q18, [x1, #112] // e............................................................... + ldr q0, [x3], #16 // e............................................................... + sqrdmulh v26.8H, v28.8H, v29.8H // *............................................................... + ldr q14, [x4, #16] // .e.............................................................. + ldr q29, [x4, #80] // .e.............................................................. + sqrdmulh v6.8H, v13.8H, v1.8H // ..*............................................................. + ldr q4, [x1, #96] // ..e............................................................. + ldr q27, [x4], #(6*16) // ..e............................................................. + ldr q23, [x1, #64] // ...e............................................................ + ldr q20, [x4, #-64] // ...e............................................................ + sqrdmulh v19.8H, v18.8H, v0.H[1] // ....e........................................................... + ldr q1, [x4, #-48] // ....e........................................................... + add v5.8H, v17.8H, v25.8H // .....*.......................................................... + sub v25.8H, v17.8H, v25.8H // ......*......................................................... + sqrdmulh v3.8H, v4.8H, v0.H[1] // ......e......................................................... + mul v13.8H, v18.8H, v0.H[0] // ........e....................................................... + mls v13.8H, v19.8H, v7.H[0] // ..........e..................................................... + ldr q19, [x1, #80] // ...........e.................................................... + mls v30.8H, v6.8H, v7.H[0] // ............*................................................... + mul v11.8H, v4.8H, v0.H[0] // ..............e................................................. + add v6.8H, v19.8H, v13.8H // ...............e................................................ + sub v18.8H, v19.8H, v13.8H // ................e............................................... + mls v11.8H, v3.8H, v7.H[0] // ................e............................................... + sub v3.8H, v5.8H, v30.8H // .................*.............................................. + add v2.8H, v5.8H, v30.8H // ..................*............................................. + sqrdmulh v17.8H, v6.8H, v0.H[3] // ..................e............................................. + mul v30.8H, v18.8H, v0.H[4] // ....................e........................................... + add v15.8H, v23.8H, v11.8H // .....................e.......................................... + sub v16.8H, v23.8H, v11.8H // ......................e......................................... + mul v6.8H, v6.8H, v0.H[2] // ......................e......................................... + mls v6.8H, v17.8H, v7.H[0] // ........................e....................................... + sqrdmulh v24.8H, v18.8H, v0.H[5] // ..........................e..................................... + mul v17.8H, v28.8H, v10.8H // ............................*................................... + ldr q10, [x4, #-32] // ............................e................................... + add v4.8H, v15.8H, v6.8H // .............................e.................................. + sub v6.8H, v15.8H, v6.8H // ..............................e................................. + mls v17.8H, v26.8H, v7.H[0] // ..............................*................................. + mls v30.8H, v24.8H, v7.H[0] // ................................e............................... + trn1 v15.4S, v4.4S, v6.4S // .................................e.............................. + trn2 v18.4S, v4.4S, v6.4S // ..................................e............................. + sqdmulh v13.8H, v2.8H, v7.H[1] // ..................................*............................. + sub v5.8H, v25.8H, v17.8H // ...................................*............................ + add v4.8H, v25.8H, v17.8H // ....................................*........................... + sqdmulh v19.8H, v3.8H, v7.H[1] // ....................................*........................... + sub v22.8H, v16.8H, v30.8H // .....................................e.......................... + add v6.8H, v16.8H, v30.8H // ......................................e......................... + sqdmulh v21.8H, v5.8H, v7.H[1] // ......................................*......................... + srshr v31.8H, v13.8H, #11 // .......................................*........................ + sqdmulh v28.8H, v4.8H, v7.H[1] // ........................................*....................... + srshr v11.8H, v19.8H, #11 // .........................................*...................... + mls v2.8H, v31.8H, v7.H[0] // ..........................................*..................... + trn2 v25.4S, v6.4S, v22.4S // ..........................................e..................... + srshr v12.8H, v21.8H, #11 // ...........................................*.................... + trn1 v24.4S, v6.4S, v22.4S // ............................................e................... + mls v3.8H, v11.8H, v7.H[0] // ............................................*................... + trn2 v23.2D, v18.2D, v25.2D // .............................................e.................. + trn1 v30.2D, v18.2D, v25.2D // ..............................................e................. + mls v5.8H, v12.8H, v7.H[0] // ..............................................*................. + trn2 v26.2D, v15.2D, v24.2D // ...............................................e................ + trn1 v17.2D, v15.2D, v24.2D // ................................................e............... + sqrdmulh v9.8H, v23.8H, v14.8H // ................................................e............... + srshr v19.8H, v28.8H, #11 // .................................................*.............. + sqrdmulh v22.8H, v26.8H, v14.8H // ..................................................e............. + mul v21.8H, v23.8H, v27.8H // ....................................................e........... + mls v21.8H, v9.8H, v7.H[0] // ......................................................e......... + mls v4.8H, v19.8H, v7.H[0] // ........................................................*....... + mul v25.8H, v26.8H, v27.8H // ..........................................................e..... + add v13.8H, v30.8H, v21.8H // ...........................................................e.... + sub v28.8H, v30.8H, v21.8H // ............................................................e... + mls v25.8H, v22.8H, v7.H[0] // ............................................................e... + st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // .............................................................*.. + mul v30.8H, v13.8H, v20.8H // ..............................................................e. + + // ----------------------------------------------------- cycle (expected) ------------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------------- + // ldr q8, [x1, #(16*0)] // ...e............................................................'..~.......................................................... + // ldr q9, [x1, #(16*1)] // ...........e....................................................'..........~.................................................. + // ldr q10, [x1, #(16*2)] // ..e.............................................................'.~........................................................... + // ldr q11, [x1, #(16*3)] // e...............................................................~............................................................. + // ldr q0, [x3], #16 // e...............................................................~............................................................. + // sqrdmulh v27.8h, v10.8h, v0.h[1] // ......e.........................................................'.....~....................................................... + // mul v24.8h, v10.8h, v0.h[0] // ..............e.................................................'.............~............................................... + // mls v24.8h, v27.8h, v7.h[0] // ................e...............................................'...............~............................................. + // sub v10.8h, v8.8h, v24.8h // ......................e.........................................'.....................~....................................... + // add v8.8h, v8.8h, v24.8h // .....................e..........................................'....................~........................................ + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ....e...........................................................'...~......................................................... + // mul v24.8h, v11.8h, v0.h[0] // ........e.......................................................'.......~..................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..........e.....................................................'.........~................................................... + // sub v11.8h, v9.8h, v24.8h // ................e...............................................'...............~............................................. + // add v9.8h, v9.8h, v24.8h // ...............e................................................'..............~.............................................. + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ..................e.............................................'.................~........................................... + // mul v24.8h, v9.8h, v0.h[2] // ......................e.........................................'.....................~....................................... + // mls v24.8h, v27.8h, v7.h[0] // ........................e.......................................'.......................~..................................... + // sub v9.8h, v8.8h, v24.8h // ..............................e.................................'.............................~............................... + // add v8.8h, v8.8h, v24.8h // .............................e..................................'............................~................................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // ..........................e.....................................'.........................~................................... + // mul v24.8h, v11.8h, v0.h[4] // ....................e...........................................'...................~......................................... + // mls v24.8h, v27.8h, v7.h[0] // ................................e...............................'...............................~............................. + // sub v11.8h, v10.8h, v24.8h // .....................................e..........................'....................................~........................ + // add v10.8h, v10.8h, v24.8h // ......................................e.........................'.....................................~....................... + // trn1 v25.4s, v8.4s, v9.4s // .................................e..............................'................................~............................ + // trn2 v26.4s, v8.4s, v9.4s // ..................................e.............................'.................................~........................... + // trn1 v27.4s, v10.4s, v11.4s // ............................................e...................'...........................................~................. + // trn2 v28.4s, v10.4s, v11.4s // ..........................................e.....................'.........................................~................... + // trn2 v10.2d, v25.2d, v27.2d // ...............................................e................'..............................................~.............. + // trn2 v11.2d, v26.2d, v28.2d // .............................................e..................'............................................~................ + // trn1 v8.2d, v25.2d, v27.2d // ................................................e...............'...............................................~............. + // trn1 v9.2d, v26.2d, v28.2d // ..............................................e.................'.............................................~............... + // ldr q0, [x4], #(6*16) // ..e.............................................................'.~........................................................... + // ldr q4, [x4, #(-6*16 + 1*16)] // .e..............................................................'~............................................................ + // ldr q1, [x4, #(-6*16 + 2*16)] // ...e............................................................'..~.......................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ....e...........................................................'...~......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ............................e...................................'...........................~................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // .e..............................................................'~............................................................ + // sqrdmulh v27.8h, v10.8h, v4.8h // ..................................................e.............'.................................................~........... + // mul v24.8h, v10.8h, v0.8h // ..........................................................e.....'.........................................................~... + // mls v24.8h, v27.8h, v7.h[0] // ............................................................e...'...........................................................~. + // sub v10.8h, v8.8h, v24.8h // ......~.........................................................'.....*....................................................... + // add v8.8h, v8.8h, v24.8h // .....~..........................................................'....*........................................................ + // sqrdmulh v27.8h, v11.8h, v4.8h // ................................................e...............'...............................................~............. + // mul v24.8h, v11.8h, v0.8h // ....................................................e...........'...................................................~......... + // mls v24.8h, v27.8h, v7.h[0] // ......................................................e.........'.....................................................~....... + // sub v11.8h, v9.8h, v24.8h // ............................................................e...'...........................................................~. + // add v9.8h, v9.8h, v24.8h // ...........................................................e....'..........................................................~.. + // sqrdmulh v27.8h, v9.8h, v5.8h // ..~.............................................................'.*........................................................... + // mul v24.8h, v9.8h, v1.8h // ..............................................................e.'............................................................. + // mls v24.8h, v27.8h, v7.h[0] // ............~...................................................'...........*................................................. + // sub v9.8h, v8.8h, v24.8h // .................~..............................................'................*............................................ + // add v8.8h, v8.8h, v24.8h // ..................~.............................................'.................*........................................... + // sqrdmulh v27.8h, v11.8h, v6.8h // ~...............................................................*............................................................. + // mul v24.8h, v11.8h, v2.8h // ............................~...................................'...........................*................................. + // mls v24.8h, v27.8h, v7.h[0] // ..............................~.................................'.............................*............................... + // sub v11.8h, v10.8h, v24.8h // ...................................~............................'..................................*.......................... + // add v10.8h, v10.8h, v24.8h // ....................................~...........................'...................................*......................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................~.............................'.................................*........................... + // srshr v25.8h, v25.8h, #11 // .......................................~........................'......................................*...................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................................~.....................'.........................................*................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ....................................~...........................'...................................*......................... + // srshr v25.8h, v25.8h, #11 // .........................................~......................'........................................*.................... + // mls v9.8h, v25.8h, v7.h[0] // ............................................~...................'...........................................*................. + // sqdmulh v25.8h, v10.8h, v7.h[1] // ........................................~.......................'.......................................*..................... + // srshr v25.8h, v25.8h, #11 // .................................................~..............'................................................*............ + // mls v10.8h, v25.8h, v7.h[0] // ........................................................~.......'.......................................................*..... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ......................................~.........................'.....................................*....................... + // srshr v25.8h, v25.8h, #11 // ...........................................~....................'..........................................*.................. + // mls v11.8h, v25.8h, v7.h[0] // ..............................................~.................'.............................................*............... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1], #64 // .............................................................~..'............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + // Instructions: 24 + // Expected cycles: 33 + // Expected IPC: 0.73 + // + // Cycle bound: 33.0 + // IPC bound: 0.73 + // + // Wall time: 0.16s + // User time: 0.16s + // + // ------- cycle (expected) -------> + // 0 25 + // |------------------------|------- + sqrdmulh v5.8H, v13.8H, v1.8H // *................................ + add v18.8H, v17.8H, v25.8H // *................................ + sub v6.8H, v17.8H, v25.8H // .*............................... + sqrdmulh v11.8H, v28.8H, v29.8H // ..*.............................. + mls v30.8H, v5.8H, v7.H[0] // .....*........................... + mul v5.8H, v28.8H, v10.8H // .......*......................... + mls v5.8H, v11.8H, v7.H[0] // .........*....................... + add v12.8H, v18.8H, v30.8H // ..........*...................... + sub v13.8H, v18.8H, v30.8H // ...........*..................... + sqdmulh v18.8H, v12.8H, v7.H[1] // .............*................... + add v14.8H, v6.8H, v5.8H // ..............*.................. + sub v15.8H, v6.8H, v5.8H // ...............*................. + sqdmulh v5.8H, v13.8H, v7.H[1] // ...............*................. + sqdmulh v6.8H, v14.8H, v7.H[1] // .................*............... + srshr v18.8H, v18.8H, #11 // ..................*.............. + sqdmulh v1.8H, v15.8H, v7.H[1] // ...................*............. + srshr v5.8H, v5.8H, #11 // ....................*............ + mls v12.8H, v18.8H, v7.H[0] // .....................*........... + srshr v17.8H, v6.8H, #11 // ......................*.......... + mls v13.8H, v5.8H, v7.H[0] // .......................*......... + srshr v5.8H, v1.8H, #11 // ........................*........ + mls v14.8H, v17.8H, v7.H[0] // .........................*....... + mls v15.8H, v5.8H, v7.H[0] // ...........................*..... + st4 {v12.4S, v13.4S, v14.4S, v15.4S}, [x1], #64 // ................................* + + // ------- cycle (expected) -------> + // 0 25 + // |------------------------|------- + // sqrdmulh v26.8H, v28.8H, v29.8H // ..*.............................. + // sqrdmulh v6.8H, v13.8H, v1.8H // *................................ + // add v5.8H, v17.8H, v25.8H // *................................ + // sub v25.8H, v17.8H, v25.8H // .*............................... + // mls v30.8H, v6.8H, v7.H[0] // .....*........................... + // sub v3.8H, v5.8H, v30.8H // ...........*..................... + // add v2.8H, v5.8H, v30.8H // ..........*...................... + // mul v17.8H, v28.8H, v10.8H // .......*......................... + // mls v17.8H, v26.8H, v7.H[0] // .........*....................... + // sqdmulh v13.8H, v2.8H, v7.H[1] // .............*................... + // sub v5.8H, v25.8H, v17.8H // ...............*................. + // add v4.8H, v25.8H, v17.8H // ..............*.................. + // sqdmulh v19.8H, v3.8H, v7.H[1] // ...............*................. + // sqdmulh v21.8H, v5.8H, v7.H[1] // ...................*............. + // srshr v31.8H, v13.8H, #11 // ..................*.............. + // sqdmulh v28.8H, v4.8H, v7.H[1] // .................*............... + // srshr v11.8H, v19.8H, #11 // ....................*............ + // mls v2.8H, v31.8H, v7.H[0] // .....................*........... + // srshr v12.8H, v21.8H, #11 // ........................*........ + // mls v3.8H, v11.8H, v7.H[0] // .......................*......... + // mls v5.8H, v12.8H, v7.H[0] // ...........................*..... + // srshr v19.8H, v28.8H, #11 // ......................*.......... + // mls v4.8H, v19.8H, v7.H[0] // .........................*....... + // st4 {v2.4S, v3.4S, v4.4S, v5.4S}, [x1], #64 // ................................* + + + pop_stack + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/optimize.sh b/mlkem/asm/aarch64/optimize.sh new file mode 100755 index 000000000..c154e9344 --- /dev/null +++ b/mlkem/asm/aarch64/optimize.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env sh +# SPDX-License-Identifier: Apache-2.0 + +set -e + +echo "* Base multiplication, Cortex-A72" + +slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ + poly_clean.S -o poly_opt.S \ + -r poly_reduce_asm_clean,poly_reduce_asm_opt \ + -l loop_start \ + -c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c reserved_regs="[x0--30,v10--v31,sp]" \ + -c sw_pipelining.minimize_overlapping=False \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 + +echo " * Forward NTT, Cortex-A72" + +slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ + ntt_clean.S -o ntt_opt.S \ + -r ntt_asm_clean,ntt_asm_opt \ + -l layer123_start \ + -l layer4567_start \ + -c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c reserved_regs="[x18--30,sp]" \ + -c sw_pipelining.minimize_overlapping=False \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 + +echo " * Inverse NTT, Cortex-A72" + +slothy-cli Arm_AArch64 Arm_Cortex_A72_frontend \ + intt_clean.S -o intt_opt.S \ + -r intt_asm_clean,intt_asm_opt \ + -l layer123_start \ + -l layer4567_start \ + -c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c reserved_regs="[x18--30,sp]" \ + -c sw_pipelining.minimize_overlapping=False \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 diff --git a/mlkem/asm/aarch64/poly_clean.S b/mlkem/asm/aarch64/poly_clean.S new file mode 100644 index 000000000..a33377ad8 --- /dev/null +++ b/mlkem/asm/aarch64/poly_clean.S @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.global poly_reduce_asm_clean +.global _poly_reduce_asm_clean + +.p2align 4 +const_addr: + .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + + ptr .req x0 + count .req x1 + xtmp .req x2 + + q_data .req q0 + data .req v0 + t0 .req v1 + consts .req v2 + +poly_reduce_asm_clean: +_poly_reduce_asm_clean: + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + mov count, #8 +loop_start: + ldr q_data, [ptr], #64 + barrett_reduce data + str q_data, [ptr, #-64] + + ldr q_data, [ptr, #-48] + barrett_reduce data + str q_data, [ptr, #-48] + + ldr q_data, [ptr, #-32] + barrett_reduce data + str q_data, [ptr, #-32] + + ldr q_data, [ptr, #-16] + barrett_reduce data + str q_data, [ptr, #-16] + + subs count, count, #1 + cbnz count, loop_start + + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/aarch64/poly_opt.S b/mlkem/asm/aarch64/poly_opt.S new file mode 100644 index 000000000..7afe40e48 --- /dev/null +++ b/mlkem/asm/aarch64/poly_opt.S @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +.macro barrett_reduce a + sqdmulh t0.8h, \a\().8h, consts.h[1] + srshr t0.8h, t0.8h, #11 + mls \a\().8h, t0.8h, consts.h[0] +.endm + +.global poly_reduce_asm_opt +.global _poly_reduce_asm_opt + +.p2align 4 +const_addr: + .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + + ptr .req x0 + count .req x1 + xtmp .req x2 + + q_data .req q0 + data .req v0 + t0 .req v1 + consts .req v2 + +poly_reduce_asm_opt: +_poly_reduce_asm_opt: + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + mov count, #8 + // Instructions: 6 + // Expected cycles: 7 + // Expected IPC: 0.86 + // + // Cycle bound: 7.0 + // IPC bound: 0.86 + // + // Wall time: 0.00s + // User time: 0.00s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q3, [x0, #48] // *............................. + ldr q6, [x0], #64 // .*............................ + ldr q7, [x0, #-32] // ..*........................... + ldr q5, [x0, #-48] // ...*.......................... + sqdmulh v0.8H, v3.8H, v2.H[1] // ....*......................... + sqdmulh v1.8H, v6.8H, v2.H[1] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q3, [x0, #48] // *.............................. + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v0.8H, v3.8H, v2.H[1] // ....*.......................... + // ldr q7, [x0, #-32] // ..*............................ + // sqdmulh v1.8H, v6.8H, v2.H[1] // ......*........................ + // ldr q5, [x0, #-48] // ...*........................... + + sub count, count, #1 +loop_start: + // Instructions: 20 + // Expected cycles: 17 + // Expected IPC: 1.18 + // + // Cycle bound: 17.0 + // IPC bound: 1.18 + // + // Wall time: 0.17s + // User time: 0.17s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + sqdmulh v9.8H, v7.8H, v2.H[1] // .*............................ + srshr v4.8H, v0.8H, #11 // ..*........................... + sqdmulh v0.8H, v5.8H, v2.H[1] // ...*.......................... + srshr v8.8H, v1.8H, #11 // ....*......................... + mls v3.8H, v4.8H, v2.H[0] // .....*........................ + srshr v9.8H, v9.8H, #11 // ......*....................... + mls v6.8H, v8.8H, v2.H[0] // .......*...................... + srshr v1.8H, v0.8H, #11 // ........*..................... + mls v7.8H, v9.8H, v2.H[0] // .........*.................... + str q3, [x0, #-16] // ..........*................... + ldr q3, [x0, #48] // ..........e................... + mls v5.8H, v1.8H, v2.H[0] // ...........*.................. + str q6, [x0, #-64] // ............*................. + ldr q6, [x0], #64 // ............e................. + str q7, [x0, #-96] // ..............*............... + sqdmulh v0.8H, v3.8H, v2.H[1] // ..............e............... + ldr q7, [x0, #-32] // ..............e............... + str q5, [x0, #-112] // ................*............. + sqdmulh v1.8H, v6.8H, v2.H[1] // ................e............. + ldr q5, [x0, #-48] // ................e............. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q0, [x0], #64 // ..e....'...........~....'...... + // sqdmulh v1.8h, v0.8h, v2.h[1] // ......e'...............~'...... + // srshr v1.8h, v1.8h, #11 // .......'...*............'...~.. + // mls v0.8h, v1.8h, v2.h[0] // .......'......*.........'...... + // str q0, [x0, #-64] // ..~....'...........*....'...... + // ldr q0, [x0, #-48] // ......e'...............~'...... + // sqdmulh v1.8h, v0.8h, v2.h[1] // .......'..*.............'..~... + // srshr v1.8h, v1.8h, #11 // .......'.......*........'...... + // mls v0.8h, v1.8h, v2.h[0] // .~.....'..........*.....'...... + // str q0, [x0, #-48] // ......~'...............*'...... + // ldr q0, [x0, #-32] // ....e..'.............~..'...... + // sqdmulh v1.8h, v0.8h, v2.h[1] // .......'*...............'~..... + // srshr v1.8h, v1.8h, #11 // .......'.....*..........'...... + // mls v0.8h, v1.8h, v2.h[0] // .......'........*.......'...... + // str q0, [x0, #-32] // ....~..'.............*..'...... + // ldr q0, [x0, #-16] // e......'.........~......'...... + // sqdmulh v1.8h, v0.8h, v2.h[1] // ....e..'.............~..'...... + // srshr v1.8h, v1.8h, #11 // .......'.*..............'.~.... + // mls v0.8h, v1.8h, v2.h[0] // .......'....*...........'....~. + // str q0, [x0, #-16] // ~......'.........*......'...... + + sub count, count, #1 + cbnz count, loop_start + // Instructions: 14 + // Expected cycles: 16 + // Expected IPC: 0.88 + // + // Cycle bound: 16.0 + // IPC bound: 0.88 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + srshr v9.8H, v0.8H, #11 // *............................. + sqdmulh v4.8H, v7.8H, v2.H[1] // *............................. + srshr v0.8H, v1.8H, #11 // .*............................ + sqdmulh v8.8H, v5.8H, v2.H[1] // ..*........................... + mls v6.8H, v0.8H, v2.H[0] // ....*......................... + srshr v0.8H, v4.8H, #11 // .....*........................ + mls v3.8H, v9.8H, v2.H[0] // ......*....................... + srshr v9.8H, v8.8H, #11 // .......*...................... + mls v7.8H, v0.8H, v2.H[0] // ........*..................... + str q6, [x0, #-64] // .........*.................... + mls v5.8H, v9.8H, v2.H[0] // ..........*................... + str q3, [x0, #-16] // ...........*.................. + str q7, [x0, #-32] // .............*................ + str q5, [x0, #-48] // ...............*.............. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // sqdmulh v9.8H, v7.8H, v2.H[1] // *.............................. + // srshr v4.8H, v0.8H, #11 // *.............................. + // sqdmulh v0.8H, v5.8H, v2.H[1] // ..*............................ + // srshr v8.8H, v1.8H, #11 // .*............................. + // mls v3.8H, v4.8H, v2.H[0] // ......*........................ + // srshr v9.8H, v9.8H, #11 // .....*......................... + // mls v6.8H, v8.8H, v2.H[0] // ....*.......................... + // srshr v1.8H, v0.8H, #11 // .......*....................... + // mls v7.8H, v9.8H, v2.H[0] // ........*...................... + // str q3, [x0, #-16] // ...........*................... + // mls v5.8H, v1.8H, v2.H[0] // ..........*.................... + // str q6, [x0, #-64] // .........*..................... + // str q7, [x0, #-32] // .............*................. + // str q5, [x0, #-48] // ...............*............... + + + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/mlkem/asm/asm.h b/mlkem/asm/asm.h index 083db0634..156e37ff4 100644 --- a/mlkem/asm/asm.h +++ b/mlkem/asm/asm.h @@ -1,14 +1,33 @@ // SPDX-License-Identifier: Apache-2.0 -#ifndef ASM_H -#define ASM_H +#ifndef MLKEM_ASM_H +#define MLKEM_ASM_H #include #include "params.h" #include "config.h" #ifdef MLKEM_USE_AARCH64_ASM -void ntt_kyber_123_4567(int16_t *); -void intt_kyber_123_4567(int16_t *); +void ntt_asm_clean(int16_t *); +void ntt_asm_opt(int16_t *); +void intt_asm_clean(int16_t *); +void intt_asm_opt(int16_t *); #endif /* MLKEM_USE_AARCH64_ASM */ -#endif +void poly_reduce_asm_clean(int16_t *); +void poly_reduce_asm_opt(int16_t *); + +#if !defined(MLKEM_USE_NTT_ASM_FORCE) + +#if defined(MLKEM_USE_NTT_ASM_CLEAN) +#define ntt_asm ntt_asm_clean +#define intt_asm intt_asm_clean +#define poly_reduce_asm poly_reduce_asm_clean +#else /* MLKEM_USE_NTT_ASM_CLEAN */ +#define ntt_asm ntt_asm_opt +#define intt_asm intt_asm_opt +#define poly_reduce_asm poly_reduce_asm_opt +#endif /* !MLKEM_USE_NTT_ASM_CLEAN */ + +#endif /* !MLKEM_USE_NTT_ASM_FORCE */ + +#endif /* MLKEM_ASM_H */ diff --git a/mlkem/ntt.c b/mlkem/ntt.c index 3a6496f5b..aabd098ec 100644 --- a/mlkem/ntt.c +++ b/mlkem/ntt.c @@ -82,7 +82,7 @@ static int16_t fqmul(int16_t a, int16_t b) void ntt(int16_t r[256]) { #ifdef MLKEM_USE_AARCH64_ASM - ntt_kyber_123_4567(r); + ntt_asm(r); #else /* MLKEM_USE_AARCH64_ASM */ unsigned int len, start, j, k; int16_t t, zeta; @@ -117,7 +117,7 @@ void ntt(int16_t r[256]) void invntt(int16_t r[256]) { #ifdef MLKEM_USE_AARCH64_ASM - intt_kyber_123_4567(r); + intt_asm(r); #else /* MLKEM_USE_AARCH64_ASM */ unsigned int start, len, j, k; int16_t t, zeta; diff --git a/mlkem/poly.c b/mlkem/poly.c index 14fee1591..04213ba5f 100644 --- a/mlkem/poly.c +++ b/mlkem/poly.c @@ -11,6 +11,8 @@ #include "verify.h" #include "fips202x4.h" +#include "asm/asm.h" + /************************************************************ * Name: scalar_compress_q_16 * @@ -612,6 +614,7 @@ void poly_tomont(poly *r) * * Arguments: - poly *r: pointer to input/output polynomial **************************************************/ +#if !defined(MLKEM_USE_AARCH64_ASM) void poly_reduce(poly *r) { unsigned int i; @@ -620,6 +623,12 @@ void poly_reduce(poly *r) r->coeffs[i] = barrett_reduce(r->coeffs[i]); } } +#else +void poly_reduce(poly *r) +{ + poly_reduce_asm((int16_t *) r); +} +#endif /* !MLKEM_USE_AARCH64_ASM */ /************************************************* * Name: poly_add