From 880d2a8ea59803cad55ae5ee069a1f7acb91378d Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 18 Dec 2024 17:12:08 +0800 Subject: [PATCH] M7: caddq_dilithium --- example.py | 19 +++ examples/naive/armv7m/caddq_dilithium.s | 52 ++++++++ examples/opt/armv7m/caddq_dilithium_opt_m7.s | 120 +++++++++++++++++++ 3 files changed, 191 insertions(+) create mode 100644 examples/naive/armv7m/caddq_dilithium.s create mode 100644 examples/opt/armv7m/caddq_dilithium_opt_m7.s diff --git a/example.py b/example.py index 461de5e5..a2ddb793 100644 --- a/example.py +++ b/example.py @@ -1701,6 +1701,24 @@ def core(self, slothy): slothy.config.sw_pipelining.enabled = True slothy.optimize_loop("1") +class caddq_dilithium(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None): + name = "caddq_dilithium" + infile = name + funcname = "pqcrystals_dilithium_asm_caddq" + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname) + + def core(self, slothy): + slothy.config.outputs = ["r10"] + slothy.config.inputs_are_outputs = True + slothy.config.sw_pipelining.enabled = True + slothy.optimize_loop("1") def main(): examples = [ Example0(), Example1(), @@ -1864,6 +1882,7 @@ def main(): pointwise_769_dilithium(), pointwise_769_asymmetric_dilithium(), reduce32_dilithium(), + caddq_dilithium(), ] all_example_names = [e.name for e in examples] diff --git a/examples/naive/armv7m/caddq_dilithium.s b/examples/naive/armv7m/caddq_dilithium.s new file mode 100644 index 00000000..8908ffbb --- /dev/null +++ b/examples/naive/armv7m/caddq_dilithium.s @@ -0,0 +1,52 @@ +.syntax unified +.thumb + +.macro caddq a, tmp, q + and \tmp, \q, \a, asr #31 + add \a, \a, \tmp +.endm + +// void asm_caddq(int32_t a[N]); +.global pqcrystals_dilithium_asm_caddq +.type pqcrystals_dilithium_asm_caddq, %function +.align 2 +pqcrystals_dilithium_asm_caddq: + push {r4-r11, r14} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + caddq r1, r9, r12 + caddq r2, r9, r12 + caddq r3, r9, r12 + caddq r4, r9, r12 + caddq r5, r9, r12 + caddq r6, r9, r12 + caddq r7, r9, r12 + caddq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r11, pc} + +.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq \ No newline at end of file diff --git a/examples/opt/armv7m/caddq_dilithium_opt_m7.s b/examples/opt/armv7m/caddq_dilithium_opt_m7.s new file mode 100644 index 00000000..f259e07b --- /dev/null +++ b/examples/opt/armv7m/caddq_dilithium_opt_m7.s @@ -0,0 +1,120 @@ +.syntax unified +.thumb + +.macro caddq a, tmp, q + and \tmp, \q, \a, asr #31 + add \a, \a, \tmp +.endm + +// void asm_caddq(int32_t a[N]); +.global pqcrystals_dilithium_asm_caddq_opt_m7 +.type pqcrystals_dilithium_asm_caddq_opt_m7, %function +.align 2 +pqcrystals_dilithium_asm_caddq_opt_m7: + push {r4-r11, r14} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.00s + // User time: 0.00s + // +1: + // Instructions: 33 + // Expected cycles: 17 + // Expected IPC: 1.94 + // + // Wall time: 3.57s + // User time: 3.57s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr.w r9, [r0] // *............................. + ldr.w r2, [r0, #28] // *............................. + ldr.w r11, [r0, #24] // .*............................ + ldr.w r8, [r0, #20] // .*............................ + subs r10, #1 // ..*........................... + ldr.w r7, [r0, #4] // ..*........................... + and r5, r12, r2, asr #31 // ...*.......................... + ldr.w r3, [r0, #16] // ...*.......................... + add r4, r2, r5 // ....*......................... + str.w r4, [r0, #28] // ....*......................... + ldr.w r5, [r0, #12] // .....*........................ + and r14, r12, r11, asr #31 // .....*........................ + add r11, r11, r14 // ......*....................... + and r14, r12, r3, asr #31 // ......*....................... + add r2, r3, r14 // .......*...................... + and r3, r12, r9, asr #31 // .......*...................... + add r3, r9, r3 // ........*..................... + str.w r2, [r0, #16] // ........*..................... + ldr.w r14, [r0, #8] // .........*.................... + and r6, r12, r5, asr #31 // .........*.................... + and r2, r12, r8, asr #31 // ..........*................... + add r4, r5, r6 // ..........*................... + add r6, r8, r2 // ...........*.................. + str.w r6, [r0, #20] // ...........*.................. + and r9, r12, r14, asr #31 // ............*................. + str r3, [r0], #8*4 // ............*................. + add r6, r14, r9 // .............*................ + str r11, [r0, #-8] // .............*................ + and r11, r12, r7, asr #31 // ..............*............... + str r6, [r0, #-24] // ..............*............... + add r6, r7, r11 // ...............*.............. + str r6, [r0, #-28] // ...............*.............. + str r4, [r0, #-20] // ................*............. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr.w r1, [r0] // *................~............. + // ldr.w r2, [r0, #1*4] // ..*..............'.~........... + // ldr.w r3, [r0, #2*4] // .........*.......'........~.... + // ldr.w r4, [r0, #3*4] // .....*...........'....~........ + // ldr.w r5, [r0, #4*4] // ...*.............'..~.......... + // ldr.w r6, [r0, #5*4] // .*...............'~............ + // ldr.w r7, [r0, #6*4] // .*...............'~............ + // ldr.w r8, [r0, #7*4] // *................~............. + // and r9, r12, r1, asr #31 // .......*.........'......~...... + // add r1, r1, r9 // ........*........'.......~..... + // and r9, r12, r2, asr #31 // ..............*..'............. + // add r2, r2, r9 // ...............*.'............. + // and r9, r12, r3, asr #31 // ............*....'...........~. + // add r3, r3, r9 // .............*...'............. + // and r9, r12, r4, asr #31 // .........*.......'........~.... + // add r4, r4, r9 // ..........*......'.........~... + // and r9, r12, r5, asr #31 // ......*..........'.....~....... + // add r5, r5, r9 // .......*.........'......~...... + // and r9, r12, r6, asr #31 // ..........*......'.........~... + // add r6, r6, r9 // ...........*.....'..........~.. + // and r9, r12, r7, asr #31 // .....*...........'....~........ + // add r7, r7, r9 // ......*..........'.....~....... + // and r9, r12, r8, asr #31 // ...*.............'..~.......... + // add r8, r8, r9 // ....*............'...~......... + // str.w r2, [r0, #1*4] // ...............*.'............. + // str.w r3, [r0, #2*4] // ..............*..'............. + // str.w r4, [r0, #3*4] // ................*'............. + // str.w r5, [r0, #4*4] // ........*........'.......~..... + // str.w r6, [r0, #5*4] // ...........*.....'..........~.. + // str.w r7, [r0, #6*4] // .............*...'............. + // str.w r8, [r0, #7*4] // ....*............'...~......... + // str r1, [r0], #8*4 // ............*....'...........~. + // subs r10, #1 // ..*..............'.~........... + + bne 1b + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + + pop {r4-r11, pc} + +.size pqcrystals_dilithium_asm_caddq_opt_m7, .-pqcrystals_dilithium_asm_caddq_opt_m7 \ No newline at end of file