Skip to content

Commit

Permalink
M7: caddq_dilithium
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Dec 18, 2024
1 parent cef0b76 commit 880d2a8
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 0 deletions.
19 changes: 19 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,24 @@ def core(self, slothy):
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")

class caddq_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "caddq_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_caddq"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")
def main():
examples = [ Example0(),
Example1(),
Expand Down Expand Up @@ -1864,6 +1882,7 @@ def main():
pointwise_769_dilithium(),
pointwise_769_asymmetric_dilithium(),
reduce32_dilithium(),
caddq_dilithium(),
]

all_example_names = [e.name for e in examples]
Expand Down
52 changes: 52 additions & 0 deletions examples/naive/armv7m/caddq_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
.syntax unified
.thumb

.macro caddq a, tmp, q
and \tmp, \q, \a, asr #31
add \a, \a, \tmp
.endm

// void asm_caddq(int32_t a[N]);
.global pqcrystals_dilithium_asm_caddq
.type pqcrystals_dilithium_asm_caddq, %function
.align 2
pqcrystals_dilithium_asm_caddq:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417

movw r10, #32
1:
ldr.w r1, [r0]
ldr.w r2, [r0, #1*4]
ldr.w r3, [r0, #2*4]
ldr.w r4, [r0, #3*4]
ldr.w r5, [r0, #4*4]
ldr.w r6, [r0, #5*4]
ldr.w r7, [r0, #6*4]
ldr.w r8, [r0, #7*4]

caddq r1, r9, r12
caddq r2, r9, r12
caddq r3, r9, r12
caddq r4, r9, r12
caddq r5, r9, r12
caddq r6, r9, r12
caddq r7, r9, r12
caddq r8, r9, r12

str.w r2, [r0, #1*4]
str.w r3, [r0, #2*4]
str.w r4, [r0, #3*4]
str.w r5, [r0, #4*4]
str.w r6, [r0, #5*4]
str.w r7, [r0, #6*4]
str.w r8, [r0, #7*4]
str r1, [r0], #8*4
subs r10, #1
bne.w 1b

pop {r4-r11, pc}

.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
120 changes: 120 additions & 0 deletions examples/opt/armv7m/caddq_dilithium_opt_m7.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
.syntax unified
.thumb

.macro caddq a, tmp, q
and \tmp, \q, \a, asr #31
add \a, \a, \tmp
.endm

// void asm_caddq(int32_t a[N]);
.global pqcrystals_dilithium_asm_caddq_opt_m7
.type pqcrystals_dilithium_asm_caddq_opt_m7, %function
.align 2
pqcrystals_dilithium_asm_caddq_opt_m7:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417

movw r10, #32
// Instructions: 0
// Expected cycles: 0
// Expected IPC: 0.00
//
// Wall time: 0.00s
// User time: 0.00s
//
1:
// Instructions: 33
// Expected cycles: 17
// Expected IPC: 1.94
//
// Wall time: 3.57s
// User time: 3.57s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr.w r9, [r0] // *.............................
ldr.w r2, [r0, #28] // *.............................
ldr.w r11, [r0, #24] // .*............................
ldr.w r8, [r0, #20] // .*............................
subs r10, #1 // ..*...........................
ldr.w r7, [r0, #4] // ..*...........................
and r5, r12, r2, asr #31 // ...*..........................
ldr.w r3, [r0, #16] // ...*..........................
add r4, r2, r5 // ....*.........................
str.w r4, [r0, #28] // ....*.........................
ldr.w r5, [r0, #12] // .....*........................
and r14, r12, r11, asr #31 // .....*........................
add r11, r11, r14 // ......*.......................
and r14, r12, r3, asr #31 // ......*.......................
add r2, r3, r14 // .......*......................
and r3, r12, r9, asr #31 // .......*......................
add r3, r9, r3 // ........*.....................
str.w r2, [r0, #16] // ........*.....................
ldr.w r14, [r0, #8] // .........*....................
and r6, r12, r5, asr #31 // .........*....................
and r2, r12, r8, asr #31 // ..........*...................
add r4, r5, r6 // ..........*...................
add r6, r8, r2 // ...........*..................
str.w r6, [r0, #20] // ...........*..................
and r9, r12, r14, asr #31 // ............*.................
str r3, [r0], #8*4 // ............*.................
add r6, r14, r9 // .............*................
str r11, [r0, #-8] // .............*................
and r11, r12, r7, asr #31 // ..............*...............
str r6, [r0, #-24] // ..............*...............
add r6, r7, r11 // ...............*..............
str r6, [r0, #-28] // ...............*..............
str r4, [r0, #-20] // ................*.............

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r1, [r0] // *................~.............
// ldr.w r2, [r0, #1*4] // ..*..............'.~...........
// ldr.w r3, [r0, #2*4] // .........*.......'........~....
// ldr.w r4, [r0, #3*4] // .....*...........'....~........
// ldr.w r5, [r0, #4*4] // ...*.............'..~..........
// ldr.w r6, [r0, #5*4] // .*...............'~............
// ldr.w r7, [r0, #6*4] // .*...............'~............
// ldr.w r8, [r0, #7*4] // *................~.............
// and r9, r12, r1, asr #31 // .......*.........'......~......
// add r1, r1, r9 // ........*........'.......~.....
// and r9, r12, r2, asr #31 // ..............*..'.............
// add r2, r2, r9 // ...............*.'.............
// and r9, r12, r3, asr #31 // ............*....'...........~.
// add r3, r3, r9 // .............*...'.............
// and r9, r12, r4, asr #31 // .........*.......'........~....
// add r4, r4, r9 // ..........*......'.........~...
// and r9, r12, r5, asr #31 // ......*..........'.....~.......
// add r5, r5, r9 // .......*.........'......~......
// and r9, r12, r6, asr #31 // ..........*......'.........~...
// add r6, r6, r9 // ...........*.....'..........~..
// and r9, r12, r7, asr #31 // .....*...........'....~........
// add r7, r7, r9 // ......*..........'.....~.......
// and r9, r12, r8, asr #31 // ...*.............'..~..........
// add r8, r8, r9 // ....*............'...~.........
// str.w r2, [r0, #1*4] // ...............*.'.............
// str.w r3, [r0, #2*4] // ..............*..'.............
// str.w r4, [r0, #3*4] // ................*'.............
// str.w r5, [r0, #4*4] // ........*........'.......~.....
// str.w r6, [r0, #5*4] // ...........*.....'..........~..
// str.w r7, [r0, #6*4] // .............*...'.............
// str.w r8, [r0, #7*4] // ....*............'...~.........
// str r1, [r0], #8*4 // ............*....'...........~.
// subs r10, #1 // ..*..............'.~...........

bne 1b
// Instructions: 0
// Expected cycles: 0
// Expected IPC: 0.00
//
// Wall time: 0.00s
// User time: 0.00s
//

pop {r4-r11, pc}

.size pqcrystals_dilithium_asm_caddq_opt_m7, .-pqcrystals_dilithium_asm_caddq_opt_m7

0 comments on commit 880d2a8

Please sign in to comment.