Skip to content

Commit

Permalink
M7: reduce32_dilithium
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Dec 18, 2024
1 parent 720c3d6 commit 75041ad
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 0 deletions.
22 changes: 22 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,27 @@ def core(self, slothy):

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_asymmetric_mul_16_loop")

class reduce32_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "reduce32_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_reduce32"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.constraints.stalls_first_attempt = 4
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")

def main():
examples = [ Example0(),
Example1(),
Expand Down Expand Up @@ -1842,6 +1863,7 @@ def main():
basemul_257_asymmetric_dilithium(),
pointwise_769_dilithium(),
pointwise_769_asymmetric_dilithium(),
reduce32_dilithium(),
]

all_example_names = [e.name for e in examples]
Expand Down
52 changes: 52 additions & 0 deletions examples/naive/armv7m/reduce32_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
.syntax unified
.thumb
.macro redq a, tmp, q
add \tmp, \a, #4194304
asrs \tmp, \tmp, #23
mls \a, \tmp, \q, \a
.endm

// void asm_reduce32(int32_t a[N]);
.global pqcrystals_dilithium_asm_reduce32
.type pqcrystals_dilithium_asm_reduce32, %function
.align 2
pqcrystals_dilithium_asm_reduce32:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417
movw r10, #32
1:
ldr.w r1, [r0]
ldr.w r2, [r0, #1*4]
ldr.w r3, [r0, #2*4]
ldr.w r4, [r0, #3*4]
ldr.w r5, [r0, #4*4]
ldr.w r6, [r0, #5*4]
ldr.w r7, [r0, #6*4]
ldr.w r8, [r0, #7*4]

redq r1, r9, r12
redq r2, r9, r12
redq r3, r9, r12
redq r4, r9, r12
redq r5, r9, r12
redq r6, r9, r12
redq r7, r9, r12
redq r8, r9, r12

str.w r2, [r0, #1*4]
str.w r3, [r0, #2*4]
str.w r4, [r0, #3*4]
str.w r5, [r0, #4*4]
str.w r6, [r0, #5*4]
str.w r7, [r0, #6*4]
str.w r8, [r0, #7*4]
str r1, [r0], #8*4
subs r10, #1
bne.w 1b

pop {r4-r11, r14}
bx lr

.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32
136 changes: 136 additions & 0 deletions examples/opt/armv7m/reduce32_dilithium_opt_m7.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
.syntax unified
.thumb
.macro redq a, tmp, q
add \tmp, \a, #4194304
asrs \tmp, \tmp, #23
mls \a, \tmp, \q, \a
.endm

// void asm_reduce32(int32_t a[N]);
.global pqcrystals_dilithium_asm_reduce32_opt_m7
.type pqcrystals_dilithium_asm_reduce32_opt_m7, %function
.align 2
pqcrystals_dilithium_asm_reduce32_opt_m7:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417
movw r10, #32
// Instructions: 0
// Expected cycles: 0
// Expected IPC: 0.00
//
// Wall time: 0.00s
// User time: 0.00s
//
1:
// Instructions: 41
// Expected cycles: 21
// Expected IPC: 1.95
//
// Wall time: 7.03s
// User time: 7.03s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr.w r9, [r0, #16] // *.............................
ldr.w r7, [r0, #4] // *.............................
add r5, r7, #4194304 // .*............................
ldr.w r11, [r0, #20] // .*............................
add r3, r11, #4194304 // ..*...........................
ldr.w r8, [r0, #28] // ..*...........................
add r14, r8, #4194304 // ...*..........................
asrs r3, r3, #23 // ...*..........................
ldr.w r2, [r0, #12] // ....*.........................
mls r4, r3, r12, r11 // ....*.........................
str.w r4, [r0, #20] // .....*........................
asrs r3, r5, #23 // .....*........................
mls r3, r3, r12, r7 // ......*.......................
ldr.w r5, [r0, #8] // ......*.......................
add r6, r9, #4194304 // .......*......................
add r4, r2, #4194304 // .......*......................
str.w r3, [r0, #4] // ........*.....................
asrs r11, r4, #23 // ........*.....................
mls r4, r11, r12, r2 // .........*....................
asrs r6, r6, #23 // .........*....................
str.w r4, [r0, #12] // ..........*...................
asrs r11, r14, #23 // ..........*...................
mls r4, r11, r12, r8 // ...........*..................
add r14, r5, #4194304 // ...........*..................
mls r6, r6, r12, r9 // ............*.................
asrs r14, r14, #23 // ............*.................
str.w r6, [r0, #16] // .............*................
ldr.w r1, [r0, #24] // .............*................
str.w r4, [r0, #28] // ..............*...............
add r4, r1, #4194304 // ..............*...............
ldr.w r11, [r0] // ...............*..............
mls r14, r14, r12, r5 // ...............*..............
asrs r2, r4, #23 // ................*.............
str.w r14, [r0, #8] // ................*.............
mls r4, r2, r12, r1 // .................*............
add r2, r11, #4194304 // .................*............
str.w r4, [r0, #24] // ..................*...........
asrs r7, r2, #23 // ..................*...........
mls r8, r7, r12, r11 // ...................*..........
subs r10, #1 // ...................*..........
str r8, [r0], #8*4 // ....................*.........

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r1, [r0] // ...............*.....'.........
// ldr.w r2, [r0, #1*4] // *....................~.........
// ldr.w r3, [r0, #2*4] // ......*..............'.....~...
// ldr.w r4, [r0, #3*4] // ....*................'...~.....
// ldr.w r5, [r0, #4*4] // *....................~.........
// ldr.w r6, [r0, #5*4] // .*...................'~........
// ldr.w r7, [r0, #6*4] // .............*.......'.........
// ldr.w r8, [r0, #7*4] // ..*..................'.~.......
// add r9, r1, #4194304 // .................*...'.........
// asrs r9, r9, #23 // ..................*..'.........
// mls r1, r9, r12, r1 // ...................*.'.........
// add r9, r2, #4194304 // .*...................'~........
// asrs r9, r9, #23 // .....*...............'....~....
// mls r2, r9, r12, r2 // ......*..............'.....~...
// add r9, r3, #4194304 // ...........*.........'.........
// asrs r9, r9, #23 // ............*........'.........
// mls r3, r9, r12, r3 // ...............*.....'.........
// add r9, r4, #4194304 // .......*.............'......~..
// asrs r9, r9, #23 // ........*............'.......~.
// mls r4, r9, r12, r4 // .........*...........'.........
// add r9, r5, #4194304 // .......*.............'......~..
// asrs r9, r9, #23 // .........*...........'.........
// mls r5, r9, r12, r5 // ............*........'.........
// add r9, r6, #4194304 // ..*..................'.~.......
// asrs r9, r9, #23 // ...*.................'..~......
// mls r6, r9, r12, r6 // ....*................'...~.....
// add r9, r7, #4194304 // ..............*......'.........
// asrs r9, r9, #23 // ................*....'.........
// mls r7, r9, r12, r7 // .................*...'.........
// add r9, r8, #4194304 // ...*.................'..~......
// asrs r9, r9, #23 // ..........*..........'.........
// mls r8, r9, r12, r8 // ...........*.........'.........
// str.w r2, [r0, #1*4] // ........*............'.......~.
// str.w r3, [r0, #2*4] // ................*....'.........
// str.w r4, [r0, #3*4] // ..........*..........'.........
// str.w r5, [r0, #4*4] // .............*.......'.........
// str.w r6, [r0, #5*4] // .....*...............'....~....
// str.w r7, [r0, #6*4] // ..................*..'.........
// str.w r8, [r0, #7*4] // ..............*......'.........
// str r1, [r0], #8*4 // ....................*'.........
// subs r10, #1 // ...................*.'.........

bne 1b
// Instructions: 0
// Expected cycles: 0
// Expected IPC: 0.00
//
// Wall time: 0.00s
// User time: 0.00s
//

pop {r4-r11, r14}
bx lr

.size pqcrystals_dilithium_asm_reduce32_opt_m7, .-pqcrystals_dilithium_asm_reduce32_opt_m7

0 comments on commit 75041ad

Please sign in to comment.