Skip to content

Commit

Permalink
Re-optimize Kyber subroutines for CM7
Browse files Browse the repository at this point in the history
  • Loading branch information
dop-amin committed Dec 2, 2024
1 parent 8c9991d commit 4bf09ed
Show file tree
Hide file tree
Showing 14 changed files with 2,396 additions and 2,473 deletions.
248 changes: 93 additions & 155 deletions examples/opt/armv7m/add_kyber_opt_m7.s

Large diffs are not rendered by default.

725 changes: 364 additions & 361 deletions examples/opt/armv7m/barrett_reduce_kyber_opt_m7.s

Large diffs are not rendered by default.

141 changes: 72 additions & 69 deletions examples/opt/armv7m/basemul_16_32_kyber_opt_m7.s
Original file line number Diff line number Diff line change
Expand Up @@ -44,106 +44,109 @@ basemul_asm_opt_16_32_opt_m7:
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr r6, [r2], #4 // *.............................
ldr r5, [r3], #8 // *.............................
ldr r10, [r2], #4 // .*............................
ldr r11, [r2], #4 // *.............................
ldr r12, [r3], #8 // *.............................
ldr r7, [r2], #4 // .*............................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr r6, [r2], #4 // *..............................
// ldr r10, [r2], #4 // .*.............................
// ldr r5, [r3], #8 // *..............................
// ldr r11, [r2], #4 // *..............................
// ldr r7, [r2], #4 // .*.............................
// ldr r12, [r3], #8 // *..............................

sub loop, loop, #1
sub r14, r14, #1
1:
// Instructions: 14
// Expected cycles: 12
// Expected IPC: 1.17
// Instructions: 15
// Expected cycles: 8
// Expected IPC: 1.88
//
// Cycle bound: 10.0
// IPC bound: 1.40
// Cycle bound: 11.0
// IPC bound: 1.36
//
// Wall time: 0.40s
// User time: 0.40s
// Wall time: 0.22s
// User time: 0.22s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr r11, [r1], #4 // *.............................
ldr r8, [r1], #4 // .*............................
smuad r9, r5, r6 // .*............................
str r9, [r0], #4 // ..*...........................
smuadx r12, r8, r10 // ....*.........................
smuadx r7, r11, r6 // .....*........................
ldr.w r6, [r3, #-4] // ......*.......................
str r7, [r0], #4 // ......*.......................
smuad r4, r6, r10 // ........*.....................
ldr r6, [r2], #4 // ........e.....................
str r4, [r0], #4 // .........*....................
ldr r10, [r2], #4 // .........e....................
str r12, [r0], #4 // ...........*..................
ldr r5, [r3], #8 // ...........e..................
ldr r8, [r1], #4 // *.............................
smuad r9, r12, r11 // *.............................
str r9, [r0], #4 // .*............................
ldr.w r4, [r3, #-4] // .*............................
ldr r5, [r1], #4 // ..*...........................
smuadx r6, r8, r11 // ..*...........................
smuad r10, r4, r7 // ...*..........................
ldr r11, [r2], #4 // ....e.........................
smuadx r9, r5, r7 // ....*.........................
ldr r7, [r2], #4 // .....e........................
str r6, [r0], #4 // .....*........................
ldr r12, [r3], #8 // ......e.......................
str r10, [r0], #4 // ......*.......................
str r9, [r0], #4 // .......*......................
subs.w r14, #1 // .......*......................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr r4, [r1], #4 // ....*...........~...........~..
// ldr r6, [r2], #4 // e...'.......~...'.......~...'..
// ldr r5, [r1], #4 // ....'*..........'~..........'~.
// ldr r7, [r2], #4 // .e..'........~..'........~..'..
// ldr.w r11, [r3, #4] // ....'.....*.....'.....~.....'..
// ldr r12, [r3], #8 // ...e'..........~'..........~'..
// smuad r12, r12, r6 // ....'*..........'~..........'~.
// str r12, [r0], #4 // ....'.*.........'.~.........'..
// smuadx r12, r4, r6 // ....'....*......'....~......'..
// str r12, [r0], #4 // ....'.....*.....'.....~.....'..
// smuad r12, r11, r7 // ~...'.......*...'.......~...'..
// str r12, [r0], #4 // .~..'........*..'........~..'..
// smuadx r12, r5, r7 // ....'...*.......'...~.......'..
// str r12, [r0], #4 // ...~'..........*'..........~'..
// ldr r4, [r1], #4 // ....*.......~.......~.......~..
// ldr r6, [r2], #4 // e...'...~...'...~...'...~...'..
// ldr r5, [r1], #4 // ....'.*.....'.~.....'.~.....'..
// ldr r7, [r2], #4 // .e..'....~..'....~..'....~..'..
// ldr.w r11, [r3, #4] // ....'*......'~......'~......'~.
// ldr r12, [r3], #8 // ..e.'.....~.'.....~.'.....~.'..
// smuad r12, r12, r6 // ....*.......~.......~.......~..
// str r12, [r0], #4 // ....'*......'~......'~......'~.
// smuadx r12, r4, r6 // ....'.*.....'.~.....'.~.....'..
// str r12, [r0], #4 // .~..'....*..'....~..'....~..'..
// smuad r12, r11, r7 // ....'..*....'..~....'..~....'..
// str r12, [r0], #4 // ..~.'.....*.'.....~.'.....~.'..
// smuadx r12, r5, r7 // ~...'...*...'...~...'...~...'..
// str r12, [r0], #4 // ...~'......*'......~'......~'..
// subs.w r14, #1 // ...~'......*'......~'......~'..

subs loop, #1
bne 1b
// Instructions: 11
// Expected cycles: 11
// Expected IPC: 1.00
// Instructions: 12
// Expected cycles: 8
// Expected IPC: 1.50
//
// Cycle bound: 11.0
// IPC bound: 1.00
// Cycle bound: 8.0
// IPC bound: 1.50
//
// Wall time: 0.02s
// User time: 0.02s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
smuad r8, r5, r6 // *.............................
ldr r12, [r1], #4 // *.............................
str r8, [r0], #4 // .*............................
ldr r8, [r1], #4 // .*............................
ldr.w r5, [r3, #-4] // ..*...........................
smuadx r6, r12, r6 // ...*..........................
smuadx r8, r8, r10 // ....*.........................
smuad r10, r5, r10 // .....*........................
str r6, [r0], #4 // ......*.......................
str r10, [r0], #4 // ........*.....................
str r8, [r0], #4 // ..........*...................
ldr r5, [r1], #4 // *.............................
smuad r4, r12, r11 // *.............................
subs.w r14, #1 // .*............................
str r4, [r0], #4 // .*............................
ldr.w r4, [r3, #-4] // ..*...........................
smuadx r11, r5, r11 // ..*...........................
str r11, [r0], #4 // ...*..........................
ldr r11, [r1], #4 // ...*..........................
smuad r5, r4, r7 // ....*.........................
smuadx r11, r11, r7 // .....*........................
str r5, [r0], #4 // ......*.......................
str r11, [r0], #4 // .......*......................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr r11, [r1], #4 // *..............................
// ldr r8, [r1], #4 // .*.............................
// smuad r9, r5, r6 // *..............................
// ldr r8, [r1], #4 // *..............................
// smuad r9, r12, r11 // *..............................
// str r9, [r0], #4 // .*.............................
// smuadx r12, r8, r10 // ....*..........................
// smuadx r7, r11, r6 // ...*...........................
// ldr.w r6, [r3, #-4] // ..*............................
// str r7, [r0], #4 // ......*........................
// smuad r4, r6, r10 // .....*.........................
// str r4, [r0], #4 // ........*......................
// str r12, [r0], #4 // ..........*....................
// ldr.w r4, [r3, #-4] // ..*............................
// ldr r5, [r1], #4 // ...*...........................
// smuadx r6, r8, r11 // ..*............................
// smuad r10, r4, r7 // ....*..........................
// smuadx r9, r5, r7 // .....*.........................
// str r6, [r0], #4 // ...*...........................
// str r10, [r0], #4 // ......*........................
// str r9, [r0], #4 // .......*.......................
// subs.w r14, #1 // .*.............................


pop {r4-r11, pc}
Loading

0 comments on commit 4bf09ed

Please sign in to comment.