Skip to content

Commit

Permalink
re-opt 769 pointwise
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Dec 16, 2024
1 parent 8dd06e6 commit be75034
Showing 1 changed file with 130 additions and 133 deletions.
263 changes: 130 additions & 133 deletions examples/opt/armv7m/pointwise_769_dilithium_opt_m7.s
Original file line number Diff line number Diff line change
Expand Up @@ -33,162 +33,159 @@ small_pointmul_asm_769_opt_m7:


add.w r3, r2, #64*width
// Instructions: 2
// Expected cycles: 1
// Expected IPC: 2.00
//
// Cycle bound: 1.0
// IPC bound: 2.00
//
// Wall time: 0.00s
// User time: 0.00s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr.w r10, [r2, #4] // *.............................
ldr.w r8, [r1, #8] // *.............................

// ------ cycle (expected) ------>
// Instructions: 5
// Expected cycles: 3
// Expected IPC: 1.67
//
// Cycle bound: 3.0
// IPC bound: 1.67
//
// Wall time: 0.01s
// User time: 0.01s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r10, [r2, #4] // *..............................
// ldr.w r8, [r1, #8] // *..............................
// |------------------------|----
ldr.w r10, [r1, #8] // *.............................
ldr.w r5, [r2, #4] // *.............................
ldr.w r9, [r1], #4*4 // .*............................
ldr.w r6, [r2], #2*4 // ..*...........................
smulwt r7, r5, r10 // ..*...........................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r10, [r1, #8] // *..............................
// ldr.w r5, [r2, #4] // *..............................
// ldr.w r9, [r1], #4*4 // .*.............................
// smulwt r7, r5, r10 // ..*............................
// ldr.w r6, [r2], #2*4 // ..*............................

sub r2, r2, #0
1:
// Instructions: 25
// Expected cycles: 13
// Expected IPC: 1.92
// Instructions: 24
// Expected cycles: 12
// Expected IPC: 2.00
//
// Cycle bound: 15.0
// IPC bound: 1.67
// Cycle bound: 17.0
// IPC bound: 1.41
//
// Wall time: 1.41s
// User time: 1.41s
// Wall time: 0.96s
// User time: 0.96s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr.w r5, [r2], #2*4 // *.............................
ldr.w r4, [r1, #12] // *.............................
smulwt r7, r10, r8 // .*............................
neg r10, r10 // .*............................
ldr.w r11, [r1], #4*4 // ..*...........................
smulwt r6, r10, r4 // ..*...........................
ldr r4, [r1, #-4] // *.............................
smlabt r11, r7, r12, r14 // *.............................
neg r5, r5 // .*............................
smulwt r7, r6, r9 // .*............................
neg r6, r6 // ..*...........................
smulwt r5, r5, r4 // ..*...........................
ldr r8, [r1, #-12] // ...*..........................
smlabt r7, r7, r12, r14 // ...*..........................
ldr.w r9, [r1, #-12] // ...*..........................
smlabt r6, r6, r12, r14 // ....*.........................
cmp.w r3, r2 // ....*.........................
smulwt r10, r5, r11 // .....*........................
neg r5, r5 // .....*........................
pkhbt r7, r8, r7 // ......*.......................
smulwt r8, r5, r9 // ......*.......................
pkhbt r4, r4, r6 // .......*......................
smlabt r5, r10, r12, r14 // .......*......................
ldr.w r10, [r2, #4] // ........e.....................
smlabt r6, r8, r12, r14 // ........*.....................
ldr.w r8, [r1, #8] // .........e....................
str.w r4, [r0, #12] // .........*....................
pkhbt r4, r11, r5 // ..........*...................
str.w r4, [r0], #2*4 // ..........*...................
str.w r7, [r0], #2*4 // ...........*..................
pkhbt r11, r9, r6 // ............*.................
str.w r11, [r0, #-12] // ............*.................
pkhbt r11, r10, r11 // ....*.........................
smlabt r5, r5, r12, r14 // ....*.........................
ldr.w r10, [r1, #8] // .....e........................
smulwt r6, r6, r8 // .....*........................
pkhbt r7, r9, r7 // ......*.......................
str.w r7, [r0], #2*4 // ......*.......................
pkhbt r9, r4, r5 // .......*......................
smlabt r6, r6, r12, r14 // .......*......................
ldr.w r5, [r2, #4] // ........e.....................
str.w r9, [r0, #4] // ........*.....................
ldr.w r9, [r1], #4*4 // .........e....................
str.w r11, [r0], #2*4 // .........*....................
pkhbt r8, r8, r6 // ..........*...................
smulwt r7, r5, r10 // ..........e...................
str r8, [r0, #-12] // ...........*..................
ldr.w r6, [r2], #2*4 // ...........e..................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r7, [r1, #2*4] // .e...'........~...'........~...
// ldr.w r8, [r1, #3*4] // .....*............~............
// ldr.w r9, [r2, #1*4] // e....'.......~....'.......~....
// ldr.w r5, [r1, #1*4] // .....'..*.........'..~.........
// ldr.w r4, [r1], #4*4 // .....'.*..........'.~..........
// ldr.w r6, [r2], #2*4 // .....*............~............
// smulwt r10, r6, r4 // .....'....*.......'....~.......
// smlabt r10, r10, r12, r14 // .....'......*.....'......~.....
// pkhbt r4, r4, r10 // ..~..'.........*..'.........~..
// neg r6, r6 // .....'....*.......'....~.......
// smulwt r10, r6, r5 // .....'.....*......'.....~......
// smlabt r10, r10, r12, r14 // ~....'.......*....'.......~....
// pkhbt r5, r5, r10 // ....~'...........*'............
// str.w r5, [r0, #1*4] // ....~'...........*'............
// str.w r4, [r0], #2*4 // ..~..'.........*..'.........~..
// smulwt r10, r9, r7 // .....'*...........'~...........
// smlabt r10, r10, r12, r14 // .....'..*.........'..~.........
// pkhbt r7, r7, r10 // .....'.....*......'.....~......
// neg r9, r9 // .....'*...........'~...........
// smulwt r10, r9, r8 // .....'.*..........'.~..........
// smlabt r10, r10, r12, r14 // .....'...*........'...~........
// pkhbt r8, r8, r10 // .....'......*.....'......~.....
// str.w r8, [r0, #1*4] // .~...'........*...'........~...
// str.w r7, [r0], #2*4 // ...~.'..........*.'..........~.
// cmp.w r3, r2 // .....'...*........'...~........
// ldr.w r7, [r1, #2*4] // e......'....~......'....~......
// ldr.w r8, [r1, #3*4] // .......*...........~...........
// ldr.w r9, [r2, #1*4] // ...e...'.......~...'.......~...
// ldr.w r5, [r1, #1*4] // .......'..*........'..~........
// ldr.w r4, [r1], #4*4 // ....e..'........~..'........~..
// ldr.w r6, [r2], #2*4 // ......e'..........~'...........
// smulwt r10, r6, r4 // .......'*..........'~..........
// smlabt r10, r10, r12, r14 // .......'..*........'..~........
// pkhbt r4, r4, r10 // .~.....'.....*.....'.....~.....
// neg r6, r6 // .......'.*.........'.~.........
// smulwt r10, r6, r5 // ~......'....*......'....~......
// smlabt r10, r10, r12, r14 // ..~....'......*....'......~....
// pkhbt r5, r5, r10 // .....~.'.........*.'.........~.
// str.w r5, [r0, #1*4] // ......~'..........*'...........
// str.w r4, [r0], #2*4 // .~.....'.....*.....'.....~.....
// smulwt r10, r9, r7 // .....e.'.........~.'.........~.
// smlabt r10, r10, r12, r14 // .......*...........~...........
// pkhbt r7, r7, r10 // .......'...*.......'...~.......
// neg r9, r9 // .......'*..........'~..........
// smulwt r10, r9, r8 // .......'.*.........'.~.........
// smlabt r10, r10, r12, r14 // .......'...*.......'...~.......
// pkhbt r8, r8, r10 // ..~....'......*....'......~....
// str.w r8, [r0, #1*4] // ...~...'.......*...'.......~...
// str.w r7, [r0], #2*4 // ....~..'........*..'........~..

cmp r3, r2
bne 1b
// Instructions: 23
// Expected cycles: 13
// Expected IPC: 1.77
//
// Cycle bound: 13.0
// IPC bound: 1.77
//
// Wall time: 0.05s
// User time: 0.05s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
cmp.w r3, r2 // *.............................
ldr.w r5, [r1, #12] // *.............................
smulwt r4, r10, r8 // .*............................
neg r10, r10 // .*............................
ldr.w r11, [r2], #2*4 // ..*...........................
smulwt r9, r10, r5 // ..*...........................
smlabt r4, r4, r12, r14 // ...*..........................
ldr.w r6, [r1], #4*4 // ...*..........................
smlabt r9, r9, r12, r14 // ....*.........................
ldr.w r7, [r1, #-12] // ....*.........................
smulwt r10, r11, r6 // .....*........................
neg r11, r11 // .....*........................
pkhbt r8, r8, r4 // ......*.......................
smulwt r4, r11, r7 // ......*.......................
pkhbt r5, r5, r9 // .......*......................
smlabt r9, r10, r12, r14 // .......*......................
smlabt r4, r4, r12, r14 // ........*.....................
str.w r5, [r0, #12] // .........*....................
pkhbt r5, r6, r9 // ..........*...................
str.w r5, [r0], #2*4 // ..........*...................
pkhbt r5, r7, r4 // ...........*..................
str.w r8, [r0], #2*4 // ...........*..................
str.w r5, [r0, #-12] // ............*.................
// Instructions: 19
// Expected cycles: 11
// Expected IPC: 1.73
//
// Cycle bound: 11.0
// IPC bound: 1.73
//
// Wall time: 0.04s
// User time: 0.04s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr r4, [r1, #-12] // *.............................
smulwt r8, r6, r9 // *.............................
neg r6, r6 // .*............................
smlabt r7, r7, r12, r14 // .*............................
neg r5, r5 // ..*...........................
smulwt r6, r6, r4 // ..*...........................
ldr r11, [r1, #-4] // ...*..........................
smlabt r8, r8, r12, r14 // ...*..........................
pkhbt r10, r10, r7 // ....*.........................
smlabt r6, r6, r12, r14 // ....*.........................
smulwt r7, r5, r11 // .....*........................
pkhbt r8, r9, r8 // ......*.......................
str.w r8, [r0], #2*4 // ......*.......................
pkhbt r6, r4, r6 // .......*......................
smlabt r4, r7, r12, r14 // .......*......................
str.w r10, [r0], #2*4 // ........*.....................
str r6, [r0, #-12] // .........*....................
pkhbt r6, r11, r4 // ..........*...................
str r6, [r0, #-4] // ..........*...................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r5, [r2], #2*4 // ..*............................
// ldr.w r4, [r1, #12] // *..............................
// smulwt r7, r10, r8 // .*.............................
// neg r10, r10 // .*.............................
// ldr.w r11, [r1], #4*4 // ...*...........................
// smulwt r6, r10, r4 // ..*............................
// ldr r4, [r1, #-4] // ...*...........................
// smlabt r11, r7, r12, r14 // .*.............................
// neg r5, r5 // ..*............................
// smulwt r7, r6, r9 // *..............................
// neg r6, r6 // .*.............................
// smulwt r5, r5, r4 // .....*.........................
// ldr r8, [r1, #-12] // *..............................
// smlabt r7, r7, r12, r14 // ...*...........................
// ldr.w r9, [r1, #-12] // ....*..........................
// pkhbt r11, r10, r11 // ....*..........................
// smlabt r5, r5, r12, r14 // .......*.......................
// smulwt r6, r6, r8 // ..*............................
// pkhbt r7, r9, r7 // ......*........................
// str.w r7, [r0], #2*4 // ......*........................
// pkhbt r9, r4, r5 // ..........*....................
// smlabt r6, r6, r12, r14 // ....*..........................
// cmp.w r3, r2 // *..............................
// smulwt r10, r5, r11 // .....*.........................
// neg r5, r5 // .....*.........................
// pkhbt r7, r8, r7 // ......*........................
// smulwt r8, r5, r9 // ......*........................
// pkhbt r4, r4, r6 // .......*.......................
// smlabt r5, r10, r12, r14 // .......*.......................
// smlabt r6, r8, r12, r14 // ........*......................
// str.w r4, [r0, #12] // .........*.....................
// pkhbt r4, r11, r5 // ..........*....................
// str.w r4, [r0], #2*4 // ..........*....................
// str.w r7, [r0], #2*4 // ...........*...................
// pkhbt r11, r9, r6 // ...........*...................
// str.w r11, [r0, #-12] // ............*..................
// str.w r9, [r0, #4] // ..........*....................
// str.w r11, [r0], #2*4 // ........*......................
// pkhbt r8, r8, r6 // .......*.......................
// str r8, [r0, #-12] // .........*.....................


pop.w {r4-r11, pc}

0 comments on commit be75034

Please sign in to comment.