-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
720c3d6
commit 75041ad
Showing
3 changed files
with
210 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
.syntax unified | ||
.thumb | ||
.macro redq a, tmp, q | ||
add \tmp, \a, #4194304 | ||
asrs \tmp, \tmp, #23 | ||
mls \a, \tmp, \q, \a | ||
.endm | ||
|
||
// void asm_reduce32(int32_t a[N]); | ||
.global pqcrystals_dilithium_asm_reduce32 | ||
.type pqcrystals_dilithium_asm_reduce32, %function | ||
.align 2 | ||
pqcrystals_dilithium_asm_reduce32: | ||
push {r4-r11, r14} | ||
|
||
movw r12,#:lower16:8380417 | ||
movt r12,#:upper16:8380417 | ||
movw r10, #32 | ||
1: | ||
ldr.w r1, [r0] | ||
ldr.w r2, [r0, #1*4] | ||
ldr.w r3, [r0, #2*4] | ||
ldr.w r4, [r0, #3*4] | ||
ldr.w r5, [r0, #4*4] | ||
ldr.w r6, [r0, #5*4] | ||
ldr.w r7, [r0, #6*4] | ||
ldr.w r8, [r0, #7*4] | ||
|
||
redq r1, r9, r12 | ||
redq r2, r9, r12 | ||
redq r3, r9, r12 | ||
redq r4, r9, r12 | ||
redq r5, r9, r12 | ||
redq r6, r9, r12 | ||
redq r7, r9, r12 | ||
redq r8, r9, r12 | ||
|
||
str.w r2, [r0, #1*4] | ||
str.w r3, [r0, #2*4] | ||
str.w r4, [r0, #3*4] | ||
str.w r5, [r0, #4*4] | ||
str.w r6, [r0, #5*4] | ||
str.w r7, [r0, #6*4] | ||
str.w r8, [r0, #7*4] | ||
str r1, [r0], #8*4 | ||
subs r10, #1 | ||
bne.w 1b | ||
|
||
pop {r4-r11, r14} | ||
bx lr | ||
|
||
.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
.syntax unified | ||
.thumb | ||
.macro redq a, tmp, q | ||
add \tmp, \a, #4194304 | ||
asrs \tmp, \tmp, #23 | ||
mls \a, \tmp, \q, \a | ||
.endm | ||
|
||
// void asm_reduce32(int32_t a[N]); | ||
.global pqcrystals_dilithium_asm_reduce32_opt_m7 | ||
.type pqcrystals_dilithium_asm_reduce32_opt_m7, %function | ||
.align 2 | ||
pqcrystals_dilithium_asm_reduce32_opt_m7: | ||
push {r4-r11, r14} | ||
|
||
movw r12,#:lower16:8380417 | ||
movt r12,#:upper16:8380417 | ||
movw r10, #32 | ||
// Instructions: 0 | ||
// Expected cycles: 0 | ||
// Expected IPC: 0.00 | ||
// | ||
// Wall time: 0.00s | ||
// User time: 0.00s | ||
// | ||
1: | ||
// Instructions: 41 | ||
// Expected cycles: 21 | ||
// Expected IPC: 1.95 | ||
// | ||
// Wall time: 7.03s | ||
// User time: 7.03s | ||
// | ||
// ----- cycle (expected) ------> | ||
// 0 25 | ||
// |------------------------|---- | ||
ldr.w r9, [r0, #16] // *............................. | ||
ldr.w r7, [r0, #4] // *............................. | ||
add r5, r7, #4194304 // .*............................ | ||
ldr.w r11, [r0, #20] // .*............................ | ||
add r3, r11, #4194304 // ..*........................... | ||
ldr.w r8, [r0, #28] // ..*........................... | ||
add r14, r8, #4194304 // ...*.......................... | ||
asrs r3, r3, #23 // ...*.......................... | ||
ldr.w r2, [r0, #12] // ....*......................... | ||
mls r4, r3, r12, r11 // ....*......................... | ||
str.w r4, [r0, #20] // .....*........................ | ||
asrs r3, r5, #23 // .....*........................ | ||
mls r3, r3, r12, r7 // ......*....................... | ||
ldr.w r5, [r0, #8] // ......*....................... | ||
add r6, r9, #4194304 // .......*...................... | ||
add r4, r2, #4194304 // .......*...................... | ||
str.w r3, [r0, #4] // ........*..................... | ||
asrs r11, r4, #23 // ........*..................... | ||
mls r4, r11, r12, r2 // .........*.................... | ||
asrs r6, r6, #23 // .........*.................... | ||
str.w r4, [r0, #12] // ..........*................... | ||
asrs r11, r14, #23 // ..........*................... | ||
mls r4, r11, r12, r8 // ...........*.................. | ||
add r14, r5, #4194304 // ...........*.................. | ||
mls r6, r6, r12, r9 // ............*................. | ||
asrs r14, r14, #23 // ............*................. | ||
str.w r6, [r0, #16] // .............*................ | ||
ldr.w r1, [r0, #24] // .............*................ | ||
str.w r4, [r0, #28] // ..............*............... | ||
add r4, r1, #4194304 // ..............*............... | ||
ldr.w r11, [r0] // ...............*.............. | ||
mls r14, r14, r12, r5 // ...............*.............. | ||
asrs r2, r4, #23 // ................*............. | ||
str.w r14, [r0, #8] // ................*............. | ||
mls r4, r2, r12, r1 // .................*............ | ||
add r2, r11, #4194304 // .................*............ | ||
str.w r4, [r0, #24] // ..................*........... | ||
asrs r7, r2, #23 // ..................*........... | ||
mls r8, r7, r12, r11 // ...................*.......... | ||
subs r10, #1 // ...................*.......... | ||
str r8, [r0], #8*4 // ....................*......... | ||
|
||
// ------ cycle (expected) ------> | ||
// 0 25 | ||
// |------------------------|----- | ||
// ldr.w r1, [r0] // ...............*.....'......... | ||
// ldr.w r2, [r0, #1*4] // *....................~......... | ||
// ldr.w r3, [r0, #2*4] // ......*..............'.....~... | ||
// ldr.w r4, [r0, #3*4] // ....*................'...~..... | ||
// ldr.w r5, [r0, #4*4] // *....................~......... | ||
// ldr.w r6, [r0, #5*4] // .*...................'~........ | ||
// ldr.w r7, [r0, #6*4] // .............*.......'......... | ||
// ldr.w r8, [r0, #7*4] // ..*..................'.~....... | ||
// add r9, r1, #4194304 // .................*...'......... | ||
// asrs r9, r9, #23 // ..................*..'......... | ||
// mls r1, r9, r12, r1 // ...................*.'......... | ||
// add r9, r2, #4194304 // .*...................'~........ | ||
// asrs r9, r9, #23 // .....*...............'....~.... | ||
// mls r2, r9, r12, r2 // ......*..............'.....~... | ||
// add r9, r3, #4194304 // ...........*.........'......... | ||
// asrs r9, r9, #23 // ............*........'......... | ||
// mls r3, r9, r12, r3 // ...............*.....'......... | ||
// add r9, r4, #4194304 // .......*.............'......~.. | ||
// asrs r9, r9, #23 // ........*............'.......~. | ||
// mls r4, r9, r12, r4 // .........*...........'......... | ||
// add r9, r5, #4194304 // .......*.............'......~.. | ||
// asrs r9, r9, #23 // .........*...........'......... | ||
// mls r5, r9, r12, r5 // ............*........'......... | ||
// add r9, r6, #4194304 // ..*..................'.~....... | ||
// asrs r9, r9, #23 // ...*.................'..~...... | ||
// mls r6, r9, r12, r6 // ....*................'...~..... | ||
// add r9, r7, #4194304 // ..............*......'......... | ||
// asrs r9, r9, #23 // ................*....'......... | ||
// mls r7, r9, r12, r7 // .................*...'......... | ||
// add r9, r8, #4194304 // ...*.................'..~...... | ||
// asrs r9, r9, #23 // ..........*..........'......... | ||
// mls r8, r9, r12, r8 // ...........*.........'......... | ||
// str.w r2, [r0, #1*4] // ........*............'.......~. | ||
// str.w r3, [r0, #2*4] // ................*....'......... | ||
// str.w r4, [r0, #3*4] // ..........*..........'......... | ||
// str.w r5, [r0, #4*4] // .............*.......'......... | ||
// str.w r6, [r0, #5*4] // .....*...............'....~.... | ||
// str.w r7, [r0, #6*4] // ..................*..'......... | ||
// str.w r8, [r0, #7*4] // ..............*......'......... | ||
// str r1, [r0], #8*4 // ....................*'......... | ||
// subs r10, #1 // ...................*.'......... | ||
|
||
bne 1b | ||
// Instructions: 0 | ||
// Expected cycles: 0 | ||
// Expected IPC: 0.00 | ||
// | ||
// Wall time: 0.00s | ||
// User time: 0.00s | ||
// | ||
|
||
pop {r4-r11, r14} | ||
bx lr | ||
|
||
.size pqcrystals_dilithium_asm_reduce32_opt_m7, .-pqcrystals_dilithium_asm_reduce32_opt_m7 |