From 8773678225ab03ba173ed8a9dd35fd9acb8ed9c4 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 16 Dec 2024 17:09:51 +0800 Subject: [PATCH] re-opt basemul_kyber_opt_m7 and frombytes_mul_16_32_kyber_opt_m7 --- examples/opt/armv7m/basemul_kyber_opt_m7.s | 279 ++++++++------- .../armv7m/frombytes_mul_16_32_kyber_opt_m7.s | 317 +++++++++--------- 2 files changed, 293 insertions(+), 303 deletions(-) diff --git a/examples/opt/armv7m/basemul_kyber_opt_m7.s b/examples/opt/armv7m/basemul_kyber_opt_m7.s index e60fa887..df96ad8c 100644 --- a/examples/opt/armv7m/basemul_kyber_opt_m7.s +++ b/examples/opt/armv7m/basemul_kyber_opt_m7.s @@ -46,186 +46,179 @@ basemul_asm_opt_m7: // Cycle bound: 2.0 // IPC bound: 1.50 // - // Wall time: 0.00s - // User time: 0.00s + // Wall time: 0.01s + // User time: 0.01s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r10, [r1], #8 // *............................. - ldr.w r4, [r3], #4 // *............................. - ldr r12, [r2], #8 // .*............................ + ldr r6, [r1], #8 // *............................. + ldr.w r5, [r3], #4 // *............................. + ldr r7, [r2], #8 // .*............................ // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r10, [r1], #8 // *.............................. - // ldr.w r4, [r3], #4 // *.............................. - // ldr r12, [r2], #8 // .*............................. + // ldr r6, [r1], #8 // *.............................. + // ldr.w r5, [r3], #4 // *.............................. + // ldr r7, [r2], #8 // .*............................. - push {r14} - vmov r14, s25 - sub r14, r14, #1 - vmov s25, r14 - pop {r14} + sub loop, loop, #1 1: - // Instructions: 32 + // Instructions: 31 // Expected cycles: 20 - // Expected IPC: 1.60 + // Expected IPC: 1.55 // // Cycle bound: 19.0 - // IPC bound: 1.68 + // IPC bound: 1.63 // - // Wall time: 3.03s - // User time: 3.03s + // Wall time: 2.77s + // User time: 2.77s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr r11, [r1, #-4] // *............................. - smuadx r7, r10, r12 // *............................. - movw r6, #26632 // .*............................ - smulwt r5, r4, r12 // .*............................ - vmov s25, r14 // ..*........................... - mul r14, r7, r9 // ..*........................... - ldr r7, [r2, #-4] // ...*.......................... - smlabt r5, r5, r8, r6 // ...*.......................... - neg r4, r4 // ....*......................... - smlatt r14, r14, r8, r6 // ....*......................... - smulwt r4, r4, r7 // .....*........................ - smultt r5, r10, r5 // ......*....................... - smlabt r4, r4, r8, r6 // .......*...................... - smlabb r12, r10, r12, r5 // ........*..................... - ldr r10, [r1], #8 // .........e.................... - smultt r5, r11, r4 // .........*.................... - ldr.w r4, [r3], #4 // ..........e................... - mul r12, r12, r9 // ..........*................... - smlabb r5, r11, r7, r5 // ...........*.................. - smlatt r12, r12, r8, r6 // ............*................. - smuadx r11, r11, r7 // .............*................ - mul r5, r5, r9 // ..............*............... - pkhtb r7, r14, r12, asr #16 // ...............*.............. - mul r12, r11, r9 // ...............*.............. - vmov r14, s25 // ................*............. - smlatt r5, r5, r8, r6 // ................*............. - subs.w r14, #1 // .................*............ - smlatt r6, r12, r8, r6 // .................*............ - str r7, [r0], #4 // ..................*........... - ldr r12, [r2], #8 // ..................e........... - pkhtb r11, r6, r5, asr #16 // ...................*.......... - str r11, [r0], #4 // ...................*.......... + vmov s27, r14 // *............................. + smuadx r4, r6, r7 // *............................. + movw r12, #26632 // .*............................ + smulwt r10, r5, r7 // .*............................ + ldr r11, [r1, #-4] // ..*........................... + mul r4, r4, r9 // ..*........................... + ldr r14, [r2, #-4] // ...*.......................... + smlabt r10, r10, r8, r12 // ...*.......................... + neg r5, r5 // ....*......................... + smlatt r4, r4, r8, r12 // ....*......................... + smulwt r5, r5, r14 // .....*........................ + smultt r10, r6, r10 // ......*....................... + smlabb r7, r6, r7, r10 // .......*...................... + smlabt r6, r5, r8, r12 // ........*..................... + mul r10, r7, r9 // .........*.................... + smultt r5, r11, r6 // ..........*................... + smlabb r5, r11, r14, r5 // ...........*.................. + smlatt r10, r10, r8, r12 // ............*................. + smuadx r6, r11, r14 // .............*................ + vmov r14, s27 // ..............*............... + mul r11, r5, r9 // ..............*............... + pkhtb r10, r4, r10, asr #16 // ...............*.............. + mul r7, r6, r9 // ...............*.............. + ldr r6, [r1], #8 // ................e............. + smlatt r11, r11, r8, r12 // ................*............. + ldr.w r5, [r3], #4 // .................e............ + smlatt r4, r7, r8, r12 // .................*............ + ldr r7, [r2], #8 // ..................e........... + str r10, [r0], #4 // ..................*........... + pkhtb r10, r4, r11, asr #16 // ...................*.......... + str r10, [r0], #4 // ...................*.......... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // vmov s0, r14 // ...........'.*................. - // movw r14, #26632 // ...........'*.................. - // ldr r5, [r1, #4] // ...........*................... - // ldr r4, [r1], #8 // e..........'........~.......... - // ldr r7, [r2, #4] // ...........'..*................ - // ldr r6, [r2], #8 // .........e.'.................~. - // ldr.w r12, [r3], #4 // .e.........'.........~......... - // smulwt r10, r12, r6 // ...........'*.................. - // smlabt r10, r10, r8, r14 // ...........'..*................ - // smultt r10, r4, r10 // ...........'.....*............. - // smlabb r10, r4, r6, r10 // ...........'.......*........... - // mul r10, r10, r9 // .~.........'.........*......... - // smlatt r10, r10, r8, r14 // ...~.......'...........*....... - // smuadx r11, r4, r6 // ...........*................... - // mul r11, r11, r9 // ...........'.*................. - // smlatt r11, r11, r8, r14 // ...........'...*............... - // pkhtb r10, r11, r10, asr #16 // ......~....'..............*.... - // str r10, [r0], #4 // .........~.'.................*. - // neg r12, r12 // ...........'...*............... - // smulwt r10, r12, r7 // ...........'....*.............. - // smlabt r10, r10, r8, r14 // ...........'......*............ - // smultt r10, r5, r10 // ~..........'........*.......... - // smlabb r10, r5, r7, r10 // ..~........'..........*........ - // mul r10, r10, r9 // .....~.....'.............*..... - // smlatt r10, r10, r8, r14 // .......~...'...............*... - // smuadx r11, r5, r7 // ....~......'............*...... - // mul r11, r11, r9 // ......~....'..............*.... - // smlatt r11, r11, r8, r14 // ........~..'................*.. - // pkhtb r10, r11, r10, asr #16 // ..........~'..................* - // str r10, [r0], #4 // ..........~'..................* - // vmov r14, s0 // .......~...'...............*... - // subs.w r14, #1 // ........~..'................*.. + // vmov s0, r14 // ....*...................~...... + // movw r14, #26632 // ....'*..................'~..... + // ldr r5, [r1, #4] // ....'.*.................'.~.... + // ldr r4, [r1], #8 // e...'...............~...'...... + // ldr r7, [r2, #4] // ....'..*................'..~... + // ldr r6, [r2], #8 // ..e.'.................~.'...... + // ldr.w r12, [r3], #4 // .e..'................~..'...... + // smulwt r10, r12, r6 // ....'*..................'~..... + // smlabt r10, r10, r8, r14 // ....'..*................'..~... + // smultt r10, r4, r10 // ....'.....*.............'...... + // smlabb r10, r4, r6, r10 // ....'......*............'...... + // mul r10, r10, r9 // ....'........*..........'...... + // smlatt r10, r10, r8, r14 // ....'...........*.......'...... + // smuadx r11, r4, r6 // ....*...................~...... + // mul r11, r11, r9 // ....'.*.................'.~.... + // smlatt r11, r11, r8, r14 // ....'...*...............'...~.. + // pkhtb r10, r11, r10, asr #16 // ....'..............*....'...... + // str r10, [r0], #4 // ..~.'.................*.'...... + // neg r12, r12 // ....'...*...............'...~.. + // smulwt r10, r12, r7 // ....'....*..............'....~. + // smlabt r10, r10, r8, r14 // ....'.......*...........'...... + // smultt r10, r5, r10 // ....'.........*.........'...... + // smlabb r10, r5, r7, r10 // ....'..........*........'...... + // mul r10, r10, r9 // ....'.............*.....'...... + // smlatt r10, r10, r8, r14 // ~...'...............*...'...... + // smuadx r11, r5, r7 // ....'............*......'...... + // mul r11, r11, r9 // ....'..............*....'...... + // smlatt r11, r11, r8, r14 // .~..'................*..'...... + // pkhtb r10, r11, r10, asr #16 // ...~'..................*'...... + // str r10, [r0], #4 // ...~'..................*'...... + // vmov r14, s0 // ....'.............*.....'...... + subs loop, #1 bne 1b - // Instructions: 29 + // Instructions: 28 // Expected cycles: 20 - // Expected IPC: 1.45 + // Expected IPC: 1.40 // // Cycle bound: 20.0 - // IPC bound: 1.45 + // IPC bound: 1.40 // - // Wall time: 0.13s - // User time: 0.13s + // Wall time: 0.08s + // User time: 0.08s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - vmov s5, r14 // *............................. - smulwt r6, r4, r12 // *............................. - neg r4, r4 // .*............................ - smuadx r11, r10, r12 // .*............................ - movw r5, #26632 // ..*........................... - smlabt r6, r6, r8, r5 // ..*........................... - ldr r14, [r1, #-4] // ...*.......................... - mul r11, r11, r9 // ...*.......................... - ldr r7, [r2, #-4] // ....*......................... - smultt r6, r10, r6 // ....*......................... - smlabb r12, r10, r12, r6 // .....*........................ - smulwt r4, r4, r7 // ......*....................... - smlatt r6, r11, r8, r5 // .......*...................... - smlabt r4, r4, r8, r5 // ........*..................... - smuadx r11, r14, r7 // .........*.................... - smultt r4, r14, r4 // ..........*................... - smlabb r4, r14, r7, r4 // ...........*.................. - vmov r14, s5 // ............*................. - mul r12, r12, r9 // ............*................. - subs.w r14, #1 // .............*................ - mul r4, r4, r9 // .............*................ - smlatt r12, r12, r8, r5 // ..............*............... - mul r11, r11, r9 // ...............*.............. - smlatt r4, r4, r8, r5 // ................*............. - pkhtb r12, r6, r12, asr #16 // .................*............ - smlatt r6, r11, r8, r5 // .................*............ + vmov s31, r14 // *............................. + smulwt r14, r5, r7 // *............................. + neg r5, r5 // .*............................ + smuadx r12, r6, r7 // .*............................ + movw r11, #26632 // ..*........................... + smlabt r4, r14, r8, r11 // ..*........................... + vmov r14, s31 // ...*.......................... + mul r12, r12, r9 // ...*.......................... + ldr r10, [r2, #-4] // ....*......................... + smultt r4, r6, r4 // ....*......................... + smlabb r7, r6, r7, r4 // .....*........................ + ldr r4, [r1, #-4] // ......*....................... + smulwt r5, r5, r10 // ......*....................... + smlatt r12, r12, r8, r11 // .......*...................... + smlabt r5, r5, r8, r11 // ........*..................... + smuadx r6, r4, r10 // .........*.................... + smultt r5, r4, r5 // ..........*................... + smlabb r5, r4, r10, r5 // ...........*.................. + mul r7, r7, r9 // ............*................. + mul r5, r5, r9 // .............*................ + smlatt r7, r7, r8, r11 // ..............*............... + mul r4, r6, r9 // ...............*.............. + smlatt r5, r5, r8, r11 // ................*............. + pkhtb r12, r12, r7, asr #16 // .................*............ + smlatt r7, r4, r8, r11 // .................*............ str r12, [r0], #4 // ..................*........... - pkhtb r12, r6, r4, asr #16 // ...................*.......... + pkhtb r12, r7, r5, asr #16 // ...................*.......... str r12, [r0], #4 // ...................*.......... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr r11, [r1, #-4] // ...*........................... - // smuadx r7, r10, r12 // .*............................. - // movw r6, #26632 // ..*............................ - // smulwt r5, r4, r12 // *.............................. - // vmov s25, r14 // *.............................. - // mul r14, r7, r9 // ...*........................... - // ldr r7, [r2, #-4] // ....*.......................... - // smlabt r5, r5, r8, r6 // ..*............................ - // neg r4, r4 // .*............................. - // smlatt r14, r14, r8, r6 // .......*....................... - // smulwt r4, r4, r7 // ......*........................ - // smultt r5, r10, r5 // ....*.......................... - // smlabt r4, r4, r8, r6 // ........*...................... - // smlabb r12, r10, r12, r5 // .....*......................... - // smultt r5, r11, r4 // ..........*.................... - // mul r12, r12, r9 // ............*.................. - // smlabb r5, r11, r7, r5 // ...........*................... - // smlatt r12, r12, r8, r6 // ..............*................ - // smuadx r11, r11, r7 // .........*..................... - // mul r5, r5, r9 // .............*................. - // pkhtb r7, r14, r12, asr #16 // .................*............. - // mul r12, r11, r9 // ...............*............... - // vmov r14, s25 // ............*.................. - // smlatt r5, r5, r8, r6 // ................*.............. - // subs.w r14, #1 // .............*................. - // smlatt r6, r12, r8, r6 // .................*............. - // str r7, [r0], #4 // ..................*............ - // pkhtb r11, r6, r5, asr #16 // ...................*........... - // str r11, [r0], #4 // ...................*........... + // vmov s27, r14 // *.............................. + // smuadx r4, r6, r7 // .*............................. + // movw r12, #26632 // ..*............................ + // smulwt r10, r5, r7 // *.............................. + // ldr r11, [r1, #-4] // ......*........................ + // mul r4, r4, r9 // ...*........................... + // ldr r14, [r2, #-4] // ....*.......................... + // smlabt r10, r10, r8, r12 // ..*............................ + // neg r5, r5 // .*............................. + // smlatt r4, r4, r8, r12 // .......*....................... + // smulwt r5, r5, r14 // ......*........................ + // smultt r10, r6, r10 // ....*.......................... + // smlabb r7, r6, r7, r10 // .....*......................... + // smlabt r6, r5, r8, r12 // ........*...................... + // mul r10, r7, r9 // ............*.................. + // smultt r5, r11, r6 // ..........*.................... + // smlabb r5, r11, r14, r5 // ...........*................... + // smlatt r10, r10, r8, r12 // ..............*................ + // smuadx r6, r11, r14 // .........*..................... + // vmov r14, s27 // ...*........................... + // mul r11, r5, r9 // .............*................. + // pkhtb r10, r4, r10, asr #16 // .................*............. + // mul r7, r6, r9 // ...............*............... + // smlatt r11, r11, r8, r12 // ................*.............. + // smlatt r4, r7, r8, r12 // .................*............. + // str r10, [r0], #4 // ..................*............ + // pkhtb r10, r4, r11, asr #16 // ...................*........... + // str r10, [r0], #4 // ...................*........... pop {r4-r11, pc} \ No newline at end of file diff --git a/examples/opt/armv7m/frombytes_mul_16_32_kyber_opt_m7.s b/examples/opt/armv7m/frombytes_mul_16_32_kyber_opt_m7.s index b6aa25b8..d4e4203b 100644 --- a/examples/opt/armv7m/frombytes_mul_16_32_kyber_opt_m7.s +++ b/examples/opt/armv7m/frombytes_mul_16_32_kyber_opt_m7.s @@ -74,186 +74,183 @@ frombytes_mul_asm_16_32_opt_m7: movt qinv, #27560 add ctr, rptr_tmp, #64*4*4 - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldrh.w r7, [r2], #6 // *............................. - ldrb.w r4, [r2, #-4] // .*............................ - ldrh.w r8, [r2, #-3] // ..*........................... - ubfx r6, r7, #12, #4 // ...*.......................... - ldr.w r5, [r3], #4 // ...*.......................... - orr r11, r6, r4, lsl #4 // ....*......................... + // Instructions: 8 + // Expected cycles: 6 + // Expected IPC: 1.33 + // + // Cycle bound: 6.0 + // IPC bound: 1.33 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldrh.w r8, [r2], #6 // *............................. + ldrb.w r12, [r2, #-4] // *............................. + ldr.w r5, [r3], #4 // .*............................ + ldrb.w r7, [r2, #-1] // .*............................ + ubfx r4, r8, #12, #4 // ...*.......................... + ubfx r6, r8, #0, #12 // ....*......................... + ldrh.w r8, [r2, #-3] // ....*......................... + orr r12, r4, r12, lsl #4 // .....*........................ - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldrh.w r7, [r2], #6 // *.............................. - // ldrb.w r12, [r2, #-4] // .*............................. - // ubfx r8, r7, #12, #4 // ...*........................... - // orr r11, r8, r12, lsl #4 // ....*.......................... - // ldrh.w r8, [r2, #-3] // ..*............................ - // ldr.w r5, [r3], #4 // ...*........................... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldrh.w r6, [r2], #6 // *.............................. + // ldrb.w r8, [r2, #-4] // *.............................. + // ubfx r5, r6, #12, #4 // ...*........................... + // orr r12, r5, r8, lsl #4 // .....*......................... + // ubfx r6, r6, #0, #12 // ....*.......................... + // ldrh.w r8, [r2, #-3] // ....*.......................... + // ldr.w r5, [r3], #4 // .*............................. + // ldrb.w r7, [r2, #-1] // .*............................. - sub r14, r14, #16 + sub ctr, ctr, #16 1: - // Instructions: 31 - // Expected cycles: 16 - // Expected IPC: 1.94 + // Instructions: 30 + // Expected cycles: 15 + // Expected IPC: 2.00 // - // Cycle bound: 22.0 - // IPC bound: 1.41 + // Cycle bound: 23.0 + // IPC bound: 1.30 // - // Wall time: 2.27s - // User time: 2.27s + // Wall time: 4.19s + // User time: 4.19s // // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ubfx r6, r7, #0, #12 // *............................. - ldrb.w r4, [r2, #-1] // *............................. - ldr r12, [r1], #8 // .*............................ - orr r11, r6, r11, lsl #16 // .*............................ - ubfx r6, r8, #12, #4 // ..*........................... - smulwt r7, r5, r11 // ..*........................... - orr r6, r6, r4, lsl #4 // ...*.......................... - smuadx r4, r12, r11 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smlabt r7, r7, r9, r10 // ....*......................... - orr r6, r8, r6, lsl #16 // .....*........................ - str r4, [r0, #4] // .....*........................ - neg r5, r5 // ......*....................... - smultt r8, r12, r7 // ......*....................... - ldr r4, [r1, #-4] // .......*...................... - smulwt r5, r5, r6 // .......*...................... - ldrh.w r7, [r2], #6 // ........e..................... - smlabb r11, r12, r11, r8 // ........*..................... - ldrb.w r12, [r2, #-4] // .........e.................... - smuadx r8, r4, r6 // .........*.................... - str r8, [r0, #12] // ..........*................... - cmp.w r0, r14 // ..........*................... - ubfx r8, r7, #12, #4 // ...........e.................. - smlabt r5, r5, r9, r10 // ...........*.................. - str r11, [r0], #16 // ............*................. // @slothy:core - orr r11, r8, r12, lsl #4 // ............e................. - ldrh.w r8, [r2, #-3] // .............e................ - smultt r12, r4, r5 // .............*................ - ldr.w r5, [r3], #4 // ..............e............... - smlabb r12, r4, r6, r12 // ..............*............... - str r12, [r0, #-8] // ...............*.............. + ldr r11, [r1], #8 // *............................. + orr r12, r6, r12, lsl #16 // *............................. + ubfx r4, r8, #12, #4 // .*............................ + smulwt r6, r5, r12 // .*............................ + orr r7, r4, r7, lsl #4 // ..*........................... + smuadx r4, r11, r12 // ..*........................... + ubfx r8, r8, #0, #12 // ...*.......................... + smlabt r6, r6, r9, r10 // ...*.......................... + orr r7, r8, r7, lsl #16 // ....*......................... + str r4, [r0, #4] // ....*......................... + neg r5, r5 // .....*........................ + smultt r4, r11, r6 // .....*........................ + ldrh.w r6, [r2], #6 // ......e....................... + smlabb r12, r11, r12, r4 // ......*....................... + ldrb.w r8, [r2, #-4] // .......e...................... + smulwt r11, r5, r7 // .......*...................... + ldr r4, [r1, #-4] // ........*..................... + str r12, [r0], #16 // ........*..................... // @slothy:core + ubfx r5, r6, #12, #4 // .........e.................... + smlabt r11, r11, r9, r10 // .........*.................... + orr r12, r5, r8, lsl #4 // ..........e................... + smuadx r5, r4, r7 // ..........*................... + ubfx r6, r6, #0, #12 // ...........e.................. + smultt r11, r4, r11 // ...........*.................. + ldrh.w r8, [r2, #-3] // ............e................. + str r5, [r0, #-4] // ............*................. + ldr.w r5, [r3], #4 // .............e................ + smlabb r11, r4, r7, r11 // .............*................ + str r11, [r0, #-8] // ..............*............... + ldrb.w r7, [r2, #-1] // ..............e............... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr.w r12, [r3], #4 // ......e.'.............~.'...... - // ldrb.w r6, [r2, #2] // .e......'........~......'...... - // ldrh.w r7, [r2, #3] // .....e..'............~..'...... - // ldrb.w r8, [r2, #5] // ........*...............~...... - // ldrh.w r4, [r2], #6 // e.......'.......~.......'...... - // ubfx r5, r4, #12, #4 // ...e....'..........~....'...... - // ubfx r4, r4, #0, #12 // ........*...............~...... - // orr r5, r5, r6, lsl #4 // ....e...'...........~...'...... - // orr r4, r4, r5, lsl #16 // ........'*..............'~..... - // ubfx r5, r7, #12, #4 // ........'.*.............'.~.... - // ubfx r6, r7, #0, #12 // ........'...*...........'...~.. - // orr r5, r5, r8, lsl #4 // ........'..*............'..~... - // orr r5, r6, r5, lsl #16 // ........'....*..........'....~. - // ldr r6, [r1], #8 // ........'*..............'~..... - // ldr r7, [r1, #-4] // ........'......*........'...... - // smulwt r8, r12, r4 // ........'.*.............'.~.... - // smlabt r8, r8, r9, r10 // ........'...*...........'...~.. - // smultt r8, r6, r8 // ........'.....*.........'...... - // smlabb r8, r6, r4, r8 // ~.......'.......*.......'...... - // str r8, [r0], #16 // ....~...'...........*...'...... - // smuadx r8, r6, r4 // ........'..*............'..~... - // str r8, [r0, #-12] // ........'....*..........'....~. - // neg r12, r12 // ........'.....*.........'...... - // smulwt r8, r12, r5 // ........'......*........'...... - // smlabt r8, r8, r9, r10 // ...~....'..........*....'...... - // smultt r8, r7, r8 // .....~..'............*..'...... - // smlabb r8, r7, r5, r8 // ......~.'.............*.'...... - // str r8, [r0, #-8] // .......~'..............*'...... - // smuadx r8, r7, r5 // .~......'........*......'...... - // str r8, [r0, #-4] // ..~.....'.........*.....'...... - // cmp.w r0, r14 // ..~.....'.........*.....'...... + // ldr.w r12, [r3], #4 // .......e.'............~.'...... + // ldrb.w r6, [r2, #2] // .e.......'......~.......'...... + // ldrh.w r7, [r2, #3] // ......e..'...........~..'...... + // ldrb.w r8, [r2, #5] // ........e'.............~'...... + // ldrh.w r4, [r2], #6 // e........'.....~........'...... + // ubfx r5, r4, #12, #4 // ...e.....'........~.....'...... + // ubfx r4, r4, #0, #12 // .....e...'..........~...'...... + // orr r5, r5, r6, lsl #4 // ....e....'.........~....'...... + // orr r4, r4, r5, lsl #16 // .........*..............~...... + // ubfx r5, r7, #12, #4 // .........'*.............'~..... + // ubfx r6, r7, #0, #12 // .........'..*...........'..~... + // orr r5, r5, r8, lsl #4 // .........'.*............'.~.... + // orr r5, r6, r5, lsl #16 // .........'...*..........'...~.. + // ldr r6, [r1], #8 // .........*..............~...... + // ldr r7, [r1, #-4] // ..~......'.......*......'...... + // smulwt r8, r12, r4 // .........'*.............'~..... + // smlabt r8, r8, r9, r10 // .........'..*...........'..~... + // smultt r8, r6, r8 // .........'....*.........'....~. + // smlabb r8, r6, r4, r8 // ~........'.....*........'...... + // str r8, [r0], #16 // ..~......'.......*......'...... + // smuadx r8, r6, r4 // .........'.*............'.~.... + // str r8, [r0, #-12] // .........'...*..........'...~.. + // neg r12, r12 // .........'....*.........'....~. + // smulwt r8, r12, r5 // .~.......'......*.......'...... + // smlabt r8, r8, r9, r10 // ...~.....'........*.....'...... + // smultt r8, r7, r8 // .....~...'..........*...'...... + // smlabb r8, r7, r5, r8 // .......~.'............*.'...... + // str r8, [r0, #-8] // ........~'.............*'...... + // smuadx r8, r7, r5 // ....~....'.........*....'...... + // str r8, [r0, #-4] // ......~..'...........*..'...... + cmp rptr_tmp, ctr bne 1b - // Instructions: 25 - // Expected cycles: 16 - // Expected IPC: 1.56 - // - // Cycle bound: 16.0 - // IPC bound: 1.56 - // - // Wall time: 0.08s - // User time: 0.08s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ubfx r12, r7, #0, #12 // *............................. - ldrb.w r4, [r2, #-1] // *............................. - orr r12, r12, r11, lsl #16 // .*............................ - ldr r7, [r1], #8 // .*............................ - ubfx r11, r8, #12, #4 // ..*........................... - smulwt r6, r5, r12 // ..*........................... - orr r11, r11, r4, lsl #4 // ...*.......................... - smuadx r4, r7, r12 // ...*.......................... - ubfx r8, r8, #0, #12 // ....*......................... - smlabt r6, r6, r9, r10 // ....*......................... - neg r5, r5 // .....*........................ - str r4, [r0, #4] // .....*........................ - cmp.w r0, r14 // ......*....................... - smultt r4, r7, r6 // ......*....................... - ldr r6, [r1, #-4] // .......*...................... - smlabb r12, r7, r12, r4 // .......*...................... - str r12, [r0], #16 // ........*..................... // @slothy:core - orr r12, r8, r11, lsl #16 // ........*..................... - smulwt r4, r5, r12 // .........*.................... - smuadx r8, r6, r12 // ..........*................... - smlabt r5, r4, r9, r10 // ...........*.................. - str r8, [r0, #-4] // ............*................. - smultt r5, r6, r5 // .............*................ - smlabb r12, r6, r12, r5 // ..............*............... - str r12, [r0, #-8] // ...............*.............. + // Instructions: 22 + // Expected cycles: 15 + // Expected IPC: 1.47 + // + // Cycle bound: 15.0 + // IPC bound: 1.47 + // + // Wall time: 0.06s + // User time: 0.06s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + orr r4, r6, r12, lsl #16 // *............................. + ldr r6, [r1], #8 // *............................. + ubfx r11, r8, #12, #4 // .*............................ + smulwt r12, r5, r4 // .*............................ + orr r7, r11, r7, lsl #4 // ..*........................... + smuadx r11, r6, r4 // ..*........................... + ubfx r8, r8, #0, #12 // ...*.......................... + smlabt r12, r12, r9, r10 // ...*.......................... + orr r7, r8, r7, lsl #16 // ....*......................... + str r11, [r0, #4] // ....*......................... + neg r5, r5 // .....*........................ + smultt r11, r6, r12 // .....*........................ + ldr r12, [r1, #-4] // ......*....................... + smulwt r5, r5, r7 // ......*....................... + smlabb r8, r6, r4, r11 // .......*...................... + smlabt r5, r5, r9, r10 // ........*..................... + smuadx r4, r12, r7 // .........*.................... + smultt r5, r12, r5 // ..........*................... + smlabb r7, r12, r7, r5 // ...........*.................. + str r8, [r0], #16 // ............*................. // @slothy:core + str r4, [r0, #-4] // .............*................ + str r7, [r0, #-8] // ..............*............... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ubfx r6, r7, #0, #12 // *.............................. - // ldrb.w r4, [r2, #-1] // *.............................. - // ldr r12, [r1], #8 // .*............................. - // orr r11, r6, r11, lsl #16 // .*............................. - // ubfx r6, r8, #12, #4 // ..*............................ - // smulwt r7, r5, r11 // ..*............................ - // orr r6, r6, r4, lsl #4 // ...*........................... - // smuadx r4, r12, r11 // ...*........................... - // ubfx r8, r8, #0, #12 // ....*.......................... - // smlabt r7, r7, r9, r10 // ....*.......................... - // orr r6, r8, r6, lsl #16 // ........*...................... - // str r4, [r0, #4] // .....*......................... + // ldr r11, [r1], #8 // *.............................. + // orr r12, r6, r12, lsl #16 // *.............................. + // ubfx r4, r8, #12, #4 // .*............................. + // smulwt r6, r5, r12 // .*............................. + // orr r7, r4, r7, lsl #4 // ..*............................ + // smuadx r4, r11, r12 // ..*............................ + // ubfx r8, r8, #0, #12 // ...*........................... + // smlabt r6, r6, r9, r10 // ...*........................... + // orr r7, r8, r7, lsl #16 // ....*.......................... + // str r4, [r0, #4] // ....*.......................... // neg r5, r5 // .....*......................... - // smultt r8, r12, r7 // ......*........................ - // ldr r4, [r1, #-4] // .......*....................... - // smulwt r5, r5, r6 // .........*..................... - // smlabb r11, r12, r11, r8 // .......*....................... - // smuadx r8, r4, r6 // ..........*.................... - // str r8, [r0, #12] // ............*.................. - // cmp.w r0, r14 // ......*........................ - // smlabt r5, r5, r9, r10 // ...........*................... - // str r11, [r0], #16 // ........*...................... - // smultt r12, r4, r5 // .............*................. - // smlabb r12, r4, r6, r12 // ..............*................ - // str r12, [r0, #-8] // ...............*............... + // smultt r4, r11, r6 // .....*......................... + // smlabb r12, r11, r12, r4 // .......*....................... + // smulwt r11, r5, r7 // ......*........................ + // ldr r4, [r1, #-4] // ......*........................ + // str r12, [r0], #16 // ............*.................. + // smlabt r11, r11, r9, r10 // ........*...................... + // smuadx r5, r4, r7 // .........*..................... + // smultt r11, r4, r11 // ..........*.................... + // str r5, [r0, #-4] // .............*................. + // smlabb r11, r4, r7, r11 // ...........*................... + // str r11, [r0, #-8] // ..............*................ pop {r4-r11, pc}