From 8c9991dbf7b989f27ef0717ba63176c3af1c202e Mon Sep 17 00:00:00 2001 From: Amin Abdulrahman Date: Fri, 29 Nov 2024 17:26:08 +0100 Subject: [PATCH] Re-optimized Dilithium 769 iNTT * 2966 -> 2544 --- example.py | 6 +- .../opt/armv7m/intt_769_dilithium_opt_m7.s | 2066 +++++++++-------- 2 files changed, 1038 insertions(+), 1034 deletions(-) diff --git a/example.py b/example.py index f0279a26..dfb47002 100644 --- a/example.py +++ b/example.py @@ -1875,7 +1875,7 @@ def core(self, slothy): slothy.config.unsafe_address_offset_fixup = False slothy.fusion_loop("layer1234_loop", ssa=False) - slothy.config.unsafe_address_offset_fixup = True + # slothy.config.unsafe_address_offset_fixup = True slothy.optimize_loop("layer1234_loop") slothy.config.split_heuristic_optimize_seam = 6 slothy.optimize_loop("layer1234_loop") @@ -1889,12 +1889,12 @@ def core(self, slothy): slothy.config.outputs = ["s0", "s2"] slothy.config.unsafe_address_offset_fixup = False slothy.fusion_region(start="layer567_first_start", end="layer567_first_end", ssa=False) - slothy.config.unsafe_address_offset_fixup = True + # slothy.config.unsafe_address_offset_fixup = True slothy.optimize(start="layer567_first_start", end="layer567_first_end") slothy.config.unsafe_address_offset_fixup = False slothy.fusion_loop("layer567_loop", ssa=False) - slothy.config.unsafe_address_offset_fixup = True + # slothy.config.unsafe_address_offset_fixup = True slothy.optimize_loop("layer567_loop") slothy.config.split_heuristic_optimize_seam = 6 slothy.optimize_loop("layer567_loop") diff --git a/examples/opt/armv7m/intt_769_dilithium_opt_m7.s b/examples/opt/armv7m/intt_769_dilithium_opt_m7.s index 56a3947e..4903ee31 100644 --- a/examples/opt/armv7m/intt_769_dilithium_opt_m7.s +++ b/examples/opt/armv7m/intt_769_dilithium_opt_m7.s @@ -185,454 +185,456 @@ small_invntt_asm_769_opt_m7: add.w tmp, poly, #8*strincr vmov s8, tmp layer1234_loop: - // Instructions: 217 - // Expected cycles: 124 - // Expected IPC: 1.75 + // Instructions: 219 + // Expected cycles: 112 + // Expected IPC: 1.96 // - // ---------------------------------------------------- cycle (expected) -----------------------------------------------------> + // ---------------------------------------------- cycle (expected) -----------------------------------------------> // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|----------------------- - ldr.w r2, [r0, #36] // *........................................................................................................................... - ldr.w r10, [r0, #32] // *........................................................................................................................... - ldr.w r4, [r0, #40] // .*.......................................................................................................................... - ssub16.w r3, r10, r2 // .*.......................................................................................................................... - sadd16.w r11, r10, r2 // ..*......................................................................................................................... - ldr.w r14, [r0, #44] // ..*......................................................................................................................... - ldr.w r9, [r0, #60] // ...*........................................................................................................................ - sadd16.w r8, r4, r14 // ...*........................................................................................................................ - sadd16.w r6, r11, r8 // ....*....................................................................................................................... - vmov.w r10, s10 // ....*....................................................................................................................... - vmov s23, r0 // .....*...................................................................................................................... - ldr.w r5, [r0, #56] // .....*...................................................................................................................... - sadd16.w r2, r5, r9 // ......*..................................................................................................................... - ldr.w r7, [r0, #48] // ......*..................................................................................................................... - ldr.w r0, [r0, #52] // .......*.................................................................................................................... - ssub16.w r4, r4, r14 // .......*.................................................................................................................... - smulwb r14, r10, r4 // ........*................................................................................................................... - ssub16.w r11, r11, r8 // ........*................................................................................................................... - ssub16.w r9, r5, r9 // .........*.................................................................................................................. - smulwt r5, r10, r4 // .........*.................................................................................................................. - smulwt r8, r10, r9 // ..........*................................................................................................................. - sadd16.w r4, r7, r0 // ..........*................................................................................................................. - ssub16.w r7, r7, r0 // ...........*................................................................................................................ - movw r0, #24608 // ...........*................................................................................................................ - smulwb r9, r10, r9 // ............*............................................................................................................... - sadd16.w r10, r4, r2 // ............*............................................................................................................... - ssub16.w r4, r4, r2 // .............*.............................................................................................................. - smlabt r8, r8, r12, r0 // .............*.............................................................................................................. - smlabt r9, r9, r12, r0 // ..............*............................................................................................................. - sadd16.w r2, r6, r10 // ..............*............................................................................................................. - ssub16.w r6, r6, r10 // ...............*............................................................................................................ - smlabt r14, r14, r12, r0 // ...............*............................................................................................................ - vmov.w r10, s12 // ................*........................................................................................................... - pkhtb r9, r8, r9, asr #16 // ................*........................................................................................................... - smlabt r8, r5, r12, r0 // .................*.......................................................................................................... - uadd16 r5, r7, r9 // .................*.......................................................................................................... - usub16 r9, r7, r9 // ..................*......................................................................................................... - smulwt r7, r10, r5 // ..................*......................................................................................................... - pkhtb r8, r8, r14, asr #16 // ...................*........................................................................................................ - smulwb r14, r10, r5 // ...................*........................................................................................................ - usub16 r5, r3, r8 // ....................*....................................................................................................... - smlabt r7, r7, r12, r0 // ....................*....................................................................................................... - vmov.w r10, s13 // .....................*...................................................................................................... - uadd16 r3, r3, r8 // .....................*...................................................................................................... - smulwt r8, r10, r4 // ......................*..................................................................................................... - smulwb r10, r10, r4 // .......................*.................................................................................................... - vmov.w r4, s14 // ........................*................................................................................................... - smlabt r8, r8, r12, r0 // ........................*................................................................................................... - smlabt r10, r10, r12, r0 // .........................*.................................................................................................. - smlabt r14, r14, r12, r0 // ..........................*................................................................................................. - pkhtb r8, r8, r10, asr #16 // ...........................*................................................................................................ - smulwt r10, r4, r9 // ...........................*................................................................................................ - pkhtb r7, r7, r14, asr #16 // ............................*............................................................................................... - smulwb r14, r4, r9 // ............................*............................................................................................... - uadd16 r9, r3, r7 // .............................*.............................................................................................. - smlabt r10, r10, r12, r0 // .............................*.............................................................................................. - vmov r4, s16 // ..............................*............................................................................................. - smlabt r14, r14, r12, r0 // ..............................*............................................................................................. - usub16 r7, r3, r7 // ...............................*............................................................................................ - smulwb r3, r4, r9 // ................................*........................................................................................... - pkhtb r10, r10, r14, asr #16 // ................................*........................................................................................... - smulwt r14, r4, r9 // .................................*.......................................................................................... - vmov r9, s17 // .................................*.......................................................................................... - smlabt r3, r3, r12, r0 // ..................................*......................................................................................... - sadd16.w r4, r11, r8 // ..................................*......................................................................................... - ssub16.w r11, r11, r8 // ...................................*........................................................................................ - smulwt r8, r9, r4 // ...................................*........................................................................................ - smulwb r4, r9, r4 // ....................................*....................................................................................... - usub16 r9, r5, r10 // ....................................*....................................................................................... - vmov s0, r2 // .....................................*...................................................................................... - smlabt r8, r8, r12, r0 // .....................................*...................................................................................... - smlabt r2, r4, r12, r0 // ......................................*..................................................................................... - vmov r4, s20 // ......................................*..................................................................................... - uadd16 r5, r5, r10 // .......................................*.................................................................................... - smulwt r10, r4, r7 // .......................................*.................................................................................... - pkhtb r8, r8, r2, asr #16 // ........................................*................................................................................... - smlabt r14, r14, r12, r0 // ........................................*................................................................................... - vmov s2, r8 // .........................................*.................................................................................. - smlabt r10, r10, r12, r0 // .........................................*.................................................................................. - vmov r2, s19 // ..........................................*................................................................................. - smulwb r4, r4, r7 // ..........................................*................................................................................. - smlabt r4, r4, r12, r0 // ............................................*............................................................................... - smulwt r8, r2, r6 // .............................................*.............................................................................. - pkhtb r14, r14, r3, asr #16 // .............................................*.............................................................................. - smulwb r3, r2, r6 // ..............................................*............................................................................. - vmov s1, r14 // ..............................................*............................................................................. - vmov r2, s22 // ...............................................*............................................................................ - smlabt r8, r8, r12, r0 // ...............................................*............................................................................ - smlabt r14, r3, r12, r0 // ................................................*........................................................................... - vmov r7, s18 // ................................................*........................................................................... - smulwb r6, r7, r5 // .................................................*.......................................................................... - pkhtb r10, r10, r4, asr #16 // .................................................*.......................................................................... - pkhtb r3, r8, r14, asr #16 // ..................................................*......................................................................... - vmov r8, s23 // ..................................................*......................................................................... - smlabt r6, r6, r12, r0 // ...................................................*........................................................................ - ldr.w r4, [r8, #16] // ...................................................*........................................................................ - smulwt r5, r7, r5 // ....................................................*....................................................................... - vmov s5, r10 // ....................................................*....................................................................... - smulwb r10, r2, r9 // .....................................................*...................................................................... - vmov s4, r3 // .....................................................*...................................................................... - ldr.w r14, [r8, #20] // ......................................................*..................................................................... - smulwt r9, r2, r9 // ......................................................*..................................................................... - ldr.w r3, [r8, #0] // .......................................................*.................................................................... - smlabt r2, r10, r12, r0 // .......................................................*.................................................................... - ssub16.w r7, r4, r14 // ........................................................*................................................................... - smlabt r10, r9, r12, r0 // ........................................................*................................................................... - smlabt r5, r5, r12, r0 // .........................................................*.................................................................. - vmov r9, s21 // .........................................................*.................................................................. - sadd16.w r14, r4, r14 // ..........................................................*................................................................. - smulwb r4, r9, r11 // ..........................................................*................................................................. - pkhtb r5, r5, r6, asr #16 // ...........................................................*................................................................ - smulwt r9, r9, r11 // ...........................................................*................................................................ - smlabt r11, r4, r12, r0 // ............................................................*............................................................... - pkhtb r6, r10, r2, asr #16 // ............................................................*............................................................... - smlabt r10, r9, r12, r0 // .............................................................*.............................................................. - ldr.w r0, [r8, #4] // .............................................................*.............................................................. - ldr.w r4, [r8, #24] // ..............................................................*............................................................. - sadd16.w r9, r3, r0 // ..............................................................*............................................................. - pkhtb r10, r10, r11, asr #16 // ...............................................................*............................................................ - vmov s3, r5 // ...............................................................*............................................................ - ssub16.w r3, r3, r0 // ................................................................*........................................................... - ldr.w r5, [r8, #28] // ................................................................*........................................................... - ldr.w r0, [r8, #12] // .................................................................*.......................................................... - ssub16.w r2, r4, r5 // .................................................................*.......................................................... - ldr.w r11, [r8, #8] // ..................................................................*......................................................... - sadd16.w r8, r4, r5 // ..................................................................*......................................................... - vmov s6, r10 // ...................................................................*........................................................ - ssub16.w r5, r11, r0 // ...................................................................*........................................................ - vmov s7, r6 // ....................................................................*....................................................... - sadd16.w r6, r14, r8 // ....................................................................*....................................................... - ssub16.w r14, r14, r8 // .....................................................................*...................................................... - vmov.w r4, s10 // .....................................................................*...................................................... - sadd16.w r10, r11, r0 // ......................................................................*..................................................... - smulwb r8, r4, r5 // ......................................................................*..................................................... - smulwt r5, r4, r5 // .......................................................................*.................................................... - ssub16.w r11, r9, r10 // .......................................................................*.................................................... - sadd16.w r9, r9, r10 // ........................................................................*................................................... - smulwt r10, r4, r2 // ........................................................................*................................................... - movw r0, #24608 // .........................................................................*.................................................. - smulwb r4, r4, r2 // .........................................................................*.................................................. - smlabt r10, r10, r12, r0 // ..........................................................................*................................................. - smlabt r4, r4, r12, r0 // ...........................................................................*................................................ - sadd16.w r2, r9, r6 // ...........................................................................*................................................ - smlabt r8, r8, r12, r0 // ............................................................................*............................................... - ssub16.w r6, r9, r6 // ............................................................................*............................................... - pkhtb r9, r10, r4, asr #16 // .............................................................................*.............................................. - smlabt r10, r5, r12, r0 // ..............................................................................*............................................. - uadd16 r4, r7, r9 // ..............................................................................*............................................. - usub16 r9, r7, r9 // ...............................................................................*............................................ - vmov.w r5, s12 // ...............................................................................*............................................ - pkhtb r8, r10, r8, asr #16 // ................................................................................*........................................... - smulwb r10, r5, r4 // ................................................................................*........................................... - smulwt r5, r5, r4 // .................................................................................*.......................................... - vmov.w r4, s13 // .................................................................................*.......................................... - smlabt r10, r10, r12, r0 // ..................................................................................*......................................... - smlabt r7, r5, r12, r0 // ...................................................................................*........................................ - usub16 r5, r3, r8 // ...................................................................................*........................................ - uadd16 r3, r3, r8 // ....................................................................................*....................................... - smulwb r8, r4, r14 // ....................................................................................*....................................... - pkhtb r7, r7, r10, asr #16 // .....................................................................................*...................................... - smulwt r14, r4, r14 // .....................................................................................*...................................... - smlabt r4, r8, r12, r0 // ......................................................................................*..................................... - uadd16 r8, r3, r7 // ......................................................................................*..................................... - smlabt r10, r14, r12, r0 // .......................................................................................*.................................... - vmov.w r14, s14 // .......................................................................................*.................................... - usub16 r7, r3, r7 // ........................................................................................*................................... - smulwb r3, r14, r9 // ........................................................................................*................................... - pkhtb r4, r10, r4, asr #16 // .........................................................................................*.................................. - smulwt r10, r14, r9 // .........................................................................................*.................................. - vmov r14, s1 // ..........................................................................................*................................. - smlabt r9, r3, r12, r0 // ..........................................................................................*................................. - usub16 r3, r8, r14 // ...........................................................................................*................................ - uadd16 r8, r8, r14 // ............................................................................................*............................... - smlabt r10, r10, r12, r0 // ............................................................................................*............................... - vmov r0, s23 // .............................................................................................*.............................. - str.w r3, [r0, #36] // .............................................................................................*.............................. - vmov r14, s3 // ..............................................................................................*............................. - pkhtb r10, r10, r9, asr #16 // ..............................................................................................*............................. - str.w r8, [r0, #4] // ...............................................................................................*............................ - uadd16 r9, r5, r10 // ...............................................................................................*............................ - uadd16 r8, r9, r14 // ................................................................................................*........................... - vmov r3, s5 // ................................................................................................*........................... - usub16 r5, r5, r10 // .................................................................................................*.......................... - str.w r8, [r0, #12] // .................................................................................................*.......................... - uadd16 r8, r7, r3 // ..................................................................................................*......................... - vmov r10, s0 // ..................................................................................................*......................... - str.w r8, [r0, #20] // ...................................................................................................*........................ - ssub16.w r8, r11, r4 // ...................................................................................................*........................ - sadd16.w r4, r11, r4 // ....................................................................................................*....................... - usub16 r11, r2, r10 // .....................................................................................................*...................... - str.w r11, [r0, #32] // .....................................................................................................*...................... - vmov r11, s2 // ......................................................................................................*..................... - usub16 r3, r7, r3 // ......................................................................................................*..................... - str.w r3, [r0, #52] // .......................................................................................................*.................... - vmov r7, s7 // .......................................................................................................*.................... - usub16 r14, r9, r14 // ........................................................................................................*................... - uadd16 r3, r5, r7 // .........................................................................................................*.................. - str.w r3, [r0, #28] // .........................................................................................................*.................. - usub16 r3, r5, r7 // ..........................................................................................................*................. - str.w r3, [r0, #60] // ...........................................................................................................*................ - vmov r3, s4 // ...........................................................................................................*................ - usub16 r9, r6, r3 // ............................................................................................................*............... - str.w r14, [r0, #44] // .............................................................................................................*.............. - uadd16 r3, r6, r3 // .............................................................................................................*.............. - str.w r3, [r0, #16] // ...............................................................................................................*............ - usub16 r3, r4, r11 // ...............................................................................................................*............ - uadd16 r11, r4, r11 // ................................................................................................................*........... - str.w r3, [r0, #40] // .................................................................................................................*.......... - uadd16 r3, r2, r10 // .................................................................................................................*.......... - str.w r3, [r0], #64 // ...................................................................................................................*........ // @slothy:core - vmov r3, s6 // ...................................................................................................................*........ - usub16 r10, r8, r3 // ....................................................................................................................*....... - str.w r11, [r0, #-56] // .....................................................................................................................*...... - uadd16 r11, r8, r3 // .....................................................................................................................*...... - str.w r9, [r0, #-16] // .......................................................................................................................*.... - str.w r10, [r0, #-8] // .........................................................................................................................*.. - str.w r11, [r0, #-40] // ...........................................................................................................................* + // |------------------------|------------------------|------------------------|------------------------|----------- + ldr.w r7, [r0, #48] // *............................................................................................................... + ldr.w r14, [r0, #52] // *............................................................................................................... + sadd16.w r4, r7, r14 // .*.............................................................................................................. + ldr.w r11, [r0, #44] // .*.............................................................................................................. + ssub16.w r7, r7, r14 // ..*............................................................................................................. + ldr.w r8, [r0, #40] // ..*............................................................................................................. + ssub16.w r10, r8, r11 // ...*............................................................................................................ + ldr.w r3, [r0, #36] // ...*............................................................................................................ + sadd16.w r5, r8, r11 // ....*........................................................................................................... + ldr.w r2, [r0, #32] // ....*........................................................................................................... + sadd16.w r8, r2, r3 // .....*.......................................................................................................... + vmov s23, r0 // .....*.......................................................................................................... + ssub16.w r11, r8, r5 // ......*......................................................................................................... + ldr.w r9, [r0, #56] // ......*......................................................................................................... + ssub16.w r3, r2, r3 // .......*........................................................................................................ + ldr.w r14, [r0, #60] // .......*........................................................................................................ + sadd16.w r0, r9, r14 // ........*....................................................................................................... + vmov.w r2, s10 // ........*....................................................................................................... + ssub16.w r9, r9, r14 // .........*...................................................................................................... + smulwb r14, r2, r10 // .........*...................................................................................................... + sadd16.w r6, r8, r5 // ..........*..................................................................................................... + smulwt r8, r2, r10 // ..........*..................................................................................................... + sadd16.w r5, r4, r0 // ...........*.................................................................................................... + smulwb r10, r2, r9 // ...........*.................................................................................................... + ssub16.w r4, r4, r0 // ............*................................................................................................... + smulwt r9, r2, r9 // ............*................................................................................................... + movw r0, #24608 // .............*.................................................................................................. + smlabt r10, r10, r12, r0 // .............*.................................................................................................. + sadd16.w r2, r6, r5 // ..............*................................................................................................. + smlabt r9, r9, r12, r0 // ..............*................................................................................................. + ssub16.w r6, r6, r5 // ...............*................................................................................................ + smlabt r5, r8, r12, r0 // ...............*................................................................................................ + pkhtb r9, r9, r10, asr #16 // ................*............................................................................................... + smlabt r10, r14, r12, r0 // ................*............................................................................................... + uadd16 r8, r7, r9 // .................*.............................................................................................. + vmov.w r14, s12 // .................*.............................................................................................. + usub16 r9, r7, r9 // ..................*............................................................................................. + smulwb r7, r14, r8 // ..................*............................................................................................. + pkhtb r10, r5, r10, asr #16 // ...................*............................................................................................ + smulwt r14, r14, r8 // ...................*............................................................................................ + usub16 r5, r3, r10 // ....................*........................................................................................... + smlabt r7, r7, r12, r0 // ....................*........................................................................................... + vmov.w r8, s13 // .....................*.......................................................................................... + smlabt r14, r14, r12, r0 // .....................*.......................................................................................... + uadd16 r10, r3, r10 // ......................*......................................................................................... + smulwb r3, r8, r4 // ......................*......................................................................................... + pkhtb r14, r14, r7, asr #16 // .......................*........................................................................................ + smulwt r4, r8, r4 // .......................*........................................................................................ + usub16 r7, r10, r14 // ........................*....................................................................................... + smlabt r3, r3, r12, r0 // ........................*....................................................................................... + vmov.w r8, s14 // .........................*...................................................................................... + smlabt r4, r4, r12, r0 // .........................*...................................................................................... + uadd16 r14, r10, r14 // ..........................*..................................................................................... + smulwb r10, r8, r9 // ..........................*..................................................................................... + pkhtb r4, r4, r3, asr #16 // ...........................*.................................................................................... + smulwt r3, r8, r9 // ...........................*.................................................................................... + vmov r9, s16 // ............................*................................................................................... + smlabt r10, r10, r12, r0 // ............................*................................................................................... + ssub16.w r8, r11, r4 // .............................*.................................................................................. + smlabt r3, r3, r12, r0 // .............................*.................................................................................. + sadd16.w r4, r11, r4 // ..............................*................................................................................. + smulwb r11, r9, r14 // ..............................*................................................................................. + pkhtb r10, r3, r10, asr #16 // ...............................*................................................................................ + smulwt r9, r9, r14 // ...............................*................................................................................ + smlabt r14, r11, r12, r0 // ................................*............................................................................... + vmov r11, s17 // .................................*.............................................................................. + smlabt r3, r9, r12, r0 // .................................*.............................................................................. + smulwb r9, r11, r4 // ..................................*............................................................................. + pkhtb r3, r3, r14, asr #16 // ...................................*............................................................................ + smulwt r4, r11, r4 // ...................................*............................................................................ + uadd16 r11, r5, r10 // ....................................*........................................................................... + smlabt r9, r9, r12, r0 // ....................................*........................................................................... + vmov r14, s18 // .....................................*.......................................................................... + smlabt r4, r4, r12, r0 // .....................................*.......................................................................... + usub16 r5, r5, r10 // ......................................*......................................................................... + smulwt r10, r14, r11 // ......................................*......................................................................... + pkhtb r4, r4, r9, asr #16 // .......................................*........................................................................ + smulwb r14, r14, r11 // .......................................*........................................................................ + vmov r9, s19 // ........................................*....................................................................... + smlabt r10, r10, r12, r0 // ........................................*....................................................................... + vmov s2, r4 // .........................................*...................................................................... + smulwt r4, r9, r6 // .........................................*...................................................................... + vmov s1, r3 // ..........................................*..................................................................... + smlabt r14, r14, r12, r0 // ..........................................*..................................................................... + vmov r3, s22 // ...........................................*.................................................................... + smulwb r9, r9, r6 // ...........................................*.................................................................... + vmov r6, s20 // ............................................*................................................................... + smlabt r11, r4, r12, r0 // ............................................*................................................................... + pkhtb r14, r10, r14, asr #16 // .............................................*.................................................................. + smulwb r4, r6, r7 // .............................................*.................................................................. + vmov r10, s21 // ..............................................*................................................................. + smlabt r9, r9, r12, r0 // ..............................................*................................................................. + vmov s3, r14 // ...............................................*................................................................ + smulwt r6, r6, r7 // ...............................................*................................................................ + vmov r7, s23 // ................................................*............................................................... + smlabt r14, r4, r12, r0 // ................................................*............................................................... + pkhtb r11, r11, r9, asr #16 // .................................................*.............................................................. + smlabt r6, r6, r12, r0 // .................................................*.............................................................. + vmov s4, r11 // ..................................................*............................................................. + smulwb r9, r3, r5 // ..................................................*............................................................. + pkhtb r6, r6, r14, asr #16 // ...................................................*............................................................ + smulwt r4, r10, r8 // ...................................................*............................................................ + ldr.w r11, [r7, #4] // ....................................................*........................................................... + smlabt r14, r9, r12, r0 // ....................................................*........................................................... + vmov s5, r6 // .....................................................*.......................................................... + smulwt r3, r3, r5 // .....................................................*.......................................................... + ldr.w r9, [r7, #24] // ......................................................*......................................................... + smulwb r6, r10, r8 // ......................................................*......................................................... + vmov s0, r2 // .......................................................*........................................................ + smlabt r10, r3, r12, r0 // .......................................................*........................................................ + ldr.w r5, [r7, #28] // ........................................................*....................................................... + smlabt r8, r4, r12, r0 // ........................................................*....................................................... + sadd16.w r4, r9, r5 // .........................................................*...................................................... + smlabt r6, r6, r12, r0 // .........................................................*...................................................... + ssub16.w r9, r9, r5 // ..........................................................*..................................................... + ldr.w r2, [r7, #0] // ..........................................................*..................................................... + ssub16.w r3, r2, r11 // ...........................................................*.................................................... + ldr.w r0, [r7, #16] // ...........................................................*.................................................... + pkhtb r5, r10, r14, asr #16 // ............................................................*................................................... + ldr.w r10, [r7, #20] // ............................................................*................................................... + sadd16.w r11, r2, r11 // .............................................................*.................................................. + ldr.w r2, [r7, #8] // .............................................................*.................................................. + pkhtb r8, r8, r6, asr #16 // ..............................................................*................................................. + ldr.w r6, [r7, #12] // ..............................................................*................................................. + ssub16.w r7, r0, r10 // ...............................................................*................................................ + vmov s7, r5 // ...............................................................*................................................ + sadd16.w r5, r2, r6 // ................................................................*............................................... + vmov s6, r8 // ................................................................*............................................... + sadd16.w r8, r11, r5 // .................................................................*.............................................. + sadd16.w r10, r0, r10 // ..................................................................*............................................. + vmov.w r14, s10 // ..................................................................*............................................. + ssub16.w r6, r2, r6 // ...................................................................*............................................ + smulwb r2, r14, r9 // ...................................................................*............................................ + ssub16.w r11, r11, r5 // ....................................................................*........................................... + smulwt r5, r14, r9 // ....................................................................*........................................... + movw r0, #24608 // .....................................................................*.......................................... + smlabt r9, r2, r12, r0 // .....................................................................*.......................................... + sadd16.w r2, r10, r4 // ......................................................................*......................................... + smlabt r5, r5, r12, r0 // ......................................................................*......................................... + ssub16.w r4, r10, r4 // .......................................................................*........................................ + smulwb r10, r14, r6 // .......................................................................*........................................ + pkhtb r9, r5, r9, asr #16 // ........................................................................*....................................... + smulwt r14, r14, r6 // ........................................................................*....................................... + ssub16.w r6, r8, r2 // .........................................................................*...................................... + smlabt r5, r10, r12, r0 // .........................................................................*...................................... + sadd16.w r2, r8, r2 // ..........................................................................*..................................... + smlabt r10, r14, r12, r0 // ..........................................................................*..................................... + uadd16 r14, r7, r9 // ...........................................................................*.................................... + vmov.w r8, s12 // ...........................................................................*.................................... + pkhtb r5, r10, r5, asr #16 // ............................................................................*................................... + smulwb r10, r8, r14 // ............................................................................*................................... + usub16 r7, r7, r9 // .............................................................................*.................................. + smulwt r8, r8, r14 // .............................................................................*.................................. + vmov.w r9, s14 // ..............................................................................*................................. + smlabt r10, r10, r12, r0 // ..............................................................................*................................. + uadd16 r14, r3, r5 // ...............................................................................*................................ + smlabt r8, r8, r12, r0 // ...............................................................................*................................ + usub16 r5, r3, r5 // ................................................................................*............................... + smulwb r3, r9, r7 // ................................................................................*............................... + pkhtb r8, r8, r10, asr #16 // .................................................................................*.............................. + smulwt r10, r9, r7 // .................................................................................*.............................. + usub16 r7, r14, r8 // ..................................................................................*............................. + smlabt r9, r3, r12, r0 // ..................................................................................*............................. + vmov.w r3, s13 // ...................................................................................*............................ + smlabt r10, r10, r12, r0 // ...................................................................................*............................ + uadd16 r8, r14, r8 // ....................................................................................*........................... + smulwt r14, r3, r4 // ....................................................................................*........................... + pkhtb r9, r10, r9, asr #16 // .....................................................................................*.......................... + smulwb r10, r3, r4 // .....................................................................................*.......................... + uadd16 r4, r5, r9 // ......................................................................................*......................... + vmov r3, s1 // ......................................................................................*......................... + usub16 r9, r5, r9 // .......................................................................................*........................ + smlabt r10, r10, r12, r0 // .......................................................................................*........................ + usub16 r5, r8, r3 // ........................................................................................*....................... + smlabt r14, r14, r12, r0 // ........................................................................................*....................... + uadd16 r3, r8, r3 // .........................................................................................*...................... + vmov r0, s23 // .........................................................................................*...................... + pkhtb r10, r14, r10, asr #16 // ..........................................................................................*..................... + vmov r14, s3 // ..........................................................................................*..................... + usub16 r8, r4, r14 // ...........................................................................................*.................... + str.w r8, [r0, #44] // ...........................................................................................*.................... + ssub16.w r8, r11, r10 // ............................................................................................*................... + str.w r5, [r0, #36] // ............................................................................................*................... + sadd16.w r10, r11, r10 // .............................................................................................*.................. + str.w r3, [r0, #4] // .............................................................................................*.................. + uadd16 r11, r4, r14 // ..............................................................................................*................. + vmov r14, s7 // ..............................................................................................*................. + uadd16 r5, r9, r14 // ...............................................................................................*................ + str.w r11, [r0, #12] // ...............................................................................................*................ + usub16 r14, r9, r14 // ................................................................................................*............... + vmov r11, s5 // ................................................................................................*............... + str.w r14, [r0, #60] // .................................................................................................*.............. + vmov r14, s4 // .................................................................................................*.............. + uadd16 r9, r7, r11 // ..................................................................................................*............. + str.w r5, [r0, #28] // ..................................................................................................*............. + usub16 r7, r7, r11 // ...................................................................................................*............ + str.w r7, [r0, #52] // ...................................................................................................*............ + str.w r9, [r0, #20] // ....................................................................................................*........... + vmov r7, s2 // ....................................................................................................*........... + uadd16 r5, r10, r7 // .....................................................................................................*.......... + str.w r5, [r0, #8] // .....................................................................................................*.......... + usub16 r7, r10, r7 // ......................................................................................................*......... + vmov r4, s0 // ......................................................................................................*......... + uadd16 r10, r2, r4 // .......................................................................................................*........ + str.w r7, [r0, #40] // .......................................................................................................*........ + usub16 r7, r2, r4 // ........................................................................................................*....... + vmov r9, s6 // ........................................................................................................*....... + uadd16 r2, r8, r9 // .........................................................................................................*...... + str.w r2, [r0, #24] // .........................................................................................................*...... + usub16 r4, r8, r9 // ..........................................................................................................*..... + str.w r4, [r0, #56] // ..........................................................................................................*..... + uadd16 r8, r6, r14 // ...........................................................................................................*.... + str.w r7, [r0, #32] // ...........................................................................................................*.... + usub16 r7, r6, r14 // ............................................................................................................*... + str.w r8, [r0, #16] // ............................................................................................................*... + str.w r7, [r0, #48] // .............................................................................................................*.. + str.w r10, [r0], #64 // ..............................................................................................................*. // @slothy:core + vmov r10, s8 // ..............................................................................................................*. + cmp.w r0, r10 // ...............................................................................................................* - // ---------------------------------------------------- cycle (expected) -----------------------------------------------------> + // ---------------------------------------------- cycle (expected) -----------------------------------------------> // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|----------------------- - // ldr.w r8, [r0, #32] // *........................................................................................................................... - // ldr.w r11, [r0, #36] // *........................................................................................................................... - // ssub16.w r3, r8, r11 // .*.......................................................................................................................... - // ldr.w r2, [r0, #60] // ...*........................................................................................................................ - // ldr.w r7, [r0, #52] // .......*.................................................................................................................... - // ldr.w r10, [r0, #56] // .....*...................................................................................................................... - // ssub16.w r9, r10, r2 // .........*.................................................................................................................. - // ldr.w r14, [r0, #48] // ......*..................................................................................................................... - // vmov s23, r0 // .....*...................................................................................................................... - // sadd16.w r5, r10, r2 // ......*..................................................................................................................... - // sadd16.w r4, r14, r7 // ..........*................................................................................................................. - // ldr.w r2, [r0, #40] // .*.......................................................................................................................... - // sadd16.w r6, r4, r5 // ............*............................................................................................................... - // ssub16.w r7, r14, r7 // ...........*................................................................................................................ - // ldr.w r14, [r0, #44] // ..*......................................................................................................................... - // vmov.w r10, s10 // ....*....................................................................................................................... - // ssub16.w r0, r2, r14 // .......*.................................................................................................................... - // sadd16.w r2, r2, r14 // ...*........................................................................................................................ - // smulwb r14, r10, r0 // ........*................................................................................................................... - // ssub16.w r4, r4, r5 // .............*.............................................................................................................. - // smulwt r5, r10, r0 // .........*.................................................................................................................. - // movw r0, #24608 // ...........*................................................................................................................ - // sadd16.w r11, r8, r11 // ..*......................................................................................................................... - // smulwt r8, r10, r9 // ..........*................................................................................................................. - // smulwb r9, r10, r9 // ............*............................................................................................................... - // sadd16.w r10, r11, r2 // ....*....................................................................................................................... - // ssub16.w r11, r11, r2 // ........*................................................................................................................... - // smlabt r8, r8, r12, r0 // .............*.............................................................................................................. - // sadd16.w r2, r10, r6 // ..............*............................................................................................................. - // smlabt r9, r9, r12, r0 // ..............*............................................................................................................. - // smlabt r14, r14, r12, r0 // ...............*............................................................................................................ - // ssub16.w r6, r10, r6 // ...............*............................................................................................................ - // pkhtb r9, r8, r9, asr #16 // ................*........................................................................................................... - // vmov.w r8, s12 // ................*........................................................................................................... - // smlabt r5, r5, r12, r0 // .................*.......................................................................................................... - // uadd16 r10, r7, r9 // .................*.......................................................................................................... - // usub16 r9, r7, r9 // ..................*......................................................................................................... - // smulwb r7, r8, r10 // ...................*........................................................................................................ - // pkhtb r5, r5, r14, asr #16 // ...................*........................................................................................................ - // smulwt r14, r8, r10 // ..................*......................................................................................................... - // smlabt r8, r7, r12, r0 // ..........................*................................................................................................. - // uadd16 r7, r3, r5 // .....................*...................................................................................................... - // usub16 r5, r3, r5 // ....................*....................................................................................................... - // smlabt r10, r14, r12, r0 // ....................*....................................................................................................... - // vmov.w r14, s13 // .....................*...................................................................................................... - // smulwb r3, r14, r4 // .......................*.................................................................................................... - // pkhtb r8, r10, r8, asr #16 // ............................*............................................................................................... - // smulwt r14, r14, r4 // ......................*..................................................................................................... - // smlabt r4, r3, r12, r0 // .........................*.................................................................................................. - // uadd16 r10, r7, r8 // .............................*.............................................................................................. - // smlabt r14, r14, r12, r0 // ........................*................................................................................................... - // vmov.w r3, s14 // ........................*................................................................................................... - // usub16 r7, r7, r8 // ...............................*............................................................................................ - // smulwb r8, r3, r9 // ............................*............................................................................................... - // smulwt r9, r3, r9 // ...........................*................................................................................................ - // pkhtb r4, r14, r4, asr #16 // ...........................*................................................................................................ - // smlabt r3, r8, r12, r0 // ..............................*............................................................................................. - // ssub16.w r8, r11, r4 // ...................................*........................................................................................ - // smlabt r9, r9, r12, r0 // .............................*.............................................................................................. - // vmov r14, s16 // ..............................*............................................................................................. - // sadd16.w r4, r11, r4 // ..................................*......................................................................................... - // smulwb r11, r14, r10 // ................................*........................................................................................... - // smulwt r14, r14, r10 // .................................*.......................................................................................... - // pkhtb r10, r9, r3, asr #16 // ................................*........................................................................................... - // smlabt r3, r11, r12, r0 // ..................................*......................................................................................... - // smlabt r9, r14, r12, r0 // ........................................*................................................................................... - // vmov r14, s17 // .................................*.......................................................................................... - // smulwb r11, r14, r4 // ....................................*....................................................................................... - // smulwt r4, r14, r4 // ...................................*........................................................................................ - // pkhtb r3, r9, r3, asr #16 // .............................................*.............................................................................. - // smlabt r14, r11, r12, r0 // ......................................*..................................................................................... - // usub16 r9, r5, r10 // ....................................*....................................................................................... - // smlabt r4, r4, r12, r0 // .....................................*...................................................................................... - // vmov r11, s19 // ..........................................*................................................................................. - // uadd16 r5, r5, r10 // .......................................*.................................................................................... - // smulwb r10, r11, r6 // ..............................................*............................................................................. - // smulwt r6, r11, r6 // .............................................*.............................................................................. - // vmov r11, s20 // ......................................*..................................................................................... - // pkhtb r4, r4, r14, asr #16 // ........................................*................................................................................... - // smulwb r14, r11, r7 // ..........................................*................................................................................. - // smlabt r6, r6, r12, r0 // ...............................................*............................................................................ - // smlabt r10, r10, r12, r0 // ................................................*........................................................................... - // smulwt r7, r11, r7 // .......................................*.................................................................................... - // vmov r11, s18 // ................................................*........................................................................... - // pkhtb r6, r6, r10, asr #16 // ..................................................*......................................................................... - // smulwt r10, r11, r5 // ....................................................*....................................................................... - // smulwb r11, r11, r5 // .................................................*.......................................................................... - // smlabt r5, r10, r12, r0 // .........................................................*.................................................................. - // vmov s0, r2 // .....................................*...................................................................................... - // smlabt r11, r11, r12, r0 // ...................................................*........................................................................ - // vmov s1, r3 // ..............................................*............................................................................. - // smlabt r3, r14, r12, r0 // ............................................*............................................................................... - // vmov s2, r4 // .........................................*.................................................................................. - // smlabt r10, r7, r12, r0 // .........................................*.................................................................................. - // vmov r4, s21 // .........................................................*.................................................................. - // vmov s4, r6 // .....................................................*...................................................................... - // smulwb r6, r4, r8 // ..........................................................*................................................................. - // smulwt r8, r4, r8 // ...........................................................*................................................................ - // vmov r4, s22 // ...............................................*............................................................................ - // pkhtb r5, r5, r11, asr #16 // ...........................................................*................................................................ - // smulwb r11, r4, r9 // .....................................................*...................................................................... - // smulwt r4, r4, r9 // ......................................................*..................................................................... - // pkhtb r3, r10, r3, asr #16 // .................................................*.......................................................................... - // smlabt r10, r6, r12, r0 // ............................................................*............................................................... - // vmov s3, r5 // ...............................................................*............................................................ - // smlabt r5, r8, r12, r0 // .............................................................*.............................................................. - // vmov s5, r3 // ....................................................*....................................................................... - // smlabt r3, r11, r12, r0 // .......................................................*.................................................................... - // smlabt r7, r4, r12, r0 // ........................................................*................................................................... - // vmov r8, s23 // ..................................................*......................................................................... - // ldr.w r2, [r8, #20] // ......................................................*..................................................................... - // pkhtb r11, r5, r10, asr #16 // ...............................................................*............................................................ - // pkhtb r14, r7, r3, asr #16 // ............................................................*............................................................... - // ldr.w r4, [r8, #16] // ...................................................*........................................................................ - // ldr.w r3, [r8, #0] // .......................................................*.................................................................... - // ssub16.w r7, r4, r2 // ........................................................*................................................................... - // sadd16.w r4, r4, r2 // ..........................................................*................................................................. - // ldr.w r9, [r8, #4] // .............................................................*.............................................................. - // ldr.w r0, [r8, #24] // ..............................................................*............................................................. - // sadd16.w r10, r3, r9 // ..............................................................*............................................................. - // ssub16.w r3, r3, r9 // ................................................................*........................................................... - // ldr.w r5, [r8, #28] // ................................................................*........................................................... - // ldr.w r6, [r8, #12] // .................................................................*.......................................................... - // ssub16.w r9, r0, r5 // .................................................................*.......................................................... - // ldr.w r2, [r8, #8] // ..................................................................*......................................................... - // sadd16.w r0, r0, r5 // ..................................................................*......................................................... - // vmov s6, r11 // ...................................................................*........................................................ - // ssub16.w r5, r2, r6 // ...................................................................*........................................................ - // vmov s7, r14 // ....................................................................*....................................................... - // sadd16.w r14, r2, r6 // ......................................................................*..................................................... - // sadd16.w r6, r4, r0 // ....................................................................*....................................................... - // vmov.w r2, s10 // .....................................................................*...................................................... - // ssub16.w r4, r4, r0 // .....................................................................*...................................................... - // smulwb r8, r2, r5 // ......................................................................*..................................................... - // smulwt r5, r2, r5 // .......................................................................*.................................................... - // ssub16.w r11, r10, r14 // .......................................................................*.................................................... - // sadd16.w r10, r10, r14 // ........................................................................*................................................... - // smulwb r14, r2, r9 // .........................................................................*.................................................. - // smulwt r9, r2, r9 // ........................................................................*................................................... - // movw r0, #24608 // .........................................................................*.................................................. - // smlabt r14, r14, r12, r0 // ...........................................................................*................................................ - // sadd16.w r2, r10, r6 // ...........................................................................*................................................ - // ssub16.w r6, r10, r6 // ............................................................................*............................................... - // smlabt r9, r9, r12, r0 // ..........................................................................*................................................. - // smlabt r8, r8, r12, r0 // ............................................................................*............................................... - // vmov.w r10, s12 // ...............................................................................*............................................ - // smlabt r5, r5, r12, r0 // ..............................................................................*............................................. - // pkhtb r9, r9, r14, asr #16 // .............................................................................*.............................................. - // uadd16 r14, r7, r9 // ..............................................................................*............................................. - // usub16 r9, r7, r9 // ...............................................................................*............................................ - // smulwb r7, r10, r14 // ................................................................................*........................................... - // pkhtb r8, r5, r8, asr #16 // ................................................................................*........................................... - // smulwt r14, r10, r14 // .................................................................................*.......................................... - // usub16 r5, r3, r8 // ...................................................................................*........................................ - // smlabt r10, r7, r12, r0 // ..................................................................................*......................................... - // uadd16 r8, r3, r8 // ....................................................................................*....................................... - // smlabt r14, r14, r12, r0 // ...................................................................................*........................................ - // vmov.w r7, s13 // .................................................................................*.......................................... - // smulwt r3, r7, r4 // .....................................................................................*...................................... - // pkhtb r10, r14, r10, asr #16 // .....................................................................................*...................................... - // smulwb r14, r7, r4 // ....................................................................................*....................................... - // smlabt r4, r3, r12, r0 // .......................................................................................*.................................... - // usub16 r7, r8, r10 // ........................................................................................*................................... - // uadd16 r3, r8, r10 // ......................................................................................*..................................... - // smlabt r10, r14, r12, r0 // ......................................................................................*..................................... - // vmov.w r14, s14 // .......................................................................................*.................................... - // pkhtb r8, r4, r10, asr #16 // .........................................................................................*.................................. - // smulwt r10, r14, r9 // .........................................................................................*.................................. - // smulwb r14, r14, r9 // ........................................................................................*................................... - // sadd16.w r4, r11, r8 // ....................................................................................................*....................... - // ssub16.w r8, r11, r8 // ...................................................................................................*........................ - // smlabt r10, r10, r12, r0 // ............................................................................................*............................... - // smlabt r9, r14, r12, r0 // ..........................................................................................*................................. - // vmov r11, s1 // ..........................................................................................*................................. - // vmov r0, s23 // .............................................................................................*.............................. - // uadd16 r14, r3, r11 // ............................................................................................*............................... - // str.w r14, [r0, #4] // ...............................................................................................*............................ - // vmov r14, s3 // ..............................................................................................*............................. - // usub16 r3, r3, r11 // ...........................................................................................*................................ - // str.w r3, [r0, #36] // .............................................................................................*.............................. - // vmov r3, s5 // ................................................................................................*........................... - // uadd16 r11, r7, r3 // ..................................................................................................*......................... - // str.w r11, [r0, #20] // ...................................................................................................*........................ - // usub16 r7, r7, r3 // ......................................................................................................*..................... - // str.w r7, [r0, #52] // .......................................................................................................*.................... - // pkhtb r3, r10, r9, asr #16 // ..............................................................................................*............................. - // uadd16 r11, r5, r3 // ...............................................................................................*............................ - // usub16 r7, r11, r14 // ........................................................................................................*................... - // str.w r7, [r0, #44] // .............................................................................................................*.............. - // usub16 r5, r5, r3 // .................................................................................................*.......................... - // vmov r3, s7 // .......................................................................................................*.................... - // uadd16 r7, r5, r3 // .........................................................................................................*.................. - // str.w r7, [r0, #28] // .........................................................................................................*.................. - // usub16 r3, r5, r3 // ..........................................................................................................*................. - // str.w r3, [r0, #60] // ...........................................................................................................*................ - // uadd16 r10, r11, r14 // ................................................................................................*........................... - // str.w r10, [r0, #12] // .................................................................................................*.......................... - // vmov r11, s2 // ......................................................................................................*..................... - // usub16 r3, r4, r11 // ...............................................................................................................*............ - // str.w r3, [r0, #40] // .................................................................................................................*.......... - // vmov r3, s0 // ..................................................................................................*......................... - // uadd16 r5, r2, r3 // .................................................................................................................*.......... - // str.w r5, [r0], #64 // ...................................................................................................................*........ - // usub16 r3, r2, r3 // .....................................................................................................*...................... - // vmov r7, s6 // ...................................................................................................................*........ - // uadd16 r9, r8, r7 // .....................................................................................................................*...... - // str.w r3, [r0, #-32] // .....................................................................................................*...................... - // usub16 r3, r8, r7 // ....................................................................................................................*....... - // str.w r3, [r0, #-8] // .........................................................................................................................*.. - // vmov r3, s4 // ...........................................................................................................*................ - // usub16 r2, r6, r3 // ............................................................................................................*............... - // str.w r9, [r0, #-40] // ...........................................................................................................................* - // uadd16 r3, r6, r3 // .............................................................................................................*.............. - // str.w r3, [r0, #-48] // ...............................................................................................................*............ - // uadd16 r3, r4, r11 // ................................................................................................................*........... - // str.w r3, [r0, #-56] // .....................................................................................................................*...... - // str.w r2, [r0, #-16] // .......................................................................................................................*.... + // |------------------------|------------------------|------------------------|------------------------|----------- + // ldr.w r10, [r0, #56] // ......*......................................................................................................... + // ldr.w r9, [r0, #60] // .......*........................................................................................................ + // sadd16.w r14, r10, r9 // ........*....................................................................................................... + // ldr.w r11, [r0, #36] // ...*............................................................................................................ + // ssub16.w r9, r10, r9 // .........*...................................................................................................... + // ldr.w r6, [r0, #32] // ....*........................................................................................................... + // sadd16.w r2, r6, r11 // .....*.......................................................................................................... + // ldr.w r10, [r0, #48] // *............................................................................................................... + // ssub16.w r3, r6, r11 // .......*........................................................................................................ + // ldr.w r7, [r0, #52] // *............................................................................................................... + // sadd16.w r11, r10, r7 // .*.............................................................................................................. + // vmov s23, r0 // .....*.......................................................................................................... + // ssub16.w r4, r11, r14 // ............*................................................................................................... + // ldr.w r8, [r0, #40] // ..*............................................................................................................. + // ssub16.w r7, r10, r7 // ..*............................................................................................................. + // vmov.w r10, s10 // ........*....................................................................................................... + // sadd16.w r6, r11, r14 // ...........*.................................................................................................... + // ldr.w r5, [r0, #44] // .*.............................................................................................................. + // ssub16.w r14, r8, r5 // ...*............................................................................................................ + // sadd16.w r11, r8, r5 // ....*........................................................................................................... + // smulwt r5, r10, r14 // ..........*..................................................................................................... + // movw r0, #24608 // .............*.................................................................................................. + // sadd16.w r8, r2, r11 // ..........*..................................................................................................... + // smulwb r14, r10, r14 // .........*...................................................................................................... + // ssub16.w r11, r2, r11 // ......*......................................................................................................... + // smulwb r2, r10, r9 // ...........*.................................................................................................... + // smulwt r9, r10, r9 // ............*................................................................................................... + // smlabt r10, r2, r12, r0 // .............*.................................................................................................. + // sadd16.w r2, r8, r6 // ..............*................................................................................................. + // smlabt r9, r9, r12, r0 // ..............*................................................................................................. + // ssub16.w r6, r8, r6 // ...............*................................................................................................ + // smlabt r8, r5, r12, r0 // ...............*................................................................................................ + // pkhtb r9, r9, r10, asr #16 // ................*............................................................................................... + // smlabt r14, r14, r12, r0 // ................*............................................................................................... + // uadd16 r5, r7, r9 // .................*.............................................................................................. + // vmov.w r10, s12 // .................*.............................................................................................. + // usub16 r9, r7, r9 // ..................*............................................................................................. + // smulwb r7, r10, r5 // ..................*............................................................................................. + // pkhtb r14, r8, r14, asr #16 // ...................*............................................................................................ + // smulwt r8, r10, r5 // ...................*............................................................................................ + // usub16 r5, r3, r14 // ....................*........................................................................................... + // smlabt r7, r7, r12, r0 // ....................*........................................................................................... + // uadd16 r3, r3, r14 // ......................*......................................................................................... + // smlabt r14, r8, r12, r0 // .....................*.......................................................................................... + // pkhtb r10, r14, r7, asr #16 // .......................*........................................................................................ + // vmov.w r8, s13 // .....................*.......................................................................................... + // usub16 r7, r3, r10 // ........................*....................................................................................... + // smulwb r14, r8, r4 // ......................*......................................................................................... + // uadd16 r10, r3, r10 // ..........................*..................................................................................... + // smulwt r3, r8, r4 // .......................*........................................................................................ + // vmov.w r8, s14 // .........................*...................................................................................... + // smlabt r4, r14, r12, r0 // ........................*....................................................................................... + // smlabt r14, r3, r12, r0 // .........................*...................................................................................... + // smulwt r3, r8, r9 // ...........................*.................................................................................... + // pkhtb r4, r14, r4, asr #16 // ...........................*.................................................................................... + // smulwb r14, r8, r9 // ..........................*..................................................................................... + // ssub16.w r8, r11, r4 // .............................*.................................................................................. + // smlabt r9, r3, r12, r0 // .............................*.................................................................................. + // vmov r3, s16 // ............................*................................................................................... + // smlabt r14, r14, r12, r0 // ............................*................................................................................... + // sadd16.w r4, r11, r4 // ..............................*................................................................................. + // smulwb r11, r3, r10 // ..............................*................................................................................. + // smulwt r10, r3, r10 // ...............................*................................................................................ + // pkhtb r3, r9, r14, asr #16 // ...............................*................................................................................ + // smlabt r14, r11, r12, r0 // ................................*............................................................................... + // usub16 r9, r5, r3 // ......................................*......................................................................... + // uadd16 r11, r5, r3 // ....................................*........................................................................... + // smlabt r5, r10, r12, r0 // .................................*.............................................................................. + // vmov r10, s18 // .....................................*.......................................................................... + // pkhtb r3, r5, r14, asr #16 // ...................................*............................................................................ + // smulwt r14, r10, r11 // ......................................*......................................................................... + // vmov r5, s17 // .................................*.............................................................................. + // smulwb r11, r10, r11 // .......................................*........................................................................ + // smulwt r10, r5, r4 // ...................................*............................................................................ + // smulwb r4, r5, r4 // ..................................*............................................................................. + // smlabt r10, r10, r12, r0 // .....................................*.......................................................................... + // smlabt r5, r11, r12, r0 // ..........................................*..................................................................... + // smlabt r14, r14, r12, r0 // ........................................*....................................................................... + // vmov r11, s19 // ........................................*....................................................................... + // smlabt r4, r4, r12, r0 // ....................................*........................................................................... + // pkhtb r5, r14, r5, asr #16 // .............................................*.................................................................. + // smulwb r14, r11, r6 // ...........................................*.................................................................... + // smulwt r6, r11, r6 // .........................................*...................................................................... + // pkhtb r4, r10, r4, asr #16 // .......................................*........................................................................ + // smlabt r10, r14, r12, r0 // ..............................................*................................................................. + // vmov r11, s20 // ............................................*................................................................... + // smlabt r6, r6, r12, r0 // ............................................*................................................................... + // smulwb r14, r11, r7 // .............................................*.................................................................. + // pkhtb r6, r6, r10, asr #16 // .................................................*.............................................................. + // smulwt r10, r11, r7 // ...............................................*................................................................ + // vmov s2, r4 // .........................................*...................................................................... + // smlabt r14, r14, r12, r0 // ................................................*............................................................... + // vmov r4, s22 // ...........................................*.................................................................... + // smlabt r11, r10, r12, r0 // .................................................*.............................................................. + // vmov s4, r6 // ..................................................*............................................................. + // smulwb r7, r4, r9 // ..................................................*............................................................. + // pkhtb r11, r11, r14, asr #16 // ...................................................*............................................................ + // smulwt r6, r4, r9 // .....................................................*.......................................................... + // vmov s5, r11 // .....................................................*.......................................................... + // smlabt r11, r7, r12, r0 // ....................................................*........................................................... + // vmov r7, s21 // ..............................................*................................................................. + // smlabt r9, r6, r12, r0 // .......................................................*........................................................ + // vmov s3, r5 // ...............................................*................................................................ + // smulwb r5, r7, r8 // ......................................................*......................................................... + // pkhtb r11, r9, r11, asr #16 // ............................................................*................................................... + // smulwt r14, r7, r8 // ...................................................*............................................................ + // vmov s7, r11 // ...............................................................*................................................ + // smlabt r4, r5, r12, r0 // .........................................................*...................................................... + // vmov s1, r3 // ..........................................*..................................................................... + // smlabt r11, r14, r12, r0 // ........................................................*....................................................... + // pkhtb r4, r11, r4, asr #16 // ..............................................................*................................................. + // vmov r7, s23 // ................................................*............................................................... + // ldr.w r6, [r7, #28] // ........................................................*....................................................... + // ldr.w r14, [r7, #24] // ......................................................*......................................................... + // ssub16.w r9, r14, r6 // ..........................................................*..................................................... + // ldr.w r11, [r7, #4] // ....................................................*........................................................... + // sadd16.w r10, r14, r6 // .........................................................*...................................................... + // ldr.w r5, [r7, #0] // ..........................................................*..................................................... + // sadd16.w r0, r5, r11 // .............................................................*.................................................. + // ldr.w r6, [r7, #8] // .............................................................*.................................................. + // ssub16.w r3, r5, r11 // ...........................................................*.................................................... + // ldr.w r5, [r7, #12] // ..............................................................*................................................. + // sadd16.w r8, r6, r5 // ................................................................*............................................... + // ldr.w r11, [r7, #16] // ...........................................................*.................................................... + // ssub16.w r5, r6, r5 // ...................................................................*............................................ + // ldr.w r6, [r7, #20] // ............................................................*................................................... + // ssub16.w r7, r11, r6 // ...............................................................*................................................ + // vmov s0, r2 // .......................................................*........................................................ + // sadd16.w r11, r11, r6 // ..................................................................*............................................. + // vmov s6, r4 // ................................................................*............................................... + // sadd16.w r6, r11, r10 // ......................................................................*......................................... + // ssub16.w r4, r11, r10 // .......................................................................*........................................ + // vmov.w r14, s10 // ..................................................................*............................................. + // ssub16.w r11, r0, r8 // ....................................................................*........................................... + // smulwb r10, r14, r9 // ...................................................................*............................................ + // sadd16.w r8, r0, r8 // .................................................................*.............................................. + // smulwt r9, r14, r9 // ....................................................................*........................................... + // movw r0, #24608 // .....................................................................*.......................................... + // smlabt r10, r10, r12, r0 // .....................................................................*.......................................... + // sadd16.w r2, r8, r6 // ..........................................................................*..................................... + // smlabt r9, r9, r12, r0 // ......................................................................*......................................... + // ssub16.w r6, r8, r6 // .........................................................................*...................................... + // smulwb r8, r14, r5 // .......................................................................*........................................ + // pkhtb r9, r9, r10, asr #16 // ........................................................................*....................................... + // smulwt r14, r14, r5 // ........................................................................*....................................... + // uadd16 r5, r7, r9 // ...........................................................................*.................................... + // smlabt r8, r8, r12, r0 // .........................................................................*...................................... + // vmov.w r10, s12 // ...........................................................................*.................................... + // smlabt r14, r14, r12, r0 // ..........................................................................*..................................... + // usub16 r9, r7, r9 // .............................................................................*.................................. + // smulwb r7, r10, r5 // ............................................................................*................................... + // pkhtb r14, r14, r8, asr #16 // ............................................................................*................................... + // smulwt r8, r10, r5 // .............................................................................*.................................. + // uadd16 r10, r3, r14 // ...............................................................................*................................ + // smlabt r7, r7, r12, r0 // ..............................................................................*................................. + // vmov.w r5, s14 // ..............................................................................*................................. + // smlabt r8, r8, r12, r0 // ...............................................................................*................................ + // usub16 r3, r3, r14 // ................................................................................*............................... + // smulwb r14, r5, r9 // ................................................................................*............................... + // pkhtb r8, r8, r7, asr #16 // .................................................................................*.............................. + // smulwt r9, r5, r9 // .................................................................................*.............................. + // usub16 r7, r10, r8 // ..................................................................................*............................. + // smlabt r14, r14, r12, r0 // ..................................................................................*............................. + // vmov.w r5, s13 // ...................................................................................*............................ + // smlabt r9, r9, r12, r0 // ...................................................................................*............................ + // uadd16 r8, r10, r8 // ....................................................................................*........................... + // smulwt r10, r5, r4 // ....................................................................................*........................... + // pkhtb r9, r9, r14, asr #16 // .....................................................................................*.......................... + // smulwb r14, r5, r4 // .....................................................................................*.......................... + // uadd16 r5, r3, r9 // ......................................................................................*......................... + // vmov r4, s1 // ......................................................................................*......................... + // usub16 r9, r3, r9 // .......................................................................................*........................ + // smlabt r14, r14, r12, r0 // .......................................................................................*........................ + // usub16 r3, r8, r4 // ........................................................................................*....................... + // smlabt r10, r10, r12, r0 // ........................................................................................*....................... + // uadd16 r8, r8, r4 // .........................................................................................*...................... + // vmov r0, s23 // .........................................................................................*...................... + // pkhtb r14, r10, r14, asr #16 // ..........................................................................................*..................... + // str.w r3, [r0, #36] // ............................................................................................*................... + // sadd16.w r10, r11, r14 // .............................................................................................*.................. + // str.w r8, [r0, #4] // .............................................................................................*.................. + // ssub16.w r8, r11, r14 // ............................................................................................*................... + // vmov r11, s3 // ..........................................................................................*..................... + // uadd16 r14, r5, r11 // ..............................................................................................*................. + // str.w r14, [r0, #12] // ...............................................................................................*................ + // usub16 r11, r5, r11 // ...........................................................................................*.................... + // vmov r14, s7 // ..............................................................................................*................. + // uadd16 r5, r9, r14 // ...............................................................................................*................ + // str.w r5, [r0, #28] // ..................................................................................................*............. + // usub16 r9, r9, r14 // ................................................................................................*............... + // vmov r14, s5 // ................................................................................................*............... + // uadd16 r5, r7, r14 // ..................................................................................................*............. + // str.w r11, [r0, #44] // ...........................................................................................*.................... + // usub16 r11, r7, r14 // ...................................................................................................*............ + // str.w r11, [r0, #52] // ...................................................................................................*............ + // str.w r5, [r0, #20] // ....................................................................................................*........... + // vmov r11, s2 // ....................................................................................................*........... + // uadd16 r14, r10, r11 // .....................................................................................................*.......... + // str.w r14, [r0, #8] // .....................................................................................................*.......... + // usub16 r3, r10, r11 // ......................................................................................................*......... + // vmov r11, s6 // ........................................................................................................*....... + // str.w r9, [r0, #60] // .................................................................................................*.............. + // vmov r5, s4 // .................................................................................................*.............. + // uadd16 r10, r8, r11 // .........................................................................................................*...... + // str.w r3, [r0, #40] // .......................................................................................................*........ + // usub16 r3, r8, r11 // ..........................................................................................................*..... + // str.w r10, [r0, #24] // .........................................................................................................*...... + // str.w r3, [r0, #56] // ..........................................................................................................*..... + // vmov r11, s0 // ......................................................................................................*......... + // uadd16 r4, r6, r5 // ...........................................................................................................*.... + // str.w r4, [r0, #16] // ............................................................................................................*... + // usub16 r6, r6, r5 // ............................................................................................................*... + // str.w r6, [r0, #48] // .............................................................................................................*.. + // usub16 r7, r2, r11 // ........................................................................................................*....... + // str.w r7, [r0, #32] // ...........................................................................................................*.... + // uadd16 r11, r2, r11 // .......................................................................................................*........ + // str.w r11, [r0], #64 // ..............................................................................................................*. + // vmov r11, s8 // ..............................................................................................................*. + // cmp.w r0, r11 // ...............................................................................................................* - vmov tmp, s8 - cmp poly, tmp bne layer1234_loop sub.w poly, #8*strincr @@ -645,270 +647,270 @@ layer1234_loop: // ITER 0 layer567_first_start: // Instructions: 127 - // Expected cycles: 83 - // Expected IPC: 1.53 + // Expected cycles: 74 + // Expected IPC: 1.72 // - // -------------------------------- cycle (expected) --------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------- - ldr.w r6, [r0, #448] // *.................................................................................. - vldr s4, [r1, #16] // *.................................................................................. - vldr s5, [r1, #20] // .*................................................................................. - ldr.w r3, [r0, #320] // .*................................................................................. - vldr s17, [r1, #4] // ..*................................................................................ - ldr.w r5, [r0, #256] // ..*................................................................................ - sadd16.w r2, r5, r3 // ...*............................................................................... - ldr.w r8, [r0, #0] // ...*............................................................................... - vmov s6, r0 // ....*.............................................................................. - ldr.w r10, [r0, #384] // ....*.............................................................................. - ssub16.w r7, r5, r3 // .....*............................................................................. - sadd16.w r5, r10, r6 // ......*............................................................................ - vldr s0, [r1, #0] // ......*............................................................................ - ssub16.w r10, r10, r6 // .......*........................................................................... - ldr.w r9, [r0, #192] // .......*........................................................................... - vmov.w r11, s17 // ........*.......................................................................... - ldr.w r14, [r0, #128] // ........*.......................................................................... - ldr.w r3, [r0, #64] // .........*......................................................................... - sadd16.w r6, r2, r5 // .........*......................................................................... - movw r0, #24608 // ..........*........................................................................ - ssub16.w r4, r2, r5 // ..........*........................................................................ - ssub16.w r2, r14, r9 // ...........*....................................................................... - vldr s2, [r1, #8] // ...........*....................................................................... - smulwb r5, r11, r2 // ............*...................................................................... - sadd16.w r14, r14, r9 // ............*...................................................................... - smulwt r2, r11, r2 // .............*..................................................................... - sadd16.w r9, r8, r3 // .............*..................................................................... - smlabt r5, r5, r12, r0 // ..............*.................................................................... - ssub16.w r8, r8, r3 // ..............*.................................................................... - smlabt r2, r2, r12, r0 // ...............*................................................................... - ssub16.w r3, r9, r14 // ...............*................................................................... - sadd16.w r14, r9, r14 // ................*.................................................................. - vldr s17, [r1, #12] // .................*................................................................. - smulwb r9, r11, r10 // .................*................................................................. - pkhtb r5, r2, r5, asr #16 // ..................*................................................................ - smulwt r10, r11, r10 // ..................*................................................................ - sadd16.w r2, r14, r6 // ...................*............................................................... - smlabt r9, r9, r12, r0 // ...................*............................................................... - smlabt r11, r10, r12, r0 // ....................*.............................................................. - vmov.w r10, s4 // ....................*.............................................................. - ssub16.w r6, r14, r6 // .....................*............................................................. - smulwb r14, r10, r4 // .....................*............................................................. - pkhtb r11, r11, r9, asr #16 // ......................*............................................................ - smulwt r4, r10, r4 // ......................*............................................................ - smlabt r14, r14, r12, r0 // .......................*........................................................... - add r1, r1, #24 // .......................*........................................................... - uadd16 r9, r7, r11 // ........................*.......................................................... - vmov.w r10, s17 // ........................*.......................................................... - smlabt r4, r4, r12, r0 // .........................*......................................................... - usub16 r7, r7, r11 // .........................*......................................................... - smulwt r11, r10, r9 // ..........................*........................................................ - pkhtb r4, r4, r14, asr #16 // ...........................*....................................................... - smulwb r14, r10, r9 // ...........................*....................................................... - vmov.w r9, s5 // ............................*...................................................... - smlabt r10, r11, r12, r0 // ............................*...................................................... - smlabt r14, r14, r12, r0 // .............................*..................................................... - usub16 r11, r8, r5 // .............................*..................................................... - uadd16 r8, r8, r5 // ..............................*.................................................... - smulwt r5, r9, r7 // ..............................*.................................................... - pkhtb r14, r10, r14, asr #16 // ...............................*................................................... - smulwb r9, r9, r7 // ...............................*................................................... - smlabt r5, r5, r12, r0 // ................................*.................................................. - ldr r10, [r1], #8 // ................................*.................................................. - usub16 r7, r8, r14 // .................................*................................................. - smlabt r9, r9, r12, r0 // .................................*................................................. - uadd16 r8, r8, r14 // ..................................*................................................ - smulwb r14, r10, r2 // ..................................*................................................ - smulwt r2, r10, r2 // ...................................*............................................... - pkhtb r10, r5, r9, asr #16 // ...................................*............................................... - smlabt r14, r14, r12, r0 // ....................................*.............................................. - ldr r5, [r1, #-4] // ....................................*.............................................. - usub16 r9, r11, r10 // .....................................*............................................. - smlabt r2, r2, r12, r0 // .....................................*............................................. - uadd16 r10, r11, r10 // ......................................*............................................ - smulwt r11, r5, r8 // ......................................*............................................ - smulwb r5, r5, r8 // .......................................*........................................... - ldr r8, [r1, #12] // .......................................*........................................... - pkhtb r2, r2, r14, asr #16 // ........................................*.......................................... - smlabt r11, r11, r12, r0 // ........................................*.......................................... - smulwb r14, r8, r7 // .........................................*......................................... - smulwt r7, r8, r7 // ..........................................*........................................ - ssub16.w r8, r3, r4 // ..........................................*........................................ - smlabt r14, r14, r12, r0 // ...........................................*....................................... - sadd16.w r4, r3, r4 // ...........................................*....................................... - smlabt r7, r7, r12, r0 // ............................................*...................................... - ldr r3, [r1], #8 // ............................................*...................................... - smlabt r5, r5, r12, r0 // .............................................*..................................... - pkhtb r7, r7, r14, asr #16 // ..............................................*.................................... - pkhtb r11, r11, r5, asr #16 // ...............................................*................................... - smulwt r14, r3, r4 // ...............................................*................................... - smulwb r3, r3, r4 // ................................................*.................................. - ldr r5, [r1, #12] // ................................................*.................................. - smlabt r4, r14, r12, r0 // .................................................*................................. - smulwb r14, r5, r9 // ..................................................*................................ - smlabt r3, r3, r12, r0 // ...................................................*............................... - smulwt r5, r5, r9 // ....................................................*.............................. - smlabt r5, r5, r12, r0 // ......................................................*............................ - ldr r9, [r1, #-4] // ......................................................*............................ - pkhtb r3, r4, r3, asr #16 // .......................................................*........................... - smlabt r4, r14, r12, r0 // .......................................................*........................... - smulwb r14, r9, r10 // ........................................................*.......................... - smulwt r10, r9, r10 // .........................................................*......................... - pkhtb r4, r5, r4, asr #16 // .........................................................*......................... - smlabt r9, r14, r12, r0 // ..........................................................*........................ - ldr r5, [r1], #8 // ..........................................................*........................ - smlabt r10, r10, r12, r0 // ...........................................................*....................... - smulwt r14, r5, r6 // ............................................................*...................... - pkhtb r10, r10, r9, asr #16 // .............................................................*..................... - smulwb r9, r5, r6 // .............................................................*..................... - smlabt r6, r14, r12, r0 // ..............................................................*.................... - ldr r14, [r1], #8 // ..............................................................*.................... - smlabt r5, r9, r12, r0 // ...............................................................*................... - smulwt r9, r14, r8 // ................................................................*.................. - smulwb r8, r14, r8 // .................................................................*................. - pkhtb r6, r6, r5, asr #16 // .................................................................*................. - smlabt r14, r9, r12, r0 // ..................................................................*................ - smlabt r8, r8, r12, r0 // ...................................................................*............... - vmov r0, s6 // ...................................................................*............... - str.w r3, [r0, #128] // ....................................................................*.............. - pkhtb r8, r14, r8, asr #16 // .....................................................................*............. - str.w r2, [r0], #4 // ......................................................................*............ - str.w r11, [r0, #60] // ........................................................................*.......... - str.w r6, [r0, #252] // ..........................................................................*........ - str.w r7, [r0, #316] // ............................................................................*...... - str.w r8, [r0, #380] // ..............................................................................*.... - str.w r4, [r0, #444] // ................................................................................*.. - str.w r10, [r0, #188] // ..................................................................................* + // --------------------------- cycle (expected) ----------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------------- + vldr s26, [r1, #4] // *......................................................................... + ldr.w r2, [r0, #0] // *......................................................................... + ldr.w r6, [r0, #64] // .*........................................................................ + vmov s6, r0 // .*........................................................................ + sadd16.w r11, r2, r6 // ..*....................................................................... + ldr.w r7, [r0, #128] // ..*....................................................................... + ssub16.w r3, r2, r6 // ...*...................................................................... + vldr s4, [r1, #16] // ....*..................................................................... + ldr.w r2, [r0, #384] // ....*..................................................................... + ldr.w r9, [r0, #448] // .....*.................................................................... + vldr s0, [r1, #0] // .....*.................................................................... + ssub16.w r14, r2, r9 // ......*................................................................... + ldr.w r5, [r0, #192] // ......*................................................................... + sadd16.w r6, r7, r5 // .......*.................................................................. + ldr.w r10, [r0, #256] // .......*.................................................................. + ssub16.w r5, r7, r5 // ........*................................................................. + ldr.w r8, [r0, #320] // ........*................................................................. + sadd16.w r2, r2, r9 // .........*................................................................ + vldr s2, [r1, #8] // .........*................................................................ + sadd16.w r9, r11, r6 // ..........*............................................................... + vmov.w r0, s26 // ..........*............................................................... + vldr s12, [r1, #12] // ...........*.............................................................. + smulwb r7, r0, r14 // ...........*.............................................................. + ssub16.w r11, r11, r6 // ............*............................................................. + smulwt r6, r0, r14 // ............*............................................................. + vldr s5, [r1, #20] // .............*............................................................ + smulwt r14, r0, r5 // .............*............................................................ + sadd16.w r4, r10, r8 // ..............*........................................................... + smulwb r5, r0, r5 // ..............*........................................................... + movw r0, #24608 // ...............*.......................................................... + smlabt r7, r7, r12, r0 // ...............*.......................................................... + ssub16.w r10, r10, r8 // ................*......................................................... + add r1, r1, #24 // .................*........................................................ + smlabt r8, r6, r12, r0 // .................*........................................................ + sadd16.w r6, r4, r2 // ..................*....................................................... + smlabt r5, r5, r12, r0 // ..................*....................................................... + pkhtb r7, r8, r7, asr #16 // ...................*...................................................... + smlabt r14, r14, r12, r0 // ...................*...................................................... + ssub16.w r4, r4, r2 // ....................*..................................................... + vmov.w r8, s4 // ....................*..................................................... + pkhtb r14, r14, r5, asr #16 // .....................*.................................................... + smulwb r5, r8, r4 // .....................*.................................................... + uadd16 r2, r10, r7 // ......................*................................................... + smulwt r8, r8, r4 // ......................*................................................... + vmov.w r4, s12 // .......................*.................................................. + smlabt r5, r5, r12, r0 // .......................*.................................................. + usub16 r10, r10, r7 // ........................*................................................. + smlabt r8, r8, r12, r0 // ........................*................................................. + smulwb r7, r4, r2 // .........................*................................................ + pkhtb r8, r8, r5, asr #16 // ..........................*............................................... + smulwt r4, r4, r2 // ..........................*............................................... + usub16 r2, r3, r14 // ...........................*.............................................. + smlabt r7, r7, r12, r0 // ...........................*.............................................. + uadd16 r5, r3, r14 // ............................*............................................. + smlabt r4, r4, r12, r0 // ............................*............................................. + sadd16.w r14, r9, r6 // .............................*............................................ + vmov.w r3, s5 // .............................*............................................ + pkhtb r7, r4, r7, asr #16 // ..............................*........................................... + smulwb r4, r3, r10 // ..............................*........................................... + ssub16.w r6, r9, r6 // ...............................*.......................................... + smulwt r10, r3, r10 // ...............................*.......................................... + ldr r3, [r1, #4] // ................................*......................................... + smlabt r9, r4, r12, r0 // ................................*......................................... + uadd16 r4, r5, r7 // .................................*........................................ + smlabt r10, r10, r12, r0 // .................................*........................................ + usub16 r7, r5, r7 // ..................................*....................................... + smulwb r5, r3, r4 // ..................................*....................................... + pkhtb r9, r10, r9, asr #16 // ...................................*...................................... + smulwt r3, r3, r4 // ...................................*...................................... + ldr r4, [r1], #8 // ....................................*..................................... + smlabt r5, r5, r12, r0 // ....................................*..................................... + uadd16 r10, r2, r9 // .....................................*.................................... + smlabt r3, r3, r12, r0 // .....................................*.................................... + usub16 r9, r2, r9 // ......................................*................................... + smulwt r2, r4, r14 // ......................................*................................... + pkhtb r3, r3, r5, asr #16 // .......................................*.................................. + smulwb r4, r4, r14 // .......................................*.................................. + ldr r5, [r1, #4] // ........................................*................................. + smlabt r2, r2, r12, r0 // ........................................*................................. + sadd16.w r14, r11, r8 // .........................................*................................ + smlabt r4, r4, r12, r0 // .........................................*................................ + ssub16.w r8, r11, r8 // ..........................................*............................... + smulwb r11, r5, r10 // ..........................................*............................... + smulwt r5, r5, r10 // ...........................................*.............................. + ldr r10, [r1], #8 // ............................................*............................. + smlabt r11, r11, r12, r0 // ............................................*............................. + pkhtb r2, r2, r4, asr #16 // .............................................*............................ + smlabt r5, r5, r12, r0 // .............................................*............................ + smulwb r4, r10, r14 // ..............................................*........................... + pkhtb r5, r5, r11, asr #16 // ...............................................*.......................... + smulwt r10, r10, r14 // ...............................................*.......................... + ldr r11, [r1, #4] // ................................................*......................... + smlabt r14, r4, r12, r0 // ................................................*......................... + smlabt r4, r10, r12, r0 // .................................................*........................ + smulwb r10, r11, r7 // ..................................................*....................... + pkhtb r14, r4, r14, asr #16 // ...................................................*...................... + smulwt r7, r11, r7 // ...................................................*...................... + ldr r4, [r1], #8 // ....................................................*..................... + smlabt r11, r10, r12, r0 // ....................................................*..................... + smlabt r10, r7, r12, r0 // .....................................................*.................... + smulwt r7, r4, r6 // ......................................................*................... + pkhtb r10, r10, r11, asr #16 // .......................................................*.................. + smulwb r11, r4, r6 // .......................................................*.................. + ldr r6, [r1, #4] // ........................................................*................. + smlabt r4, r7, r12, r0 // ........................................................*................. + smlabt r11, r11, r12, r0 // .........................................................*................ + smulwt r7, r6, r9 // ..........................................................*............... + smulwb r6, r6, r9 // ...........................................................*.............. + ldr r9, [r1], #8 // ............................................................*............. + smlabt r7, r7, r12, r0 // ............................................................*............. + pkhtb r11, r4, r11, asr #16 // .............................................................*............ + smlabt r4, r6, r12, r0 // .............................................................*............ + smulwt r6, r9, r8 // ..............................................................*........... + smulwb r8, r9, r8 // ...............................................................*.......... + smlabt r6, r6, r12, r0 // ................................................................*......... + smlabt r8, r8, r12, r0 // .................................................................*........ + vmov r0, s6 // ..................................................................*....... + str.w r5, [r0, #192] // ..................................................................*....... + str.w r14, [r0, #128] // ...................................................................*...... + str.w r3, [r0, #64] // ....................................................................*..... + str.w r10, [r0, #320] // .....................................................................*.... + pkhtb r3, r6, r8, asr #16 // ......................................................................*... + str.w r11, [r0, #256] // ......................................................................*... + pkhtb r8, r7, r4, asr #16 // .......................................................................*.. + str.w r3, [r0, #384] // .......................................................................*.. + str.w r8, [r0, #448] // ........................................................................*. + str.w r2, [r0], #4 // .........................................................................* - // -------------------------------- cycle (expected) --------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------- - // vmov s6, r0 // ....*.............................................................................. - // ldr.w r2, [r0, #0] // ...*............................................................................... - // ldr.w r3, [r0, #64] // .........*......................................................................... - // ldr.w r4, [r0, #128] // ........*.......................................................................... - // ldr.w r5, [r0, #192] // .......*........................................................................... - // ldr.w r6, [r0, #256] // ..*................................................................................ - // ldr.w r7, [r0, #320] // .*................................................................................. - // ldr.w r8, [r0, #384] // ....*.............................................................................. - // ldr.w r9, [r0, #448] // *.................................................................................. - // vldr s0, [r1, #0] // ......*............................................................................ - // vldr s1, [r1, #4] // ..*................................................................................ - // vldr s2, [r1, #8] // ...........*....................................................................... - // vldr s3, [r1, #12] // .................*................................................................. - // vldr s4, [r1, #16] // *.................................................................................. - // vldr s5, [r1, #20] // .*................................................................................. - // add r1, r1, #24 // .......................*........................................................... - // movw r0, #24608 // ..........*........................................................................ - // sadd16.w r14, r2, r3 // .............*..................................................................... - // ssub16.w r3, r2, r3 // ..............*.................................................................... - // sadd16.w r11, r4, r5 // ............*...................................................................... - // ssub16.w r5, r4, r5 // ...........*....................................................................... - // sadd16.w r2, r6, r7 // ...*............................................................................... - // ssub16.w r7, r6, r7 // .....*............................................................................. - // sadd16.w r4, r8, r9 // ......*............................................................................ - // ssub16.w r9, r8, r9 // .......*........................................................................... - // sadd16.w r8, r14, r11 // ................*.................................................................. - // ssub16.w r11, r14, r11 // ...............*................................................................... - // sadd16.w r6, r2, r4 // .........*......................................................................... - // ssub16.w r4, r2, r4 // ..........*........................................................................ - // vmov.w r10, s1 // ........*.......................................................................... - // smulwb r14, r10, r5 // ............*...................................................................... - // smulwt r5, r10, r5 // .............*..................................................................... - // smlabt r14, r14, r12, r0 // ..............*.................................................................... - // smlabt r5, r5, r12, r0 // ...............*................................................................... - // pkhtb r14, r5, r14, asr #16 // ..................*................................................................ - // usub16 r5, r3, r14 // .............................*..................................................... - // uadd16 r3, r3, r14 // ..............................*.................................................... - // smulwb r14, r10, r9 // .................*................................................................. - // smulwt r9, r10, r9 // ..................*................................................................ - // smlabt r14, r14, r12, r0 // ...................*............................................................... - // smlabt r9, r9, r12, r0 // ....................*.............................................................. - // pkhtb r14, r9, r14, asr #16 // ......................*............................................................ - // usub16 r9, r7, r14 // .........................*......................................................... - // uadd16 r7, r7, r14 // ........................*.......................................................... - // sadd16.w r2, r8, r6 // ...................*............................................................... - // ssub16.w r6, r8, r6 // .....................*............................................................. - // vmov.w r10, s3 // ........................*.......................................................... - // smulwb r14, r10, r7 // ...........................*....................................................... - // smulwt r7, r10, r7 // ..........................*........................................................ - // smlabt r14, r14, r12, r0 // .............................*..................................................... - // smlabt r7, r7, r12, r0 // ............................*...................................................... - // pkhtb r14, r7, r14, asr #16 // ...............................*................................................... - // usub16 r7, r3, r14 // .................................*................................................. - // uadd16 r3, r3, r14 // ..................................*................................................ - // vmov.w r10, s4 // ....................*.............................................................. - // smulwb r14, r10, r4 // .....................*............................................................. - // smulwt r4, r10, r4 // ......................*............................................................ - // smlabt r14, r14, r12, r0 // .......................*........................................................... - // smlabt r4, r4, r12, r0 // .........................*......................................................... - // pkhtb r14, r4, r14, asr #16 // ...........................*....................................................... - // ssub16.w r8, r11, r14 // ..........................................*........................................ - // sadd16.w r4, r11, r14 // ...........................................*....................................... - // vmov.w r10, s5 // ............................*...................................................... - // smulwb r14, r10, r9 // ...............................*................................................... - // smulwt r9, r10, r9 // ..............................*.................................................... - // smlabt r14, r14, r12, r0 // .................................*................................................. - // smlabt r9, r9, r12, r0 // ................................*.................................................. - // pkhtb r14, r9, r14, asr #16 // ...................................*............................................... - // usub16 r9, r5, r14 // .....................................*............................................. - // uadd16 r5, r5, r14 // ......................................*............................................ - // ldr r11, [r1, #4] // ....................................*.............................................. - // ldr r10, [r1], #8 // ................................*.................................................. - // smulwb r14, r10, r2 // ..................................*................................................ - // smulwt r2, r10, r2 // ...................................*............................................... - // smlabt r14, r14, r12, r0 // ....................................*.............................................. - // smlabt r2, r2, r12, r0 // .....................................*............................................. - // pkhtb r2, r2, r14, asr #16 // ........................................*.......................................... - // smulwb r14, r11, r3 // .......................................*........................................... - // smulwt r3, r11, r3 // ......................................*............................................ - // smlabt r14, r14, r12, r0 // .............................................*..................................... - // smlabt r3, r3, r12, r0 // ........................................*.......................................... - // pkhtb r3, r3, r14, asr #16 // ...............................................*................................... - // ldr r11, [r1, #4] // ......................................................*............................ - // ldr r10, [r1], #8 // ............................................*...................................... - // smulwb r14, r10, r4 // ................................................*.................................. - // smulwt r4, r10, r4 // ...............................................*................................... - // smlabt r14, r14, r12, r0 // ...................................................*............................... - // smlabt r4, r4, r12, r0 // .................................................*................................. - // pkhtb r4, r4, r14, asr #16 // .......................................................*........................... - // smulwb r14, r11, r5 // ........................................................*.......................... - // smulwt r5, r11, r5 // .........................................................*......................... - // smlabt r14, r14, r12, r0 // ..........................................................*........................ - // smlabt r5, r5, r12, r0 // ...........................................................*....................... - // pkhtb r5, r5, r14, asr #16 // .............................................................*..................... - // ldr r11, [r1, #4] // .......................................*........................................... - // ldr r10, [r1], #8 // ..........................................................*........................ - // smulwb r14, r10, r6 // .............................................................*..................... - // smulwt r6, r10, r6 // ............................................................*...................... - // smlabt r14, r14, r12, r0 // ...............................................................*................... - // smlabt r6, r6, r12, r0 // ..............................................................*.................... - // pkhtb r6, r6, r14, asr #16 // .................................................................*................. - // smulwb r14, r11, r7 // .........................................*......................................... - // smulwt r7, r11, r7 // ..........................................*........................................ - // smlabt r14, r14, r12, r0 // ...........................................*....................................... - // smlabt r7, r7, r12, r0 // ............................................*...................................... - // pkhtb r7, r7, r14, asr #16 // ..............................................*.................................... - // ldr r11, [r1, #4] // ................................................*.................................. - // ldr r10, [r1], #8 // ..............................................................*.................... - // smulwb r14, r10, r8 // .................................................................*................. - // smulwt r8, r10, r8 // ................................................................*.................. - // smlabt r14, r14, r12, r0 // ...................................................................*............... - // smlabt r8, r8, r12, r0 // ..................................................................*................ - // pkhtb r8, r8, r14, asr #16 // .....................................................................*............. - // smulwb r14, r11, r9 // ..................................................*................................ - // smulwt r9, r11, r9 // ....................................................*.............................. - // smlabt r14, r14, r12, r0 // .......................................................*........................... - // smlabt r9, r9, r12, r0 // ......................................................*............................ - // pkhtb r9, r9, r14, asr #16 // .........................................................*......................... - // vmov r0, s6 // ...................................................................*............... - // str.w r6, [r0, #256] // ..........................................................................*........ - // str.w r7, [r0, #320] // ............................................................................*...... - // str.w r8, [r0, #384] // ..............................................................................*.... - // str.w r9, [r0, #448] // ................................................................................*.. - // str.w r3, [r0, #64] // ........................................................................*.......... - // str.w r4, [r0, #128] // ....................................................................*.............. - // str.w r5, [r0, #192] // ..................................................................................* - // str.w r2, [r0], #4 // ......................................................................*............ + // --------------------------- cycle (expected) ----------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------------- + // vmov s6, r0 // .*........................................................................ + // ldr.w r2, [r0, #0] // *......................................................................... + // ldr.w r3, [r0, #64] // .*........................................................................ + // ldr.w r4, [r0, #128] // ..*....................................................................... + // ldr.w r5, [r0, #192] // ......*................................................................... + // ldr.w r6, [r0, #256] // .......*.................................................................. + // ldr.w r7, [r0, #320] // ........*................................................................. + // ldr.w r8, [r0, #384] // ....*..................................................................... + // ldr.w r9, [r0, #448] // .....*.................................................................... + // vldr s0, [r1, #0] // .....*.................................................................... + // vldr s1, [r1, #4] // *......................................................................... + // vldr s2, [r1, #8] // .........*................................................................ + // vldr s3, [r1, #12] // ...........*.............................................................. + // vldr s4, [r1, #16] // ....*..................................................................... + // vldr s5, [r1, #20] // .............*............................................................ + // add r1, r1, #24 // .................*........................................................ + // movw r0, #24608 // ...............*.......................................................... + // sadd16.w r14, r2, r3 // ..*....................................................................... + // ssub16.w r3, r2, r3 // ...*...................................................................... + // sadd16.w r11, r4, r5 // .......*.................................................................. + // ssub16.w r5, r4, r5 // ........*................................................................. + // sadd16.w r2, r6, r7 // ..............*........................................................... + // ssub16.w r7, r6, r7 // ................*......................................................... + // sadd16.w r4, r8, r9 // .........*................................................................ + // ssub16.w r9, r8, r9 // ......*................................................................... + // sadd16.w r8, r14, r11 // ..........*............................................................... + // ssub16.w r11, r14, r11 // ............*............................................................. + // sadd16.w r6, r2, r4 // ..................*....................................................... + // ssub16.w r4, r2, r4 // ....................*..................................................... + // vmov.w r10, s1 // ..........*............................................................... + // smulwb r14, r10, r5 // ..............*........................................................... + // smulwt r5, r10, r5 // .............*............................................................ + // smlabt r14, r14, r12, r0 // ..................*....................................................... + // smlabt r5, r5, r12, r0 // ...................*...................................................... + // pkhtb r14, r5, r14, asr #16 // .....................*.................................................... + // usub16 r5, r3, r14 // ...........................*.............................................. + // uadd16 r3, r3, r14 // ............................*............................................. + // smulwb r14, r10, r9 // ...........*.............................................................. + // smulwt r9, r10, r9 // ............*............................................................. + // smlabt r14, r14, r12, r0 // ...............*.......................................................... + // smlabt r9, r9, r12, r0 // .................*........................................................ + // pkhtb r14, r9, r14, asr #16 // ...................*...................................................... + // usub16 r9, r7, r14 // ........................*................................................. + // uadd16 r7, r7, r14 // ......................*................................................... + // sadd16.w r2, r8, r6 // .............................*............................................ + // ssub16.w r6, r8, r6 // ...............................*.......................................... + // vmov.w r10, s3 // .......................*.................................................. + // smulwb r14, r10, r7 // .........................*................................................ + // smulwt r7, r10, r7 // ..........................*............................................... + // smlabt r14, r14, r12, r0 // ...........................*.............................................. + // smlabt r7, r7, r12, r0 // ............................*............................................. + // pkhtb r14, r7, r14, asr #16 // ..............................*........................................... + // usub16 r7, r3, r14 // ..................................*....................................... + // uadd16 r3, r3, r14 // .................................*........................................ + // vmov.w r10, s4 // ....................*..................................................... + // smulwb r14, r10, r4 // .....................*.................................................... + // smulwt r4, r10, r4 // ......................*................................................... + // smlabt r14, r14, r12, r0 // .......................*.................................................. + // smlabt r4, r4, r12, r0 // ........................*................................................. + // pkhtb r14, r4, r14, asr #16 // ..........................*............................................... + // ssub16.w r8, r11, r14 // ..........................................*............................... + // sadd16.w r4, r11, r14 // .........................................*................................ + // vmov.w r10, s5 // .............................*............................................ + // smulwb r14, r10, r9 // ..............................*........................................... + // smulwt r9, r10, r9 // ...............................*.......................................... + // smlabt r14, r14, r12, r0 // ................................*......................................... + // smlabt r9, r9, r12, r0 // .................................*........................................ + // pkhtb r14, r9, r14, asr #16 // ...................................*...................................... + // usub16 r9, r5, r14 // ......................................*................................... + // uadd16 r5, r5, r14 // .....................................*.................................... + // ldr r11, [r1, #4] // ................................*......................................... + // ldr r10, [r1], #8 // ....................................*..................................... + // smulwb r14, r10, r2 // .......................................*.................................. + // smulwt r2, r10, r2 // ......................................*................................... + // smlabt r14, r14, r12, r0 // .........................................*................................ + // smlabt r2, r2, r12, r0 // ........................................*................................. + // pkhtb r2, r2, r14, asr #16 // .............................................*............................ + // smulwb r14, r11, r3 // ..................................*....................................... + // smulwt r3, r11, r3 // ...................................*...................................... + // smlabt r14, r14, r12, r0 // ....................................*..................................... + // smlabt r3, r3, r12, r0 // .....................................*.................................... + // pkhtb r3, r3, r14, asr #16 // .......................................*.................................. + // ldr r11, [r1, #4] // ........................................*................................. + // ldr r10, [r1], #8 // ............................................*............................. + // smulwb r14, r10, r4 // ..............................................*........................... + // smulwt r4, r10, r4 // ...............................................*.......................... + // smlabt r14, r14, r12, r0 // ................................................*......................... + // smlabt r4, r4, r12, r0 // .................................................*........................ + // pkhtb r4, r4, r14, asr #16 // ...................................................*...................... + // smulwb r14, r11, r5 // ..........................................*............................... + // smulwt r5, r11, r5 // ...........................................*.............................. + // smlabt r14, r14, r12, r0 // ............................................*............................. + // smlabt r5, r5, r12, r0 // .............................................*............................ + // pkhtb r5, r5, r14, asr #16 // ...............................................*.......................... + // ldr r11, [r1, #4] // ................................................*......................... + // ldr r10, [r1], #8 // ....................................................*..................... + // smulwb r14, r10, r6 // .......................................................*.................. + // smulwt r6, r10, r6 // ......................................................*................... + // smlabt r14, r14, r12, r0 // .........................................................*................ + // smlabt r6, r6, r12, r0 // ........................................................*................. + // pkhtb r6, r6, r14, asr #16 // .............................................................*............ + // smulwb r14, r11, r7 // ..................................................*....................... + // smulwt r7, r11, r7 // ...................................................*...................... + // smlabt r14, r14, r12, r0 // ....................................................*..................... + // smlabt r7, r7, r12, r0 // .....................................................*.................... + // pkhtb r7, r7, r14, asr #16 // .......................................................*.................. + // ldr r11, [r1, #4] // ........................................................*................. + // ldr r10, [r1], #8 // ............................................................*............. + // smulwb r14, r10, r8 // ...............................................................*.......... + // smulwt r8, r10, r8 // ..............................................................*........... + // smlabt r14, r14, r12, r0 // .................................................................*........ + // smlabt r8, r8, r12, r0 // ................................................................*......... + // pkhtb r8, r8, r14, asr #16 // ......................................................................*... + // smulwb r14, r11, r9 // ...........................................................*.............. + // smulwt r9, r11, r9 // ..........................................................*............... + // smlabt r14, r14, r12, r0 // .............................................................*............ + // smlabt r9, r9, r12, r0 // ............................................................*............. + // pkhtb r9, r9, r14, asr #16 // .......................................................................*.. + // vmov r0, s6 // ..................................................................*....... + // str.w r6, [r0, #256] // ......................................................................*... + // str.w r7, [r0, #320] // .....................................................................*.... + // str.w r8, [r0, #384] // .......................................................................*.. + // str.w r9, [r0, #448] // ........................................................................*. + // str.w r3, [r0, #64] // ....................................................................*..... + // str.w r4, [r0, #128] // ...................................................................*...... + // str.w r5, [r0, #192] // ..................................................................*....... + // str.w r2, [r0], #4 // .........................................................................* layer567_first_end: @@ -918,336 +920,338 @@ layer1234_loop: add.w tmp, poly, #strincr2*3*(5) vmov s14, tmp layer567_loop: - // Instructions: 158 - // Expected cycles: 100 - // Expected IPC: 1.58 + // Instructions: 160 + // Expected cycles: 94 + // Expected IPC: 1.70 // - // ---------------------------------------- cycle (expected) -----------------------------------------> + // ------------------------------------- cycle (expected) --------------------------------------> // 0 25 50 75 - // |------------------------|------------------------|------------------------| - ldr.w r3, [r1], #4 // *................................................................................................... - ldr.w r4, [r0, #320] // *................................................................................................... - ldr.w r8, [r0, #384] // .*.................................................................................................. - vmov s6, r0 // .*.................................................................................................. - smulwb r7, r3, r4 // ..*................................................................................................. - ldr.w r11, [r0, #64] // ..*................................................................................................. - smulwt r6, r3, r4 // ...*................................................................................................ - ldr.w r10, [r0, #128] // ...*................................................................................................ - smulwt r2, r3, r11 // ....*............................................................................................... - ldr.w r14, [r0, #256] // ....*............................................................................................... - smulwb r5, r3, r11 // .....*.............................................................................................. - ldr.w r11, [r0, #448] // .....*.............................................................................................. - ldr.w r4, [r0, #0] // ......*............................................................................................. - ldr.w r9, [r0, #192] // .......*............................................................................................ - movw r0, #24608 // .......*............................................................................................ - smlabt r6, r6, r12, r0 // ........*........................................................................................... - smlabt r7, r7, r12, r0 // .........*.......................................................................................... - smlabt r5, r5, r12, r0 // ..........*......................................................................................... - pkhtb r7, r6, r7, asr #16 // ...........*........................................................................................ - smlabt r2, r2, r12, r0 // ...........*........................................................................................ - smulwb r6, r3, r9 // ............*....................................................................................... - smulwt r9, r3, r9 // .............*...................................................................................... - pkhtb r5, r2, r5, asr #16 // .............*...................................................................................... - smlabt r6, r6, r12, r0 // ..............*..................................................................................... - uadd16 r2, r4, r5 // ..............*..................................................................................... - usub16 r5, r4, r5 // ...............*.................................................................................... - smlabt r4, r9, r12, r0 // ...............*.................................................................................... - smulwb r9, r3, r11 // ................*................................................................................... - pkhtb r6, r4, r6, asr #16 // .................*.................................................................................. - smulwt r4, r3, r11 // .................*.................................................................................. - uadd16 r3, r10, r6 // ..................*................................................................................. - smlabt r11, r9, r12, r0 // ..................*................................................................................. - usub16 r9, r10, r6 // ...................*................................................................................ - ldr r10, [r1], #8 // ...................*................................................................................ - uadd16 r6, r14, r7 // ....................*............................................................................... - smlabt r4, r4, r12, r0 // ....................*............................................................................... - usub16 r14, r14, r7 // .....................*.............................................................................. - smulwt r7, r10, r3 // .....................*.............................................................................. - pkhtb r11, r4, r11, asr #16 // ......................*............................................................................. - smulwb r3, r10, r3 // ......................*............................................................................. - smlabt r7, r7, r12, r0 // .......................*............................................................................ - uadd16 r4, r8, r11 // .......................*............................................................................ - usub16 r11, r8, r11 // ........................*........................................................................... - smlabt r8, r3, r12, r0 // ........................*........................................................................... - smulwb r3, r10, r4 // .........................*.......................................................................... - smulwt r4, r10, r4 // ..........................*......................................................................... - pkhtb r10, r7, r8, asr #16 // ..........................*......................................................................... - smlabt r3, r3, r12, r0 // ...........................*........................................................................ - ldr r7, [r1, #-4] // ...........................*........................................................................ - smlabt r8, r4, r12, r0 // ............................*....................................................................... - usub16 r4, r2, r10 // ............................*....................................................................... - uadd16 r2, r2, r10 // .............................*...................................................................... - smulwt r10, r7, r11 // .............................*...................................................................... - pkhtb r8, r8, r3, asr #16 // ..............................*..................................................................... - smulwb r11, r7, r11 // ..............................*..................................................................... - smlabt r10, r10, r12, r0 // ...............................*.................................................................... - uadd16 r3, r6, r8 // ...............................*.................................................................... - smlabt r11, r11, r12, r0 // ................................*................................................................... - usub16 r6, r6, r8 // ................................*................................................................... - smulwt r8, r7, r9 // .................................*.................................................................. - smulwb r9, r7, r9 // ..................................*................................................................. - pkhtb r7, r10, r11, asr #16 // ..................................*................................................................. - ldr r10, [r1], #8 // ...................................*................................................................ - smlabt r8, r8, r12, r0 // ...................................*................................................................ - uadd16 r11, r14, r7 // ....................................*............................................................... - smlabt r9, r9, r12, r0 // ....................................*............................................................... - usub16 r7, r14, r7 // .....................................*.............................................................. - smulwb r14, r10, r3 // .....................................*.............................................................. - pkhtb r9, r8, r9, asr #16 // ......................................*............................................................. - smulwt r8, r10, r3 // ......................................*............................................................. - smlabt r3, r14, r12, r0 // .......................................*............................................................ - ldr r10, [r1, #-4] // .......................................*............................................................ - smlabt r14, r8, r12, r0 // ........................................*........................................................... - uadd16 r8, r5, r9 // ........................................*........................................................... - usub16 r9, r5, r9 // .........................................*.......................................................... - smulwb r5, r10, r11 // .........................................*.......................................................... - smulwt r10, r10, r11 // ..........................................*......................................................... - pkhtb r14, r14, r3, asr #16 // ..........................................*......................................................... - smlabt r11, r5, r12, r0 // ...........................................*........................................................ - ldr r5, [r1], #8 // ...........................................*........................................................ - smlabt r10, r10, r12, r0 // ............................................*....................................................... - smulwt r3, r5, r6 // .............................................*...................................................... - pkhtb r11, r10, r11, asr #16 // ..............................................*..................................................... - smulwb r10, r5, r6 // ..............................................*..................................................... - smlabt r3, r3, r12, r0 // ...............................................*.................................................... - ldr r5, [r1, #-4] // ...............................................*.................................................... - smlabt r10, r10, r12, r0 // ................................................*................................................... - usub16 r6, r2, r14 // ................................................*................................................... - uadd16 r2, r2, r14 // .................................................*.................................................. - smulwt r14, r5, r7 // .................................................*.................................................. - pkhtb r10, r3, r10, asr #16 // ..................................................*................................................. - smulwb r3, r5, r7 // ..................................................*................................................. - smlabt r7, r14, r12, r0 // ...................................................*................................................ - ldr r5, [r1, #20] // ...................................................*................................................ - smlabt r3, r3, r12, r0 // ....................................................*............................................... - usub16 r14, r8, r11 // ....................................................*............................................... - uadd16 r11, r8, r11 // .....................................................*.............................................. - smulwt r8, r5, r14 // .....................................................*.............................................. - smulwb r14, r5, r14 // ......................................................*............................................. - ldr r5, [r1, #4] // ......................................................*............................................. - smlabt r8, r8, r12, r0 // .......................................................*............................................ - pkhtb r7, r7, r3, asr #16 // .......................................................*............................................ - smulwb r3, r5, r11 // ........................................................*........................................... - smulwt r11, r5, r11 // .........................................................*.......................................... - smlabt r14, r14, r12, r0 // ..........................................................*......................................... - smlabt r11, r11, r12, r0 // ...........................................................*........................................ - ldr r5, [r1], #8 // ...........................................................*........................................ - smlabt r3, r3, r12, r0 // ............................................................*....................................... - pkhtb r14, r8, r14, asr #16 // ............................................................*....................................... - smulwt r8, r5, r2 // .............................................................*...................................... - smulwb r2, r5, r2 // ..............................................................*..................................... - ldr r5, [r1, #20] // ..............................................................*..................................... - smlabt r8, r8, r12, r0 // ...............................................................*.................................... - pkhtb r3, r11, r3, asr #16 // ...............................................................*.................................... - usub16 r11, r9, r7 // ................................................................*................................... - smlabt r2, r2, r12, r0 // ................................................................*................................... - uadd16 r7, r9, r7 // .................................................................*.................................. - smulwb r9, r5, r11 // .................................................................*.................................. - pkhtb r8, r8, r2, asr #16 // ..................................................................*................................. - smulwt r5, r5, r11 // ..................................................................*................................. - smlabt r2, r9, r12, r0 // ...................................................................*................................ - ldr r11, [r1], #8 // ...................................................................*................................ - smlabt r9, r5, r12, r0 // ....................................................................*............................... - uadd16 r5, r4, r10 // ....................................................................*............................... - usub16 r10, r4, r10 // .....................................................................*.............................. - smulwb r4, r11, r5 // .....................................................................*.............................. - pkhtb r9, r9, r2, asr #16 // ......................................................................*............................. - smulwt r2, r11, r5 // ......................................................................*............................. - ldr r5, [r1, #-4] // .......................................................................*............................ - smlabt r4, r4, r12, r0 // .......................................................................*............................ - smlabt r11, r2, r12, r0 // ........................................................................*........................... - smulwb r2, r5, r7 // .........................................................................*.......................... - pkhtb r11, r11, r4, asr #16 // ..........................................................................*......................... - smulwt r7, r5, r7 // ..........................................................................*......................... - smlabt r4, r2, r12, r0 // ...........................................................................*........................ - ldr r5, [r1], #8 // ...........................................................................*........................ - smlabt r7, r7, r12, r0 // ............................................................................*....................... - smulwt r2, r5, r6 // .............................................................................*...................... - pkhtb r4, r7, r4, asr #16 // ..............................................................................*..................... - smulwb r7, r5, r6 // ..............................................................................*..................... - smlabt r6, r2, r12, r0 // ...............................................................................*.................... - ldr r2, [r1], #8 // ...............................................................................*.................... - smlabt r5, r7, r12, r0 // ................................................................................*................... - smulwt r7, r2, r10 // .................................................................................*.................. - pkhtb r6, r6, r5, asr #16 // ..................................................................................*................. - smulwb r2, r2, r10 // ..................................................................................*................. - smlabt r10, r7, r12, r0 // ...................................................................................*................ - smlabt r2, r2, r12, r0 // ....................................................................................*............... - vmov r0, s6 // ....................................................................................*............... - str.w r4, [r0, #192] // .....................................................................................*.............. - pkhtb r5, r10, r2, asr #16 // ......................................................................................*............. - str.w r8, [r0], #4 // .......................................................................................*............ // @slothy:core - str.w r3, [r0, #60] // .........................................................................................*.......... - str.w r14, [r0, #316] // ...........................................................................................*........ - str.w r5, [r0, #380] // .............................................................................................*...... - str.w r11, [r0, #124] // ...............................................................................................*.... - str.w r9, [r0, #444] // .................................................................................................*.. - str.w r6, [r0, #252] // ...................................................................................................* + // |------------------------|------------------------|------------------------|------------------ + ldr.w r7, [r1], #4 // *............................................................................................. + ldr.w r9, [r0, #320] // *............................................................................................. + ldr.w r14, [r0, #64] // .*............................................................................................ + ldr.w r11, [r0, #0] // ..*........................................................................................... + smulwt r10, r7, r9 // ..*........................................................................................... + ldr.w r8, [r0, #448] // ...*.......................................................................................... + smulwt r2, r7, r14 // ...*.......................................................................................... + ldr.w r5, [r0, #256] // ....*......................................................................................... + vmov s6, r0 // ....*......................................................................................... + ldr.w r6, [r0, #192] // .....*........................................................................................ + smulwb r9, r7, r9 // .....*........................................................................................ + ldr.w r4, [r0, #384] // ......*....................................................................................... + smulwb r14, r7, r14 // ......*....................................................................................... + ldr.w r3, [r0, #128] // .......*...................................................................................... + movw r0, #24608 // ........*..................................................................................... + smlabt r9, r9, r12, r0 // ........*..................................................................................... + smlabt r10, r10, r12, r0 // .........*.................................................................................... + smlabt r14, r14, r12, r0 // ..........*................................................................................... + pkhtb r9, r10, r9, asr #16 // ...........*.................................................................................. + smlabt r2, r2, r12, r0 // ...........*.................................................................................. + smulwb r10, r7, r6 // ............*................................................................................. + pkhtb r14, r2, r14, asr #16 // .............*................................................................................ + smulwt r6, r7, r6 // .............*................................................................................ + uadd16 r2, r11, r14 // ..............*............................................................................... + smlabt r10, r10, r12, r0 // ..............*............................................................................... + usub16 r14, r11, r14 // ...............*.............................................................................. + smlabt r6, r6, r12, r0 // ...............*.............................................................................. + uadd16 r11, r5, r9 // ................*............................................................................. + pkhtb r10, r6, r10, asr #16 // .................*............................................................................ + smulwt r6, r7, r8 // .................*............................................................................ + usub16 r9, r5, r9 // ..................*........................................................................... + smulwb r7, r7, r8 // ..................*........................................................................... + ldr r8, [r1, #4] // ...................*.......................................................................... + smlabt r6, r6, r12, r0 // ...................*.......................................................................... + usub16 r5, r3, r10 // ....................*......................................................................... + smlabt r7, r7, r12, r0 // ....................*......................................................................... + uadd16 r10, r3, r10 // .....................*........................................................................ + smulwb r3, r8, r5 // .....................*........................................................................ + smulwt r5, r8, r5 // ......................*....................................................................... + pkhtb r7, r6, r7, asr #16 // .......................*...................................................................... + smlabt r6, r3, r12, r0 // .......................*...................................................................... + usub16 r3, r4, r7 // ........................*..................................................................... + smlabt r5, r5, r12, r0 // ........................*..................................................................... + uadd16 r7, r4, r7 // .........................*.................................................................... + smulwb r4, r8, r3 // .........................*.................................................................... + pkhtb r5, r5, r6, asr #16 // ..........................*................................................................... + smulwt r8, r8, r3 // ..........................*................................................................... + ldr r6, [r1], #8 // ...........................*.................................................................. + smlabt r4, r4, r12, r0 // ...........................*.................................................................. + uadd16 r3, r14, r5 // ............................*................................................................. + smlabt r8, r8, r12, r0 // ............................*................................................................. + usub16 r5, r14, r5 // .............................*................................................................ + smulwb r14, r6, r10 // .............................*................................................................ + pkhtb r8, r8, r4, asr #16 // ..............................*............................................................... + smulwt r10, r6, r10 // ..............................*............................................................... + uadd16 r4, r9, r8 // ...............................*.............................................................. + smlabt r14, r14, r12, r0 // ...............................*.............................................................. + usub16 r9, r9, r8 // ................................*............................................................. + smlabt r10, r10, r12, r0 // ................................*............................................................. + smulwb r8, r6, r7 // .................................*............................................................ + pkhtb r10, r10, r14, asr #16 // ..................................*........................................................... + smulwt r6, r6, r7 // ..................................*........................................................... + ldr r14, [r1, #4] // ...................................*.......................................................... + smlabt r7, r8, r12, r0 // ...................................*.......................................................... + uadd16 r8, r2, r10 // ....................................*......................................................... + smlabt r6, r6, r12, r0 // ....................................*......................................................... + usub16 r10, r2, r10 // .....................................*........................................................ + smulwb r2, r14, r4 // .....................................*........................................................ + pkhtb r7, r6, r7, asr #16 // ......................................*....................................................... + smulwt r4, r14, r4 // ......................................*....................................................... + ldr r6, [r1], #8 // .......................................*...................................................... + smlabt r2, r2, r12, r0 // .......................................*...................................................... + uadd16 r14, r11, r7 // ........................................*..................................................... + smlabt r4, r4, r12, r0 // ........................................*..................................................... + usub16 r11, r11, r7 // .........................................*.................................................... + smulwb r7, r6, r14 // .........................................*.................................................... + pkhtb r2, r4, r2, asr #16 // ..........................................*................................................... + smulwt r6, r6, r14 // ..........................................*................................................... + ldr r14, [r1, #4] // ...........................................*.................................................. + smlabt r4, r7, r12, r0 // ...........................................*.................................................. + usub16 r7, r3, r2 // ............................................*................................................. + smlabt r6, r6, r12, r0 // ............................................*................................................. + uadd16 r3, r3, r2 // .............................................*................................................ + smulwb r2, r14, r9 // .............................................*................................................ + pkhtb r6, r6, r4, asr #16 // ..............................................*............................................... + smulwt r14, r14, r9 // ..............................................*............................................... + ldr r9, [r1], #8 // ...............................................*.............................................. + smlabt r2, r2, r12, r0 // ...............................................*.............................................. + uadd16 r4, r8, r6 // ................................................*............................................. + smlabt r14, r14, r12, r0 // ................................................*............................................. + usub16 r6, r8, r6 // .................................................*............................................ + smulwb r8, r9, r11 // .................................................*............................................ + pkhtb r2, r14, r2, asr #16 // ..................................................*........................................... + smulwt r9, r9, r11 // ..................................................*........................................... + ldr r11, [r1, #4] // ...................................................*.......................................... + smlabt r14, r8, r12, r0 // ...................................................*.......................................... + smlabt r8, r9, r12, r0 // ....................................................*......................................... + smulwb r9, r11, r3 // .....................................................*........................................ + pkhtb r8, r8, r14, asr #16 // ......................................................*....................................... + smulwt r3, r11, r3 // ......................................................*....................................... + ldr r14, [r1], #8 // .......................................................*...................................... + smlabt r11, r9, r12, r0 // .......................................................*...................................... + uadd16 r9, r5, r2 // ........................................................*..................................... + smlabt r3, r3, r12, r0 // ........................................................*..................................... + usub16 r5, r5, r2 // .........................................................*.................................... + smulwb r2, r14, r4 // .........................................................*.................................... + pkhtb r3, r3, r11, asr #16 // ..........................................................*................................... + smulwt r11, r14, r4 // ..........................................................*................................... + ldr r4, [r1, #4] // ...........................................................*.................................. + smlabt r2, r2, r12, r0 // ...........................................................*.................................. + uadd16 r14, r10, r8 // ............................................................*................................. + smlabt r11, r11, r12, r0 // ............................................................*................................. + usub16 r10, r10, r8 // .............................................................*................................ + smulwt r8, r4, r9 // .............................................................*................................ + pkhtb r11, r11, r2, asr #16 // ..............................................................*............................... + smulwb r4, r4, r9 // ..............................................................*............................... + ldr r9, [r1], #8 // ...............................................................*.............................. + smlabt r2, r8, r12, r0 // ...............................................................*.............................. + smlabt r4, r4, r12, r0 // ................................................................*............................. + smulwb r8, r9, r14 // .................................................................*............................ + smulwt r14, r9, r14 // ..................................................................*........................... + ldr r9, [r1, #4] // ...................................................................*.......................... + smlabt r8, r8, r12, r0 // ...................................................................*.......................... + pkhtb r2, r2, r4, asr #16 // ....................................................................*......................... + smlabt r4, r14, r12, r0 // ....................................................................*......................... + smulwt r14, r9, r7 // .....................................................................*........................ + pkhtb r8, r4, r8, asr #16 // ......................................................................*....................... + smulwb r9, r9, r7 // ......................................................................*....................... + ldr r7, [r1], #8 // .......................................................................*...................... + smlabt r14, r14, r12, r0 // .......................................................................*...................... + smlabt r9, r9, r12, r0 // ........................................................................*..................... + smulwt r4, r7, r6 // .........................................................................*.................... + smulwb r6, r7, r6 // ..........................................................................*................... + ldr r7, [r1, #4] // ...........................................................................*.................. + smlabt r4, r4, r12, r0 // ...........................................................................*.................. + pkhtb r14, r14, r9, asr #16 // ............................................................................*................. + smlabt r9, r6, r12, r0 // ............................................................................*................. + smulwb r6, r7, r5 // .............................................................................*................ + smulwt r7, r7, r5 // ..............................................................................*............... + ldr r5, [r1], #8 // ...............................................................................*.............. + smlabt r6, r6, r12, r0 // ...............................................................................*.............. + pkhtb r4, r4, r9, asr #16 // ................................................................................*............. + smlabt r7, r7, r12, r0 // ................................................................................*............. + smulwb r9, r5, r10 // .................................................................................*............ + smulwt r5, r5, r10 // ..................................................................................*........... + smlabt r10, r9, r12, r0 // ...................................................................................*.......... + pkhtb r9, r7, r6, asr #16 // ....................................................................................*......... + smlabt r6, r5, r12, r0 // ....................................................................................*......... + vmov r0, s6 // .....................................................................................*........ + str.w r9, [r0, #448] // .....................................................................................*........ + str.w r8, [r0, #128] // ......................................................................................*....... + str.w r3, [r0, #64] // .......................................................................................*...... + pkhtb r9, r6, r10, asr #16 // ........................................................................................*..... + str.w r9, [r0, #384] // ........................................................................................*..... + str.w r2, [r0, #192] // .........................................................................................*.... + str.w r14, [r0, #320] // ..........................................................................................*... + str.w r4, [r0, #256] // ...........................................................................................*.. + str.w r11, [r0], #4 // ............................................................................................*. // @slothy:core + vmov r11, s14 // ............................................................................................*. + cmp.w r0, r11 // .............................................................................................* - // ---------------------------------------- cycle (expected) -----------------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------------------------ - // ldr.w r2, [r1], #4 // *................................................................................................... - // ldr.w r7, [r0, #192] // .......*............................................................................................ - // ldr.w r11, [r0, #64] // ..*................................................................................................. - // vmov s6, r0 // .*.................................................................................................. - // smulwb r9, r2, r7 // ............*....................................................................................... - // ldr.w r4, [r0, #448] // .....*.............................................................................................. - // smulwt r6, r2, r7 // .............*...................................................................................... - // ldr.w r5, [r0, #320] // *................................................................................................... - // ldr.w r7, [r0, #256] // ....*............................................................................................... - // smulwt r14, r2, r11 // ....*............................................................................................... - // ldr.w r8, [r0, #384] // .*.................................................................................................. - // smulwb r3, r2, r11 // .....*.............................................................................................. - // ldr.w r11, [r0, #0] // ......*............................................................................................. - // ldr.w r10, [r0, #128] // ...*................................................................................................ - // movw r0, #24608 // .......*............................................................................................ - // smlabt r14, r14, r12, r0 // ...........*........................................................................................ - // smlabt r3, r3, r12, r0 // ..........*......................................................................................... - // smlabt r9, r9, r12, r0 // ..............*..................................................................................... - // pkhtb r3, r14, r3, asr #16 // .............*...................................................................................... - // smlabt r14, r6, r12, r0 // ...............*.................................................................................... - // smulwb r6, r2, r5 // ..*................................................................................................. - // pkhtb r9, r14, r9, asr #16 // .................*.................................................................................. - // smulwt r14, r2, r5 // ...*................................................................................................ - // usub16 r5, r11, r3 // ...............*.................................................................................... - // smlabt r6, r6, r12, r0 // .........*.......................................................................................... - // uadd16 r3, r11, r3 // ..............*..................................................................................... - // smlabt r11, r14, r12, r0 // ........*........................................................................................... - // smulwb r14, r2, r4 // ................*................................................................................... - // smulwt r4, r2, r4 // .................*.................................................................................. - // pkhtb r2, r11, r6, asr #16 // ...........*........................................................................................ - // ldr r6, [r1], #8 // ...................*................................................................................ - // smlabt r14, r14, r12, r0 // ..................*................................................................................. - // usub16 r11, r10, r9 // ...................*................................................................................ - // smlabt r4, r4, r12, r0 // ....................*............................................................................... - // uadd16 r9, r10, r9 // ..................*................................................................................. - // pkhtb r4, r4, r14, asr #16 // ......................*............................................................................. - // smulwt r10, r6, r9 // .....................*.............................................................................. - // usub16 r14, r7, r2 // .....................*.............................................................................. - // smulwb r9, r6, r9 // ......................*............................................................................. - // smlabt r10, r10, r12, r0 // .......................*............................................................................ - // uadd16 r7, r7, r2 // ....................*............................................................................... - // uadd16 r2, r8, r4 // .......................*............................................................................ - // smlabt r9, r9, r12, r0 // ........................*........................................................................... - // usub16 r8, r8, r4 // ........................*........................................................................... - // smulwb r4, r6, r2 // .........................*.......................................................................... - // smulwt r6, r6, r2 // ..........................*......................................................................... - // pkhtb r2, r10, r9, asr #16 // ..........................*......................................................................... - // smlabt r10, r4, r12, r0 // ...........................*........................................................................ - // ldr r9, [r1, #-4] // ...........................*........................................................................ - // smlabt r6, r6, r12, r0 // ............................*....................................................................... - // usub16 r4, r3, r2 // ............................*....................................................................... - // uadd16 r2, r3, r2 // .............................*...................................................................... - // smulwt r3, r9, r8 // .............................*...................................................................... - // pkhtb r10, r6, r10, asr #16 // ..............................*..................................................................... - // smulwb r6, r9, r8 // ..............................*..................................................................... - // smlabt r8, r3, r12, r0 // ...............................*.................................................................... - // uadd16 r3, r7, r10 // ...............................*.................................................................... - // usub16 r10, r7, r10 // ................................*................................................................... - // smlabt r7, r6, r12, r0 // ................................*................................................................... - // smulwt r6, r9, r11 // .................................*.................................................................. - // smulwb r9, r9, r11 // ..................................*................................................................. - // pkhtb r8, r8, r7, asr #16 // ..................................*................................................................. - // ldr r11, [r1], #8 // ...................................*................................................................ - // smlabt r7, r6, r12, r0 // ...................................*................................................................ - // smlabt r6, r9, r12, r0 // ....................................*............................................................... - // usub16 r9, r14, r8 // .....................................*.............................................................. - // uadd16 r14, r14, r8 // ....................................*............................................................... - // smulwb r8, r11, r3 // .....................................*.............................................................. - // smulwt r11, r11, r3 // ......................................*............................................................. - // pkhtb r6, r7, r6, asr #16 // ......................................*............................................................. - // smlabt r7, r8, r12, r0 // .......................................*............................................................ - // ldr r8, [r1, #-4] // .......................................*............................................................ - // uadd16 r3, r5, r6 // ........................................*........................................................... - // smlabt r11, r11, r12, r0 // ........................................*........................................................... - // usub16 r5, r5, r6 // .........................................*.......................................................... - // smulwt r6, r8, r14 // ..........................................*......................................................... - // smulwb r8, r8, r14 // .........................................*.......................................................... - // ldr r14, [r1], #8 // ...........................................*........................................................ - // pkhtb r11, r11, r7, asr #16 // ..........................................*......................................................... - // smlabt r6, r6, r12, r0 // ............................................*....................................................... - // smlabt r8, r8, r12, r0 // ...........................................*........................................................ - // smulwt r7, r14, r10 // .............................................*...................................................... - // pkhtb r8, r6, r8, asr #16 // ..............................................*..................................................... - // smulwb r10, r14, r10 // ..............................................*..................................................... - // ldr r14, [r1, #-4] // ...............................................*.................................................... - // smlabt r7, r7, r12, r0 // ...............................................*.................................................... - // smlabt r10, r10, r12, r0 // ................................................*................................................... - // usub16 r6, r2, r11 // ................................................*................................................... - // uadd16 r2, r2, r11 // .................................................*.................................................. - // smulwt r11, r14, r9 // .................................................*.................................................. - // smulwb r9, r14, r9 // ..................................................*................................................. - // pkhtb r10, r7, r10, asr #16 // ..................................................*................................................. - // ldr r14, [r1], #8 // ...........................................................*........................................ - // smlabt r11, r11, r12, r0 // ...................................................*................................................ - // usub16 r7, r3, r8 // ....................................................*............................................... - // smlabt r9, r9, r12, r0 // ....................................................*............................................... - // uadd16 r8, r3, r8 // .....................................................*.............................................. - // smulwt r3, r14, r2 // .............................................................*...................................... - // pkhtb r9, r11, r9, asr #16 // .......................................................*............................................ - // smulwb r11, r14, r2 // ..............................................................*..................................... - // ldr r14, [r1, #12] // ...................................................*................................................ - // smlabt r2, r3, r12, r0 // ...............................................................*.................................... - // smlabt r11, r11, r12, r0 // ................................................................*................................... - // uadd16 r3, r5, r9 // .................................................................*.................................. - // usub16 r9, r5, r9 // ................................................................*................................... - // smulwb r5, r14, r7 // ......................................................*............................................. - // smulwt r7, r14, r7 // .....................................................*.............................................. - // smlabt r14, r5, r12, r0 // ..........................................................*......................................... - // ldr r5, [r1, #4] // .......................................................................*............................ - // pkhtb r2, r2, r11, asr #16 // ..................................................................*................................. - // smlabt r7, r7, r12, r0 // .......................................................*............................................ - // smulwb r11, r5, r3 // .........................................................................*.......................... - // smulwt r3, r5, r3 // ..........................................................................*......................... - // ldr r5, [r1, #20] // ..............................................................*..................................... - // smlabt r11, r11, r12, r0 // ...........................................................................*........................ - // pkhtb r14, r7, r14, asr #16 // ............................................................*....................................... - // smulwb r7, r5, r9 // .................................................................*.................................. - // smulwt r9, r5, r9 // ..................................................................*................................. - // smlabt r5, r3, r12, r0 // ............................................................................*....................... - // smlabt r3, r9, r12, r0 // ....................................................................*............................... - // ldr r9, [r1, #-4] // ......................................................*............................................. - // pkhtb r5, r5, r11, asr #16 // ..............................................................................*..................... - // smlabt r7, r7, r12, r0 // ...................................................................*................................ - // smulwb r11, r9, r8 // ........................................................*........................................... - // pkhtb r7, r3, r7, asr #16 // ......................................................................*............................. - // smulwt r3, r9, r8 // .........................................................*.......................................... - // smlabt r9, r11, r12, r0 // ............................................................*....................................... - // ldr r8, [r1], #8 // ...................................................................*................................ - // uadd16 r11, r4, r10 // ....................................................................*............................... - // smlabt r3, r3, r12, r0 // ...........................................................*........................................ - // usub16 r4, r4, r10 // .....................................................................*.............................. - // smulwt r10, r8, r11 // ......................................................................*............................. - // smulwb r11, r8, r11 // .....................................................................*.............................. - // pkhtb r3, r3, r9, asr #16 // ...............................................................*.................................... - // ldr r8, [r1], #8 // ...........................................................................*........................ - // smlabt r9, r10, r12, r0 // ........................................................................*........................... - // smlabt r11, r11, r12, r0 // .......................................................................*............................ - // smulwt r10, r8, r6 // .............................................................................*...................... - // smulwb r6, r8, r6 // ..............................................................................*..................... - // pkhtb r11, r9, r11, asr #16 // ..........................................................................*......................... - // ldr r9, [r1], #8 // ...............................................................................*.................... - // smlabt r8, r10, r12, r0 // ...............................................................................*.................... - // smlabt r6, r6, r12, r0 // ................................................................................*................... - // smulwt r10, r9, r4 // .................................................................................*.................. - // smulwb r9, r9, r4 // ..................................................................................*................. - // pkhtb r4, r8, r6, asr #16 // ..................................................................................*................. - // smlabt r8, r10, r12, r0 // ...................................................................................*................ - // smlabt r6, r9, r12, r0 // ....................................................................................*............... - // vmov r0, s6 // ....................................................................................*............... - // str.w r14, [r0, #320] // ...........................................................................................*........ - // str.w r3, [r0, #64] // .........................................................................................*.......... - // str.w r4, [r0, #256] // ...................................................................................................* - // pkhtb r6, r8, r6, asr #16 // ......................................................................................*............. - // str.w r6, [r0, #384] // .............................................................................................*...... - // str.w r7, [r0, #448] // .................................................................................................*.. - // str.w r2, [r0], #4 // .......................................................................................*............ - // str.w r5, [r0, #188] // .....................................................................................*.............. - // str.w r11, [r0, #124] // ...............................................................................................*.... + // ------------------------------------- cycle (expected) --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------ + // ldr.w r6, [r1], #4 // *............................................................................................. + // ldr.w r3, [r0, #64] // .*............................................................................................ + // ldr.w r9, [r0, #192] // .....*........................................................................................ + // vmov s6, r0 // ....*......................................................................................... + // ldr.w r4, [r0, #448] // ...*.......................................................................................... + // smulwb r2, r6, r3 // ......*....................................................................................... + // ldr.w r5, [r0, #0] // ..*........................................................................................... + // smulwt r14, r6, r3 // ...*.......................................................................................... + // ldr.w r3, [r0, #320] // *............................................................................................. + // ldr.w r7, [r0, #256] // ....*......................................................................................... + // ldr.w r10, [r0, #128] // .......*...................................................................................... + // smulwb r11, r6, r3 // .....*........................................................................................ + // ldr.w r8, [r0, #384] // ......*....................................................................................... + // smulwt r3, r6, r3 // ..*........................................................................................... + // movw r0, #24608 // ........*..................................................................................... + // smlabt r2, r2, r12, r0 // ..........*................................................................................... + // smlabt r14, r14, r12, r0 // ...........*.................................................................................. + // smlabt r11, r11, r12, r0 // ........*..................................................................................... + // pkhtb r2, r14, r2, asr #16 // .............*................................................................................ + // smlabt r3, r3, r12, r0 // .........*.................................................................................... + // smulwb r14, r6, r9 // ............*................................................................................. + // pkhtb r3, r3, r11, asr #16 // ...........*.................................................................................. + // smulwt r9, r6, r9 // .............*................................................................................ + // smlabt r11, r14, r12, r0 // ..............*............................................................................... + // usub16 r14, r5, r2 // ...............*.............................................................................. + // smlabt r9, r9, r12, r0 // ...............*.............................................................................. + // uadd16 r2, r5, r2 // ..............*............................................................................... + // smulwt r5, r6, r4 // .................*............................................................................ + // pkhtb r11, r9, r11, asr #16 // .................*............................................................................ + // smulwb r9, r6, r4 // ..................*........................................................................... + // uadd16 r6, r10, r11 // .....................*........................................................................ + // smlabt r4, r5, r12, r0 // ...................*.......................................................................... + // usub16 r5, r10, r11 // ....................*......................................................................... + // ldr r10, [r1, #4] // ...................*.......................................................................... + // uadd16 r11, r7, r3 // ................*............................................................................. + // smlabt r9, r9, r12, r0 // ....................*......................................................................... + // usub16 r7, r7, r3 // ..................*........................................................................... + // smulwb r3, r10, r5 // .....................*........................................................................ + // smulwt r5, r10, r5 // ......................*....................................................................... + // pkhtb r4, r4, r9, asr #16 // .......................*...................................................................... + // smlabt r9, r3, r12, r0 // .......................*...................................................................... + // usub16 r3, r8, r4 // ........................*..................................................................... + // smlabt r5, r5, r12, r0 // ........................*..................................................................... + // uadd16 r8, r8, r4 // .........................*.................................................................... + // smulwb r4, r10, r3 // .........................*.................................................................... + // pkhtb r5, r5, r9, asr #16 // ..........................*................................................................... + // smulwt r9, r10, r3 // ..........................*................................................................... + // ldr r10, [r1], #8 // ...........................*.................................................................. + // smlabt r4, r4, r12, r0 // ...........................*.................................................................. + // uadd16 r3, r14, r5 // ............................*................................................................. + // smlabt r9, r9, r12, r0 // ............................*................................................................. + // usub16 r5, r14, r5 // .............................*................................................................ + // smulwb r14, r10, r8 // .................................*............................................................ + // pkhtb r9, r9, r4, asr #16 // ..............................*............................................................... + // smulwt r4, r10, r8 // ..................................*........................................................... + // smlabt r8, r14, r12, r0 // ...................................*.......................................................... + // uadd16 r14, r7, r9 // ...............................*.............................................................. + // smlabt r4, r4, r12, r0 // ....................................*......................................................... + // usub16 r9, r7, r9 // ................................*............................................................. + // smulwb r7, r10, r6 // .............................*................................................................ + // pkhtb r4, r4, r8, asr #16 // ......................................*....................................................... + // smulwt r10, r10, r6 // ..............................*............................................................... + // ldr r8, [r1, #4] // ...................................*.......................................................... + // smlabt r7, r7, r12, r0 // ...............................*.............................................................. + // uadd16 r6, r11, r4 // ........................................*..................................................... + // smlabt r10, r10, r12, r0 // ................................*............................................................. + // usub16 r4, r11, r4 // .........................................*.................................................... + // smulwb r11, r8, r14 // .....................................*........................................................ + // pkhtb r10, r10, r7, asr #16 // ..................................*........................................................... + // smulwt r7, r8, r14 // ......................................*....................................................... + // ldr r14, [r1], #8 // .......................................*...................................................... + // smlabt r11, r11, r12, r0 // .......................................*...................................................... + // usub16 r8, r2, r10 // .....................................*........................................................ + // smlabt r7, r7, r12, r0 // ........................................*..................................................... + // uadd16 r2, r2, r10 // ....................................*......................................................... + // smulwb r10, r14, r6 // .........................................*.................................................... + // pkhtb r7, r7, r11, asr #16 // ..........................................*................................................... + // smulwt r14, r14, r6 // ..........................................*................................................... + // ldr r6, [r1, #4] // ...........................................*.................................................. + // smlabt r10, r10, r12, r0 // ...........................................*.................................................. + // smlabt r11, r14, r12, r0 // ............................................*................................................. + // smulwb r14, r6, r9 // .............................................*................................................ + // pkhtb r10, r11, r10, asr #16 // ..............................................*............................................... + // smulwt r9, r6, r9 // ..............................................*............................................... + // ldr r11, [r1], #8 // ...............................................*.............................................. + // smlabt r14, r14, r12, r0 // ...............................................*.............................................. + // usub16 r6, r2, r10 // .................................................*............................................ + // smlabt r9, r9, r12, r0 // ................................................*............................................. + // uadd16 r2, r2, r10 // ................................................*............................................. + // smulwb r10, r11, r4 // .................................................*............................................ + // pkhtb r9, r9, r14, asr #16 // ..................................................*........................................... + // smulwt r4, r11, r4 // ..................................................*........................................... + // ldr r11, [r1, #4] // ...................................................*.......................................... + // smlabt r10, r10, r12, r0 // ...................................................*.......................................... + // uadd16 r14, r3, r7 // .............................................*................................................ + // smlabt r4, r4, r12, r0 // ....................................................*......................................... + // usub16 r7, r3, r7 // ............................................*................................................. + // smulwb r3, r11, r14 // .....................................................*........................................ + // pkhtb r10, r4, r10, asr #16 // ......................................................*....................................... + // smulwt r4, r11, r14 // ......................................................*....................................... + // ldr r11, [r1], #8 // .......................................................*...................................... + // smlabt r3, r3, r12, r0 // .......................................................*...................................... + // uadd16 r14, r8, r10 // ............................................................*................................. + // smlabt r4, r4, r12, r0 // ........................................................*..................................... + // usub16 r8, r8, r10 // .............................................................*................................ + // smulwb r10, r11, r2 // .........................................................*.................................... + // pkhtb r3, r4, r3, asr #16 // ..........................................................*................................... + // smulwt r11, r11, r2 // ..........................................................*................................... + // ldr r4, [r1, #4] // ...........................................................*.................................. + // smlabt r2, r10, r12, r0 // ...........................................................*.................................. + // uadd16 r10, r5, r9 // ........................................................*..................................... + // smlabt r11, r11, r12, r0 // ............................................................*................................. + // usub16 r5, r5, r9 // .........................................................*.................................... + // smulwb r9, r4, r10 // ..............................................................*............................... + // pkhtb r11, r11, r2, asr #16 // ..............................................................*............................... + // smulwt r2, r4, r10 // .............................................................*................................ + // ldr r4, [r1], #8 // ...............................................................*.............................. + // smlabt r10, r9, r12, r0 // ................................................................*............................. + // smlabt r2, r2, r12, r0 // ...............................................................*.............................. + // smulwb r9, r4, r14 // .................................................................*............................ + // pkhtb r10, r2, r10, asr #16 // ....................................................................*......................... + // smulwt r2, r4, r14 // ..................................................................*........................... + // ldr r14, [r1, #4] // ...................................................................*.......................... + // smlabt r9, r9, r12, r0 // ...................................................................*.......................... + // smlabt r4, r2, r12, r0 // ....................................................................*......................... + // smulwt r2, r14, r7 // .....................................................................*........................ + // pkhtb r9, r4, r9, asr #16 // ......................................................................*....................... + // smulwb r14, r14, r7 // ......................................................................*....................... + // ldr r7, [r1], #8 // .......................................................................*...................... + // smlabt r4, r2, r12, r0 // .......................................................................*...................... + // smlabt r2, r14, r12, r0 // ........................................................................*..................... + // smulwt r14, r7, r6 // .........................................................................*.................... + // smulwb r6, r7, r6 // ..........................................................................*................... + // ldr r7, [r1, #4] // ...........................................................................*.................. + // smlabt r14, r14, r12, r0 // ...........................................................................*.................. + // smlabt r6, r6, r12, r0 // ............................................................................*................. + // pkhtb r2, r4, r2, asr #16 // ............................................................................*................. + // smulwb r4, r7, r5 // .............................................................................*................ + // smulwt r7, r7, r5 // ..............................................................................*............... + // ldr r5, [r1], #8 // ...............................................................................*.............. + // smlabt r4, r4, r12, r0 // ...............................................................................*.............. + // pkhtb r14, r14, r6, asr #16 // ................................................................................*............. + // smlabt r6, r7, r12, r0 // ................................................................................*............. + // smulwb r7, r5, r8 // .................................................................................*............ + // pkhtb r4, r6, r4, asr #16 // ....................................................................................*......... + // smulwt r8, r5, r8 // ..................................................................................*........... + // vmov r6, s14 // ............................................................................................*. + // smlabt r5, r7, r12, r0 // ...................................................................................*.......... + // smlabt r8, r8, r12, r0 // ....................................................................................*......... + // vmov r0, s6 // .....................................................................................*........ + // str.w r2, [r0, #320] // ..........................................................................................*... + // str.w r14, [r0, #256] // ...........................................................................................*.. + // str.w r9, [r0, #128] // ......................................................................................*....... + // str.w r3, [r0, #64] // .......................................................................................*...... + // str.w r4, [r0, #448] // .....................................................................................*........ + // pkhtb r3, r8, r5, asr #16 // ........................................................................................*..... + // str.w r3, [r0, #384] // ........................................................................................*..... + // str.w r10, [r0, #192] // .........................................................................................*.... + // str.w r11, [r0], #4 // ............................................................................................*. + // cmp.w r0, r6 // .............................................................................................* - vmov tmp, s14 - cmp poly, tmp bne layer567_loop vpop.w {s16-s23}