From 3ecb6194452b209fcb460a36ff8f06d6b98be2da Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 5 Nov 2024 17:36:33 +0800 Subject: [PATCH 1/3] Add support for .equ in asm https://github.com/slothy-optimizer/slothy/commit/b5f0aa9c9e83b1e51d1e526abfb1411f157b007e https://github.com/slothy-optimizer/slothy/commit/3c00fc88875a83fa9cc2198c52daae1fbaa40f30 https://github.com/slothy-optimizer/slothy/commit/c39ac3c9d9b02fe90aec73e4e053377335ae2fe4 --- slothy/helper.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/slothy/helper.py b/slothy/helper.py index 0aebad98..5ef6c50c 100644 --- a/slothy/helper.py +++ b/slothy/helper.py @@ -589,9 +589,14 @@ def _extract_core(source, lbl_start=None, lbl_end=None): class AsmAllocation(): """Helper for tracking register aliases via .req and .unreq""" + # TODO: This is conceptionally different and should be + # handled in its own class. + _REGEXP_EQU_TXT = r"\s*\.equ\s+(?P[A-Za-z0-9\_]+)\s*,\s*(?P[A-Za-z0-9()*/+-]+)" + _REGEXP_REQ_TXT = r"\s*(?P\w+)\s+\.req\s+(?P\w+)" _REGEXP_UNREQ_TXT = r"\s*\.unreq\s+(?P\w+)" + _REGEXP_EQU = re.compile(_REGEXP_EQU_TXT) _REGEXP_REQ = re.compile(_REGEXP_REQ_TXT) _REGEXP_UNREQ = re.compile(_REGEXP_UNREQ_TXT) @@ -625,6 +630,12 @@ def check_allocation(line): reg = p.group("reg") return alias, reg + p = AsmAllocation._REGEXP_EQU.match(line.text) + if p is not None: + key = p.group("key") + val = p.group("val") + return key, val + return None @staticmethod @@ -683,10 +694,17 @@ def parse_allocs(src): def unfold_all_aliases(aliases, src): """Unfold aliases in assembly source""" def _apply_single_alias_to_line(alias_from, alias_to, src): - return re.sub(f"(\\W){alias_from}(\\W|\\Z)", f"\\1{alias_to}\\2", src) + res = re.sub(f"(\\W){alias_from}(\\W|\\Z)", f"\\g<1>{alias_to}\\2", src) + return res def _apply_multiple_aliases_to_line(line): - for (alias_from, alias_to) in aliases.items(): - line = _apply_single_alias_to_line(alias_from, alias_to, line) + do_again = True + while do_again: + do_again = False + for (alias_from, alias_to) in aliases.items(): + line_new = _apply_single_alias_to_line(alias_from, alias_to, line) + if line_new != line: + do_again = True + line = line_new return line res = [] for line in src: From 535fac247355305131d8ad484e0e1f680ffd8ffd Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 5 Nov 2024 17:46:06 +0800 Subject: [PATCH 2/3] remove redundant .equ's --- examples/naive/intt_dilithium_12_34_56_78.s | 1 - examples/naive/intt_n256_l6_s32_bar.s | 1 - examples/naive/intt_n256_l6_s32_mont.s | 1 - examples/naive/intt_n256_l8_s32_bar.s | 1 - examples/naive/intt_n256_l8_s32_mont.s | 1 - examples/opt/intt_n256_l6_s32_bar.s | 1 - examples/opt/intt_n256_l6_s32_mont.s | 1 - examples/opt/intt_n256_l8_s32_bar.s | 1 - examples/opt/intt_n256_l8_s32_mont.s | 1 - 9 files changed, 9 deletions(-) diff --git a/examples/naive/intt_dilithium_12_34_56_78.s b/examples/naive/intt_dilithium_12_34_56_78.s index b170cd3e..11f1fda0 100644 --- a/examples/naive/intt_dilithium_12_34_56_78.s +++ b/examples/naive/intt_dilithium_12_34_56_78.s @@ -213,7 +213,6 @@ layer34_loop: // the scope of our work to optimize this: We only want to demonstrate the // ability of Helight to optimize the core loops. barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/naive/intt_n256_l6_s32_bar.s b/examples/naive/intt_n256_l6_s32_bar.s index e03446cf..8ccfe65c 100644 --- a/examples/naive/intt_n256_l6_s32_bar.s +++ b/examples/naive/intt_n256_l6_s32_bar.s @@ -160,7 +160,6 @@ layer34_loop: // TEMPORARY: Barrett reduction barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/naive/intt_n256_l6_s32_mont.s b/examples/naive/intt_n256_l6_s32_mont.s index c48b4089..bc21d2d7 100644 --- a/examples/naive/intt_n256_l6_s32_mont.s +++ b/examples/naive/intt_n256_l6_s32_mont.s @@ -290,7 +290,6 @@ layer34_loop: modulus_neg .req r10 neg modulus_neg, modulus barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/naive/intt_n256_l8_s32_bar.s b/examples/naive/intt_n256_l8_s32_bar.s index 0d29d216..41d2a33e 100644 --- a/examples/naive/intt_n256_l8_s32_bar.s +++ b/examples/naive/intt_n256_l8_s32_bar.s @@ -209,7 +209,6 @@ layer34_loop: // TEMPORARY: Barrett reduction barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/naive/intt_n256_l8_s32_mont.s b/examples/naive/intt_n256_l8_s32_mont.s index 86fdd73a..b54e54a3 100644 --- a/examples/naive/intt_n256_l8_s32_mont.s +++ b/examples/naive/intt_n256_l8_s32_mont.s @@ -819,7 +819,6 @@ layer34_loop: modulus_neg .req r10 neg modulus_neg, modulus barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/opt/intt_n256_l6_s32_bar.s b/examples/opt/intt_n256_l6_s32_bar.s index 88fb2980..ff912b1d 100644 --- a/examples/opt/intt_n256_l6_s32_bar.s +++ b/examples/opt/intt_n256_l6_s32_bar.s @@ -388,7 +388,6 @@ layer34_loop_end: // TEMPORARY: Barrett reduction barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/opt/intt_n256_l6_s32_mont.s b/examples/opt/intt_n256_l6_s32_mont.s index bbda3ee5..9f06fe0f 100644 --- a/examples/opt/intt_n256_l6_s32_mont.s +++ b/examples/opt/intt_n256_l6_s32_mont.s @@ -518,7 +518,6 @@ layer34_loop_end: modulus_neg .req r10 neg modulus_neg, modulus barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/opt/intt_n256_l8_s32_bar.s b/examples/opt/intt_n256_l8_s32_bar.s index 769d6bf1..008496d0 100644 --- a/examples/opt/intt_n256_l8_s32_bar.s +++ b/examples/opt/intt_n256_l8_s32_bar.s @@ -573,7 +573,6 @@ layer34_loop_end: // TEMPORARY: Barrett reduction barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 diff --git a/examples/opt/intt_n256_l8_s32_mont.s b/examples/opt/intt_n256_l8_s32_mont.s index 315b7a82..f06d5408 100644 --- a/examples/opt/intt_n256_l8_s32_mont.s +++ b/examples/opt/intt_n256_l8_s32_mont.s @@ -1190,7 +1190,6 @@ layer34_loop_end: modulus_neg .req r10 neg modulus_neg, modulus barrett_const .req r1 - .equ const_barrett, 63 movw barrett_const, #:lower16:const_barrett movt barrett_const, #:upper16:const_barrett mov lr, #64 From bc3b4aa447574546cabd4913f748ceb84204f36d Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 6 Nov 2024 15:24:46 +0800 Subject: [PATCH 3/3] add example for .equ --- example.py | 19 ++++++ examples/naive/aarch64/aarch64_simple0_equ.s | 28 +++++++++ .../opt/aarch64/aarch64_simple0_equ_opt_a55.s | 62 +++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 examples/naive/aarch64/aarch64_simple0_equ.s create mode 100644 examples/opt/aarch64/aarch64_simple0_equ_opt_a55.s diff --git a/example.py b/example.py index c5d1e06a..aa3ea54b 100644 --- a/example.py +++ b/example.py @@ -547,6 +547,24 @@ def core(self,slothy): slothy.config.constraints.stalls_first_attempt=32 slothy.optimize() +class AArch64Example0Equ(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_simple0_equ" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.constraints.stalls_first_attempt=32 + slothy.optimize(start="start", end="end") + + class AArch64Example1(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "aarch64_simple0_macros" @@ -1370,6 +1388,7 @@ def main(): AArch64Example0(), AArch64Example0(target=Target_CortexA72), + AArch64Example0Equ(), AArch64Example1(), AArch64Example1(target=Target_CortexA72), AArch64Example2(), diff --git a/examples/naive/aarch64/aarch64_simple0_equ.s b/examples/naive/aarch64/aarch64_simple0_equ.s new file mode 100644 index 00000000..3ebb1167 --- /dev/null +++ b/examples/naive/aarch64/aarch64_simple0_equ.s @@ -0,0 +1,28 @@ + .equ dist, 16 + +start: +ldr q0, [x1, #0] +ldr q1, [x2, #0] + +ldr q8, [x0] +ldr q9, [x0, #1*dist] +ldr q10, [x0, #2*dist] +ldr q11, [x0, #3*dist] + +mul v24.8h, v9.8h, v0.h[0] +sqrdmulh v9.8h, v9.8h, v0.h[1] +mls v24.8h, v9.8h, v1.h[0] +sub v9.8h, v8.8h, v24.8h +add v8.8h, v8.8h, v24.8h + +mul v24.8h, v11.8h, v0.h[0] +sqrdmulh v11.8h, v11.8h, v0.h[1] +mls v24.8h, v11.8h, v1.h[0] +sub v11.8h, v10.8h, v24.8h +add v10.8h, v10.8h, v24.8h + +str q8, [x0], #4*dist +str q9, [x0, #-3*dist] +str q10, [x0, #-2*dist] +str q11, [x0, #-1*dist] +end: \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_simple0_equ_opt_a55.s b/examples/opt/aarch64/aarch64_simple0_equ_opt_a55.s new file mode 100644 index 00000000..09381cbb --- /dev/null +++ b/examples/opt/aarch64/aarch64_simple0_equ_opt_a55.s @@ -0,0 +1,62 @@ + .equ dist, 16 + + start: + // Instructions: 20 + // Expected cycles: 28 + // Expected IPC: 0.71 + // + // Cycle bound: 28.0 + // IPC bound: 0.71 + // + // Wall time: 0.25s + // User time: 0.25s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q0, [x1, #0] // *............................. + ldr q7, [x0, #16] // ..*........................... + ldr q13, [x2, #0] // ....*......................... + ldr q24, [x0, #48] // ......*....................... + mul v30.8H, v7.8H, v0.H[0] // ........*..................... + sqrdmulh v14.8H, v7.8H, v0.H[1] // .........*.................... + sqrdmulh v27.8H, v24.8H, v0.H[1] // ..........*................... + mul v20.8H, v24.8H, v0.H[0] // ...........*.................. + ldr q17, [x0] // ............*................. + mls v30.8H, v14.8H, v13.H[0] // ..............*............... + mls v20.8H, v27.8H, v13.H[0] // ...............*.............. + ldr q13, [x0, #32] // ................*............. + sub v10.8H, v17.8H, v30.8H // ..................*........... + add v27.8H, v17.8H, v30.8H // ...................*.......... + sub v0.8H, v13.8H, v20.8H // ....................*......... + str q10, [x0, #16] // .....................*........ + add v8.8H, v13.8H, v20.8H // ......................*....... + str q0, [x0, #48] // .......................*...... + str q27, [x0], #4*16 // .........................*.... + str q8, [x0, #-32] // ...........................*.. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q0, [x1, #0] // *.............................. + // ldr q1, [x2, #0] // ....*.......................... + // ldr q8, [x0] // ............*.................. + // ldr q9, [x0, #1*16] // ..*............................ + // ldr q10, [x0, #2*16] // ................*.............. + // ldr q11, [x0, #3*16] // ......*........................ + // mul v24.8h, v9.8h, v0.h[0] // ........*...................... + // sqrdmulh v9.8h, v9.8h, v0.h[1] // .........*..................... + // mls v24.8h, v9.8h, v1.h[0] // ..............*................ + // sub v9.8h, v8.8h, v24.8h // ..................*............ + // add v8.8h, v8.8h, v24.8h // ...................*........... + // mul v24.8h, v11.8h, v0.h[0] // ...........*................... + // sqrdmulh v11.8h, v11.8h, v0.h[1] // ..........*.................... + // mls v24.8h, v11.8h, v1.h[0] // ...............*............... + // sub v11.8h, v10.8h, v24.8h // ....................*.......... + // add v10.8h, v10.8h, v24.8h // ......................*........ + // str q8, [x0], #4*16 // .........................*..... + // str q9, [x0, #-3*16] // .....................*......... + // str q10, [x0, #-2*16] // ...........................*... + // str q11, [x0, #-1*16] // .......................*....... + + end: