From 4e374ad97c62ba57f69ac83b000252be8504a1e5 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 13 Nov 2024 20:44:42 +0800 Subject: [PATCH] Add AArch64 example for assembly conditional --- example.py | 16 ++++++ examples/naive/aarch64/aarch64_ifelse.s | 30 +++++++++++ examples/opt/aarch64/aarch64_ifelse_opt_a55.s | 54 +++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 examples/naive/aarch64/aarch64_ifelse.s create mode 100644 examples/opt/aarch64/aarch64_ifelse_opt_a55.s diff --git a/example.py b/example.py index 9f2a360e..b4527aaf 100644 --- a/example.py +++ b/example.py @@ -706,6 +706,21 @@ def core(self,slothy): slothy.config.outputs = ["r6"] slothy.optimize_loop("start") +class AArch64IfElse(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_ifelse" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.optimize() + class ntt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = "ntt_kyber_123_4567" @@ -1494,6 +1509,7 @@ def main(): AArch64Example1(target=Target_CortexA72), AArch64Example2(), AArch64Example2(target=Target_CortexA72), + AArch64IfElse(), # Armv7m examples Armv7mExample0(), diff --git a/examples/naive/aarch64/aarch64_ifelse.s b/examples/naive/aarch64/aarch64_ifelse.s new file mode 100644 index 00000000..7c21475e --- /dev/null +++ b/examples/naive/aarch64/aarch64_ifelse.s @@ -0,0 +1,30 @@ +ldr q0, [x1, #0] +ldr q1, [x2, #0] + +ldr q8, [x0] +ldr q9, [x0, #1*16] +ldr q10, [x0, #2*16] +ldr q11, [x0, #3*16] +.if 5 != 0 + mul v24.8h, v9.8h, v0.h[0] + sqrdmulh v9.8h, v9.8h, v0.h[1] + mls v24.8h, v9.8h, v1.h[0] + sub v9.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + + .if 5 > 2 + mul v24.8h, v11.8h, v0.h[0] + sqrdmulh v11.8h, v11.8h, v0.h[1] + mls v24.8h, v11.8h, v1.h[0] + sub v11.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + .else + add v10.8h, v10.8h, v11.8h + .endif +.else + add r0, r1 +.endif +str q8, [x0], #4*16 +str q9, [x0, #-3*16] +str q10, [x0, #-2*16] +str q11, [x0, #-1*16] \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_ifelse_opt_a55.s b/examples/opt/aarch64/aarch64_ifelse_opt_a55.s new file mode 100644 index 00000000..ffb154cf --- /dev/null +++ b/examples/opt/aarch64/aarch64_ifelse_opt_a55.s @@ -0,0 +1,54 @@ + // Instructions: 20 + // Expected cycles: 28 + // Expected IPC: 0.71 + // + // Wall time: 0.25s + // User time: 0.25s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q2, [x0, #48] // *............................. + ldr q4, [x1, #0] // ..*........................... + ldr q13, [x0, #16] // ....*......................... + mul v27.8H, v2.8H, v4.H[0] // ......*....................... + sqrdmulh v6.8H, v2.8H, v4.H[1] // .......*...................... + ldr q3, [x2, #0] // ........*..................... + mul v24.8H, v13.8H, v4.H[0] // ..........*................... + ldr q28, [x0, #32] // ...........*.................. + mls v27.8H, v6.8H, v3.H[0] // .............*................ + sqrdmulh v14.8H, v13.8H, v4.H[1] // ..............*............... + ldr q2, [x0] // ...............*.............. + sub v18.8H, v28.8H, v27.8H // .................*............ + mls v24.8H, v14.8H, v3.H[0] // ..................*........... + add v9.8H, v28.8H, v27.8H // ....................*......... + str q18, [x0, #48] // .....................*........ + add v12.8H, v2.8H, v24.8H // ......................*....... + str q9, [x0, #32] // .......................*...... + sub v3.8H, v2.8H, v24.8H // ........................*..... + str q12, [x0], #4*16 // .........................*.... + str q3, [x0, #-48] // ...........................*.. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q0, [x1, #0] // ..*............................ + // ldr q1, [x2, #0] // ........*...................... + // ldr q8, [x0] // ...............*............... + // ldr q9, [x0, #1*16] // ....*.......................... + // ldr q10, [x0, #2*16] // ...........*................... + // ldr q11, [x0, #3*16] // *.............................. + // mul v24.8h, v9.8h, v0.h[0] // ..........*.................... + // sqrdmulh v9.8h, v9.8h, v0.h[1] // ..............*................ + // mls v24.8h, v9.8h, v1.h[0] // ..................*............ + // sub v9.8h, v8.8h, v24.8h // ........................*...... + // add v8.8h, v8.8h, v24.8h // ......................*........ + // mul v24.8h, v11.8h, v0.h[0] // ......*........................ + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*....................... + // mls v24.8h, v11.8h, v1.h[0] // .............*................. + // sub v11.8h, v10.8h, v24.8h // .................*............. + // add v10.8h, v10.8h, v24.8h // ....................*.......... + // str q8, [x0], #4*16 // .........................*..... + // str q9, [x0, #-3*16] // ...........................*... + // str q10, [x0, #-2*16] // .......................*....... + // str q11, [x0, #-1*16] // .....................*.........