diff --git a/example.py b/example.py index 5dbceef4..0fefa758 100644 --- a/example.py +++ b/example.py @@ -819,6 +819,21 @@ def core(self,slothy): slothy.config.outputs = ["r6"] slothy.optimize_loop("start") +class AArch64IfElse(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "aarch64_ifelse" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.optimize() + class ntt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = "ntt_kyber_123_4567" @@ -2765,6 +2780,7 @@ def main(): AArch64Example1(target=Target_CortexA72), AArch64Example2(), AArch64Example2(target=Target_CortexA72), + AArch64IfElse(), AArch64Split0(), # Armv7m examples diff --git a/examples/naive/aarch64/aarch64_ifelse.s b/examples/naive/aarch64/aarch64_ifelse.s new file mode 100644 index 00000000..f621859f --- /dev/null +++ b/examples/naive/aarch64/aarch64_ifelse.s @@ -0,0 +1,30 @@ +ldr q0, [x1, #0] +ldr q1, [x2, #0] + +ldr q8, [x0] +ldr q9, [x0, #1*16] +ldr q10, [x0, #2*16] +ldr q11, [x0, #3*16] +.if 5 != 0 + mul v24.8h, v9.8h, v0.h[0] + sqrdmulh v9.8h, v9.8h, v0.h[1] + mls v24.8h, v9.8h, v1.h[0] + sub v9.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + + .if 5 > 2 + mul v24.8h, v11.8h, v0.h[0] + sqrdmulh v11.8h, v11.8h, v0.h[1] + mls v24.8h, v11.8h, v1.h[0] + sub v11.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + .else + add v10.8h, v10.8h, v11.8h + .endif +.else + add x0, x0, #4 +.endif +str q8, [x0], #4*16 +str q9, [x0, #-3*16] +str q10, [x0, #-2*16] +str q11, [x0, #-1*16] \ No newline at end of file diff --git a/examples/opt/aarch64/aarch64_ifelse_opt_a55.s b/examples/opt/aarch64/aarch64_ifelse_opt_a55.s new file mode 100644 index 00000000..ffb154cf --- /dev/null +++ b/examples/opt/aarch64/aarch64_ifelse_opt_a55.s @@ -0,0 +1,54 @@ + // Instructions: 20 + // Expected cycles: 28 + // Expected IPC: 0.71 + // + // Wall time: 0.25s + // User time: 0.25s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q2, [x0, #48] // *............................. + ldr q4, [x1, #0] // ..*........................... + ldr q13, [x0, #16] // ....*......................... + mul v27.8H, v2.8H, v4.H[0] // ......*....................... + sqrdmulh v6.8H, v2.8H, v4.H[1] // .......*...................... + ldr q3, [x2, #0] // ........*..................... + mul v24.8H, v13.8H, v4.H[0] // ..........*................... + ldr q28, [x0, #32] // ...........*.................. + mls v27.8H, v6.8H, v3.H[0] // .............*................ + sqrdmulh v14.8H, v13.8H, v4.H[1] // ..............*............... + ldr q2, [x0] // ...............*.............. + sub v18.8H, v28.8H, v27.8H // .................*............ + mls v24.8H, v14.8H, v3.H[0] // ..................*........... + add v9.8H, v28.8H, v27.8H // ....................*......... + str q18, [x0, #48] // .....................*........ + add v12.8H, v2.8H, v24.8H // ......................*....... + str q9, [x0, #32] // .......................*...... + sub v3.8H, v2.8H, v24.8H // ........................*..... + str q12, [x0], #4*16 // .........................*.... + str q3, [x0, #-48] // ...........................*.. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q0, [x1, #0] // ..*............................ + // ldr q1, [x2, #0] // ........*...................... + // ldr q8, [x0] // ...............*............... + // ldr q9, [x0, #1*16] // ....*.......................... + // ldr q10, [x0, #2*16] // ...........*................... + // ldr q11, [x0, #3*16] // *.............................. + // mul v24.8h, v9.8h, v0.h[0] // ..........*.................... + // sqrdmulh v9.8h, v9.8h, v0.h[1] // ..............*................ + // mls v24.8h, v9.8h, v1.h[0] // ..................*............ + // sub v9.8h, v8.8h, v24.8h // ........................*...... + // add v8.8h, v8.8h, v24.8h // ......................*........ + // mul v24.8h, v11.8h, v0.h[0] // ......*........................ + // sqrdmulh v11.8h, v11.8h, v0.h[1] // .......*....................... + // mls v24.8h, v11.8h, v1.h[0] // .............*................. + // sub v11.8h, v10.8h, v24.8h // .................*............. + // add v10.8h, v10.8h, v24.8h // ....................*.......... + // str q8, [x0], #4*16 // .........................*..... + // str q9, [x0, #-3*16] // ...........................*... + // str q10, [x0, #-2*16] // .......................*....... + // str q11, [x0, #-1*16] // .....................*......... diff --git a/slothy/helper.py b/slothy/helper.py index e0ae835c..d4b583c0 100644 --- a/slothy/helper.py +++ b/slothy/helper.py @@ -30,9 +30,10 @@ import logging from sympy import simplify from abc import ABC, abstractmethod - +from sympy import simplify from slothy.targets.common import * + class SourceLine: """Representation of a single line of source code"""