diff --git a/example.py b/example.py index dfb47002..e818f423 100644 --- a/example.py +++ b/example.py @@ -748,11 +748,76 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): infile += f"_{var}" name += f"_{target_label_dict[target]}" + super().__init__(infile, name, rename=True, arch=arch, target=target) + def core(self,slothy): + slothy.config.allow_useless_instructions = True + slothy.fusion_region("start", "end", ssa=False) + +class Armv7mExample0(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): + name = "armv7m_simple0" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + super().__init__(infile, name, rename=True, arch=arch, target=target) def core(self,slothy): slothy.config.allow_useless_instructions = True slothy.fusion_region("start", "end", ssa=False) + +class Armv7mLoopSubs(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): + name = "loop_subs" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.optimize_loop("start") + +class Armv7mLoopCmp(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): + name = "loop_cmp" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.outputs = ["r6"] + slothy.optimize_loop("start") + +class Armv7mLoopVmovCmp(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): + name = "loop_vmov_cmp" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.outputs = ["r6"] + slothy.optimize_loop("start") class ntt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): @@ -2690,12 +2755,15 @@ def main(): AArch64Example2(target=Target_CortexA72), AArch64Split0(), + # Armv7m examples + Armv7mExample0(), # Loop examples AArch64LoopSubs(), LoopLe(), Armv7mLoopSubs(), Armv7mLoopCmp(), + Armv7mLoopVmovCmp(), CRT(), diff --git a/examples/naive/armv7m/armv7m_simple0.s b/examples/naive/armv7m/armv7m_simple0.s new file mode 100644 index 00000000..1b3e77c4 --- /dev/null +++ b/examples/naive/armv7m/armv7m_simple0.s @@ -0,0 +1,9 @@ + +start: +ldr r1, [r0, #4] +add r1, r2, r1 +eor.w r1, r1, r3 +smlabt r3, r2, r2, r1 +asrs r3, r3, #1 +str r3, [r0, #4] +end: \ No newline at end of file diff --git a/examples/opt/armv7m/armv7m_simple0_opt_m7.s b/examples/opt/armv7m/armv7m_simple0_opt_m7.s new file mode 100644 index 00000000..98abb396 --- /dev/null +++ b/examples/opt/armv7m/armv7m_simple0_opt_m7.s @@ -0,0 +1,33 @@ + + start: + // Instructions: 6 + // Expected cycles: 5 + // Expected IPC: 1.20 + // + // Cycle bound: 5.0 + // IPC bound: 1.20 + // + // Wall time: 0.02s + // User time: 0.02s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r6, [r0, #4] // *............................. + add r6, r2, r6 // .*............................ + eor.w r3, r6, r3 // ..*........................... + smlabt r12, r2, r2, r3 // ..*........................... + asrs r3, r12, #1 // ....*......................... + str r3, [r0, #4] // ....*......................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr r1, [r0, #4] // *.............................. + // add r1, r2, r1 // .*............................. + // eor.w r1, r1, r3 // ..*............................ + // smlabt r3, r2, r2, r1 // ..*............................ + // asrs r3, r3, #1 // ....*.......................... + // str r3, [r0, #4] // ....*.......................... + + end: diff --git a/examples/opt/armv7m/loop_cmp_opt_m7.s b/examples/opt/armv7m/loop_cmp_opt_m7.s new file mode 100644 index 00000000..4524a7e2 --- /dev/null +++ b/examples/opt/armv7m/loop_cmp_opt_m7.s @@ -0,0 +1,29 @@ +/* For example, r5 represents an address where we will stop iterating and r6 is +the actual pointer which is incremented inside the loop. */ + +mov.w r6, #0 +add.w r5, r6, #64 + +1: + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Cycle bound: 1.0 + // IPC bound: 1.00 + // + // Wall time: 0.02s + // User time: 0.02s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add r6, r6, #4 // *............................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // add r6, r6, #4 // *.............................. + + cmp r6, r5 + bne 1b \ No newline at end of file diff --git a/examples/opt/armv7m/loop_subs_opt_m7.s b/examples/opt/armv7m/loop_subs_opt_m7.s new file mode 100644 index 00000000..f1bcc451 --- /dev/null +++ b/examples/opt/armv7m/loop_subs_opt_m7.s @@ -0,0 +1,11 @@ +movw r5, #16 +start: + // Instructions: 0 + // Expected cycles: 0 + // Expected IPC: 0.00 + // + // Wall time: 0.00s + // User time: 0.00s + // + subs r5, #1 + bne start \ No newline at end of file diff --git a/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s b/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s new file mode 100644 index 00000000..c75f8b9b --- /dev/null +++ b/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s @@ -0,0 +1,31 @@ +/* For example, r5 represents an address where we will stop iterating and r6 is +the actual pointer which is incremented inside the loop. */ + +mov.w r6, #0 +add.w r5, r6, #64 +vmov s0, r5 + +start: + // Instructions: 1 + // Expected cycles: 1 + // Expected IPC: 1.00 + // + // Cycle bound: 1.0 + // IPC bound: 1.00 + // + // Wall time: 0.02s + // User time: 0.02s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + add r6, r6, #4 // *............................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // add r6, r6, #4 // *.............................. + + vmov r5, s0 + cmp r6, r5 + bne start \ No newline at end of file diff --git a/slothy/core/config.py b/slothy/core/config.py index 12b036e5..1057f09c 100644 --- a/slothy/core/config.py +++ b/slothy/core/config.py @@ -172,6 +172,12 @@ def unsafe_address_offset_fixup(self): str instructions with increment reordered with instructions depending on the address register). + By default, this is enabled for backwards compatibility. + + LIMITATION: For historical reason, this feature cannot be disabled for + the Armv8.1-M architecture model. A refactoring of that model is needed + to make address offset fixup configurable. + Note: The user-imposed safety constraint is not a necessity -- in principle, SLOTHY could detect when it is safe to reorder ldr/str instructions with increment. It just hasn't been implemented yet. @@ -1291,6 +1297,8 @@ def allow_useless_instructions(self,val): self._allow_useless_instructions = val @unsafe_address_offset_fixup.setter def unsafe_address_offset_fixup(self,val): + if val is False and self.arch.arch_name == "Arm_v81M": + raise InvalidConfig("unsafe address offset fixup must be set for Armv8.1-M") self._unsafe_address_offset_fixup = val @locked_registers.setter def locked_registers(self,val): diff --git a/slothy/core/core.py b/slothy/core/core.py index ebeeed41..13d51b79 100644 --- a/slothy/core/core.py +++ b/slothy/core/core.py @@ -1483,8 +1483,6 @@ def optimize(self, source, prefix_len=0, suffix_len=0, log_model=None, retry=Fal self.result.success = self._solve() self.result.valid = True - # - Export (optional) - self._export_model() if not retry and self.success: self.logger.info("Booleans in result: %d", self._model.cp_solver.NumBooleans()) @@ -3450,6 +3448,9 @@ def is_good_enough( cur, bound ): ok = self._model.cp_model.status in [cp_model.FEASIBLE, cp_model.OPTIMAL] + # - Export (optional) + self._export_model() + if ok: # Remember solution in case we want to retry with an(other) objective self._model.cp_model.ClearHints() diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py index ce710f94..69acd90a 100644 --- a/slothy/core/slothy.py +++ b/slothy/core/slothy.py @@ -385,7 +385,9 @@ def fusion_loop(self, loop_lbl, **kwargs): assert SourceLine.is_source(self.source) def optimize_loop(self, loop_lbl, postamble_label=None): - """Optimize the loop starting at a given label""" + """Optimize the loop starting at a given label + The postamble_label marks the end of the loop kernel. + """ logger = self.logger.getChild(loop_lbl) diff --git a/slothy/helper.py b/slothy/helper.py index 3ae380ce..e0ae835c 100644 --- a/slothy/helper.py +++ b/slothy/helper.py @@ -1222,11 +1222,11 @@ def _extract(self, source, lbl): pre = [] body = [] post = [] + # candidate lines for the end of the loop loop_end_candidates = [] loop_lbl_regexp_txt = self.lbl_regex loop_lbl_regexp = re.compile(loop_lbl_regexp_txt) - # TODO: Allow other forms of looping # end_regex shall contain group cnt as the counter variable loop_end_regexp_txt = self.end_regex loop_end_regexp = [re.compile(txt) for txt in loop_end_regexp_txt] @@ -1255,6 +1255,7 @@ def _extract(self, source, lbl): if state == 1: p = loop_end_regexp[loop_end_ctr].match(l_str) if p is not None: + # Case: We may have encountered part of the loop end # collect all named groups self.additional_data = self.additional_data | p.groupdict() loop_end_ctr += 1 @@ -1263,6 +1264,11 @@ def _extract(self, source, lbl): state = 2 continue elif loop_end_ctr > 0 and l_str != "": + # Case: The sequence of loop end candidates was interrupted + # i.e., we found a false-positive or this is not a proper loop + + # The loop end candidates are not part of the loop, meaning + # they belonged to the body body += loop_end_candidates self.additional_data = {} loop_end_ctr = 0 diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 842e41e6..725a3a18 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -50,6 +50,7 @@ class which generates instruction parsers and writers from instruction templates from slothy.targets.common import * from slothy.helper import Loop +arch_name = "Arm_AArch64" llvm_mca_arch = "aarch64" class RegisterType(Enum): @@ -181,7 +182,7 @@ class SubsLoop(Loop): ``` loop_lbl: {code} - sub[s] , , #1 + sub[s] , , # (cbnz|bnz|bne) , loop_lbl ``` where cnt is the loop counter in lr. @@ -191,7 +192,7 @@ def __init__(self, lbl="lbl", lbl_start="1", lbl_end="2", loop_init="lr") -> Non # The group naming in the regex should be consistent; give same group # names to the same registers self.lbl_regex = r"^\s*(?P