diff --git a/example.py b/example.py
index dfb47002..e818f423 100644
--- a/example.py
+++ b/example.py
@@ -748,11 +748,76 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55):
             infile += f"_{var}"
         name += f"_{target_label_dict[target]}"
 
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+    def core(self,slothy):
+        slothy.config.allow_useless_instructions = True
+        slothy.fusion_region("start", "end", ssa=False)
+
+class Armv7mExample0(Example):
+    def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
+        name = "armv7m_simple0"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
         super().__init__(infile, name, rename=True, arch=arch, target=target)
 
     def core(self,slothy):
         slothy.config.allow_useless_instructions = True
         slothy.fusion_region("start", "end", ssa=False)
+        
+class Armv7mLoopSubs(Example):
+    def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
+        name = "loop_subs"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+    def core(self,slothy):
+        slothy.config.variable_size=True
+        slothy.optimize_loop("start")
+
+class Armv7mLoopCmp(Example):
+    def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
+        name = "loop_cmp"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+    def core(self,slothy):
+        slothy.config.variable_size=True
+        slothy.config.outputs = ["r6"]
+        slothy.optimize_loop("start")
+        
+class Armv7mLoopVmovCmp(Example):
+    def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
+        name = "loop_vmov_cmp"
+        infile = name
+
+        if var != "":
+            name += f"_{var}"
+            infile += f"_{var}"
+        name += f"_{target_label_dict[target]}"
+
+        super().__init__(infile, name, rename=True, arch=arch, target=target)
+
+    def core(self,slothy):
+        slothy.config.variable_size=True
+        slothy.config.outputs = ["r6"]
+        slothy.optimize_loop("start")
 
 class ntt_kyber_123_4567(Example):
     def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None):
@@ -2690,12 +2755,15 @@ def main():
                  AArch64Example2(target=Target_CortexA72),
 
                  AArch64Split0(),
+                # Armv7m examples
+                 Armv7mExample0(),
 
                 # Loop examples
                  AArch64LoopSubs(),
                  LoopLe(),
                  Armv7mLoopSubs(),
                  Armv7mLoopCmp(),
+                 Armv7mLoopVmovCmp(),
 
                  CRT(),
 
diff --git a/examples/naive/armv7m/armv7m_simple0.s b/examples/naive/armv7m/armv7m_simple0.s
new file mode 100644
index 00000000..1b3e77c4
--- /dev/null
+++ b/examples/naive/armv7m/armv7m_simple0.s
@@ -0,0 +1,9 @@
+
+start:
+ldr r1, [r0, #4]
+add r1, r2, r1
+eor.w r1, r1, r3
+smlabt r3, r2, r2, r1
+asrs r3, r3, #1
+str r3, [r0, #4]
+end:
\ No newline at end of file
diff --git a/examples/opt/armv7m/armv7m_simple0_opt_m7.s b/examples/opt/armv7m/armv7m_simple0_opt_m7.s
new file mode 100644
index 00000000..98abb396
--- /dev/null
+++ b/examples/opt/armv7m/armv7m_simple0_opt_m7.s
@@ -0,0 +1,33 @@
+
+        start:
+                                      // Instructions:    6
+                                      // Expected cycles: 5
+                                      // Expected IPC:    1.20
+                                      //
+                                      // Cycle bound:     5.0
+                                      // IPC bound:       1.20
+                                      //
+                                      // Wall time:     0.02s
+                                      // User time:     0.02s
+                                      //
+                                      // ----- cycle (expected) ------>
+                                      // 0                        25
+                                      // |------------------------|----
+        ldr r6, [r0, #4]              // *.............................
+        add r6, r2, r6                // .*............................
+        eor.w r3, r6, r3              // ..*...........................
+        smlabt r12, r2, r2, r3        // ..*...........................
+        asrs r3, r12, #1              // ....*.........................
+        str r3, [r0, #4]              // ....*.........................
+
+                                      // ------ cycle (expected) ------>
+                                      // 0                        25
+                                      // |------------------------|-----
+        // ldr r1, [r0, #4]           // *..............................
+        // add r1, r2, r1             // .*.............................
+        // eor.w r1, r1, r3           // ..*............................
+        // smlabt r3, r2, r2, r1      // ..*............................
+        // asrs r3, r3, #1            // ....*..........................
+        // str r3, [r0, #4]           // ....*..........................
+
+        end:
diff --git a/examples/opt/armv7m/loop_cmp_opt_m7.s b/examples/opt/armv7m/loop_cmp_opt_m7.s
new file mode 100644
index 00000000..4524a7e2
--- /dev/null
+++ b/examples/opt/armv7m/loop_cmp_opt_m7.s
@@ -0,0 +1,29 @@
+/* For example, r5 represents an address where we will stop iterating and r6 is
+the actual pointer which is incremented inside the loop. */
+
+mov.w r6, #0
+add.w r5, r6, #64
+
+1:
+                              // Instructions:    1
+                              // Expected cycles: 1
+                              // Expected IPC:    1.00
+                              //
+                              // Cycle bound:     1.0
+                              // IPC bound:       1.00
+                              //
+                              // Wall time:     0.02s
+                              // User time:     0.02s
+                              //
+                              // ----- cycle (expected) ------>
+                              // 0                        25
+                              // |------------------------|----
+        add r6, r6, #4        // *.............................
+
+                               // ------ cycle (expected) ------>
+                               // 0                        25
+                               // |------------------------|-----
+        // add r6, r6, #4      // *..............................
+
+        cmp r6, r5
+        bne 1b
\ No newline at end of file
diff --git a/examples/opt/armv7m/loop_subs_opt_m7.s b/examples/opt/armv7m/loop_subs_opt_m7.s
new file mode 100644
index 00000000..f1bcc451
--- /dev/null
+++ b/examples/opt/armv7m/loop_subs_opt_m7.s
@@ -0,0 +1,11 @@
+movw r5, #16
+start:
+                // Instructions:    0
+                // Expected cycles: 0
+                // Expected IPC:    0.00
+                //
+                // Wall time:     0.00s
+                // User time:     0.00s
+                //
+        subs r5, #1
+        bne start
\ No newline at end of file
diff --git a/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s b/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s
new file mode 100644
index 00000000..c75f8b9b
--- /dev/null
+++ b/examples/opt/armv7m/loop_vmov_cmp_opt_m7.s
@@ -0,0 +1,31 @@
+/* For example, r5 represents an address where we will stop iterating and r6 is
+the actual pointer which is incremented inside the loop. */
+
+mov.w r6, #0
+add.w r5, r6, #64
+vmov s0, r5
+
+start:
+                              // Instructions:    1
+                              // Expected cycles: 1
+                              // Expected IPC:    1.00
+                              //
+                              // Cycle bound:     1.0
+                              // IPC bound:       1.00
+                              //
+                              // Wall time:     0.02s
+                              // User time:     0.02s
+                              //
+                              // ----- cycle (expected) ------>
+                              // 0                        25
+                              // |------------------------|----
+        add r6, r6, #4        // *.............................
+
+                               // ------ cycle (expected) ------>
+                               // 0                        25
+                               // |------------------------|-----
+        // add r6, r6, #4      // *..............................
+
+        vmov r5, s0
+        cmp r6, r5
+        bne start
\ No newline at end of file
diff --git a/slothy/core/config.py b/slothy/core/config.py
index 12b036e5..1057f09c 100644
--- a/slothy/core/config.py
+++ b/slothy/core/config.py
@@ -172,6 +172,12 @@ def unsafe_address_offset_fixup(self):
         str instructions with increment reordered with instructions depending
         on the address register).
 
+        By default, this is enabled for backwards compatibility.
+
+        LIMITATION: For historical reason, this feature cannot be disabled for
+        the Armv8.1-M architecture model. A refactoring of that model is needed
+        to make address offset fixup configurable.
+
         Note: The user-imposed safety constraint is not a necessity -- in principle,
         SLOTHY could detect when it is safe to reorder ldr/str instructions with increment.
         It just hasn't been implemented yet.
@@ -1291,6 +1297,8 @@ def allow_useless_instructions(self,val):
         self._allow_useless_instructions = val
     @unsafe_address_offset_fixup.setter
     def unsafe_address_offset_fixup(self,val):
+        if val is False and self.arch.arch_name == "Arm_v81M":
+            raise InvalidConfig("unsafe address offset fixup must be set for Armv8.1-M")
         self._unsafe_address_offset_fixup = val
     @locked_registers.setter
     def locked_registers(self,val):
diff --git a/slothy/core/core.py b/slothy/core/core.py
index ebeeed41..13d51b79 100644
--- a/slothy/core/core.py
+++ b/slothy/core/core.py
@@ -1483,8 +1483,6 @@ def optimize(self, source, prefix_len=0, suffix_len=0, log_model=None, retry=Fal
         self.result.success = self._solve()
         self.result.valid = True
 
-        # - Export (optional)
-        self._export_model()
 
         if not retry and self.success:
             self.logger.info("Booleans in result: %d", self._model.cp_solver.NumBooleans())
@@ -3450,6 +3448,9 @@ def is_good_enough( cur, bound ):
 
         ok = self._model.cp_model.status in [cp_model.FEASIBLE, cp_model.OPTIMAL]
 
+        # - Export (optional)
+        self._export_model()
+
         if ok:
             # Remember solution in case we want to retry with an(other) objective
             self._model.cp_model.ClearHints()
diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py
index ce710f94..69acd90a 100644
--- a/slothy/core/slothy.py
+++ b/slothy/core/slothy.py
@@ -385,7 +385,9 @@ def fusion_loop(self, loop_lbl, **kwargs):
         assert SourceLine.is_source(self.source)
 
     def optimize_loop(self, loop_lbl, postamble_label=None):
-        """Optimize the loop starting at a given label"""
+        """Optimize the loop starting at a given label
+            The postamble_label marks the end of the loop kernel.
+        """
 
         logger = self.logger.getChild(loop_lbl)
 
diff --git a/slothy/helper.py b/slothy/helper.py
index 3ae380ce..e0ae835c 100644
--- a/slothy/helper.py
+++ b/slothy/helper.py
@@ -1222,11 +1222,11 @@ def _extract(self, source, lbl):
         pre  = []
         body = []
         post = []
+        # candidate lines for the end of the loop
         loop_end_candidates = []
         loop_lbl_regexp_txt = self.lbl_regex
         loop_lbl_regexp = re.compile(loop_lbl_regexp_txt)
 
-        # TODO: Allow other forms of looping
         # end_regex shall contain group cnt as the counter variable
         loop_end_regexp_txt = self.end_regex
         loop_end_regexp = [re.compile(txt) for txt in loop_end_regexp_txt]
@@ -1255,6 +1255,7 @@ def _extract(self, source, lbl):
             if state == 1:
                 p = loop_end_regexp[loop_end_ctr].match(l_str)
                 if p is not None:
+                    # Case: We may have encountered part of the loop end
                     # collect all named groups
                     self.additional_data = self.additional_data | p.groupdict()
                     loop_end_ctr += 1
@@ -1263,6 +1264,11 @@ def _extract(self, source, lbl):
                         state = 2
                     continue
                 elif loop_end_ctr > 0 and l_str != "":
+                    # Case: The sequence of loop end candidates was interrupted
+                    #       i.e., we found a false-positive or this is not a proper loop
+                    
+                    # The loop end candidates are not part of the loop, meaning
+                    # they belonged to the body
                     body += loop_end_candidates
                     self.additional_data = {}
                     loop_end_ctr = 0
diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 842e41e6..725a3a18 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -50,6 +50,7 @@ class which generates instruction parsers and writers from instruction templates
 from slothy.targets.common import *
 from slothy.helper import Loop
 
+arch_name = "Arm_AArch64"
 llvm_mca_arch = "aarch64"
 
 class RegisterType(Enum):
@@ -181,7 +182,7 @@ class SubsLoop(Loop):
     ```
            loop_lbl:
                {code}
-               sub[s] <cnt>, <cnt>, #1
+               sub[s] <cnt>, <cnt>, #<imm>
                (cbnz|bnz|bne) <cnt>, loop_lbl
     ```
     where cnt is the loop counter in lr.
@@ -191,7 +192,7 @@ def __init__(self, lbl="lbl", lbl_start="1", lbl_end="2", loop_init="lr") -> Non
         # The group naming in the regex should be consistent; give same group
         # names to the same registers
         self.lbl_regex = r"^\s*(?P<label>\w+)\s*:(?P<remainder>.*)$"
-        self.end_regex = (r"^\s*sub[s]?\s+(?P<cnt>\w+),\s*(?P<reg1>\w+),\s*(?P<imm>#1)",
+        self.end_regex = (r"^\s*sub[s]?\s+(?P<cnt>\w+),\s*(?P<reg1>\w+),\s*#(?P<imm>\d+)",
                                rf"^\s*(cbnz|bnz|bne)\s+(?P<cnt>\w+),\s*{lbl}")
 
     def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None, preamble_code=None, body_code=None, postamble_code=None, register_aliases=None):
@@ -201,6 +202,9 @@ def start(self, loop_cnt, indentation=0, fixup=0, unroll=1, jump_if_empty=None,
             assert unroll in [1,2,4,8,16,32]
             yield f"{indent}lsr {loop_cnt}, {loop_cnt}, #{int(math.log2(unroll))}"
         if fixup != 0:
+            # In case the immediate is >1, we need to scale the fixup. This
+            # allows for loops that do not use an increment of 1
+            fixup *= self.additional_data['imm']
             yield f"{indent}sub {loop_cnt}, {loop_cnt}, #{fixup}"
         if jump_if_empty is not None:
             yield f"cbz {loop_cnt}, {jump_if_empty}"
@@ -893,7 +897,7 @@ def make(cls, src):
         obj.addr = obj.args_in[0]
         return obj
 
-class q_ldr_with_inc_hint(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
+class q_ldr_with_imm_hint(Ldr_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldrh <Qa>, <Xc>, <imm>, <Th>"
     inputs = ["Xc", "Th"]
     outputs = ["Qa"]
@@ -1122,7 +1126,7 @@ def make(cls, src):
         obj.addr = obj.args_in[1]
         return obj
 
-class q_str_with_inc_hint(Str_Q): # pylint: disable=missing-docstring,invalid-name
+class q_str_with_imm_hint(Str_Q): # pylint: disable=missing-docstring,invalid-name
     pattern = "strh <Qa>, <Xc>, <imm>, <Th>"
     inputs = ["Qa", "Xc"]
     outputs = ["Th"]
@@ -1485,7 +1489,7 @@ def make(cls, src):
         obj.addr = obj.args_in_out[0]
         return obj
 
-class x_ldp_with_inc_hint(Ldp_X): # pylint: disable=missing-docstring,invalid-name
+class x_ldp_with_imm_hint(Ldp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldph <Xa>, <Xb>, <Xc>, <imm>, <Th>"
     inputs = ["Xc", "Th"]
     outputs = ["Xa", "Xb"]
@@ -1501,7 +1505,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_ldp_sp_with_inc_hint(Ldp_X): # pylint: disable=missing-docstring,invalid-name
+class x_ldp_sp_with_imm_hint(Ldp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldph <Xa>, <Xb>, sp, <imm>, <Th>"
     inputs = ["Th"]
     outputs = ["Xa", "Xb"]
@@ -1517,7 +1521,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_ldp_sp_with_inc_hint2(Ldp_X): # pylint: disable=missing-docstring,invalid-name
+class x_ldp_sp_with_imm_hint2(Ldp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldphp <Xa>, <Xb>, sp, <imm>, <Th0>, <Th1>"
     inputs = ["Th0", "Th1"]
     outputs = ["Xa", "Xb"]
@@ -1533,7 +1537,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_ldp_with_inc_hint2(Ldp_X): # pylint: disable=missing-docstring,invalid-name
+class x_ldp_with_imm_hint2(Ldp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "ldphp <Xa>, <Xb>, <Xc>, <imm>, <Th0>, <Th1>"
     inputs = ["Xc", "Th0", "Th1"]
     outputs = ["Xa", "Xb"]
@@ -2799,7 +2803,7 @@ def make(cls, src):
         obj.addr = obj.args_in_out[0]
         return obj
 
-class x_stp_with_inc_hint(Stp_X): # pylint: disable=missing-docstring,invalid-name
+class x_stp_with_imm_hint(Stp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "stph <Xa>, <Xb>, <Xc>, <imm>, <Th>"
     inputs = ["Xc", "Xa", "Xb"]
     outputs = ["Th"]
@@ -2815,7 +2819,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_stp_sp_with_inc_hint(Stp_X): # pylint: disable=missing-docstring,invalid-name
+class x_stp_sp_with_imm_hint(Stp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "stph <Xa>, <Xb>, sp, <imm>, <Th>"
     inputs = ["Xa", "Xb"]
     outputs = ["Th"]
@@ -2831,7 +2835,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_stp_sp_with_inc_hint2(Stp_X): # pylint: disable=missing-docstring,invalid-name
+class x_stp_sp_with_imm_hint2(Stp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "stphp <Xa>, <Xb>, sp, <imm>, <Th0>, <Th1>"
     inputs = ["Xa", "Xb"]
     outputs = ["Th0", "Th1"]
@@ -2847,7 +2851,7 @@ def write(self):
         self.immediate = simplify(self.pre_index)
         return super().write()
 
-class x_stp_with_inc_hint2(Stp_X): # pylint: disable=missing-docstring,invalid-name
+class x_stp_with_imm_hint2(Stp_X): # pylint: disable=missing-docstring,invalid-name
     pattern = "stphp <Xa>, <Xb>, <Xc>, <imm>, <Th0>, <Th1>"
     inputs = ["Xa", "Xb", "Xc"]
     outputs = ["Th0", "Th1"]
diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py
index eeda2312..8cd3ab76 100644
--- a/slothy/targets/arm_v7m/arch_v7m.py
+++ b/slothy/targets/arm_v7m/arch_v7m.py
@@ -14,10 +14,8 @@
 class RegisterType(Enum):
     GPR = 1
     FPR = 2
-    STACK_FPR = 3
-    STACK_GPR = 4
-    FLAGS = 5
-    HINT = 6
+    FLAGS = 3
+    HINT = 4
 
     def __str__(self):
         return self.name
@@ -34,9 +32,6 @@ def spillable(reg_type):
     def list_registers(reg_type, only_extra=False, only_normal=False, with_variants=False):
         """Return the list of all registers of a given type"""
 
-        stack_locations  = [ f"STACK{i}"  for i in range(8) ]
-        fpstack_locations  = [ f"STACK{i}"  for i in range(8) ]
-
         gprs_normal  = [ f"r{i}" for i in range(15) ]
         fprs_normal  = [ f"s{i}" for i in range(31) ]
 
@@ -59,9 +54,7 @@ def list_registers(reg_type, only_extra=False, only_normal=False, with_variants=
             fprs += fprs_extra
 
         return { RegisterType.GPR       : gprs,
-                 RegisterType.STACK_GPR : stack_locations,
                  RegisterType.FPR       : fprs,
-                 RegisterType.STACK_FPR : fpstack_locations,
                  RegisterType.HINT      : hints,
                  RegisterType.FLAGS     : flags}[reg_type]
 
@@ -89,9 +82,7 @@ def is_renamed(ty):
     def from_string(string):
         """Find registe type from string"""
         string = string.lower()
-        return { "fprstack"    : RegisterType.STACK_FPR,
-                 "stack"     : RegisterType.STACK_GPR,
-                 "fpr"      : RegisterType.FPR,
+        return { "fpr"      : RegisterType.FPR,
                  "gpr"       : RegisterType.GPR,
                  "hint"      : RegisterType.HINT,
                  "flags"     : RegisterType.FLAGS}.get(string,None)
@@ -212,7 +203,30 @@ def end(self, other, indentation=0):
             lbl_start += "b"
 
         yield f'{indent}bne {lbl_start}'
+
 class VmovCmpLoop(Loop):
+    """
+    Loop ending in a vmov, a compare, and a branch.
+    
+    The modification to the value we compare against happens inside the loop
+    body. The value that is being compared to is stashed to a floating point
+    register before the loop starts and therefore needs to be recovered before
+    the comparison. 
+    
+    WARNING: This type of loop is experimental as slothy has no knowledge about
+    what happens inside the loop boundary! Especially, a register is written
+    inside the boundary which may be used for renaming by slothy. Use with
+    caution.
+
+    Example:
+    ```
+           loop_lbl:
+               {code}
+               vmov <end>, <endf>
+               cmp <cnt>, <end>
+               (cbnz|bnz|bne) loop_lbl
+    ``` where cnt is the loop counter in lr.
+    """
     def __init__(self, lbl="lbl", lbl_start="1", lbl_end="2", loop_init="lr") -> None:
         super().__init__(lbl_start=lbl_start, lbl_end=lbl_end, loop_init=loop_init)
         self.lbl = lbl
@@ -283,6 +297,21 @@ def end(self, other, indentation=0):
         yield f'{indent}bne {lbl_start}'
 
 class CmpLoop(Loop):
+    """
+    Loop ending in a compare and a branch.
+    The modification to the value we compare against happens inside the loop body.
+    WARNING: This type of loop is experimental as slothy has no knowledge about 
+    what happens inside the loop boundary! Use with caution.
+
+    Example:
+    ```
+           loop_lbl:
+               {code}
+               cmp <cnt>, <end>
+               (cbnz|bnz|bne) loop_lbl
+    ```
+    where cnt is the loop counter in lr.
+    """
     def __init__(self, lbl="lbl", lbl_start="1", lbl_end="2", loop_init="lr") -> None:
         super().__init__(lbl_start=lbl_start, lbl_end=lbl_end, loop_init=loop_init)
         self.lbl_regex = r"^\s*(?P<label>\w+)\s*:(?P<remainder>.*)$"
@@ -344,6 +373,18 @@ def end(self, other, indentation=0):
         yield f'{indent}bne {lbl_start}'
 
 class SubsLoop(Loop):
+    """
+    Loop ending in a flag setting subtraction and a branch.
+
+    Example:
+    ```
+           loop_lbl:
+               {code}
+               sub[s] <cnt>, <cnt>, #1
+               (cbnz|bnz|bne) loop_lbl
+    ```
+    where cnt is the loop counter in lr.
+    """
     def __init__(self, lbl_start="1", lbl_end="2", loop_init="lr") -> None:
         super().__init__(lbl_start=lbl_start, lbl_end=lbl_end, loop_init=loop_init)
         self.lbl_regex = r"^\s*(?P<label>\w+)\s*:(?P<remainder>.*)$"
diff --git a/slothy/targets/arm_v7m/cortex_m7.py b/slothy/targets/arm_v7m/cortex_m7.py
index f2705b10..4140721f 100644
--- a/slothy/targets/arm_v7m/cortex_m7.py
+++ b/slothy/targets/arm_v7m/cortex_m7.py
@@ -32,11 +32,10 @@ class ExecutionUnit(Enum):
     ALU0 = 1
     ALU1 = 2
     MAC = 5
-    FPU0 = 6
-    FPU1 = 7
-    LOAD0 = 8
-    LOAD1 = 9
-    SIMD = 10
+    FPU = 6
+    LOAD0 = 7
+    LOAD1 = 8
+    SIMD = 9
 
     def __repr__(self):
         return self.name
@@ -160,8 +159,7 @@ def get_min_max_objective(slothy):
     (ror, ror_short, rors_short, lsl, asr, asrs): [[ExecutionUnit.ALU0], [ExecutionUnit.ALU1]],
     (mul, mul_short, smull, smlal, mla, mls, smulwb, smulwt, smultb, smultt,
      smulbb, smlabt, smlabb, smlatt, smlad, smladx, smuad, smuadx, smmulr): [ExecutionUnit.MAC],
-    (vmov_gpr, vmov_gpr2): [ExecutionUnit.FPU0, ExecutionUnit.FPU1],
-    (vmov_gpr2_dual): [[ExecutionUnit.FPU0, ExecutionUnit.FPU1]],
+    (vmov_gpr, vmov_gpr2, vmov_gpr2_dual): [ExecutionUnit.FPU],
     (uadd16, sadd16, usub16, ssub16): list(map(list, product(ExecutionUnit.ALU(), [ExecutionUnit.SIMD]))),
     (pkhbt, pkhtb, pkhbt_shifted, ubfx_imm): [[ExecutionUnit.ALU0, ExecutionUnit.SIMD]],
     (Armv7mShiftedArithmetic): [[ExecutionUnit.ALU0]],
@@ -207,6 +205,7 @@ def get_min_max_objective(slothy):
         ror, ror_short, rors_short, lsl, asr, asrs,
         cmp, cmp_imm,
         vmov_gpr,
+        vmov_gpr2, vmov_gpr2_dual,  # verify for dual
         pkhbt, pkhtb, pkhbt_shifted, ubfx_imm,
         str_with_imm,
         str_with_imm_stack,
@@ -218,7 +217,7 @@ def get_min_max_objective(slothy):
     ): 1,
     (
         stm_interval_inc_writeback,  # actually not, just placeholder
-        vmov_gpr2, vmov_gpr2_dual): 2
+        vmov_gpr2_dual): 2
 }
 
 default_latencies = {
diff --git a/slothy/targets/arm_v81m/arch_v81m.py b/slothy/targets/arm_v81m/arch_v81m.py
index 74b93355..e7290411 100644
--- a/slothy/targets/arm_v81m/arch_v81m.py
+++ b/slothy/targets/arm_v81m/arch_v81m.py
@@ -43,6 +43,7 @@
 from slothy.targets.common import *
 from slothy.helper import Loop
 
+arch_name = "Arm_v81M"
 llvm_mca_arch = "arm"
 
 class RegisterType(Enum):