Merge pull request #119 from slothy-optimizer/equiv_test_fixup

Selftest: Add support for Armv7-M thumb
slothy-optimizer · Dec 11, 2024 · e6ec7ec · e6ec7ec
2 parents 4c7b177 + 4317ce7
commit e6ec7ec
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 11 deletions.
diff --git a/examples/naive/armv7m/armv7m_simple0_func.s b/examples/naive/armv7m/armv7m_simple0_func.s
@@ -1,10 +1,10 @@
 .syntax unified
 //.cpu cortex-m4 // llvm-mc does not like this...
-//.thumb // unicorn seems to get confused by this...
+.thumb // unicorn seems to get confused by this...
 
 .align 2
 .global my_func
-// .type my_func, %function // llvm-mc does not like this...
+.type my_func, %function
 my_func:
   push {r4-r11, lr}
 

diff --git a/slothy/core/core.py b/slothy/core/core.py
@@ -897,7 +897,12 @@ def run_code(code, txt=None):
             mu.mem_map(RAM_BASE, RAM_SZ)
             mu.mem_write(RAM_BASE, initial_memory)
             # Run emulator
-            mu.emu_start(CODE_BASE + offset, CODE_BASE + len(objcode))
+            try:
+                mu.emu_start(CODE_BASE + offset, CODE_BASE + len(objcode))
+            except:
+                log.error("Failed to emulate code using unicorn engine")
+                log.error("Code")
+                log.error(SouceLine.write_multiline(code))
 
             final_register_contents = {}
             for r in regs:

diff --git a/slothy/helper.py b/slothy/helper.py
@@ -1146,18 +1146,51 @@ def parse_as_int(s):
             raise LLVM_Mc_Error(f"Could not find unambiguous text section in object file. Sections: {sections}")
         return sections_with_offsets[text_section[0]]
 
+    @staticmethod
+    def llvm_mc_output_extract_symbol(objfile, symbol):
+        """Extracts symbol from an objectfile emitted by llvm-mc"""
+
+        # Feed object file through llvm-readobj
+        r = subprocess.run(["llvm-readobj", "-s", "-"], input=objfile, capture_output=True, check=True)
+        objfile_txt = r.stdout.decode().split("\n")
+
+        # So we look for lines "Name: ..." and lines "Value: ...".
+        def parse_as_int(s):
+            if s.startswith("0x"):
+                return int(s, base=16)
+            else:
+                return int(s,base=10)
+
+        symbols = filter(lambda l: l.strip().startswith("Name: "), objfile_txt)
+        symbols = list(map(lambda l: l.strip().removeprefix("Name: ").split(' ')[0].strip(), symbols))
+        values = filter(lambda l: l.strip().startswith("Value: "), objfile_txt)
+        values = map(lambda l: parse_as_int(l.strip().removeprefix("Value: ")), values)
+        symbols_with_values = { s:val for (s,val) in zip(symbols, values) }
+        matching_symbols = list(filter(lambda s: s.endswith(symbol), symbols))
+        # Sometimes assemble functions are named both `_foo` and `foo`, in which case we'd find
+        # multiple matching symbols -- however, they'd have the same value. Hence, only fail if
+        # there are multiple matching symbols of _different_ values.
+        if len({ symbols_with_values[s] for s in matching_symbols }) != 1:
+            raise LLVM_Mc_Error(f"Could not find unambiguous symbol {symbol} in object file. Symbols: {symbols}")
+        return symbols_with_values[matching_symbols[0]]
+
     @staticmethod
     def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_paths=None):
         """Runs LLVM-MC tool to assemble `source`, returning byte code"""
 
+        thumb = "thumb" in arch or (attr is not None and "thumb" in attr)
+
         # Unfortunately, there is no option to directly extract byte code
         # from LLVM-MC: One either gets a textual description, or an object file.
         # To not introduce another binary dependency, we just extract the byte
         # code directly from the textual output, which for every assembly line
         # has a "encoding: [byte0, byte1, ...]" comment at the end.
 
         if symbol is None:
+            if thumb is True:
+                source = [SourceLine(".thumb")] + source
             source = [SourceLine(".global harness"),
+                      SourceLine(".type harness, %function"),
                       SourceLine("harness:")] + source
             symbol = "harness"
 
@@ -1167,8 +1200,13 @@ def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_pa
                 source = CPreprocessor.unfold([], source, [], preprocessor,
                                               include=include_paths)
             except subprocess.CalledProcessError as exc:
+                log.error("CPreprocessor failed on the following input")
+                log.error(SouceLine.write_multiline(source))
                 raise LLVM_Mc_Error from exc
 
+        if platform.system() == "Darwin":
+            source = list(filter(lambda s: s.text.strip().startswith(".type") is False, source))
+
         code = SourceLine.write_multiline(source)
 
         log.debug(f"Calling LLVM MC assmelber on the following code")
@@ -1184,6 +1222,8 @@ def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_pa
             r = subprocess.run(["llvm-mc"] + args,
                                input=code.encode(), capture_output=True, check=True)
         except subprocess.CalledProcessError as exc:
+            log.error("llvm-mc failed to handle the following code")
+            log.error(code)
             raise LLVM_Mc_Error from exc
 
         args = [f"--arch={arch}", "--assemble", "--filetype=obj"]
@@ -1201,11 +1241,10 @@ def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_pa
         offset, sz = LLVM_Mc.llvm_mc_output_extract_text_section(objfile)
         code = objfile[offset:offset+sz]
 
-        # Extract symbol table
-        r = subprocess.run(["llvm-nm","-"], input=objfile, capture_output=True)
-        out = r.stdout.decode()
-        symbol = next(filter(lambda l: symbol in l, out.split("\n")))
-        offset = int(symbol.split(" ")[0], base=16)
+        offset = LLVM_Mc.llvm_mc_output_extract_symbol(objfile, symbol)
+
+        if platform.system() == "Darwin" and thumb is True:
+            offset += 1
 
         return code, offset
 

diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py
@@ -13,11 +13,11 @@
 from sympy import simplify
 
 llvm_mca_arch = "arm"
-llvm_mc_arch = "arm" ### TODO: What to put here?
-llvm_mc_attr = "armv5te,thumb2,dsp" ### TODO: What to put here?
+llvm_mc_arch = "thumb"
+llvm_mc_attr = "armv7e-m,thumb2,dsp,fpregs"
 
 unicorn_arch = UC_ARCH_ARM
-unicorn_mode = UC_MODE_ARM
+unicorn_mode = UC_MODE_THUMB | UC_MODE_MCLASS
 
 class RegisterType(Enum):
     GPR = 1