diff --git a/example.py b/example.py index 9decb2e0..030f3b60 100644 --- a/example.py +++ b/example.py @@ -656,6 +656,24 @@ def core(self,slothy): slothy.config.inputs_are_outputs = True slothy.optimize(start="start", end="end") +class Armv7mExample0Func(Example): + def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): + name = "armv7m_simple0_func" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target) + + def core(self,slothy): + slothy.config.variable_size=True + slothy.config.inputs_are_outputs = True + slothy.optimize(start="start", end="end") + slothy.global_selftest("my_func", {"r0": 1024 }) + class Armv7mLoopSubs(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): name = "loop_subs" @@ -688,7 +706,7 @@ def core(self,slothy): slothy.config.variable_size=True slothy.config.outputs = ["r6"] slothy.optimize_loop("start") - + class Armv7mLoopVmovCmp(Example): def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7): name = "loop_vmov_cmp" @@ -720,7 +738,7 @@ def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): def core(self,slothy): slothy.optimize() - + class ntt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): name = "ntt_kyber_123_4567" @@ -744,6 +762,9 @@ def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 64 slothy.optimize_loop("layer123_start") slothy.optimize_loop("layer4567_start") + # Build + emulate entire function to test that behaviour has not changed + slothy.global_selftest("ntt_kyber_123_4567", + {"x0": 1024, "x1": 1024, "x3": 1024, "x4": 1024, "x5": 1024}) class intt_kyber_123_4567(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): @@ -1226,7 +1247,7 @@ def core(self, slothy): slothy.config.constraints.stalls_first_attempt = 110 slothy.optimize_loop("layer123_start") - + class ntt_dilithium_123(Example): @@ -1349,7 +1370,7 @@ def core(self, slothy): slothy.optimize_loop("layer5678_start") slothy.config = conf.copy() - + if self.timeout is not None: slothy.config.timeout = self.timeout // 12 @@ -1366,7 +1387,7 @@ def core(self, slothy): slothy.config.split_heuristic_stepsize = 0.1 slothy.config.constraints.stalls_first_attempt = 14 slothy.optimize_loop("layer1234_start") - + class ntt_dilithium_1234(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): @@ -1513,6 +1534,7 @@ def main(): # Armv7m examples Armv7mExample0(), + Armv7mExample0Func(), # Loop examples AArch64LoopSubs(), diff --git a/examples/naive/aarch64/ntt_kyber_123_4567.s b/examples/naive/aarch64/ntt_kyber_123_4567.s index 0f2c9ae1..a92b887b 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567.s @@ -23,8 +23,13 @@ /// SOFTWARE. /// +// Commented out for simple standalone emulation not +// requiring correct constant data +// +// Should be commented when used. +// // Needed to provide ASM_LOAD directive -#include +// #include .macro mulmodq dst, src, const, idx0, idx1 sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1] @@ -154,7 +159,12 @@ .data .p2align 4 roots: - #include "ntt_kyber_123_45_67_twiddles.s" +// Commented out for simple standalone emulation not +// requiring correct constant data +// +// Should be commented when used. +// +// #include "ntt_kyber_123_45_67_twiddles.s" in .req x0 inp .req x1 @@ -223,9 +233,14 @@ ntt_kyber_123_4567: _ntt_kyber_123_4567: push_stack - ASM_LOAD(r_ptr0, roots) - ASM_LOAD(r_ptr1, roots_l56) - ASM_LOAD(xtmp, const_addr) +// Commented out for simple standalone emulation not +// requiring correct constant data. +// +// Should be commented when used. +// +// ASM_LOAD(r_ptr0, roots) +// ASM_LOAD(r_ptr1, roots_l56) +// ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] diff --git a/slothy/core/config.py b/slothy/core/config.py index 1140a2bb..b14e69f2 100644 --- a/slothy/core/config.py +++ b/slothy/core/config.py @@ -122,8 +122,8 @@ def selftest(self): equivalence-check the loop-form (including the compare+branch instructions at the loop boundary) rather than the unrolled code. - DEPENDENCY: To run this, you need `llvm-mc` the binary in your path or configured - as via `llvm_mc_binary`, and `unicorn-engine` Python bindings setup. + DEPENDENCY: To run this, you need `llvm-nm`, `llvm-readobj`, `llvm-mc` + in your PATH. Those are part of a standard LLVM setup. NOTE: This is so far implemented as a repeated randomized test -- nothing clever. """ @@ -469,21 +469,6 @@ def compiler_include_paths(self): or `with_llvm_mca_after` are set.""" return self._compiler_include_paths - @property - def llvm_mca_binary(self): - """The llvm-mca binary to be used for estimated performance annotations - - This is only relevant if `with_llvm_mca_before` or `with_llvm_mca_after` - is set.""" - return self._llvm_mca_binary - - @property - def llvm_mc_binary(self): - """The llvm-mc binary to be used for assembling output data - - This is only relevant if `selftest` is set.""" - return self._llvm_mc_binary - @property def timeout(self): """The timeout in seconds after which the underlying constraint solver stops @@ -1228,8 +1213,6 @@ def __init__(self, Arch, Target): self._compiler_binary = "gcc" self._compiler_include_paths = None - self._llvm_mca_binary = "llvm-mca" - self._llvm_mc_binary = "llvm-mc" self.keep_tags = True self.inherit_macro_comments = False @@ -1377,12 +1360,6 @@ def compiler_binary(self, val): @compiler_include_paths.setter def compiler_include_paths(self, val): self._compiler_include_paths = val - @llvm_mca_binary.setter - def llvm_mca_binary(self, val): - self._llvm_mca_binary = val - @llvm_mc_binary.setter - def llvm_mc_binary(self, val): - self._llvm_mc_binary = val @timeout.setter def timeout(self, val): self._timeout = val diff --git a/slothy/core/core.py b/slothy/core/core.py index 8f4bcff1..d635db28 100644 --- a/slothy/core/core.py +++ b/slothy/core/core.py @@ -877,11 +877,10 @@ def selftest(self, log): self._config.arch.RegisterType.list_registers(ty)] def run_code(code, txt=None): - objcode = LLVM_Mc.assemble(code, self._config.llvm_mc_binary, + objcode, offset = LLVM_Mc.assemble(code, self._config.arch.llvm_mc_arch, self._config.arch.llvm_mc_attr, log) - # Setup emulator mu = Uc(self.config.arch.unicorn_arch, self.config.arch.unicorn_mode) # Copy initial register contents into emulator @@ -937,7 +936,7 @@ def run_code(code, txt=None): if final_regs_old[r] != final_regs_new[r]: raise SlothySelfTestException(f"Selftest failed: Register mismatch for {r}: {hex(final_regs_old[r])} != {hex(final_regs_new[r])}") - log.info("Selftest: OK") + log.info("Local selftest: OK") def selfcheck_with_fixup(self, log): """Do selfcheck, and consider preamble/postamble fixup in case of SW pipelining diff --git a/slothy/core/slothy.py b/slothy/core/slothy.py index 76cbc1ae..dcd2e33e 100644 --- a/slothy/core/slothy.py +++ b/slothy/core/slothy.py @@ -45,6 +45,7 @@ This module provides the Slothy class, which is a stateful interface to both one-shot and heuristic optimiations using SLOTHY.""" +import os import logging from types import SimpleNamespace @@ -54,7 +55,16 @@ from slothy.core.heuristics import Heuristics from slothy.helper import CPreprocessor, SourceLine from slothy.helper import AsmAllocation, AsmMacro, AsmHelper, AsmIfElse -from slothy.helper import CPreprocessor, LLVM_Mca, LLVM_Mca_Error +from slothy.helper import CPreprocessor, LLVM_Mca, LLVM_Mc, LLVM_Mca_Error + +try: + from unicorn import * + from unicorn.arm64_const import * +except ImportError: + Uc = None + +class SlothyGlobalSelfTestException(Exception): + """Exception thrown upon global selftest failures""" class Slothy: """SLOTHY optimizer @@ -87,6 +97,7 @@ def __init__(self, arch, target, logger=None): # The source, once loaded, is represented as a list of strings self._source = None + self._original_source = None self.results = None self.last_result = None @@ -99,21 +110,40 @@ def source(self): If you want the current source code as a multiline string, use get_source_as_string().""" return self._source + @property + def original_source(self): + """Returns the original source code as an array of SourceLine objects + + If you want the current source code as a multiline string, use get_original_source_as_string().""" + return self._original_source + @source.setter def source(self, val): assert SourceLine.is_source(val) self._source = val + @original_source.setter + def original_source(self, val): + assert SourceLine.is_source(val) + self._original_source = val + def get_source_as_string(self, comments=True, indentation=True, tags=True): """Retrieve current source code as multi-line string""" return SourceLine.write_multiline(self.source, comments=comments, indentation=indentation, tags=tags) + def get_original_source_as_string(self, comments=True, indentation=True, tags=True): + """Retrieve original source code as multi-line string""" + return SourceLine.write_multiline(self.original_source, comments=comments, + indentation=indentation, tags=tags) + def set_source_as_string(self, s): """Provide input source code as multi-line string""" assert isinstance(s, str) reduce = not self.config.ignore_tags self.source = SourceLine.read_multiline(s, reduce=reduce) + if self.original_source is None: + self.original_source = self.source def load_source_raw(self, source): """Load source code from multi-line string""" @@ -145,6 +175,114 @@ def _dump(name, s, logger, err=False): for l in s: fun(f"> {l}") + def global_selftest(self, funcname, address_gprs, iterations=5): + """Conduct a function-level selftest + + - funcname: Name of function to be called. Must be exposed as a symbol + - address_prs: Dictionary indicating which GPRs are pointers to buffers of which size. + For example, `{ "x0": 1024, "x4": 1024 }` would indicate that both x0 and x4 + point to buffers of size 1024 bytes. The global selftest needs to know this to + setup valid calls to the assembly routine. + + DEPENDENCY: To run this, you need `llvm-nm`, `llvm-readobj`, `llvm-mc` + in your PATH. Those are part of a standard LLVM setup. + """ + + log = self.logger.getChild(f"global_selftest_{funcname}") + + if Uc is None: + raise SlothyGlobalSelfTestException("Cannot run selftest -- unicorn-engine is not available.") + + if self.config.arch.unicorn_arch is None or \ + self.config.arch.llvm_mc_arch is None: + log.warning("Selftest not supported on target architecture") + return + + old_source = self.original_source + new_source = self.source + + CODE_BASE = 0x010000 + CODE_SZ = 0x010000 + CODE_END = CODE_BASE + CODE_SZ + RAM_BASE = 0x030000 + RAM_SZ = 0x010000 + STACK_BASE = 0x040000 + STACK_SZ = 0x010000 + STACK_TOP = STACK_BASE + STACK_SZ + + regs = [r for ty in self.config.arch.RegisterType for r in \ + self.config.arch.RegisterType.list_registers(ty)] + + def run_code(code, txt=None): + objcode, offset = LLVM_Mc.assemble(code, + self.config.arch.llvm_mc_arch, + self.config.arch.llvm_mc_attr, + log, symbol=funcname, + preprocessor=self.config.compiler_binary, + include_paths=self.config.compiler_include_paths) + # Setup emulator + mu = Uc(self.config.arch.unicorn_arch, self.config.arch.unicorn_mode) + # Copy initial register contents into emulator + for r,v in initial_register_contents.items(): + ur = self.config.arch.RegisterType.unicorn_reg_by_name(r) + if ur is None: + continue + mu.reg_write(ur, v) + # Put a valid address in the LR that serves as the marker to terminate emulation + mu.reg_write(self.config.arch.RegisterType.unicorn_link_register(), CODE_END) + # Setup stack + mu.reg_write(self.config.arch.RegisterType.unicorn_stack_pointer(), STACK_TOP) + # Copy code into emulator + mu.mem_map(CODE_BASE, CODE_SZ) + mu.mem_write(CODE_BASE, objcode) + + # Copy initial memory contents into emulator + mu.mem_map(RAM_BASE, RAM_SZ) + mu.mem_write(RAM_BASE, initial_memory) + # Setup stack + mu.mem_map(STACK_BASE, STACK_SZ) + mu.mem_write(STACK_BASE, initial_stack) + # Run emulator + mu.emu_start(CODE_BASE + offset, CODE_END) + + final_register_contents = {} + for r in regs: + ur = self.config.arch.RegisterType.unicorn_reg_by_name(r) + if ur is None: + continue + final_register_contents[r] = mu.reg_read(ur) + final_memory_contents = mu.mem_read(RAM_BASE, RAM_SZ) + + return final_register_contents, final_memory_contents + + for _ in range(iterations): + initial_memory = os.urandom(RAM_SZ) + initial_stack = os.urandom(STACK_SZ) + cur_ram = RAM_BASE + # Set initial register contents arbitrarily, except for registers + # which must hold valid memory addresses. + initial_register_contents = {} + for r in regs: + initial_register_contents[r] = int.from_bytes(os.urandom(16)) + for (reg, sz) in address_gprs.items(): + initial_register_contents[reg] = cur_ram + cur_ram += sz + + final_regs_old, final_mem_old = run_code(old_source, txt="old") + final_regs_new, final_mem_new = run_code(new_source, txt="new") + + # Check if memory contents are the same + if final_mem_old != final_mem_new: + raise SlothyGlobalSelfTestException(f"Selftest failed: Memory mismatch") + + # Check that callee-saved registers are the same + regs_expected = self.config.arch.RegisterType.callee_saved_registers() + for r in regs_expected: + if final_regs_old[r] != final_regs_new[r]: + raise SlothyGlobalSelfTestException(f"Selftest failed: Register mismatch for {r}: {hex(final_regs_old[r])} != {hex(final_regs_new[r])}") + + log.info(f"Global selftest for {funcname}: OK") + # # Stateful wrappers around heuristics # diff --git a/slothy/helper.py b/slothy/helper.py index 020ea66d..354ea3ab 100644 --- a/slothy/helper.py +++ b/slothy/helper.py @@ -27,6 +27,7 @@ import re import subprocess +import platform import logging from abc import ABC, abstractmethod from sympy import simplify @@ -1085,11 +1086,69 @@ class LLVM_Mc(): """Helper class for the application of the LLVM MC tool""" @staticmethod - def assemble(source, mc_binary, arch, attr, log): - """Runs LLVM-MC tool to assemble `source`, returning byte code""" + def llvm_mc_output_extract_text_section(objfile): + """Extracts offset and size of .text section from an objectfile + emitted by llvm-mc.""" + + # We use llvm-readobj to inspect the objectfile, which works + # for both ELF and MachOS object files. Unfortunately, however, + # the output formats of both tools are not the same. Moreovoer, + # the output when selecting JSON as the output format, is not valid JSON. + # So we're left to hacky string munging. + + # Feed object file through llvm-readobj + r = subprocess.run(["llvm-readobj", "-S", "-"], input=objfile, capture_output=True, check=True) + objfile_txt = r.stdout.decode().split("\n") + + # We expect something like this here + # ``` + # File: test.o + # Format: Mach-O arm + # Arch: arm + # AddressSize: 32bit + # Sections [ + # Section { + # Index: 0 + # Name: __text (5F 5F 74 65 78 74 00 00 00 00 00 00 00 00 00 00) + # Segment: __TEXT (5F 5F 54 45 58 54 00 00 00 00 00 00 00 00 00 00) + # Address: 0x0 + # Size: 0x4 + # Offset: 176 + # Alignment: 0 + # RelocationOffset: 0x0 + # RelocationCount: 0 + # Type: Regular (0x0) + # Attributes [ (0x800004) + # PureInstructions (0x800000) + # SomeInstructions (0x4) + # ] + # Reserved1: 0x0 + # Reserved2: 0x0 + # } + # ] + # ``` + # So we look for lines "Name: __text" and lines "Offset: ...". + def parse_as_int(s): + if s.startswith("0x"): + return int(s, base=16) + else: + return int(s,base=10) + + sections = filter(lambda l: l.strip().startswith("Name: "), objfile_txt) + sections = list(map(lambda l: l.strip().removeprefix("Name: ").split(' ')[0].strip(), sections)) + offsets = filter(lambda l: l.strip().startswith("Offset: "), objfile_txt) + offsets = map(lambda l: parse_as_int(l.strip().removeprefix("Offset: ")), offsets) + sizes = filter(lambda l: l.strip().startswith("Size: "), objfile_txt) + sizes = map(lambda l: parse_as_int(l.strip().removeprefix("Size: ")), sizes) + sections_with_offsets = { s:(o,sz) for (s,o,sz) in zip(sections, offsets, sizes) } + text_section = list(filter(lambda s: "text" in s, sections)) + if len(text_section) != 1: + raise LLVM_Mc_Error(f"Could not find unambiguous text section in object file. Sections: {sections}") + return sections_with_offsets[text_section[0]] - LLVM_MCA_BEGIN = SourceLine("").add_comment("LLVM-MCA-BEGIN") - LLVM_MCA_END = SourceLine("").add_comment("LLVM-MCA-END") + @staticmethod + def assemble(source, arch, attr, log, symbol=None, preprocessor=None, include_paths=None): + """Runs LLVM-MC tool to assemble `source`, returning byte code""" # Unfortunately, there is no option to directly extract byte code # from LLVM-MC: One either gets a textual description, or an object file. @@ -1097,29 +1156,45 @@ def assemble(source, mc_binary, arch, attr, log): # code directly from the textual output, which for every assembly line # has a "encoding: [byte0, byte1, ...]" comment at the end. + if symbol is None: + source = [SourceLine(".global harness"), + SourceLine("harness:")] + source + symbol = "harness" + + if preprocessor is not None: + # First, run the C preprocessor on the code + try: + source = CPreprocessor.unfold([], source, [], preprocessor, + include=include_paths) + except subprocess.CalledProcessError as exc: + raise LLVM_Mc_Error from exc + code = SourceLine.write_multiline(source) + log.debug(f"Calling LLVM MC assmelber on the following code") log.debug(code) - args = [f"--arch={arch}", "--assemble", "--show-encoding"] + args = [f"--arch={arch}", "--assemble", "--filetype=obj"] if attr is not None: args.append(f"--mattr={attr}") try: - r = subprocess.run([mc_binary] + args, - input=code, text=True, capture_output=True, check=True) + r = subprocess.run(["llvm-mc"] + args, + input=code.encode(), capture_output=True, check=True) except subprocess.CalledProcessError as exc: raise LLVM_Mc_Error from exc - res = r.stdout.split('\n') - res = filter(lambda s: "encoding:" in s, res) - res = list(map(lambda s: s.split("encoding:")[1].strip(), res)) - - # Every line has the form "[byte, byte, byte,...]" now -- interpret as byte array - # Bit hacky, but nevermind... - def string_as_byte_array(s): - return s.replace("[", "").replace("]", "").split(",") - res = list(map(string_as_byte_array, res)) - res = [int(b, base=16) for l in res for b in l] # Flatten - return bytes(res) + # TODO: If there are relocations remaining, we should fail at this point + + objfile = r.stdout + offset, sz = LLVM_Mc.llvm_mc_output_extract_text_section(objfile) + code = objfile[offset:offset+sz] + + # Extract symbol table + r = subprocess.run(["llvm-nm","-"], input=objfile, capture_output=True) + out = r.stdout.decode() + symbol = next(filter(lambda l: symbol in l, out.split("\n"))) + offset = int(symbol.split(" ")[0], base=16) + + return code, offset class LLVM_Mca_Error(Exception): """Exception thrown if llvm-mca subprocess fails""" diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 9547e4ac..e6fac7cf 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -81,6 +81,18 @@ def __repr__(self): def spillable(reg_type): return reg_type in [RegisterType.GPR, RegisterType.NEON] + @staticmethod + def callee_saved_registers(): + return [f"x{i}" for i in range(18,31)] + [f"v{i}" for i in range(8,16)] + + @staticmethod + def unicorn_link_register(): + return UC_ARM64_REG_X30 + + @staticmethod + def unicorn_stack_pointer(): + return UC_ARM64_REG_SP + @cache @staticmethod def unicorn_reg_by_name(reg): diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py index aa9bf276..216b18cc 100644 --- a/slothy/targets/arm_v7m/arch_v7m.py +++ b/slothy/targets/arm_v7m/arch_v7m.py @@ -13,11 +13,11 @@ from sympy import simplify llvm_mca_arch = "arm" -llvm_mc_arch = "arm" -llvm_mc_attr = "armv5te" +llvm_mc_arch = "arm" ### TODO: What to put here? +llvm_mc_attr = "armv5te,thumb2,dsp" ### TODO: What to put here? unicorn_arch = UC_ARCH_ARM -unicorn_mode = UC_MODE_ARM +unicorn_mode = UC_MODE_THUMB class RegisterType(Enum): GPR = 1 @@ -35,6 +35,18 @@ def __repr__(self): def spillable(reg_type): return reg_type in [RegisterType.GPR] + @staticmethod + def callee_saved_registers(): + return [f"r{i}" for i in range(4,12)] + [f"s{i}" for i in range(0,16)] + + @staticmethod + def unicorn_link_register(): + return UC_ARM_REG_LR + + @staticmethod + def unicorn_stack_pointer(): + return UC_ARM_REG_SP + @cache @staticmethod def unicorn_reg_by_name(reg):