From ce8cb55fc22ac1f9e89165f1344cbfe5330f98a6 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 9 Dec 2024 17:49:25 +0800 Subject: [PATCH] Make ldm/stm range parsing more flexible. Currently the Armv7-M arch model is limited to registers lists consisting of a single range: e.g., ldm r0, {r1-r7}. This is not actually the correct restriction from the manual. Actually, the registers don't have to be consecutive - the only limitation is that the lowest register is loaded from the lowest address. Or in other words: the regsiters need to be order. Also, the range is merely syntactic sugar and one can also just list all registers. This commit changes our parsing to be closer to the real world. It allows to write any register list including ranges. Internally range gets expanded to the actual list. Instructions are always written as the full list. The main motivation for this change that we were recently optimizing code containing ldm r0!, {r1-r3,r14} which cannot be parsed with the current model. With this commit, it parses correctly. This does in theory give SLOTHY more freedom in chosing register allocations, but it is yet to be seen if that is useful. args_out_combinations can become very big in this case and we may actually have to restrict that if we are running into performance issues later. --- examples/naive/armv7m/armv7m_simple0.s | 23 ++++ examples/opt/armv7m/armv7m_simple0_opt_m7.s | 78 ++++++++---- slothy/targets/arm_v7m/arch_v7m.py | 133 ++++++++++++-------- slothy/targets/arm_v7m/cortex_m7.py | 4 +- 4 files changed, 163 insertions(+), 75 deletions(-) diff --git a/examples/naive/armv7m/armv7m_simple0.s b/examples/naive/armv7m/armv7m_simple0.s index f250650f..937c22f2 100644 --- a/examples/naive/armv7m/armv7m_simple0.s +++ b/examples/naive/armv7m/armv7m_simple0.s @@ -6,4 +6,27 @@ eor.w r1,r1, r3 smlabt r3,r2, r2, r1 asrs r3, r3,#1 str r3, [r0,#4] + +ldm r0, {r1-r2,r14} +add r1, r2,r1 +eor.w r1,r1, r14 +smlabt r3,r2, r2, r1 +asrs r3, r3,#1 +str r3, [r0,#4] + + +ldm r0, {r1-r3} +add r1, r2,r1 +eor.w r1,r1, r3 +smlabt r3,r2, r2, r1 +asrs r3, r3,#1 +str r3, [r0,#4] + +ldm r0, {r1,r2,r3} +add r1, r2,r1 +eor.w r1,r1, r3 +smlabt r3,r2, r2, r1 +asrs r3, r3,#1 +str r3, [r0,#4] + end: \ No newline at end of file diff --git a/examples/opt/armv7m/armv7m_simple0_opt_m7.s b/examples/opt/armv7m/armv7m_simple0_opt_m7.s index d9e1be33..72754c35 100644 --- a/examples/opt/armv7m/armv7m_simple0_opt_m7.s +++ b/examples/opt/armv7m/armv7m_simple0_opt_m7.s @@ -1,24 +1,42 @@ start: - // Instructions: 6 - // Expected cycles: 5 - // Expected IPC: 1.20 - // - // Cycle bound: 5.0 - // IPC bound: 1.20 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr r12, [r0, #4] // *............................. - add r12,r2,r12 // .*............................ - eor.w r10,r12,r3 // ..*........................... - smlabt r10,r2,r2,r10 // ..*........................... - asrs r3,r10,#1 // ....*......................... - str r3,[r0,#4] // ....*......................... + // Instructions: 24 + // Expected cycles: 14 + // Expected IPC: 1.71 + // + // Cycle bound: 14.0 + // IPC bound: 1.71 + // + // Wall time: 0.39s + // User time: 0.39s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr r11,[r0,#4] // *............................. + ldm r0,{r7,r9,r14} // .*............................ + add r8,r2,r11 // .*............................ + eor.w r11,r8,r3 // ..*........................... + smlabt r3,r2,r2,r11 // ..*........................... + add r11,r9,r7 // ....*......................... + ldm r0,{r1,r2,r7} // ....*......................... + eor.w r14,r11,r14 // .....*........................ + smlabt r8,r9,r9,r14 // .....*........................ + asrs r4,r3,#1 // ......*....................... + str r4,[r0,#4] // ......*....................... + add r3,r2,r1 // .......*...................... + ldm r0,{r11,r12,r14} // .......*...................... + eor.w r3,r3,r7 // ........*..................... + smlabt r7,r2,r2,r3 // ........*..................... + asrs r9,r8,#1 // .........*.................... + str r9,[r0,#4] // .........*.................... + asrs r3,r7,#1 // ..........*................... + add r7,r12,r11 // ..........*................... + eor.w r14,r7,r14 // ...........*.................. + smlabt r14,r12,r12,r14 // ...........*.................. + str r3,[r0,#4] // ............*................. + asrs r14,r14,#1 // .............*................ + str r14,[r0,#4] // .............*................ // ------ cycle (expected) ------> // 0 25 @@ -27,7 +45,25 @@ // add r1, r2,r1 // .*............................. // eor.w r1,r1, r3 // ..*............................ // smlabt r3,r2, r2, r1 // ..*............................ - // asrs r3, r3,#1 // ....*.......................... - // str r3, [r0,#4] // ....*.......................... + // asrs r3, r3,#1 // ......*........................ + // str r3, [r0,#4] // ......*........................ + // ldm r0, {r1-r2,r14} // .*............................. + // add r1, r2,r1 // ....*.......................... + // eor.w r1,r1, r14 // .....*......................... + // smlabt r3,r2, r2, r1 // .....*......................... + // asrs r3, r3,#1 // .........*..................... + // str r3, [r0,#4] // .........*..................... + // ldm r0, {r1-r3} // .......*....................... + // add r1, r2,r1 // ..........*.................... + // eor.w r1,r1, r3 // ...........*................... + // smlabt r3,r2, r2, r1 // ...........*................... + // asrs r3, r3,#1 // .............*................. + // str r3, [r0,#4] // .............*................. + // ldm r0, {r1,r2,r3} // ....*.......................... + // add r1, r2,r1 // .......*....................... + // eor.w r1,r1, r3 // ........*...................... + // smlabt r3,r2, r2, r1 // ........*...................... + // asrs r3, r3,#1 // ..........*.................... + // str r3, [r0,#4] // ............*.................. end: diff --git a/slothy/targets/arm_v7m/arch_v7m.py b/slothy/targets/arm_v7m/arch_v7m.py index deefdee7..be18e01b 100644 --- a/slothy/targets/arm_v7m/arch_v7m.py +++ b/slothy/targets/arm_v7m/arch_v7m.py @@ -2,6 +2,7 @@ import inspect import re import math +import itertools from enum import Enum from functools import cache @@ -125,12 +126,12 @@ def unconditional(lbl): class VmovCmpLoop(Loop): """ Loop ending in a vmov, a compare, and a branch. - + The modification to the value we compare against happens inside the loop body. The value that is being compared to is stashed to a floating point register before the loop starts and therefore needs to be recovered before - the comparison. - + the comparison. + WARNING: This type of loop is experimental as slothy has no knowledge about what happens inside the loop boundary! Especially, a register is written inside the boundary which may be used for renaming by slothy. Use with @@ -218,7 +219,7 @@ class CmpLoop(Loop): """ Loop ending in a compare and a branch. The modification to the value we compare against happens inside the loop body. - WARNING: This type of loop is experimental as slothy has no knowledge about + WARNING: This type of loop is experimental as slothy has no knowledge about what happens inside the loop boundary! Use with caution. Example: @@ -397,7 +398,7 @@ def __init__(self, *, mnemonic, self.flag = None self.width = None self.barrel = None - self.range = None + self.reg_list = None def extract_read_writes(self): """Extracts 'reads'/'writes' clauses from the source line of the instruction""" @@ -651,7 +652,11 @@ def pattern_i(i): index_pattern = "[0-9]+" width_pattern = "(?:\.w|\.n|)" barrel_pattern = "(?:lsl|ror|lsr|asr)\\\\s*" - range_pattern = "\{(?P[rs])(?P\\\\d+)-[rs](?P\\\\d+)\}" + + # reg_list is (,)* + # range is [rs]NN(-rsMM)? + range_pat = "([rs]\\\\d+)(-[rs](\\\\d+))?" + reg_list_pattern = "\{"+ range_pat + "(," + range_pat + ")*" +"\}" src = re.sub(" ", "\\\\s+", src) src = re.sub(",", "\\\\s*,\\\\s*", src) @@ -662,7 +667,7 @@ def pattern_i(i): src = replace_placeholders(src, "flag", flag_pattern, "flag") # TODO: Are any changes required for IT syntax? src = replace_placeholders(src, "width", width_pattern, "width") src = replace_placeholders(src, "barrel", barrel_pattern, "barrel") - src = replace_placeholders(src, "range", range_pattern, "range") + src = replace_placeholders(src, "reg_list", reg_list_pattern, "reg_list") src = r"\s*" + src + r"\s*(//.*)?\Z" return src @@ -789,6 +794,30 @@ def _instantiate_pattern(s, ty, arg, out): raise FatalParsingException(f"Failed to replace <{s}> by {rep} in {out}!") return res + @staticmethod + def _expand_reg_list(reg_list): + """Expanding list of registers that may contain ranges + Examples: + r1,r2,r3 + s1-s7 + r1-r3,r14 + """ + reg_list = reg_list.replace("{", "") + reg_list = reg_list.replace("}", "") + + reg_list_type = reg_list[0] + regs = [] + for reg_range in reg_list.split(","): + if "-" in reg_range: + start = reg_range.split("-")[0] + end = reg_range.split("-")[1] + start = int(start.replace(reg_list_type, "")) + end = int(end.replace(reg_list_type, "")) + regs += [f"{reg_list_type}{i}" for i in range(start, end+1)] + else: # not a range, just a register + regs += [reg_range] + return reg_list_type, regs + @staticmethod def build_core(obj, res): @@ -815,10 +844,7 @@ def group_name_i(i): group_to_attribute('flag', 'flag') group_to_attribute('width', 'width') group_to_attribute('barrel', 'barrel') - group_to_attribute('range', 'range') - group_to_attribute('range_start', 'range_start', int) - group_to_attribute('range_end', 'range_end', int) - group_to_attribute('range_type', 'range_type') + group_to_attribute('reg_list', 'reg_list') for s, ty in obj.pattern_inputs: if ty == RegisterType.FLAGS: @@ -891,7 +917,7 @@ def t_default(x): out = replace_pattern(out, "index", "index", str) out = replace_pattern(out, "width", "width", lambda x: x.lower()) out = replace_pattern(out, "barrel", "barrel", lambda x: x.lower()) - out = replace_pattern(out, "range", "range", lambda x: x.lower()) + out = replace_pattern(out, "reg_list", "reg_list", lambda x: x.lower()) out = out.replace("\\[", "[") out = out.replace("\\]", "]") @@ -1417,53 +1443,53 @@ def make(cls, src): return obj class ldm_interval(Armv7mLoadInstruction): # pylint: disable=missing-docstring,invalid-name - pattern = "ldm ," + pattern = "ldm ," inputs = ["Ra"] outputs = [] def write(self): - reg_from = self.args_out[0] - reg_to = self.args_out[-1] - self.range = f"{{{reg_from}-{reg_to}}}" + regs = ",".join(self.args_out) + self.reg_list = f"{{{regs}}}" return super().write() @classmethod def make(cls, src): obj = Armv7mLoadInstruction.build(cls, src) - reg_type = Armv7mInstruction._infer_register_type(obj.range_type) - num_regs = len(RegisterType.list_registers(reg_type)) - obj.increment = (obj.range_end-obj.range_start+1) * 4 # word sized loads - obj.args_out = [f"{obj.range_type}{i}" for i in range(obj.range_start, obj.range_end+1)] + reg_list_type, reg_list = Armv7mInstruction._expand_reg_list(obj.reg_list) + + obj.args_out = reg_list obj.num_out = len(obj.args_out) obj.arg_types_out = [RegisterType.GPR] * obj.num_out - obj.args_out_restrictions = [[ f"r{i+j}" for j in range(0, num_regs-obj.num_out)] for i in range(0, obj.num_out) ] - obj.args_out_combinations = [ ( list(range(0, obj.num_out)), [ [ f"r{i+j}" for i in range(0, obj.num_out)] for j in range(0, num_regs-obj.num_out) ] )] + available_regs = RegisterType.list_registers(RegisterType.GPR) + obj.args_out_combinations = [ (list(range(0, obj.num_out)), [list(a) for a in itertools.combinations(available_regs, obj.num_out)])] + obj.args_out_restrictions = [ None for _ in range(obj.num_out) ] return obj class ldm_interval_inc_writeback(Armv7mLoadInstruction): # pylint: disable=missing-docstring,invalid-name - pattern = "ldm !," + pattern = "ldm !," in_outs = ["Ra"] outputs = [] def write(self): - reg_from = self.args_out[0] - reg_to = self.args_out[-1] - self.range = f"{{{reg_from}-{reg_to}}}" + regs = ",".join(self.args_out) + self.reg_list = f"{{{regs}}}" return super().write() @classmethod def make(cls, src): obj = Armv7mLoadInstruction.build(cls, src) - reg_type = Armv7mInstruction._infer_register_type(obj.range_type) - num_regs = len(RegisterType.list_registers(reg_type)) - obj.increment = (obj.range_end-obj.range_start+1) * 4 # word sized loads - obj.args_out = [f"{obj.range_type}{i}" for i in range(obj.range_start, obj.range_end+1)] + reg_list_type, reg_list = Armv7mInstruction._expand_reg_list(obj.reg_list) + + obj.args_out = reg_list obj.num_out = len(obj.args_out) obj.arg_types_out = [RegisterType.GPR] * obj.num_out - obj.args_out_restrictions = [[ f"r{i+j}" for j in range(0, num_regs-obj.num_out)] for i in range(0, obj.num_out) ] - obj.args_out_combinations = [ ( list(range(0, obj.num_out)), [ [ f"r{i+j}" for i in range(0, obj.num_out)] for j in range(0, num_regs-obj.num_out) ] )] + obj.increment = obj.num_out * 4 + + available_regs = RegisterType.list_registers(RegisterType.GPR) + obj.args_out_combinations = [ (list(range(0, obj.num_out)), [list(a) for a in itertools.combinations(available_regs, obj.num_out)])] + obj.args_out_restrictions = [ None for _ in range(obj.num_out) ] return obj class vldr_with_imm(Armv7mLoadInstruction): # pylint: disable=missing-docstring,invalid-name @@ -1496,27 +1522,28 @@ def make(cls, src): return obj class vldm_interval_inc_writeback(Armv7mLoadInstruction): # pylint: disable=missing-docstring,invalid-name - pattern = "vldm !," + pattern = "vldm !," in_outs = ["Ra"] outputs = [] def write(self): - reg_from = self.args_out[0] - reg_to = self.args_out[-1] - self.range = f"{{{reg_from}-{reg_to}}}" + regs = ",".join(self.args_out) + self.reg_list = f"{{{regs}}}" return super().write() @classmethod def make(cls, src): obj = Armv7mLoadInstruction.build(cls, src) - reg_type = Armv7mInstruction._infer_register_type(obj.range_type) - num_regs = len(RegisterType.list_registers(reg_type)) - obj.increment = (obj.range_end-obj.range_start+1) * 4 # word sized loads - obj.args_out = [f"{obj.range_type}{i}" for i in range(obj.range_start, obj.range_end+1)] + reg_list_type, reg_list = Armv7mInstruction._expand_reg_list(obj.reg_list) + + obj.args_out = reg_list obj.num_out = len(obj.args_out) obj.arg_types_out = [RegisterType.FPR] * obj.num_out - obj.args_out_restrictions = [[ f"s{i+j}" for j in range(0, num_regs-obj.num_out)] for i in range(0, obj.num_out) ] - obj.args_out_combinations = [ ( list(range(0, obj.num_out)), [ [ f"s{i+j}" for i in range(0, obj.num_out)] for j in range(0, num_regs-obj.num_out) ] )] + obj.increment = obj.num_out * 4 + + available_regs = RegisterType.list_registers(RegisterType.FPR) + obj.args_out_combinations = [ (list(range(0, obj.num_out)), [list(a) for a in itertools.combinations(available_regs, obj.num_out)])] + obj.args_out_restrictions = [ None for _ in range(obj.num_out) ] return obj # Store @@ -1611,27 +1638,29 @@ def make(cls, src): return obj class stm_interval_inc_writeback(Armv7mLoadInstruction): # pylint: disable=missing-docstring,invalid-name - pattern = "stm !," + pattern = "stm !," in_outs = ["Ra"] outputs = [] def write(self): - reg_from = self.args_in[0] - reg_to = self.args_in[-1] - self.range = f"{{{reg_from}-{reg_to}}}" + regs = ",".join(self.args_out) + self.reg_list = f"{{{regs}}}" return super().write() @classmethod def make(cls, src): obj = Armv7mLoadInstruction.build(cls, src) - reg_type = Armv7mInstruction._infer_register_type(obj.range_type) - num_regs = len(RegisterType.list_registers(reg_type)) - obj.increment = (obj.range_end-obj.range_start+1) * 4 # word sized loads - obj.args_in = [f"{obj.range_type}{i}" for i in range(obj.range_start, obj.range_end+1)] + + reg_list_type, reg_list = Armv7mInstruction._expand_reg_list(obj.reg_list) + + obj.args_in = reg_list obj.num_in = len(obj.args_in) obj.arg_types_in = [RegisterType.GPR] * obj.num_in - obj.args_in_restrictions = [[ f"r{i+j}" for j in range(0, num_regs-obj.num_in)] for i in range(0, obj.num_in) ] - obj.args_in_combinations = [ ( list(range(0, obj.num_in)), [ [ f"r{i+j}" for i in range(0, obj.num_in)] for j in range(0, num_regs-obj.num_in) ] )] + obj.increment = obj.num_in * 4 + + available_regs = RegisterType.list_registers(RegisterType.GPR) + obj.args_in_combinations = [ (list(range(0, obj.num_in)), [list(a) for a in itertools.combinations(available_regs, obj.num_in)])] + obj.args_in_restrictions = [ None for _ in range(obj.num_in) ] return obj # Other class cmp(Armv7mBasicArithmetic): # pylint: disable=missing-docstring,invalid-name diff --git a/slothy/targets/arm_v7m/cortex_m7.py b/slothy/targets/arm_v7m/cortex_m7.py index 33c9d5f3..57cc8f11 100644 --- a/slothy/targets/arm_v7m/cortex_m7.py +++ b/slothy/targets/arm_v7m/cortex_m7.py @@ -313,7 +313,7 @@ def get_latency(src, out_idx, dst): # Load and store multiples take a long time to complete if instclass_src in [ldm_interval, ldm_interval_inc_writeback, stm_interval_inc_writeback, vldm_interval_inc_writeback]: - latency = (src.range_end - src.range_start) + 1 + latency = src.num_out # Can always store result in the same cycle # TODO: double-check this @@ -352,6 +352,6 @@ def evaluate_immediate(string_expr): def get_inverse_throughput(src): itp = lookup_multidict(inverse_throughput, src) if find_class(src) in [ldm_interval, ldm_interval_inc_writeback, stm_interval_inc_writeback, vldm_interval_inc_writeback]: - itp = (src.range_end - src.range_start) + 1 + itp = src.num_out return itp