From 01c567f53c04cb91d0af7e66a0db47e33dae261d Mon Sep 17 00:00:00 2001 From: SharzyL Date: Wed, 31 Jan 2024 18:23:07 +0800 Subject: [PATCH 1/2] [build system] dedup run-test.py --- scripts/run-test.py | 175 +++++++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 92 deletions(-) diff --git a/scripts/run-test.py b/scripts/run-test.py index 8b951c005..3efb848e3 100755 --- a/scripts/run-test.py +++ b/scripts/run-test.py @@ -17,13 +17,50 @@ def main(): parser = ArgumentParser() - subparsers = parser.add_subparsers(help="sub-commands help") + subparsers = parser.add_subparsers(help="sub-commands help", required=True) - # Set verilator emulator arg handler - verilator_args_parser = subparsers.add_parser( - "verilate", help="Run verilator emulator" - ) - verilator_args_parser.add_argument("case", help="name alias for loading test case") + # Add sub-commands + verilator_args_parser = subparsers.add_parser("verilate", help="ip emulator help") # TODO: rename to ip + verilator_args_parser.set_defaults(func=run_ip) + soc_args_parser = subparsers.add_parser("soc", help="soc emulator help") + soc_args_parser.set_defaults(func=run_soc) + + # Register common args + for subparser in (verilator_args_parser, soc_args_parser): + subparser.add_argument("case", help="Case name alias or a path to ELF file") + subparser.add_argument( + "-c", + "--config", + default="v1024-l8-b2", + help="config name, as filename in ./configs. default to v1024-l8-b2", + ) + subparser.add_argument( + "--trace", action="store_true", help="enable trace file dumping" + ) + subparser.add_argument( + "--emulator-path", + default=None, + help="path to the soc emulator, use nix generated one if unspecified", + ) + subparser.add_argument( + "--cases-dir", help="path to testcases, default to TEST_CASES_DIR environment" + ) + subparser.add_argument( + "--use-individual-drv", help="use .#t1.rvv-testcases.. instead of .#t1.rvv-testcases.all", + action="store_true", + ) + subparser.add_argument( + "--out-dir", + default=None, + help="path to save results", # TODO: give a consistent behavior for both verilate and soc emulator + ) + subparser.add_argument( + "--base-out-dir", + default=None, + help="save result files in {base_out_dir}/{config}/{case}", + ) + + # Register verilator emulator args verilator_args_parser.add_argument( "-d", "--dramsim3-cfg", @@ -36,15 +73,6 @@ def main(): default=2000, type=float, ) - verilator_args_parser.add_argument( - "-c", - "--config", - default="v1024-l8-b2", - help="configuration name, as filenames in ./configs", - ) - verilator_args_parser.add_argument( - "--trace", action="store_true", help="use emulator with trace support" - ) verilator_args_parser.add_argument( "--cosim-timeout", default=100000, help="set cosim timeout" ) @@ -69,95 +97,46 @@ def main(): help="prevent emulator print log to console", ) - verilator_args_parser.add_argument( - "--cases-dir", help="path to testcases, default to TEST_CASES_DIR environment" - ) - verilator_args_parser.add_argument( - "--out-dir", default=None, help="path to save wave file and perf result file" - ) - verilator_args_parser.add_argument( - "--base-out-dir", - default=None, - help="save result files in {base_out_dir}/{config}/{case}", - ) - verilator_args_parser.add_argument( - "--emulator-path", default=None, help="path to emulator" - ) - # Set verilator emulator args handler - verilator_args_parser.set_defaults(func=run_verilator_emulator) - - # Set soc runner arg handler - soc_args_parser = subparsers.add_parser("soc", help="Run soc emulator") - soc_args_parser.add_argument("case", help="Case name alias or a path to ELF file") - soc_args_parser.add_argument( - "-c", - "--config", - default="v1024-l8-b2", - help="config name, as filename in ./configs. default to v1024-l8-b2", - ) - soc_args_parser.add_argument( - "--output-dir", - default=None, - help="path to save results, default to ./testrun/soc-emulator///", - ) + # Register soc emulator args soc_args_parser.add_argument( - "--trace", action="store_true", help="enable trace file dumping" - ) - soc_args_parser.add_argument( - "--trace-output-file", + "--trace-out-file", default="None", help="path for storing trace file, default to /trace.fst", ) - soc_args_parser.add_argument( - "--emulator-path", - default=None, - help="path to the soc emulator, default using nix generated one", - ) - soc_args_parser.add_argument( - "--cases-dir", help="path to testcases, default to TEST_CASES_DIR environment" - ) - - # Set soc args handler - soc_args_parser.set_defaults(func=run_soc) # Run args = parser.parse_args() args.func(args) -def run_verilator_emulator(args): - if args.verbose: - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.INFO) - - if args.out_dir is None: - if args.base_out_dir is not None: - args.out_dir = f"{args.base_out_dir}/{args.config}/{args.case}" - else: - args.out_dir = f"./testrun/{args.config}/{args.case}" - Path(args.out_dir).mkdir(exist_ok=True, parents=True) - - execute_verilator_emulator(args) - - # Try to search ELF from the given directory -def load_elf_from_dir(cases_dir, case_name): +def load_elf_from_dir(cases_dir, case_name, use_individual_drv=False): if cases_dir is None: if env_case_dir := os.environ.get("TEST_CASES_DIR"): cases_dir = env_case_dir else: - cases_dir = ( - subprocess.check_output( - "nix build .#t1.rvv-testcases.all --max-jobs 16 --no-link --print-out-paths".split() + if use_individual_drv: + split_idx = case_name.rfind('-') + case_true_name, case_type = case_name[:split_idx], case_name[split_idx+1:] + cases_dir = ( + subprocess.check_output( + f"nix build .#t1.rvv-testcases.{case_type}.{case_true_name} --max-jobs 16 --no-link --print-out-paths".split() + ) + .strip() + .decode("UTF-8") + ) + else: + cases_dir = ( + subprocess.check_output( + "nix build .#t1.rvv-testcases.all --max-jobs 16 --no-link --print-out-paths".split() + ) + .strip() + .decode("UTF-8") ) - .strip() - .decode("UTF-8") - ) cases_dir = Path(cases_dir) - case_config_path = cases_dir / "configs" / f"{case_name}.json" + case_config_path = cases_dir / f"{case_name}.json" if use_individual_drv else cases_dir / "configs" / f"{case_name}.json" assert case_config_path.exists(), f"cannot find case config in {case_config_path}" config = json.loads(case_config_path.read_text()) @@ -167,11 +146,23 @@ def load_elf_from_dir(cases_dir, case_name): return case_elf_path -def execute_verilator_emulator(args): +def run_ip(args): + if args.verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + if args.out_dir is None: + if args.base_out_dir is not None: + args.out_dir = f"{args.base_out_dir}/{args.config}/{args.case}" + else: + args.out_dir = f"./testrun/{args.config}/{args.case}" + Path(args.out_dir).mkdir(exist_ok=True, parents=True) + case_elf_path = ( args.case if Path(args.case).exists() - else load_elf_from_dir(args.cases_dir, args.case) + else load_elf_from_dir(args.cases_dir, args.case, args.use_individual_drv) ) dramsim3_cfg = args.dramsim3_cfg @@ -267,16 +258,16 @@ def run_soc(args): elf_path = ( args.case if Path(args.case).exists() - else load_elf_from_dir(args.cases_dir, args.case) + else load_elf_from_dir(args.cases_dir, args.case, args.use_individual_drv) ) process_args.append(f"+init_file={elf_path}") elf_filename = os.path.splitext(os.path.basename(elf_path))[0] - if args.output_dir is None: - args.output_dir = f"./testrun/soc-emulator/{args.config}/{elf_filename}/" - logger.info(f"Output dir set to {args.output_dir}") + if args.out_dir is None: + args.out_dir = f"./testrun/soc-emulator/{args.config}/{elf_filename}/" + logger.info(f"Output dir set to {args.out_dir}") - trace_filepath = args.trace_output_file or f"{args.trace_output_dir}/trace.fst" + trace_filepath = args.trace_output_file or f"{args.trace_out_dir}/trace.fst" process_args.append( f"+trace_file={trace_filepath}" if args.trace From 689e822318c756608ebe467662d51bf5d7dc5d48 Mon Sep 17 00:00:00 2001 From: SharzyL Date: Wed, 31 Jan 2024 18:23:45 +0800 Subject: [PATCH 2/2] [WIP] [testcases] add mmm_mem cases --- tests/asm/mmm_mem/default.nix | 9 + tests/asm/mmm_mem/mmm.S | 343 ++++++++++++++++++++++ tests/asm/mmm_mem/mmm.c | 32 ++ tests/asm/mmm_mem_scratchpad/default.nix | 9 + tests/asm/mmm_mem_scratchpad/mmm.S | 353 +++++++++++++++++++++++ tests/asm/mmm_mem_scratchpad/mmm.c | 32 ++ 6 files changed, 778 insertions(+) create mode 100644 tests/asm/mmm_mem/default.nix create mode 100644 tests/asm/mmm_mem/mmm.S create mode 100644 tests/asm/mmm_mem/mmm.c create mode 100644 tests/asm/mmm_mem_scratchpad/default.nix create mode 100644 tests/asm/mmm_mem_scratchpad/mmm.S create mode 100644 tests/asm/mmm_mem_scratchpad/mmm.c diff --git a/tests/asm/mmm_mem/default.nix b/tests/asm/mmm_mem/default.nix new file mode 100644 index 000000000..1f9515b90 --- /dev/null +++ b/tests/asm/mmm_mem/default.nix @@ -0,0 +1,9 @@ +{ testcase-env }: +testcase-env.mkAsmCase { + caseName = "mmm_mem"; + srcs = [ + ./mmm.S + ./mmm.c + ../main.S + ]; +} diff --git a/tests/asm/mmm_mem/mmm.S b/tests/asm/mmm_mem/mmm.S new file mode 100644 index 000000000..6999596ef --- /dev/null +++ b/tests/asm/mmm_mem/mmm.S @@ -0,0 +1,343 @@ +.text +.balign 16 +.globl mmm +.type mmm,@function +# assume VLEN >= 128, BN = 4096, SEW = 16 * 2 = 32 +# we only support LMUL = 1 for now +# P, A, B, AB should have 260 elements +mmm: + # quite SIMD + li t0, 4 # in case way > 31 + vsetvli zero, t0, e32, m1, ta, ma + # stride + li t1, 260 + # start loop of niter + 1 times + li t4,0 +1: + # AB = B_i*A + AB + # !!!!!! important: lw here assumes SEW = 32 + # T0 is used in vmacc, do not use for temp now! + lw t0, 0(a2) + addi a2, a2, 4 # advance B by a SEW + + # carry for ABV_0 + vmv.v.i v30,0 + # loop variable + li t5,0 +2: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a1 + vlsseg8e32.v v10, (t3), t1 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + vmacc.vx v20, t0, v10 + vmacc.vx v21, t0, v11 + vmacc.vx v22, t0, v12 + vmacc.vx v23, t0, v13 + vmacc.vx v24, t0, v14 + vmacc.vx v25, t0, v15 + vmacc.vx v26, t0, v16 + vmacc.vx v27, t0, v17 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + addi t5,t5,1 + # reuse T0 for special treatment + li t2,8 + bne t5,t2,2b + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a1 + vlse32.v v10, (t3), t1 + add t3,t2,a0 + vlse32.v v20, (t3), t1 + vmacc.vx v20, t0, v10 + # store one group of AB + vsse32.v v20, (t3), t1 + # start loop of niter + 1 times + # use T2 as outer loop index + li t2,0 +9: + # mask + # set TV2 for every propagate() + # set TV2 every time (see slide1up below) + li t0,65535 + vmv.v.x v31,t0 + + # carry for ABV_0 + vmv.v.i v30,0 + + # loop variable + li t5,0 +10: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t3,t5,5 + add t3,t3,a0 + vlsseg8e32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + vadd.vv v21, v21, v30 + # save carry in TV + vsrl.vi v30, v21, 16 + # mod 2 ** 16 + vand.vv v21, v21, v31 + vadd.vv v22, v22, v30 + # save carry in TV + vsrl.vi v30, v22, 16 + # mod 2 ** 16 + vand.vv v22, v22, v31 + vadd.vv v23, v23, v30 + # save carry in TV + vsrl.vi v30, v23, 16 + # mod 2 ** 16 + vand.vv v23, v23, v31 + vadd.vv v24, v24, v30 + # save carry in TV + vsrl.vi v30, v24, 16 + # mod 2 ** 16 + vand.vv v24, v24, v31 + vadd.vv v25, v25, v30 + # save carry in TV + vsrl.vi v30, v25, 16 + # mod 2 ** 16 + vand.vv v25, v25, v31 + vadd.vv v26, v26, v30 + # save carry in TV + vsrl.vi v30, v26, 16 + # mod 2 ** 16 + vand.vv v26, v26, v31 + vadd.vv v27, v27, v30 + # save carry in TV + vsrl.vi v30, v27, 16 + # mod 2 ** 16 + vand.vv v27, v27, v31 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t0,8 + bne t5,t0,10b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t3,t5,5 + add t3,t3,a0 + vlse32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + # store last group of AB + vsse32.v v20, (t3), t1 + + # update carry of AB_{ntotalreg - 1} to AB_0 + vlse32.v v20, (a0), t1 + vslide1up.vx v31, v30, zero + vadd.vv v20, v20, v31 + vsse32.v v20, (a0), t1 + addi t2,t2,1 + li t0,4 + bne t2,t0,9b + # !!!!!! important: lw here assumes SEW = 32 + # T0 is used in vmacc, do not use for temp now! + lw t0, 0(a0) + mul t0, t0, a4 + # mod 2 ** 16 + # !!!! important: here we assume SEW = 32 and XLEN = 64 + sll t0, t0, 16 + srl t0, t0, 16 + + # loop variable + li t5,0 +2: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a3 + vlsseg8e32.v v0, (t3), t1 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + vmacc.vx v20, t0, v0 + vmacc.vx v21, t0, v1 + vmacc.vx v22, t0, v2 + vmacc.vx v23, t0, v3 + vmacc.vx v24, t0, v4 + vmacc.vx v25, t0, v5 + vmacc.vx v26, t0, v6 + vmacc.vx v27, t0, v7 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + addi t5,t5,1 + # reuse T0 for special treatment + li t2,8 + bne t5,t2,2b + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a3 + vlse32.v v0, (t3), t1 + add t3,t2,a0 + vlse32.v v20, (t3), t1 + vmacc.vx v20, t0, v0 + # store one group of AB + vsse32.v v20, (t3), t1 + # start loop of niter + 1 times + # use T2 as outer loop index + li t2,0 +9: + # mask + # set TV2 for every propagate() + # set TV2 every time (see slide1up below) + li t0,65535 + vmv.v.x v31,t0 + + # carry for ABV_0 + vmv.v.i v30,0 + + # loop variable + li t5,0 +10: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t3,t5,5 + add t3,t3,a0 + vlsseg8e32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + vadd.vv v21, v21, v30 + # save carry in TV + vsrl.vi v30, v21, 16 + # mod 2 ** 16 + vand.vv v21, v21, v31 + vadd.vv v22, v22, v30 + # save carry in TV + vsrl.vi v30, v22, 16 + # mod 2 ** 16 + vand.vv v22, v22, v31 + vadd.vv v23, v23, v30 + # save carry in TV + vsrl.vi v30, v23, 16 + # mod 2 ** 16 + vand.vv v23, v23, v31 + vadd.vv v24, v24, v30 + # save carry in TV + vsrl.vi v30, v24, 16 + # mod 2 ** 16 + vand.vv v24, v24, v31 + vadd.vv v25, v25, v30 + # save carry in TV + vsrl.vi v30, v25, 16 + # mod 2 ** 16 + vand.vv v25, v25, v31 + vadd.vv v26, v26, v30 + # save carry in TV + vsrl.vi v30, v26, 16 + # mod 2 ** 16 + vand.vv v26, v26, v31 + vadd.vv v27, v27, v30 + # save carry in TV + vsrl.vi v30, v27, 16 + # mod 2 ** 16 + vand.vv v27, v27, v31 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t0,8 + bne t5,t0,10b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t3,t5,5 + add t3,t3,a0 + vlse32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + # store last group of AB + vsse32.v v20, (t3), t1 + + # update carry of AB_{ntotalreg - 1} to AB_0 + vlse32.v v20, (a0), t1 + vslide1up.vx v31, v30, zero + vadd.vv v20, v20, v31 + vsse32.v v20, (a0), t1 + addi t2,t2,1 + li t0,4 + bne t2,t0,9b + # update carry of AB_{ntotalreg - 1} to AB_0 + # since we need to substract AB_0 + vlse32.v v20, (a0), t1 + # AB / word + vslide1down.vx v30, v20, zero + # do not need vsse now + # just store it in TV for move + # loop variable + li t5,0 +2: + # load one offseted group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + + # then offset by 1 element + addi t2,t2,4 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + + # back to original offset + addi t3,t3,-4 + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t2,8 + bne t5,t2,2b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t2,t5,5 + # then offset by 1 element + addi t2,t2,4 + add t3,t2,a0 + # move AB_0 to AB_{ntotalreg-1} + vmv.v.v v20, v30 + + # back to original offset + addi t3,t3,-4 + vsse32.v v20, (t3), t1 + addi t4,t4,1 + li t0,257 + bne t4,t0,1b + + ret diff --git a/tests/asm/mmm_mem/mmm.c b/tests/asm/mmm_mem/mmm.c new file mode 100644 index 000000000..77280a564 --- /dev/null +++ b/tests/asm/mmm_mem/mmm.c @@ -0,0 +1,32 @@ +#include +#include +#include + +const int vl = 128; + +const int max_bits = 4096; +const int word_bits = 16; +const int way = vl / word_bits / 2; // 4 +const int s = 65; // ceil((max_bits / word_bits + 1) / way) + +// 4096 / 16 = 64 +// 64 + 4 comes from s * way = 260 +const uint32_t a[256 + 4] = {0xFFFF, 0xFFFF}; +const uint32_t b[256 + 4] = {0xFFFF, 0xFFFF}; +// for mmm_mem.pl, we need to ensure abr1 = 0 +uint32_t abr1[256 + 4] = {0}; +const uint32_t p[256 + 4] = {0xa285, 0xfbcb, 0x2b62, 0xc04c, 0x08e6, 0x5b0b, 0x44c7, 0x7403, 0x1291, 0x1a5f, 0x3100, 0x34d4, 0x7006, 0xa589, 0x5ee5, 0xa841, 0xc2ef, 0x5c18, 0x70c9, 0x3b54, 0x2d96, 0xb89c, 0xaff1, 0x466a, 0x9efa, 0xb6dd, 0x7749, 0x4bbc, 0x38b3, 0x93fc, 0xa629, 0x3f00, 0x88b9, 0xcbb2, 0xa694, 0x1a52, 0xc96b, 0xbe5d, 0x2fe8, 0xadfd, 0xa394, 0x57ed, 0x4b79, 0x9b5f, 0xd37e, 0xa94f, 0x5559, 0xea7d, 0xd1a9, 0x753b, 0xe079, 0x81de, 0xfe08, 0x738a, 0x9438, 0x845c, 0x3358, 0x7ab4, 0xd10c, 0xfe88, 0xfc76, 0x81cb, 0xfd86, 0x0eb3, 0x7887, 0x64e1, 0x7a8f, 0x5dd8, 0xb6df, 0x97cd, 0x01af, 0x0b77, 0xb51b, 0xb900, 0xd6ba, 0xd89b, 0xe71c, 0x8eba, 0xc1e7, 0x91b1, 0xb0db, 0x0c73, 0xe5e2, 0x1df4, 0x87c7, 0xd94a, 0x57bb, 0xcb1f, 0x85ae, 0xbd7e, 0xb359, 0x753c, 0x41e3, 0xb515, 0x9896, 0xdddd, 0x8ee9, 0x90b1, 0xdd10, 0x0d41, 0x5496, 0x7aef, 0xfaf3, 0x5ff1, 0x249c, 0x9549, 0xd7cf, 0xab30, 0x5a92, 0x6532, 0x9f61, 0x471d, 0x212f, 0x7ad1, 0x7034, 0x4fe8, 0x7a6b, 0x793d, 0x09d8, 0x1dcd, 0xb60b, 0xe2fe, 0x01e0, 0xd2db, 0xea7f, 0xf5d0, 0x7759, 0x2242, 0xe701, 0x1a09, 0xf53a, 0x710f, 0x5224, 0x56f4, 0xf439, 0xb1e3, 0xffdd, 0xac3b, 0x78ee, 0x4ade, 0x20a5, 0x962d, 0x6fa4, 0x1f4c, 0x13e6, 0x4476, 0x9d8f, 0x2ea2, 0x2769, 0xd6d6, 0x88a1, 0xf82d, 0x2540, 0xb340, 0x03de, 0x7565, 0x7418, 0xd207, 0x55e1, 0x0c8d, 0x3547, 0xf16b, 0xf073, 0x0c77, 0xe3db, 0xc58b, 0x7073, 0xb415, 0x7f44, 0xa294, 0xdae5, 0x69e4, 0x76cf, 0x169e, 0xa3fc, 0x9412, 0x0795, 0x80b3, 0x3b23, 0xede4, 0xa2e5, 0x7a61, 0xbe6f, 0x0b36, 0x033a, 0x1a06, 0x4b08, 0x3232, 0x1a5b, 0xb722, 0x4ee3, 0x12ab, 0xe4d9, 0xe94c, 0x9e3c, 0xcf07, 0x16c0, 0x3dee, 0xb38c, 0x067c, 0x0c58, 0x7868, 0x3705, 0x915b, 0x5084, 0x6db6, 0x0a94, 0x3c3a, 0x0f02, 0x123c, 0xfcc9, 0xa81f, 0x9a09, 0xa3e5, 0xbd77, 0x324b, 0x4b30, 0x501f, 0xf4c3, 0xd3f8, 0x659c, 0x3cc0, 0xf7d4, 0x2fc2, 0xe326, 0x276d, 0xca2f, 0xbb0d, 0x04a4, 0x814f, 0x8db7, 0xf4e7, 0x467f, 0x0686, 0x42ff, 0xb319, 0xaf84, 0xf027, 0x0ff5, 0x5558, 0x5565, 0xe9fd, 0x4573, 0x873d, 0x3491, 0xa800, 0xbc9d, 0x484f, 0x34bd, 0x34bf, 0x6d55, 0xb44e, 0x5eb5, 0xc68c, 0x50a7, 0xc996}; +const uint32_t mu = 0xf9b3; + +// 32 = 2 * word_bits +// mu is of 16 bits +// R is 2 ** (max_bits + word_bits) +void mmm(uint32_t* r, const uint32_t* a, const uint32_t* b, const uint32_t* p, const uint32_t mu); + +int test() { + mmm(abr1, a, b, p, mu); + // for(int i = 0; i != 260; ++i) { + // printf("%04lX ", abr1[i]); + // } + return 0; +} diff --git a/tests/asm/mmm_mem_scratchpad/default.nix b/tests/asm/mmm_mem_scratchpad/default.nix new file mode 100644 index 000000000..5d8e02562 --- /dev/null +++ b/tests/asm/mmm_mem_scratchpad/default.nix @@ -0,0 +1,9 @@ +{ testcase-env }: +testcase-env.mkAsmCase { + caseName = "mmm_mem_scratchpad"; + srcs = [ + ./mmm.S + ./mmm.c + ../main.S + ]; +} diff --git a/tests/asm/mmm_mem_scratchpad/mmm.S b/tests/asm/mmm_mem_scratchpad/mmm.S new file mode 100644 index 000000000..b3005278f --- /dev/null +++ b/tests/asm/mmm_mem_scratchpad/mmm.S @@ -0,0 +1,353 @@ +.text +.balign 16 +.globl mmm +.type mmm,@function +# assume VLEN >= 128, BN = 4096, SEW = 16 * 2 = 32 +# we only support LMUL = 1 for now +# P, A, B, AB should have 260 elements +mmm: + # quite SIMD + li t0, 4 # in case way > 31 + vsetvli zero, t0, e32, m1, ta, ma + # stride + li t1, 260 + # start loop of niter + 1 times + li t4,0 +1: + # AB = B_i*A + AB + # !!!!!! important: lw here assumes SEW = 32 + # T0 is used in vmacc, do not use for temp now! + lw t0, 0(a2) + addi a2, a2, 4 # advance B by a SEW + + # carry for ABV_0 + vmv.v.i v30,0 + # loop variable + li t5,0 +2: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a1 + vlsseg8e32.v v10, (t3), t1 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + vmacc.vx v20, t0, v10 + vmacc.vx v21, t0, v11 + vmacc.vx v22, t0, v12 + vmacc.vx v23, t0, v13 + vmacc.vx v24, t0, v14 + vmacc.vx v25, t0, v15 + vmacc.vx v26, t0, v16 + vmacc.vx v27, t0, v17 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + addi t5,t5,1 + # reuse T0 for special treatment + li t2,8 + bne t5,t2,2b + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a1 + vlse32.v v10, (t3), t1 + add t3,t2,a0 + vlse32.v v20, (t3), t1 + vmacc.vx v20, t0, v10 + # store one group of AB + vsse32.v v20, (t3), t1 + # start loop of niter + 1 times + # use T2 as outer loop index + li t2,0 +9: + # mask + # set TV2 for every propagate() + # set TV2 every time (see slide1up below) + li t0,65535 + vmv.v.x v31,t0 + + # carry for ABV_0 + vmv.v.i v30,0 + + # loop variable + li t5,0 +10: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t3,t5,5 + add t3,t3,a0 + vlsseg8e32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + vadd.vv v21, v21, v30 + # save carry in TV + vsrl.vi v30, v21, 16 + # mod 2 ** 16 + vand.vv v21, v21, v31 + vadd.vv v22, v22, v30 + # save carry in TV + vsrl.vi v30, v22, 16 + # mod 2 ** 16 + vand.vv v22, v22, v31 + vadd.vv v23, v23, v30 + # save carry in TV + vsrl.vi v30, v23, 16 + # mod 2 ** 16 + vand.vv v23, v23, v31 + vadd.vv v24, v24, v30 + # save carry in TV + vsrl.vi v30, v24, 16 + # mod 2 ** 16 + vand.vv v24, v24, v31 + vadd.vv v25, v25, v30 + # save carry in TV + vsrl.vi v30, v25, 16 + # mod 2 ** 16 + vand.vv v25, v25, v31 + vadd.vv v26, v26, v30 + # save carry in TV + vsrl.vi v30, v26, 16 + # mod 2 ** 16 + vand.vv v26, v26, v31 + vadd.vv v27, v27, v30 + # save carry in TV + vsrl.vi v30, v27, 16 + # mod 2 ** 16 + vand.vv v27, v27, v31 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t0,8 + bne t5,t0,10b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t3,t5,5 + add t3,t3,a0 + vlse32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + # store last group of AB + vsse32.v v20, (t3), t1 + + # update carry of AB_{ntotalreg - 1} to AB_0 + vlse32.v v20, (a0), t1 + addi t0, a5, 4 + vse32.v v30, (t0) + vle32.v v31, (a5) + + vadd.vv v20, v20, v31 + vsse32.v v20, (a0), t1 + addi t2,t2,1 + li t0,4 + bne t2,t0,9b + # !!!!!! important: lw here assumes SEW = 32 + # T0 is used in vmacc, do not use for temp now! + lw t0, 0(a0) + mul t0, t0, a4 + # mod 2 ** 16 + # !!!! important: here we assume SEW = 32 and XLEN = 64 + sll t0, t0, 16 + srl t0, t0, 16 + + # loop variable + li t5,0 +2: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a3 + vlsseg8e32.v v0, (t3), t1 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + vmacc.vx v20, t0, v0 + vmacc.vx v21, t0, v1 + vmacc.vx v22, t0, v2 + vmacc.vx v23, t0, v3 + vmacc.vx v24, t0, v4 + vmacc.vx v25, t0, v5 + vmacc.vx v26, t0, v6 + vmacc.vx v27, t0, v7 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + addi t5,t5,1 + # reuse T0 for special treatment + li t2,8 + bne t5,t2,2b + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + add t3,t2,a3 + vlse32.v v0, (t3), t1 + add t3,t2,a0 + vlse32.v v20, (t3), t1 + vmacc.vx v20, t0, v0 + # store one group of AB + vsse32.v v20, (t3), t1 + # start loop of niter + 1 times + # use T2 as outer loop index + li t2,0 +9: + # mask + # set TV2 for every propagate() + # set TV2 every time (see slide1up below) + li t0,65535 + vmv.v.x v31,t0 + + # carry for ABV_0 + vmv.v.i v30,0 + + # loop variable + li t5,0 +10: + # load one group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t3,t5,5 + add t3,t3,a0 + vlsseg8e32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + vadd.vv v21, v21, v30 + # save carry in TV + vsrl.vi v30, v21, 16 + # mod 2 ** 16 + vand.vv v21, v21, v31 + vadd.vv v22, v22, v30 + # save carry in TV + vsrl.vi v30, v22, 16 + # mod 2 ** 16 + vand.vv v22, v22, v31 + vadd.vv v23, v23, v30 + # save carry in TV + vsrl.vi v30, v23, 16 + # mod 2 ** 16 + vand.vv v23, v23, v31 + vadd.vv v24, v24, v30 + # save carry in TV + vsrl.vi v30, v24, 16 + # mod 2 ** 16 + vand.vv v24, v24, v31 + vadd.vv v25, v25, v30 + # save carry in TV + vsrl.vi v30, v25, 16 + # mod 2 ** 16 + vand.vv v25, v25, v31 + vadd.vv v26, v26, v30 + # save carry in TV + vsrl.vi v30, v26, 16 + # mod 2 ** 16 + vand.vv v26, v26, v31 + vadd.vv v27, v27, v30 + # save carry in TV + vsrl.vi v30, v27, 16 + # mod 2 ** 16 + vand.vv v27, v27, v31 + # store one group of AB + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t0,8 + bne t5,t0,10b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t3,t5,5 + add t3,t3,a0 + vlse32.v v20, (t3), t1 + vadd.vv v20, v20, v30 + # save carry in TV + vsrl.vi v30, v20, 16 + # mod 2 ** 16 + vand.vv v20, v20, v31 + # store last group of AB + vsse32.v v20, (t3), t1 + + # update carry of AB_{ntotalreg - 1} to AB_0 + vlse32.v v20, (a0), t1 + addi t0, a5, 4 + vse32.v v30, (t0) + vle32.v v31, (a5) + + vadd.vv v20, v20, v31 + vsse32.v v20, (a0), t1 + addi t2,t2,1 + li t0,4 + bne t2,t0,9b + # update carry of AB_{ntotalreg - 1} to AB_0 + # since we need to substract AB_0 + vlse32.v v20, (a0), t1 + # AB / word + addi t0, a5, 4 + vse32.v v20, (t0) + addi t0, a5, 8 + vle32.v v30, (t0) + + # do not need vsse now + # just store it in TV for move + # loop variable + li t5,0 +2: + # load one offseted group of values from arg + # offset of one group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + slli t2,t5,5 + + # then offset by 1 element + addi t2,t2,4 + add t3,t2,a0 + vlsseg8e32.v v20, (t3), t1 + + # back to original offset + addi t3,t3,-4 + vssseg8e32.v v20, (t3), t1 + + addi t5,t5,1 + li t2,8 + bne t5,t2,2b + # load last group of values from arg + # offset of last group + # !!! important: assume nreg = 8 and sew = 32 + # log(8) + log(32/8) = 5 + # LOOP2 is now ngroup - 1 + slli t2,t5,5 + # then offset by 1 element + addi t2,t2,4 + add t3,t2,a0 + # move AB_0 to AB_{ntotalreg-1} + vmv.v.v v20, v30 + + # back to original offset + addi t3,t3,-4 + vsse32.v v20, (t3), t1 + addi t4,t4,1 + li t0,257 + bne t4,t0,1b + + ret diff --git a/tests/asm/mmm_mem_scratchpad/mmm.c b/tests/asm/mmm_mem_scratchpad/mmm.c new file mode 100644 index 000000000..683cd3900 --- /dev/null +++ b/tests/asm/mmm_mem_scratchpad/mmm.c @@ -0,0 +1,32 @@ +#include +#include +#include + +const int vl = 128; + +const int max_bits = 4096; +const int word_bits = 16; +const int way = vl / word_bits / 2; // 4 +const int s = 65; // ceil((max_bits / word_bits + 1) / way) + +// 4096 / 16 = 64 +// 64 + 4 comes from s * way = 260 +const uint32_t a[256 + 4] = {0xFFFF, 0xFFFF}; +const uint32_t b[256 + 4] = {0xFFFF, 0xFFFF}; +// for mmm_mem.pl, we need to ensure abr1 = 0 +uint32_t abr1[256 + 4] = {0}; +const uint32_t p[256 + 4] = {0xa285, 0xfbcb, 0x2b62, 0xc04c, 0x08e6, 0x5b0b, 0x44c7, 0x7403, 0x1291, 0x1a5f, 0x3100, 0x34d4, 0x7006, 0xa589, 0x5ee5, 0xa841, 0xc2ef, 0x5c18, 0x70c9, 0x3b54, 0x2d96, 0xb89c, 0xaff1, 0x466a, 0x9efa, 0xb6dd, 0x7749, 0x4bbc, 0x38b3, 0x93fc, 0xa629, 0x3f00, 0x88b9, 0xcbb2, 0xa694, 0x1a52, 0xc96b, 0xbe5d, 0x2fe8, 0xadfd, 0xa394, 0x57ed, 0x4b79, 0x9b5f, 0xd37e, 0xa94f, 0x5559, 0xea7d, 0xd1a9, 0x753b, 0xe079, 0x81de, 0xfe08, 0x738a, 0x9438, 0x845c, 0x3358, 0x7ab4, 0xd10c, 0xfe88, 0xfc76, 0x81cb, 0xfd86, 0x0eb3, 0x7887, 0x64e1, 0x7a8f, 0x5dd8, 0xb6df, 0x97cd, 0x01af, 0x0b77, 0xb51b, 0xb900, 0xd6ba, 0xd89b, 0xe71c, 0x8eba, 0xc1e7, 0x91b1, 0xb0db, 0x0c73, 0xe5e2, 0x1df4, 0x87c7, 0xd94a, 0x57bb, 0xcb1f, 0x85ae, 0xbd7e, 0xb359, 0x753c, 0x41e3, 0xb515, 0x9896, 0xdddd, 0x8ee9, 0x90b1, 0xdd10, 0x0d41, 0x5496, 0x7aef, 0xfaf3, 0x5ff1, 0x249c, 0x9549, 0xd7cf, 0xab30, 0x5a92, 0x6532, 0x9f61, 0x471d, 0x212f, 0x7ad1, 0x7034, 0x4fe8, 0x7a6b, 0x793d, 0x09d8, 0x1dcd, 0xb60b, 0xe2fe, 0x01e0, 0xd2db, 0xea7f, 0xf5d0, 0x7759, 0x2242, 0xe701, 0x1a09, 0xf53a, 0x710f, 0x5224, 0x56f4, 0xf439, 0xb1e3, 0xffdd, 0xac3b, 0x78ee, 0x4ade, 0x20a5, 0x962d, 0x6fa4, 0x1f4c, 0x13e6, 0x4476, 0x9d8f, 0x2ea2, 0x2769, 0xd6d6, 0x88a1, 0xf82d, 0x2540, 0xb340, 0x03de, 0x7565, 0x7418, 0xd207, 0x55e1, 0x0c8d, 0x3547, 0xf16b, 0xf073, 0x0c77, 0xe3db, 0xc58b, 0x7073, 0xb415, 0x7f44, 0xa294, 0xdae5, 0x69e4, 0x76cf, 0x169e, 0xa3fc, 0x9412, 0x0795, 0x80b3, 0x3b23, 0xede4, 0xa2e5, 0x7a61, 0xbe6f, 0x0b36, 0x033a, 0x1a06, 0x4b08, 0x3232, 0x1a5b, 0xb722, 0x4ee3, 0x12ab, 0xe4d9, 0xe94c, 0x9e3c, 0xcf07, 0x16c0, 0x3dee, 0xb38c, 0x067c, 0x0c58, 0x7868, 0x3705, 0x915b, 0x5084, 0x6db6, 0x0a94, 0x3c3a, 0x0f02, 0x123c, 0xfcc9, 0xa81f, 0x9a09, 0xa3e5, 0xbd77, 0x324b, 0x4b30, 0x501f, 0xf4c3, 0xd3f8, 0x659c, 0x3cc0, 0xf7d4, 0x2fc2, 0xe326, 0x276d, 0xca2f, 0xbb0d, 0x04a4, 0x814f, 0x8db7, 0xf4e7, 0x467f, 0x0686, 0x42ff, 0xb319, 0xaf84, 0xf027, 0x0ff5, 0x5558, 0x5565, 0xe9fd, 0x4573, 0x873d, 0x3491, 0xa800, 0xbc9d, 0x484f, 0x34bd, 0x34bf, 0x6d55, 0xb44e, 0x5eb5, 0xc68c, 0x50a7, 0xc996}; +const uint32_t mu = 0xf9b3; + +// 32 = 2 * word_bits +// mu is of 16 bits +// R is 2 ** (max_bits + word_bits) +void mmm(uint32_t* r, const uint32_t* a, const uint32_t* b, const uint32_t* p, const uint32_t mu, uint32_t *scratchpad); + +int test() { + mmm(abr1, a, b, p, mu, (uint32_t *) 0x200000); + // for(int i = 0; i != 260; ++i) { + // printf("%04lX ", abr1[i]); + // } + return 0; +}