From 41237b36f7615fd6d17030962582268902af3156 Mon Sep 17 00:00:00 2001
From: "Lim, Thing-han" <15379156+potsrevennil@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:50:39 +0800
Subject: [PATCH] Benchmarking on A55 (#84)

* add exec_wrapper for tests script

Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com>

* add ci benchmark on a55 runner

Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com>

* fix if condition for the benchmark workflow

Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com>

* make parsing of results more robust

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* log cmd on failure

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* remove taskpolicy and replace by exec_wrapper

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* refactor benchmarking yml

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* fix exec wrapper

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* add name of job

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* always turn exec wrapper into a list

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* remove duplicate test script

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

* move splitting of exec wrapper

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>

---------

Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com>
Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 .github/workflows/bench.yml |  19 ++++-
 scripts/tests               | 147 ++++++++++++++++++++----------------
 2 files changed, 95 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 04adcbe35..5f903415d 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -9,7 +9,18 @@ on:
     types: [ "labeled" ]
 jobs:
   bench:
-    runs-on: self-hosted-rpi4
+    name: ${{ matrix.target.name }}
+    strategy:
+      fail-fast: true
+      matrix:
+       target:
+        - system: rpi4
+          name: Arm Cortex-A72 (Raspberry Pi 4) benchmarks
+          cmd: tests bench -c PMU --cflags -mcpu=cortex-a72 -v --output output.json
+        - system: a55
+          name: Arm Cortex-A55 (Snapdragon 888) benchmarks
+          cmd: tests bench -c PERF --cflags "-static -mcpu=cortex-a55" --arch-flags -march=armv8.2-a -w exec-on-a55 -v --output output.json
+    runs-on: self-hosted-${{ matrix.target.system }}
     permissions:
       contents: write
     if: github.repository_owner == 'pq-code-package' && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main')
@@ -27,7 +38,7 @@ jobs:
               - $(uname -a)
               - $(nix --version)
               - $(astyle --version)
-              - $(${{ matrix.cross_prefix }}gcc --version | grep -m1 "")
+              - $(${{ matrix.target.cross_prefix }}gcc --version | grep -m1 "")
               - $(bash --version | grep -m1 "")
 
               ## CPU Info
@@ -36,12 +47,12 @@ jobs:
       - name: Run benchmark
         shell: nix develop .#ci -c bash -e {0}
         run: |
-          tests bench -c PMU --cflags -mcpu=cortex-a72 -v --output output.json
+          ${{ matrix.target.cmd }}
       - name: Store benchmark result
         if: github.repository_owner == 'pq-code-package' && github.ref == 'refs/heads/main'
         uses: benchmark-action/github-action-benchmark@v1
         with:
-          name: Arm Cortex-A72 (Raspberry Pi 4) benchmarks
+          name: ${{ matrix.target.name }}
           tool: 'customSmallerIsBetter'
           output-file-path: output.json
           github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/scripts/tests b/scripts/tests
index 7e8b7ad5c..63019bb30 100755
--- a/scripts/tests
+++ b/scripts/tests
@@ -25,23 +25,22 @@ def sha256sum(result):
     return m.hexdigest()
 
 
-def base_run(
+def base_compile(
     bin,
-    force_qemu,
     verbose,
-    run_as_root=False,
-    mac_taskpolicy=None,
     extra_make_envs={},
     extra_make_args=[],
 ):
+    """compile or cross compile with some extra environment variables and makefile arguments"""
+
     def dict2str(dict):
         s = ""
         for k, v in dict.items():
             s += f"{k}={v} "
         return s
 
-    if force_qemu or (platform.system() == "Linux" and platform.machine() == "x86_64"):
-        logging.debug(f"Emulating {bin} with QEMU")
+    if platform.system() == "Linux" and platform.machine() == "x86_64":
+        logging.debug(f"Cross compiling {bin}")
 
         args = [
             "make",
@@ -49,66 +48,59 @@ def base_run(
             f"{bin}",
         ] + extra_make_args
 
-        logging.info(dict2str(extra_make_envs) + " ".join(args))
-
-        p = subprocess.run(
-            args,
-            stdout=subprocess.DEVNULL if not verbose else None,
-            env=os.environ.copy() | extra_make_envs,
-        )
-        if p.returncode != 0:
-            logging.error(f"make failed: {p.returncode}")
-            sys.exit(1)
-
-        result = subprocess.run(
-            ["qemu-aarch64", f"{bin}"],
-            capture_output=True,
-            universal_newlines=False,
-        )
-
-        if result.returncode != 0:
-            logging.error(
-                f"Emulating {bin} failed: {result.returncode} {result.stderr.decode()}"
-            )
-            sys.exit(1)
-
     else:
-        logging.debug(f"Running {bin} natively")
+        logging.debug(f"Compiling {bin} natively")
 
         args = ["make", f"{bin}"] + extra_make_args
         logging.info(dict2str(extra_make_envs) + " ".join(args))
 
-        p = subprocess.run(
-            args,
-            stdout=subprocess.DEVNULL if not verbose else None,
-            env=os.environ.copy() | extra_make_envs,
-        )
+    p = subprocess.run(
+        args,
+        stdout=subprocess.DEVNULL if not verbose else None,
+        env=os.environ.copy() | extra_make_envs,
+    )
 
-        if p.returncode != 0:
-            logging.error(f"make failed: {p.returncode}")
-            sys.exit(1)
+    if p.returncode != 0:
+        logging.error(f"make failed: {p.returncode}")
+        sys.exit(1)
 
-        cmd = [f"./{bin}"]
-        if run_as_root:
-            logging.info(
-                "Running benchmarks as root -- you may need to enter your root password."
-            )
-            cmd = ["sudo"] + cmd
 
-        if mac_taskpolicy is not None:
-            cmd = ["taskpolicy", "-c", mac_taskpolicy] + cmd
+def base_run(
+    bin,
+    force_qemu,
+    verbose,
+    run_as_root=False,
+    exec_wrapper=None,
+):
+    """Run the binary in all different ways"""
+    cmd = [f"./{bin}"]
+    if force_qemu or (platform.system() == "Linux" and platform.machine() == "x86_64"):
+        logging.info(f"Emulating {bin} with QEMU")
+        cmd = ["qemu-aarch64"] + cmd
 
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            universal_newlines=False,
+    if run_as_root:
+        logging.info(
+            f"Running {bin} as root -- you may need to enter your root password."
         )
+        cmd = ["sudo"] + cmd
 
-        if result.returncode != 0:
-            logging.error(
-                f"Running {bin} natively failed: {result.returncode} {result.stderr.decode()}"
-            )
-            sys.exit(1)
+    if exec_wrapper:
+        logging.info(f"Running {bin} with customized wrapper.")
+        exec_wrapper = exec_wrapper.split(" ")
+        cmd = exec_wrapper + cmd
+
+    logging.info(" ".join(cmd))
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        universal_newlines=False,
+    )
+
+    if result.returncode != 0:
+        logging.error(
+            f"Running '{cmd}' failed: {result.returncode} {result.stderr.decode()}"
+        )
+        sys.exit(1)
 
     return result.stdout
 
@@ -149,7 +141,7 @@ def test_schemes(
     force_qemu,
     verbose,
     run_as_root=False,
-    mac_taskpolicy=None,
+    exec_wrapper=None,
     extra_make_envs={},
     extra_make_args=[],
 ):
@@ -165,14 +157,13 @@ def test_schemes(
     results = {}
     for scheme in SCHEME:
         bin = scheme2file(scheme)
+        base_compile(bin, verbose, extra_make_envs, extra_make_args)
         result = base_run(
             bin,
             force_qemu,
             verbose,
             run_as_root,
-            mac_taskpolicy,
-            extra_make_envs,
-            extra_make_args,
+            exec_wrapper,
         )
         results[scheme] = result
 
@@ -279,11 +270,11 @@ def add_options(options):
 def run(bin, force_qemu, verbose, cflags, arch_flags):
     config_logger(verbose)
 
+    base_compile(bin, verbose, process_make_envs(cflags, arch_flags))
     result = base_run(
         bin,
         force_qemu,
         verbose,
-        process_make_envs(cflags, arch_flags),
     )
     logging.info(str(result, encoding="utf-8"))
 
@@ -388,20 +379,41 @@ def kat(force_qemu, verbose, cflags, arch_flags):
     type=bool,
     help="Benchmarking binary is run with sudo.",
 )
+@click.option(
+    "-w",
+    "--exec-wrapper",
+    help="Run the benchmark binary with the user-customized wrapper.",
+)
 @click.option(
     "-t",
     "--mac-taskpolicy",
     nargs=1,
     type=click.Choice(["utility", "background", "maintenance"]),
+    hidden=platform.system() != "Darwin",
     show_default=True,
     default=None,
-    help="Run the program using the specified QoS clamp. Applies to MacOS only. Setting this flag to 'background' guarantees running on E-cores.",
+    help="Run the program using the specified QoS clamp. Applies to MacOS only. Setting this flag to 'background' guarantees running on E-cores. This is an abbreviation of --exec-wrapper 'taskpolicy -c {mac_taskpolicy}'.",
 )
 def bench(
-    force_qemu, verbose, cycles, cflags, arch_flags, output, run_as_root, mac_taskpolicy
+    force_qemu,
+    verbose,
+    cycles,
+    cflags,
+    arch_flags,
+    output,
+    run_as_root,
+    exec_wrapper,
+    mac_taskpolicy,
 ):
     config_logger(verbose)
 
+    if mac_taskpolicy:
+        if exec_wrapper:
+            logging.error(f"cannot set both --mac-taskpolicy and --exec-wrapper")
+            sys.exit(1)
+        else:
+            exec_wrapper = f"taskpolicy -c {mac_taskpolicy}"
+
     results = test_schemes(
         "benchmark",
         lambda scheme: scheme.name.replace("MLKEM", "test/bin/bench_kyber"),
@@ -411,7 +423,7 @@ def bench(
         force_qemu,
         verbose,
         run_as_root,
-        mac_taskpolicy,
+        exec_wrapper,
         process_make_envs(cflags, arch_flags),
         [f"CYCLES={cycles}"],
     )
@@ -430,10 +442,11 @@ def bench(
                 # encaps cycles=X
                 # decaps cycles=X
 
-                d = {
-                    k: int(v)
-                    for k, v in (l.decode().split("=") for l in r.splitlines()[:3])
-                }
+                lines = [
+                    line.decode() for line in r.splitlines() if "=" in line.decode()
+                ]
+
+                d = {k: int(v) for k, v in (l.split("=") for l in lines)}
                 for primitive in ["keypair", "encaps", "decaps"]:
                     v.append(
                         {