Run performance comparison in CI using Kani's Benchcomp

Use proof harnesses in AWS C Common as (initial) benchmark to guard against performance regressions introduced in PRs. See the "Summary" behind any of the executions of https://github.com/model-checking/kani/actions/workflows/bench.yml for examples what the output will look like. Co-authored-by: Kareem Khazem <[email protected]>
diffblue · Feb 6, 2024 · 3ac5219 · 3ac5219
1 parent 2d96b83
commit 3ac5219
Show file tree

Hide file tree

Showing 3 changed files with 361 additions and 0 deletions.
diff --git a/.github/workflows/benchcomp-config.yaml b/.github/workflows/benchcomp-config.yaml
@@ -0,0 +1,105 @@
+# Compare CBMC performance of selected benchmarks across two versions of CBMC,
+# erroring out on regression. This config file is to be used together with the
+# set-up in performance.yaml, because it assumes that there are two CBMC
+# checkouts in the 'old' and 'new' directories; benchcomp compares the
+# performance of these two checkouts.
+#
+# This configuration file is based on Benchcomp's perf-regression.yaml that
+# ships with Kani.
+
+variants:
+  aws-c-common@old:
+    config:
+      command_line: cd verification/cbmc/proofs && export PATH=/home/runner/work/cbmc/cbmc/old/build/bin:$PATH &&
+        ./run-cbmc-proofs.py
+      directory: /home/runner/work/cbmc/cbmc/aws-c-common.git
+      env: {}
+  aws-c-common@new:
+    config:
+      command_line: cd verification/cbmc/proofs && export PATH=/home/runner/work/cbmc/cbmc/new/build/bin:$PATH &&
+        ./run-cbmc-proofs.py
+      directory: /home/runner/work/cbmc/cbmc/aws-c-common.git
+      env: {}
+
+run:
+  suites:
+    aws-c-common:
+      parser:
+        command: /home/runner/work/cbmc/cbmc/new/.github/workflows/benchcomp-parse_cbmc.py
+      variants:
+        - aws-c-common@old
+        - aws-c-common@new
+
+visualize:
+  - type: dump_yaml
+    out_file: '-'
+
+  - type: dump_markdown_results_table
+    out_file: '-'
+    extra_columns:
+
+      # For these two metrics, display the difference between old and new and
+      # embolden if the absolute difference is more than 10% of the old value
+      number_vccs:
+      - column_name: diff old → new
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "")
+          + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "")
+          + str(b["aws-c-common@new"] - b["aws-c-common@old"])
+          + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "")
+      number_program_steps:
+      - column_name: diff old → new
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "")
+          + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "")
+          + str(b["aws-c-common@new"] - b["aws-c-common@old"])
+          + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "")
+
+      # For 'runtime' metrics, display the % change from old to new, emboldening
+      # cells whose absolute change is >50%
+      solver_runtime:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+          + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "")
+          + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"])
+          + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+      verification_time:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+          + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "")
+          + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"])
+          + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+      symex_runtime:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+          + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "")
+          + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"])
+          + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "")
+
+      # For success metric, display some text if success has changed
+      success:
+      - column_name: change
+        text: >
+          lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"]
+          else "❌ newly failing" if b["aws-c-common@old"]
+          else "✅ newly passing"
+
+  - type: error_on_regression
+    variant_pairs: [[aws-c-common@old, aws-c-common@new]]
+    checks:
+      - metric: success
+        # Compare the old and new variants of each benchmark. The
+        # benchmark has regressed if the lambda returns true.
+        test: "lambda old, new: False if not old else not new"
+      - metric: solver_runtime
+        test: "lambda old, new: False if new < 10 else new/old > 1.5"
+      - metric: symex_runtime
+        test: "lambda old, new: False if new < 10 else new/old > 1.5"
diff --git a/.github/workflows/benchcomp-parse_cbmc.py b/.github/workflows/benchcomp-parse_cbmc.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# Copyright Kani Contributors
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+
+import json
+import logging
+import os
+import pathlib
+import re
+import sys
+import textwrap
+
+import yaml
+
+
+def get_description():
+    return textwrap.dedent("""\
+        Read CBMC statistics from a Litani run.json file.
+        """)
+
+
+def _get_metrics():
+    return {
+        "solver_runtime": {
+            "pat": re.compile(r"Runtime Solver: (?P<value>[-e\d\.]+)s"),
+            "parse": float,
+        },
+        "removed_program_steps": {
+            "pat": re.compile(r"slicing removed (?P<value>\d+) assignments"),
+            "parse": int,
+        },
+        "number_program_steps": {
+            "pat": re.compile(r"size of program expression: (?P<value>\d+) steps"),
+            "parse": int,
+        },
+        "number_vccs": {
+            "pat": re.compile(
+                r"Generated \d+ VCC\(s\), (?P<value>\d+) remaining after simplification"),
+            "parse": int,
+        },
+        "symex_runtime": {
+            "pat": re.compile(r"Runtime Symex: (?P<value>[-e\d\.]+)s"),
+            "parse": float,
+        },
+        "success": {
+            "pat": re.compile(r"VERIFICATION:- (?P<value>\w+)"),
+            "parse": lambda v: v == "SUCCESSFUL",
+        },
+    }
+
+
+def get_metrics():
+    metrics = dict(_get_metrics())
+    for _, info in metrics.items():
+        for field in ("pat", "parse"):
+            info.pop(field)
+
+    # This is not a metric we return; it is used to find the correct value for
+    # the number_program_steps metric
+    metrics.pop("removed_program_steps", None)
+
+    # We don't parse this directly from the output
+    metrics["verification_time"] = {}
+
+    return metrics
+
+
+def get_run(root_dir):
+    for fyle in pathlib.Path(root_dir).rglob("run.json"):
+        with open(fyle) as handle:
+            return json.load(handle)
+    logging.error("No run.json found in %s", root_dir)
+    sys.exit(1)
+
+
+def main(root_dir):
+    root_dir = pathlib.Path(root_dir)
+    run = get_run(root_dir)
+    metrics = _get_metrics()
+    benchmarks = {}
+    for pipe in run["pipelines"]:
+        for stage in pipe["ci_stages"]:
+            if stage["name"] != "test":
+                continue
+            for job in stage["jobs"]:
+                if not job["wrapper_arguments"]["command"].startswith("cbmc "):
+                    continue
+                if "cover" in job["wrapper_arguments"]["command"]:
+                    continue
+                if "--show-properties" in job["wrapper_arguments"]["command"]:
+                    continue
+
+                bench_name = f"{run['project']}::{pipe['name']}"
+                if not job["complete"]:
+                    logging.warning(
+                        "Found an incomplete CBMC benchmark '%s'",
+                        bench_name)
+                    continue
+
+                benchmarks[bench_name] = {
+                    "metrics": {
+                        "success": job["outcome"] == "success",
+                        "verification_time": float(job["duration_ms"]),
+                }}
+
+                for line in job["stdout"]:
+                    for metric, metric_info in metrics.items():
+                        m = metric_info["pat"].search(line)
+                        if not m:
+                            continue
+
+                        parse = metric_info["parse"]
+                        try:
+                            # CBMC prints out some metrics more than once, e.g.
+                            # "Solver" and "decision procedure". Add those
+                            # values together
+                            benchmarks[bench_name]["metrics"][metric] += parse(m["value"])
+                        except (KeyError, TypeError):
+                            benchmarks[bench_name]["metrics"][metric] = parse(m["value"])
+                        break
+
+    for bench_name, bench_info in benchmarks.items():
+        try:
+            n_steps = bench_info["metrics"]["number_program_steps"]
+            rm_steps = bench_info["metrics"]["removed_program_steps"]
+            bench_info["metrics"]["number_program_steps"] = n_steps - rm_steps
+            bench_info["metrics"].pop("removed_program_steps", None)
+        except KeyError:
+            pass
+
+    return {
+        "metrics": get_metrics(),
+        "benchmarks": benchmarks,
+    }
+
+
+if __name__ == "__main__":
+    result = main(os.getcwd())
+    print(yaml.dump(result, default_flow_style=False))
diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml
@@ -0,0 +1,116 @@
+# Run performance benchmarks comparing two different CBMC versions.
+name: Performance Benchmarking
+on:
+  push:
+    branches: [ develop ]
+  pull_request:
+    branches: [ develop ]
+
+jobs:
+  perf-benchcomp:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Save push event HEAD and HEAD~ to environment variables
+        if: ${{ github.event_name == 'push' }}
+        run: |
+          echo "NEW_REF=${{ github.event.after}}" | tee -a "$GITHUB_ENV"
+          echo "OLD_REF=${{ github.event.before }}" | tee -a "$GITHUB_ENV"
+
+      - name: Save pull request HEAD and base to environment variables
+        if: ${{ contains(fromJSON('["pull_request", "pull_request_target"]'), github.event_name) }}
+        run: |
+          echo "OLD_REF=${{ github.event.pull_request.base.sha }}" | tee -a "$GITHUB_ENV"
+          echo "NEW_REF=${{ github.event.pull_request.head.sha }}" | tee -a "$GITHUB_ENV"
+
+      - name: Check out CBMC (old variant)
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: ./old
+          ref: ${{ env.OLD_REF }}
+          fetch-depth: 2
+
+      - name: Check out CBMC (new variant)
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: ./new
+          ref: ${{ env.NEW_REF }}
+          fetch-depth: 1
+
+      - name: Fetch dependencies
+        env:
+          # This is needed in addition to -yq to prevent apt-get from asking for
+          # user input
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          sudo apt-get update
+          sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache
+
+      - name: Prepare ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: .ccache
+          key: ${{ runner.os }}-20.04-Release-${{ github.ref }}-${{ github.sha }}-PR
+          restore-keys: |
+            ${{ runner.os }}-20.04-Release-${{ github.ref }}
+            ${{ runner.os }}-20.04-Release
+      - name: ccache environment
+        run: echo "CCACHE_DIR=$PWD/.ccache" >> $GITHUB_ENV
+      - name: Zero ccache stats and limit in size
+        run: ccache -z --max-size=500M
+
+      - name: Configure using CMake (new variant)
+        run: cmake -S new/ -B new/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++ -Dsat_impl="minisat2;cadical"
+      - name: ccache environment (new variant)
+        run: echo "CCACHE_BASEDIR=$PWD/new" >> $GITHUB_ENV
+      - name: Build with Ninja (new variant)
+        run: ninja -C new/build -j2
+      - name: Print ccache stats (new variant)
+        run: ccache -s
+
+      - name: Configure using CMake (old variant)
+        run: cmake -S old/ -B old/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++ -Dsat_impl="minisat2;cadical"
+      - name: ccache environment (old variant)
+        run: echo "CCACHE_BASEDIR=$PWD/old" >> $GITHUB_ENV
+      - name: Build with Ninja (old variant)
+        run: ninja -C old/build -j2
+      - name: Print ccache stats (old variant)
+        run: ccache -s
+
+      - name: Obtain benchcomp
+        run: git clone --depth 1 https://github.com/model-checking/kani kani.git
+
+      - name: Fetch benchmarks
+        run: |
+          git clone --depth 1 https://github.com/awslabs/aws-c-common aws-c-common.git
+          pushd aws-c-common.git/verification/cbmc/proofs/
+          # keep 6 proofs of each kind of data structure
+          for c in $(ls -d aws_* | cut -f2 -d_ | uniq) ; do rm -rf $(ls -d aws_${c}_* | tail -n+7) ; done
+          popd
+          sudo apt-get install --no-install-recommends -yq gnuplot graphviz universal-ctags python3-pip
+          curl -L --remote-name \
+            https://github.com/awslabs/aws-build-accumulator/releases/download/1.29.0/litani-1.29.0.deb
+          sudo dpkg -i litani-1.29.0.deb
+          sudo python3 -m pip install cbmc-viewer
+
+      - name: Run benchcomp
+        run: |
+          kani.git/tools/benchcomp/bin/benchcomp \
+            --config new/.github/workflows/benchcomp-config.yaml \
+            run
+          kani.git/tools/benchcomp/bin/benchcomp \
+            --config new/.github/workflows/benchcomp-config.yaml \
+            collate
+
+      - name: Perf Regression Results Table
+        run: |
+          kani.git/tools/benchcomp/bin/benchcomp \
+            --config new/.github/workflows/benchcomp-config.yaml \
+            visualize --only dump_markdown_results_table >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Run other visualizations
+        run: |
+          kani.git/tools/benchcomp/bin/benchcomp \
+            --config new/.github/workflows/benchcomp-config.yaml \
+            visualize --except dump_markdown_results_table