From d618c6b1794943b8ea6b98f44a20995c4238e1b3 Mon Sep 17 00:00:00 2001 From: Michael Tautschnig Date: Thu, 25 Jan 2024 13:53:48 +0000 Subject: [PATCH] Run performance comparison in CI using Kani's Benchcomp Use proof harnesses in AWS C Common as (initial) benchmark to guard against performance regressions introduced in PRs. See the "Summary" behind any of the executions of https://github.com/model-checking/kani/actions/workflows/bench.yml for examples what the output will look like. Co-authored-by: Kareem Khazem --- .github/workflows/benchcomp-config.yaml | 105 +++++++++++++++++ .github/workflows/benchcomp-parse_cbmc.py | 137 ++++++++++++++++++++++ .github/workflows/performance.yaml | 111 ++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100644 .github/workflows/benchcomp-config.yaml create mode 100755 .github/workflows/benchcomp-parse_cbmc.py create mode 100644 .github/workflows/performance.yaml diff --git a/.github/workflows/benchcomp-config.yaml b/.github/workflows/benchcomp-config.yaml new file mode 100644 index 000000000000..3e4ef43d4d8e --- /dev/null +++ b/.github/workflows/benchcomp-config.yaml @@ -0,0 +1,105 @@ +# Compare CBMC performance of selected benchmarks across two versions of CBMC, +# erroring out on regression. This config file is to be used together with the +# set-up in performance.yaml, because it assumes that there are two CBMC +# checkouts in the 'old' and 'new' directories; benchcomp compares the +# performance of these two checkouts. +# +# This configuration file is based on Benchcomp's perf-regression.yaml that +# ships with Kani. + +variants: + aws-c-common@old: + config: + command_line: cd verification/cbmc/proofs && export PATH=/home/runner/work/cbmc/cbmc/old/build/bin:$PATH && + ./run-cbmc-proofs.py + directory: /home/runner/work/cbmc/cbmc/aws-c-common.git + env: {} + aws-c-common@new: + config: + command_line: cd verification/cbmc/proofs && export PATH=/home/runner/work/cbmc/cbmc/new/build/bin:$PATH && + ./run-cbmc-proofs.py + directory: /home/runner/work/cbmc/cbmc/aws-c-common.git + env: {} + +run: + suites: + aws-c-common: + parser: + command: /home/runner/work/cbmc/cbmc/new/.github/workflows/benchcomp-parse_cbmc.py + variants: + - aws-c-common@old + - aws-c-common@new + +visualize: + - type: dump_yaml + out_file: '-' + + - type: dump_markdown_results_table + out_file: '-' + extra_columns: + + # For these two metrics, display the difference between old and new and + # embolden if the absolute difference is more than 10% of the old value + number_vccs: + - column_name: diff old → new + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "") + + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "") + + str(b["aws-c-common@new"] - b["aws-c-common@old"]) + + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "") + number_program_steps: + - column_name: diff old → new + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "") + + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "") + + str(b["aws-c-common@new"] - b["aws-c-common@old"]) + + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.1 else "") + + # For 'runtime' metrics, display the % change from old to new, emboldening + # cells whose absolute change is >50% + solver_runtime: + - column_name: "% change old → new" + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "") + + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"]) + + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + verification_time: + - column_name: "% change old → new" + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "") + + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"]) + + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + symex_runtime: + - column_name: "% change old → new" + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + + ("+" if b["aws-c-common@new"] > b["aws-c-common@old"] else "") + + "%.3f%%" % ((b["aws-c-common@new"] - b["aws-c-common@old"]) * 100 / b["aws-c-common@old"]) + + ("**" if abs((b["aws-c-common@new"]-b["aws-c-common@old"])/b["aws-c-common@old"]) > 0.5 else "") + + # For success metric, display some text if success has changed + success: + - column_name: change + text: > + lambda b: "" if b["aws-c-common@new"] == b["aws-c-common@old"] + else "❌ newly failing" if b["aws-c-common@old"] + else "✅ newly passing" + + - type: error_on_regression + variant_pairs: [[aws-c-common@old, aws-c-common@new]] + checks: + - metric: success + # Compare the old and new variants of each benchmark. The + # benchmark has regressed if the lambda returns true. + test: "lambda old, new: False if not old else not new" + - metric: solver_runtime + test: "lambda old, new: False if new < 10 else new/old > 1.5" + - metric: symex_runtime + test: "lambda old, new: False if new < 10 else new/old > 1.5" diff --git a/.github/workflows/benchcomp-parse_cbmc.py b/.github/workflows/benchcomp-parse_cbmc.py new file mode 100755 index 000000000000..2eb61054e2fb --- /dev/null +++ b/.github/workflows/benchcomp-parse_cbmc.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# Copyright Kani Contributors +# SPDX-License-Identifier: Apache-2.0 OR MIT + + +import json +import logging +import os +import pathlib +import re +import sys +import textwrap + +import yaml + + +def get_description(): + return textwrap.dedent("""\ + Read CBMC statistics from a Litani run.json file. + """) + + +def _get_metrics(): + return { + "solver_runtime": { + "pat": re.compile(r"Runtime Solver: (?P[-e\d\.]+)s"), + "parse": float, + }, + "removed_program_steps": { + "pat": re.compile(r"slicing removed (?P\d+) assignments"), + "parse": int, + }, + "number_program_steps": { + "pat": re.compile(r"size of program expression: (?P\d+) steps"), + "parse": int, + }, + "number_vccs": { + "pat": re.compile( + r"Generated \d+ VCC\(s\), (?P\d+) remaining after simplification"), + "parse": int, + }, + "symex_runtime": { + "pat": re.compile(r"Runtime Symex: (?P[-e\d\.]+)s"), + "parse": float, + }, + "success": { + "pat": re.compile(r"VERIFICATION:- (?P\w+)"), + "parse": lambda v: v == "SUCCESSFUL", + }, + } + + +def get_metrics(): + metrics = dict(_get_metrics()) + for _, info in metrics.items(): + for field in ("pat", "parse"): + info.pop(field) + + # This is not a metric we return; it is used to find the correct value for + # the number_program_steps metric + metrics.pop("removed_program_steps", None) + + return metrics + + +def get_run(root_dir): + for fyle in pathlib.Path(root_dir).rglob("run.json"): + with open(fyle) as handle: + return json.load(handle) + logging.error("No run.json found in %s", root_dir) + sys.exit(1) + + +def main(root_dir): + root_dir = pathlib.Path(root_dir) + run = get_run(root_dir) + metrics = _get_metrics() + benchmarks = {} + for pipe in run["pipelines"]: + for stage in pipe["ci_stages"]: + if stage["name"] != "test": + continue + for job in stage["jobs"]: + if not job["wrapper_arguments"]["command"].startswith("cbmc "): + continue + if "cover" in job["wrapper_arguments"]["command"]: + continue + if "--show-properties" in job["wrapper_arguments"]["command"]: + continue + + bench_name = f"{run['project']}::{pipe['name']}" + if not job["complete"]: + logging.warning( + "Found an incomplete CBMC benchmark '%s'", + bench_name) + continue + + benchmarks[bench_name] = { + "metrics": { + "success": job["outcome"] == "success", + "verification_time": job["duration_ms"], + }} + + for line in job["stdout"]: + for metric, metric_info in metrics.items(): + m = metric_info["pat"].search(line) + if not m: + continue + + parse = metric_info["parse"] + try: + # CBMC prints out some metrics more than once, e.g. + # "Solver" and "decision procedure". Add those + # values together + benchmarks[bench_name]["metrics"][metric] += parse(m["value"]) + except (KeyError, TypeError): + benchmarks[bench_name]["metrics"][metric] = parse(m["value"]) + break + + for bench_name, bench_info in benchmarks.items(): + try: + n_steps = bench_info["metrics"]["number_program_steps"] + rm_steps = bench_info["metrics"]["removed_program_steps"] + bench_info["metrics"]["number_program_steps"] = n_steps - rm_steps + bench_info["metrics"].pop("removed_program_steps", None) + except KeyError: + pass + + return { + "metrics": get_metrics(), + "benchmarks": benchmarks, + } + + +if __name__ == "__main__": + result = main(os.getcwd()) + print(yaml.dump(result, default_flow_style=False)) diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml new file mode 100644 index 000000000000..abbd17bf2b98 --- /dev/null +++ b/.github/workflows/performance.yaml @@ -0,0 +1,111 @@ +# Run performance benchmarks comparing two different CBMC versions. +name: Performance Benchmarking +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + +jobs: + perf-benchcomp: + runs-on: ubuntu-20.04 + steps: + - name: Save push event HEAD and HEAD~ to environment variables + if: ${{ github.event_name == 'push' }} + run: | + echo "NEW_REF=${{ github.event.after}}" | tee -a "$GITHUB_ENV" + echo "OLD_REF=${{ github.event.before }}" | tee -a "$GITHUB_ENV" + + - name: Save pull request HEAD and base to environment variables + if: ${{ contains(fromJSON('["pull_request", "pull_request_target"]'), github.event_name) }} + run: | + echo "OLD_REF=${{ github.event.pull_request.base.sha }}" | tee -a "$GITHUB_ENV" + echo "NEW_REF=${{ github.event.pull_request.head.sha }}" | tee -a "$GITHUB_ENV" + + - name: Check out CBMC (old variant) + uses: actions/checkout@v4 + with: + submodules: recursive + path: ./old + ref: ${{ env.OLD_REF }} + fetch-depth: 2 + + - name: Check out CBMC (new variant) + uses: actions/checkout@v4 + with: + submodules: recursive + path: ./new + ref: ${{ env.NEW_REF }} + fetch-depth: 1 + + - name: Fetch dependencies + env: + # This is needed in addition to -yq to prevent apt-get from asking for + # user input + DEBIAN_FRONTEND: noninteractive + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends -yq cmake ninja-build gcc g++ maven flex bison ccache + + - name: Prepare ccache + uses: actions/cache/restore@v4 + with: + path: .ccache + key: ${{ runner.os }}-20.04-Release-${{ github.ref }}-${{ github.sha }}-PR + restore-keys: | + ${{ runner.os }}-20.04-Release-${{ github.ref }} + ${{ runner.os }}-20.04-Release + - name: ccache environment + run: echo "CCACHE_DIR=$PWD/.ccache" >> $GITHUB_ENV + - name: Zero ccache stats and limit in size + run: ccache -z --max-size=500M + + - name: Configure using CMake (new variant) + run: cmake -S new/ -B new/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++ -Dsat_impl="minisat2;cadical" + - name: ccache environment (new variant) + run: echo "CCACHE_BASEDIR=$PWD/new" >> $GITHUB_ENV + - name: Build with Ninja (new variant) + run: ninja -C new/build -j2 + - name: Print ccache stats (new variant) + run: ccache -s + + - name: Configure using CMake (old variant) + run: cmake -S old/ -B old/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++ -Dsat_impl="minisat2;cadical" + - name: ccache environment (old variant) + run: echo "CCACHE_BASEDIR=$PWD/old" >> $GITHUB_ENV + - name: Build with Ninja (old variant) + run: ninja -C old/build -j2 + - name: Print ccache stats (old variant) + run: ccache -s + + - name: Obtain benchcomp + run: git clone --depth 1 https://github.com/model-checking/kani kani.git + + - name: Fetch benchmarks + run: | + git clone --depth 1 https://github.com/awslabs/aws-c-common aws-c-common.git + sudo apt-get install --no-install-recommends -yq gnuplot graphviz + curl -L --remote-name \ + https://github.com/awslabs/aws-build-accumulator/releases/download/1.29.0/litani-1.29.0.deb + sudo dpkg -i litani-1.29.0.deb + + - name: Run benchcomp + run: | + kani.git/tools/benchcomp/bin/benchcomp \ + --config new/.github/workflows/benchcomp-config.yaml \ + run + kani.git/tools/benchcomp/bin/benchcomp \ + --config new/.github/workflows/benchcomp-config.yaml \ + collate + + - name: Perf Regression Results Table + run: | + kani.git/tools/benchcomp/bin/benchcomp \ + --config new/.github/workflows/benchcomp-config.yaml \ + visualize --only dump_markdown_results_table >> "$GITHUB_STEP_SUMMARY" + + - name: Run other visualizations + run: | + kani.git/tools/benchcomp/bin/benchcomp \ + --config new/.github/workflows/benchcomp-config.yaml \ + visualize --except dump_markdown_results_table