From f4a00081d46cc8c8964ce8deae3012ee9a0b376c Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Tue, 5 Mar 2024 10:36:27 +0100 Subject: [PATCH] Try running a basic comparison benchmark between base and pr commit --- .github/workflows/pr_benchmarks.yml | 59 ++++++++++++++++++++++ benchmarks/bench.sh | 12 +++-- datafusion/physical-plan/src/sorts/sort.rs | 3 ++ 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/pr_benchmarks.yml diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml new file mode 100644 index 0000000000000..93021406bc6eb --- /dev/null +++ b/.github/workflows/pr_benchmarks.yml @@ -0,0 +1,59 @@ +name: Run and Cache Benchmarks + +on: + pull_request: + types: [labeled, opened, reopened, synchronize] + +jobs: + benchmark: + name: Run Benchmarks + runs-on: ubuntu-latest + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJSON(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Checkout PR changes + uses: actions/checkout@v4 + + - name: Setup data and generate unique result names + run: | + cd benchmarks + mkdir data + + # Setup the TPC-H data set with a scale factor of 10. + # Hack: pipe the output and explicitly pre-prend a carriage + # return to avoid the output staircase effect (e.g. percentage + # updates appearing on subsequent lines). + ./bench.sh data tpch | | awk '{printf "\r%s", $0; fflush();}' + + # Generate a unique-ish identifier for the results using + # branch name and commit sha + short_ref=$(echo "${{ github.head_ref }}" | cut -c1-20) + short_sha=$(echo "${{ github.sha }}" | cut -c1-7) + echo "HEAD_REF_SHA=$short_ref-$short_sha" >> "$GITHUB_ENV" + + short_sha=$(echo "${{ github.pull_request.base.sha }}" | cut -c1-7) + echo "BASE_REF_SHA=${{ github.base_ref }}-$short_sha" >> "$GITHUB_ENV" + + - name: Benchmark PR changes + run: | + cd benchmarks + + RESULTS_NAME=$HEAD_REF_SHA ./bench.sh run tpch + + - name: Checkout base commit + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.sha }} + clean: false + + - name: Benchmark baseline and compare + run: | + cd benchmarks + + RESULTS_NAME=$BASE_REF_SHA ./bench.sh run tpch + + pip3 install rich + ./bench.sh compare $BASE_REF_SHA $HEAD_REF_SHA diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 2ecd42920e096..5acdde6977756 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -82,6 +82,7 @@ clickbench_extended: ClickBench "inspired" queries against a single parquet ( DATA_DIR directory to store datasets CARGO_COMMAND command that runs the benchmark binary DATAFUSION_DIR directory to use (default $DATAFUSION_DIR) +RESULTS_NAME folder where the benchmark files are stored " exit 1 } @@ -166,18 +167,19 @@ main() { esac ;; run) - # Parse positional paraleters + # Parse positional parameters BENCHMARK=${ARG2:-"${BENCHMARK}"} BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD) BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ - RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$BRANCH_NAME"} + RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"} + RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"} echo "***************************" echo "DataFusion Benchmark Script" echo "COMMAND: ${COMMAND}" echo "BENCHMARK: ${BENCHMARK}" echo "DATAFUSION_DIR: ${DATAFUSION_DIR}" - echo "BRACH_NAME: ${BRANCH_NAME}" + echo "BRANCH_NAME: ${BRANCH_NAME}" echo "DATA_DIR: ${DATA_DIR}" echo "RESULTS_DIR: ${RESULTS_DIR}" echo "CARGO_COMMAND: ${CARGO_COMMAND}" @@ -278,7 +280,7 @@ data_tpch() { echo " tbl files exist ($FILE exists)." else echo " creating tbl files with tpch_dbgen..." - docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} + docker run -v "${TPCH_DIR}":/data --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} fi # Copy expected answers into the ./data/answers directory if it does not already exist @@ -288,7 +290,7 @@ data_tpch() { else echo " Copying answers to ${TPCH_DIR}/answers" mkdir -p "${TPCH_DIR}/answers" - docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" + docker run -v "${TPCH_DIR}":/data --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" fi # Create 'parquet' files from tbl diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index db352bb2c86f3..61639f42f0c0c 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -980,6 +980,9 @@ impl ExecutionPlan for SortExec { let batch = batch?; sorter.insert_batch(batch).await?; } + // Test whether benchmarks catch this + // TODO: remove before merge! + tokio::time::sleep(std::time::Duration::from_secs(1)).await; sorter.sort() }) .try_flatten(),