From 8fe7ffa763c615c495396d62000a5cc49819e0b4 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Tue, 5 Mar 2024 10:36:27 +0100 Subject: [PATCH 1/3] Try running a basic comparison benchmark between base and pr commit --- .github/workflows/pr_benchmarks.yml | 63 ++++++++++++++++++++++ benchmarks/bench.sh | 12 +++-- datafusion/physical-plan/src/sorts/sort.rs | 3 ++ 3 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/pr_benchmarks.yml diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml new file mode 100644 index 0000000000000..c3b0b914d2f97 --- /dev/null +++ b/.github/workflows/pr_benchmarks.yml @@ -0,0 +1,63 @@ +name: Run and Cache Benchmarks + +on: + pull_request: + types: [labeled, opened, reopened, synchronize] + +jobs: + benchmark: + name: Run Benchmarks + runs-on: ubuntu-latest + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJSON(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Checkout PR changes + uses: actions/checkout@v4 + + - name: Setup data and generate unique result names + run: | + cd benchmarks + mkdir data + + # Setup the TPC-H data set with a scale factor of 10 + ./bench.sh data tpch + + # Generate a unique-ish identifier for the results using + # branch name and commit sha + short_ref=$(echo "${{ github.head_ref }}" | cut -c1-20) + short_sha=$(echo "${{ github.sha }}" | cut -c1-7) + echo "HEAD_REF_SHA=$short_ref-$short_sha" >> "$GITHUB_ENV" + + short_sha=$(echo "${{ github.event.pull_request.base.sha }}" | cut -c1-7) + echo "BASE_REF_SHA=${{ github.base_ref }}-$short_sha" >> "$GITHUB_ENV" + + - name: Benchmark PR changes + env: + RESULTS_NAME: ${{ env.HEAD_REF_SHA }} + run: | + cd benchmarks + + ./bench.sh run tpch + + - name: Checkout base commit + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.base.sha }} + clean: false + + - name: Benchmark baseline and compare + env: + RESULTS_NAME: ${{ env.BASE_REF_SHA }} + run: | + cd benchmarks + + ./bench.sh run tpch + + # Temporary workaround, until `RESULTS_NAME` var lands into main + mv -f results/HEAD results/${{ env.BASE_REF_SHA }} + + pip3 install rich + ./bench.sh compare ${{ env.BASE_REF_SHA }} ${{ env.HEAD_REF_SHA }} diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 2ecd42920e096..5acdde6977756 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -82,6 +82,7 @@ clickbench_extended: ClickBench "inspired" queries against a single parquet ( DATA_DIR directory to store datasets CARGO_COMMAND command that runs the benchmark binary DATAFUSION_DIR directory to use (default $DATAFUSION_DIR) +RESULTS_NAME folder where the benchmark files are stored " exit 1 } @@ -166,18 +167,19 @@ main() { esac ;; run) - # Parse positional paraleters + # Parse positional parameters BENCHMARK=${ARG2:-"${BENCHMARK}"} BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD) BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ - RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$BRANCH_NAME"} + RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"} + RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"} echo "***************************" echo "DataFusion Benchmark Script" echo "COMMAND: ${COMMAND}" echo "BENCHMARK: ${BENCHMARK}" echo "DATAFUSION_DIR: ${DATAFUSION_DIR}" - echo "BRACH_NAME: ${BRANCH_NAME}" + echo "BRANCH_NAME: ${BRANCH_NAME}" echo "DATA_DIR: ${DATA_DIR}" echo "RESULTS_DIR: ${RESULTS_DIR}" echo "CARGO_COMMAND: ${CARGO_COMMAND}" @@ -278,7 +280,7 @@ data_tpch() { echo " tbl files exist ($FILE exists)." else echo " creating tbl files with tpch_dbgen..." - docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} + docker run -v "${TPCH_DIR}":/data --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} fi # Copy expected answers into the ./data/answers directory if it does not already exist @@ -288,7 +290,7 @@ data_tpch() { else echo " Copying answers to ${TPCH_DIR}/answers" mkdir -p "${TPCH_DIR}/answers" - docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" + docker run -v "${TPCH_DIR}":/data --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" fi # Create 'parquet' files from tbl diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index db352bb2c86f3..61639f42f0c0c 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -980,6 +980,9 @@ impl ExecutionPlan for SortExec { let batch = batch?; sorter.insert_batch(batch).await?; } + // Test whether benchmarks catch this + // TODO: remove before merge! + tokio::time::sleep(std::time::Duration::from_secs(1)).await; sorter.sort() }) .try_flatten(), From 831c6eae3b3f524308c4b17a663551af7d357c60 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Tue, 5 Mar 2024 23:20:45 +0100 Subject: [PATCH 2/3] Add job for commenting benchmark results on the PR --- .github/workflows/pr_benchmarks.yml | 68 +++++++++++++++++++++++++++-- .github/workflows/pr_comment.yml | 53 ++++++++++++++++++++++ 2 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/pr_comment.yml diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml index c3b0b914d2f97..a8c3d4a5473f4 100644 --- a/.github/workflows/pr_benchmarks.yml +++ b/.github/workflows/pr_benchmarks.yml @@ -1,8 +1,7 @@ -name: Run and Cache Benchmarks +name: Benchmarks on: pull_request: - types: [labeled, opened, reopened, synchronize] jobs: benchmark: @@ -48,7 +47,7 @@ jobs: ref: ${{ github.event.pull_request.base.sha }} clean: false - - name: Benchmark baseline and compare + - name: Benchmark baseline and generate comparison message env: RESULTS_NAME: ${{ env.BASE_REF_SHA }} run: | @@ -59,5 +58,66 @@ jobs: # Temporary workaround, until `RESULTS_NAME` var lands into main mv -f results/HEAD results/${{ env.BASE_REF_SHA }} + echo ${{ github.event.pull_request.number }} > pr + pip3 install rich - ./bench.sh compare ${{ env.BASE_REF_SHA }} ${{ env.HEAD_REF_SHA }} + cat > message.md < + Benchmarks comparing ${{ github.event.pull_request.base.sha }} and ${{ github.sha }} + + \`\`\` + $(./bench.sh compare ${{ env.BASE_REF_SHA }} ${{ env.HEAD_REF_SHA }}) + \`\`\` + + + EOF + + cat message.md + + - name: Upload benchmark comparison message + uses: actions/upload-artifact@v4 + with: + name: message + path: benchmarks/message.md + + - name: Upload PR number + uses: actions/upload-artifact@v4 + with: + name: pr + path: benchmarks/pr + + comment: + name: Post benchmarks comment + runs-on: ubuntu-latest + needs: [ benchmark ] + steps: + - name: Download comment message + uses: actions/download-artifact@v4 + with: + name: message + + - name: Download pr number + uses: actions/download-artifact@v4 + with: + name: pr + + - name: Print message and pr number + run: | + cat pr + echo "PR_NUMBER=$(cat pr)" >> "$GITHUB_ENV" + cat message.md + + - name: Post a comment + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const content = fs.readFileSync('message.md', 'utf8'); + github.rest.issues.createComment({ + issue_number: process.env.PR_NUMBER, + owner: context.repo.owner, + repo: context.repo.repo, + body: content, + }) diff --git a/.github/workflows/pr_comment.yml b/.github/workflows/pr_comment.yml new file mode 100644 index 0000000000000..440063565288a --- /dev/null +++ b/.github/workflows/pr_comment.yml @@ -0,0 +1,53 @@ +name: PR Comment + +on: + workflow_run: + workflows: ["Benchmarks"] + types: + - completed + +jobs: + comment: + name: PR Comment + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJSON(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Download comment message + uses: actions/download-artifact@v4 + with: + name: message + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download pr number + uses: actions/download-artifact@v4 + with: + name: pr + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Print message and pr number + run: | + cat pr + echo "PR_NUMBER=$(cat pr)" >> "$GITHUB_ENV" + cat message.md + + - name: Post the comment + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const content = fs.readFileSync('message.md', 'utf8'); + github.rest.issues.createComment({ + issue_number: process.env.PR_NUMBER, + owner: context.repo.owner, + repo: context.repo.repo, + body: content, + }) From d01f5592665aa79defd46f04ff7edae8d1f54523 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 7 Mar 2024 18:53:30 +0100 Subject: [PATCH 3/3] Trigger benchmark workflow on a pr comment to avoid noise --- .github/workflows/pr_benchmarks.yml | 56 +++++----------------- .github/workflows/pr_comment.yml | 4 +- datafusion/physical-plan/src/sorts/sort.rs | 3 -- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml index a8c3d4a5473f4..d930a989d8ca3 100644 --- a/.github/workflows/pr_benchmarks.yml +++ b/.github/workflows/pr_benchmarks.yml @@ -1,12 +1,13 @@ name: Benchmarks on: - pull_request: + issue_comment: jobs: benchmark: name: Run Benchmarks runs-on: ubuntu-latest + if: github.event.issue.pull_request && contains(github.event.comment.body, '/benchmark') steps: - name: Dump GitHub context env: @@ -15,6 +16,8 @@ jobs: - name: Checkout PR changes uses: actions/checkout@v4 + with: + ref: refs/pull/${{ github.event.issue.number }}/head - name: Setup data and generate unique result names run: | @@ -24,14 +27,11 @@ jobs: # Setup the TPC-H data set with a scale factor of 10 ./bench.sh data tpch - # Generate a unique-ish identifier for the results using - # branch name and commit sha - short_ref=$(echo "${{ github.head_ref }}" | cut -c1-20) - short_sha=$(echo "${{ github.sha }}" | cut -c1-7) - echo "HEAD_REF_SHA=$short_ref-$short_sha" >> "$GITHUB_ENV" + # Generate a unique-ish identifiers for the results + echo "HEAD_REF_SHA=pr-${{ github.event.issue.number }}" >> "$GITHUB_ENV" - short_sha=$(echo "${{ github.event.pull_request.base.sha }}" | cut -c1-7) - echo "BASE_REF_SHA=${{ github.base_ref }}-$short_sha" >> "$GITHUB_ENV" + short_sha=$(echo "${{ github.sha }}" | cut -c1-7) + echo "BASE_REF_SHA=main-$short_sha" >> "$GITHUB_ENV" - name: Benchmark PR changes env: @@ -44,7 +44,7 @@ jobs: - name: Checkout base commit uses: actions/checkout@v4 with: - ref: ${{ github.event.pull_request.base.sha }} + ref: ${{ github.sha }} clean: false - name: Benchmark baseline and generate comparison message @@ -58,14 +58,14 @@ jobs: # Temporary workaround, until `RESULTS_NAME` var lands into main mv -f results/HEAD results/${{ env.BASE_REF_SHA }} - echo ${{ github.event.pull_request.number }} > pr + echo ${{ github.event.issue.number }} > pr pip3 install rich cat > message.md < - Benchmarks comparing ${{ github.event.pull_request.base.sha }} and ${{ github.sha }} + Benchmarks comparing ${{ github.sha }} and PR ${{ github.event.issue.number }} \`\`\` $(./bench.sh compare ${{ env.BASE_REF_SHA }} ${{ env.HEAD_REF_SHA }}) @@ -87,37 +87,3 @@ jobs: with: name: pr path: benchmarks/pr - - comment: - name: Post benchmarks comment - runs-on: ubuntu-latest - needs: [ benchmark ] - steps: - - name: Download comment message - uses: actions/download-artifact@v4 - with: - name: message - - - name: Download pr number - uses: actions/download-artifact@v4 - with: - name: pr - - - name: Print message and pr number - run: | - cat pr - echo "PR_NUMBER=$(cat pr)" >> "$GITHUB_ENV" - cat message.md - - - name: Post a comment - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const content = fs.readFileSync('message.md', 'utf8'); - github.rest.issues.createComment({ - issue_number: process.env.PR_NUMBER, - owner: context.repo.owner, - repo: context.repo.repo, - body: content, - }) diff --git a/.github/workflows/pr_comment.yml b/.github/workflows/pr_comment.yml index 440063565288a..4b7b80632d13a 100644 --- a/.github/workflows/pr_comment.yml +++ b/.github/workflows/pr_comment.yml @@ -10,9 +10,7 @@ jobs: comment: name: PR Comment runs-on: ubuntu-latest - if: > - github.event.workflow_run.event == 'pull_request' && - github.event.workflow_run.conclusion == 'success' + if: github.event.workflow_run.conclusion == 'success' steps: - name: Dump GitHub context env: diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 61639f42f0c0c..db352bb2c86f3 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -980,9 +980,6 @@ impl ExecutionPlan for SortExec { let batch = batch?; sorter.insert_batch(batch).await?; } - // Test whether benchmarks catch this - // TODO: remove before merge! - tokio::time::sleep(std::time::Duration::from_secs(1)).await; sorter.sort() }) .try_flatten(),