diff --git a/.github/workflows/pr_benchmarks.yml b/.github/workflows/pr_benchmarks.yml new file mode 100644 index 0000000000000..9fca68f58ba0d --- /dev/null +++ b/.github/workflows/pr_benchmarks.yml @@ -0,0 +1,86 @@ +name: Benchmarks + +on: + issue_comment: + +jobs: + benchmark: + name: Run Benchmarks + runs-on: ubuntu-latest + if: github.event.issue.pull_request && contains(github.event.comment.body, '/benchmark') + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJSON(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Checkout PR changes + uses: actions/checkout@v4 + with: + ref: refs/pull/${{ github.event.issue.number }}/head + + - name: Setup data and generate unique result names + run: | + cd benchmarks + mkdir data + + # Setup the TPC-H data set with a scale factor of 10 + ./bench.sh data tpch + + # Generate a unique-ish identifiers for the results + echo "HEAD_REF_SHA=pr-${{ github.event.issue.number }}" >> "$GITHUB_ENV" + + short_sha=$(echo "${{ github.sha }}" | cut -c1-7) + echo "BASE_REF_SHA=main-$short_sha" >> "$GITHUB_ENV" + + - name: Benchmark PR changes + env: + RESULTS_NAME: ${{ env.HEAD_REF_SHA }} + run: | + cd benchmarks + + ./bench.sh run tpch + + - name: Checkout base commit + uses: actions/checkout@v4 + with: + ref: ${{ github.sha }} + clean: false + + - name: Benchmark baseline and generate comparison message + env: + RESULTS_NAME: ${{ env.BASE_REF_SHA }} + run: | + cd benchmarks + + ./bench.sh run tpch + + echo ${{ github.event.issue.number }} > pr + + pip3 install rich + cat > message.md < + Benchmarks comparing ${{ github.sha }} and PR ${{ github.event.issue.number }} + + \`\`\` + $(./bench.sh compare ${{ env.BASE_REF_SHA }} ${{ env.HEAD_REF_SHA }}) + \`\`\` + + + EOF + + cat message.md + + - name: Upload benchmark comparison message + uses: actions/upload-artifact@v4 + with: + name: message + path: benchmarks/message.md + + - name: Upload PR number + uses: actions/upload-artifact@v4 + with: + name: pr + path: benchmarks/pr diff --git a/.github/workflows/pr_comment.yml b/.github/workflows/pr_comment.yml new file mode 100644 index 0000000000000..4b7b80632d13a --- /dev/null +++ b/.github/workflows/pr_comment.yml @@ -0,0 +1,51 @@ +name: PR Comment + +on: + workflow_run: + workflows: ["Benchmarks"] + types: + - completed + +jobs: + comment: + name: PR Comment + runs-on: ubuntu-latest + if: github.event.workflow_run.conclusion == 'success' + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJSON(github) }} + run: echo "$GITHUB_CONTEXT" + + - name: Download comment message + uses: actions/download-artifact@v4 + with: + name: message + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download pr number + uses: actions/download-artifact@v4 + with: + name: pr + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Print message and pr number + run: | + cat pr + echo "PR_NUMBER=$(cat pr)" >> "$GITHUB_ENV" + cat message.md + + - name: Post the comment + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const content = fs.readFileSync('message.md', 'utf8'); + github.rest.issues.createComment({ + issue_number: process.env.PR_NUMBER, + owner: context.repo.owner, + repo: context.repo.repo, + body: content, + }) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 2ecd42920e096..5acdde6977756 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -82,6 +82,7 @@ clickbench_extended: ClickBench "inspired" queries against a single parquet ( DATA_DIR directory to store datasets CARGO_COMMAND command that runs the benchmark binary DATAFUSION_DIR directory to use (default $DATAFUSION_DIR) +RESULTS_NAME folder where the benchmark files are stored " exit 1 } @@ -166,18 +167,19 @@ main() { esac ;; run) - # Parse positional paraleters + # Parse positional parameters BENCHMARK=${ARG2:-"${BENCHMARK}"} BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD) BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _ - RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$BRANCH_NAME"} + RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"} + RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"} echo "***************************" echo "DataFusion Benchmark Script" echo "COMMAND: ${COMMAND}" echo "BENCHMARK: ${BENCHMARK}" echo "DATAFUSION_DIR: ${DATAFUSION_DIR}" - echo "BRACH_NAME: ${BRANCH_NAME}" + echo "BRANCH_NAME: ${BRANCH_NAME}" echo "DATA_DIR: ${DATA_DIR}" echo "RESULTS_DIR: ${RESULTS_DIR}" echo "CARGO_COMMAND: ${CARGO_COMMAND}" @@ -278,7 +280,7 @@ data_tpch() { echo " tbl files exist ($FILE exists)." else echo " creating tbl files with tpch_dbgen..." - docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} + docker run -v "${TPCH_DIR}":/data --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR} fi # Copy expected answers into the ./data/answers directory if it does not already exist @@ -288,7 +290,7 @@ data_tpch() { else echo " Copying answers to ${TPCH_DIR}/answers" mkdir -p "${TPCH_DIR}/answers" - docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" + docker run -v "${TPCH_DIR}":/data --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" fi # Create 'parquet' files from tbl