Skip to content

[FEAT] [Join Optimizations] Add sort-merge join. #5547

[FEAT] [Join Optimizations] Add sort-merge join.

[FEAT] [Join Optimizations] Add sort-merge join. #5547

Workflow file for this run

# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: daft
on:
push:
branches: [main]
pull_request:
branches: [main]
env:
DAFT_ANALYTICS_ENABLED: '0'
jobs:
unit-tests-with-coverage:
runs-on: ${{ matrix.os }}-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.10']
daft-runner: [py, ray]
pyarrow-version: [6.0.1, 12.0]
os: [ubuntu, windows]
micropartitions: [1, 0]
exclude:
- daft-runner: ray
pyarrow-version: 6.0.1
os: ubuntu
- daft-runner: py
python-version: '3.10'
pyarrow-version: 6.0.1
os: ubuntu
- os: windows
pyarrow-version: 12.0
steps:
- uses: actions/checkout@v4
- uses: moonrepo/setup-rust@v1
with:
cache: false
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- uses: Swatinem/rust-cache@v2
with:
key: ${{ runner.os }}-build
cache-all-crates: 'true'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: pip
cache-dependency-path: |
pyproject.toml
requirements-dev.txt
- name: Setup Virtual Env
run: |
python -m venv venv
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install dependencies
if: ${{ (matrix.os != 'windows') }}
run: |
pip install --upgrade pip
pip install -r requirements-dev.txt
- name: Install dependencies (Windows)
if: ${{ (matrix.os == 'windows') }}
run: |
.\venv\Scripts\activate
python -m pip install --upgrade pip
python -m pip install -r requirements-dev.txt
- name: Override pyarrow
if: ${{ (matrix.pyarrow-version) && (matrix.os != 'windows') }}
run: pip install pyarrow==${{ matrix.pyarrow-version }}
- name: Build library and Test with pytest (unix)
if: ${{ (matrix.os != 'windows') }}
run: |
source activate
# source <(cargo llvm-cov show-env --export-prefix)
# export CARGO_TARGET_DIR=$CARGO_LLVM_COV_TARGET_DIR
# export CARGO_INCREMENTAL=1
# cargo llvm-cov clean --workspace
maturin develop
mkdir -p report-output && pytest --cov=daft --ignore tests/integration --durations=50
coverage combine -a --data-file='.coverage' || true
coverage xml -o ./report-output/coverage-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.daft-runner }}-${{ matrix.pyarrow-version }}.xml
# cargo llvm-cov --no-run --lcov --output-path report-output/rust-coverage-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.daft-runner }}-${{ matrix.pyarrow-version }}.lcov
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
DAFT_MICROPARTITIONS: ${{ matrix.micropartitions }}
- name: Build library and Test with pytest (windows)
if: ${{ (matrix.os == 'windows') }}
run: |
.\venv\Scripts\activate
# source <(cargo llvm-cov show-env --export-prefix)
# export CARGO_TARGET_DIR=$CARGO_LLVM_COV_TARGET_DIR
# export CARGO_INCREMENTAL=1
# cargo llvm-cov clean --workspace
maturin develop
mkdir -p report-output && pytest --cov=daft --ignore tests\integration --durations=50
coverage combine -a --data-file='.coverage' || true
coverage xml -o .\report-output\coverage-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.daft-runner }}-${{ matrix.pyarrow-version }}.xml
# cargo llvm-cov --no-run --lcov --output-path report-output\rust-coverage-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.daft-runner }}-${{ matrix.pyarrow-version }}.lcov
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
- name: Upload coverage report
uses: actions/upload-artifact@v3
with:
name: coverage-reports
path: ./report-output
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Pytest Unit Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
integration-test-build:
runs-on: ubuntu-latest
timeout-minutes: 30
env:
package-name: getdaft
strategy:
matrix:
python-version: ['3.7']
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- run: pip install -U twine toml maturin
- uses: moonrepo/setup-rust@v1
with:
cache: false
- uses: Swatinem/rust-cache@v2
with:
key: ${{ runner.os }}-integration-build
cache-all-crates: 'true'
# NOTE: we don't build with all the actual release optimizations to avoid hellish CI times
- name: Build wheels
run: maturin build --release --compatibility linux --out dist
- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: dist
integration-test-tpch:
runs-on: ubuntu-latest
timeout-minutes: 30
needs:
- integration-test-build
env:
package-name: getdaft
strategy:
fail-fast: false
matrix:
python-version: ['3.7']
daft-runner: [py, ray]
micropartitions: [1, 0]
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: pip
cache-dependency-path: |
pyproject.toml
requirements-dev.txt
- name: Download built wheels
uses: actions/download-artifact@v3
with:
name: wheels
path: dist
- name: Setup Virtual Env
run: |
python -m venv venv
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install Daft and dev dependencies
run: |
pip install --upgrade pip
pip install -r requirements-dev.txt dist/${{ env.package-name }}-*x86_64*.whl --force-reinstall
rm -rf daft
- uses: actions/cache@v3
env:
cache-name: cache-tpch-data
with:
path: data/tpch-dbgen
key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('tests/integration/test_tpch.py', 'benchmarking/tpch/**') }}
- name: Run TPCH integration tests
run: |
pytest tests/integration/test_tpch.py --durations=50
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
DAFT_MICROPARTITIONS: ${{ matrix.micropartitions }}
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] TPCH Integration Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
integration-test-io:
runs-on: ubuntu-latest
timeout-minutes: 30
needs:
- integration-test-build
env:
package-name: getdaft
strategy:
fail-fast: false
matrix:
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs
daft-runner: [py, ray]
micropartitions: [1, 0]
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: pip
cache-dependency-path: |
pyproject.toml
requirements-dev.txt
- name: Download built wheels
uses: actions/download-artifact@v3
with:
name: wheels
path: dist
- name: Setup Virtual Env
run: |
python -m venv venv
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install Daft and dev dependencies
run: |
pip install --upgrade pip
pip install -r requirements-dev.txt dist/${{ env.package-name }}-*x86_64*.whl --force-reinstall
rm -rf daft
- name: Prepare tmpdirs for IO services
run: |
mkdir -p /tmp/daft-integration-testing/nginx
chmod +rw /tmp/daft-integration-testing/nginx
- name: Spin up IO services
uses: isbang/[email protected]
with:
compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
down-flags: --volumes
- name: Run IO integration tests
run: |
pytest tests/integration/io -m 'integration' --durations=50
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
DAFT_MICROPARTITIONS: ${{ matrix.micropartitions }}
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] IO Integration Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
# Same as integration-test-io but runs the tests that require credentials, only on `main`
integration-test-io-credentialed:
if: ${{ github.ref == 'refs/heads/main' }}
runs-on: ubuntu-latest
timeout-minutes: 30
needs:
- integration-test-build
env:
package-name: getdaft
strategy:
fail-fast: false
matrix:
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs
daft-runner: [py, ray]
micropartitions: [1, 0]
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
# This is used in the step "Assume GitHub Actions AWS Credentials"
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: pip
cache-dependency-path: |
pyproject.toml
requirements-dev.txt
- name: Download built wheels
uses: actions/download-artifact@v3
with:
name: wheels
path: dist
- name: Setup Virtual Env
run: |
python -m venv venv
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install Daft and dev dependencies
run: |
pip install --upgrade pip
pip install -r requirements-dev.txt dist/${{ env.package-name }}-*x86_64*.whl --force-reinstall
rm -rf daft
- name: Prepare tmpdirs for IO services
run: |
mkdir -p /tmp/daft-integration-testing/nginx
chmod +rw /tmp/daft-integration-testing/nginx
- name: Assume GitHub Actions AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
role-session-name: DaftPythonPackageGitHubWorkflow
- name: Assume GitHub Actions GCloud Credentials
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT_JSON }}
# NOTE: Workload Identity seems to be having problems with our Rust crate, so we use JSON instead
# See issue: https://github.com/yoshidan/google-cloud-rust/issues/171#issuecomment-1730511655
# workload_identity_provider: ${{ secrets.ACTIONS_GCP_WORKLOAD_IDENTITY_PROVIDER }}
# service_account: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT }}
- name: Spin up IO services
uses: isbang/[email protected]
with:
compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
down-flags: --volumes
- name: Run IO integration tests
run: |
pytest tests/integration/io -m 'integration' --credentials --durations=50
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
DAFT_MICROPARTITIONS: ${{ matrix.micropartitions }}
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] IO Integration Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
integration-test-iceberg:
# Dependabot-triggered PRs don't have access to the required secrets for this to run successfully
if: ${{ github.triggering_actor != 'dependabot[bot]' }}
runs-on: ubuntu-latest
timeout-minutes: 30
needs:
- integration-test-build
env:
package-name: getdaft
strategy:
fail-fast: false
matrix:
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs
daft-runner: [py, ray]
micropartitions: [1]
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 0
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: pip
cache-dependency-path: |
pyproject.toml
requirements-dev.txt
- name: Download built wheels
uses: actions/download-artifact@v3
with:
name: wheels
path: dist
- name: Setup Virtual Env
run: |
python -m venv venv
echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
- name: Install Daft and dev dependencies
run: |
pip install --upgrade pip
pip install -r requirements-dev.txt dist/${{ env.package-name }}-*x86_64*.whl --force-reinstall
rm -rf daft
- uses: docker/setup-buildx-action@v3
- uses: docker/build-push-action@v5
with:
context: ./tests/integration/iceberg/docker-compose/
file: ./tests/integration/iceberg/docker-compose/Dockerfile
cache-from: type=gha
cache-to: type=gha,mode=min
- name: Spin up services
run: |
pushd ./tests/integration/iceberg/docker-compose/
docker-compose -f ./docker-compose.yml up -d
popd
- name: Run Iceberg integration tests
run: |
pytest tests/integration/iceberg -m 'integration' --durations=50
env:
DAFT_RUNNER: ${{ matrix.daft-runner }}
DAFT_MICROPARTITIONS: ${{ matrix.micropartitions }}
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Iceberg Integration Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
rust-tests:
runs-on: ${{ matrix.os }}-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
os: [ubuntu, windows]
steps:
- uses: actions/checkout@v4
- uses: moonrepo/setup-rust@v1
with:
cache: false
- uses: Swatinem/rust-cache@v2
with:
key: ${{ runner.os }}-rust-build
cache-all-crates: 'true'
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: mkdir -p report-output && cargo llvm-cov --no-default-features --workspace --lcov --output-path ./report-output/lcov.info
- name: Upload coverage report
uses: actions/upload-artifact@v3
with:
name: coverage-reports
path: ./report-output
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Rust Unit Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
publish-coverage-reports:
name: Publish coverage reports to CodeCov
runs-on: ubuntu-latest
needs:
- unit-tests-with-coverage
- rust-tests
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
with:
name: coverage-reports
path: ./report-output
- name: Upload coverage reports to Codecov with GitHub Action
uses: codecov/codecov-action@v3
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
files: ./report-output/*
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Codecov Uploads <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
test-imports:
runs-on: ${{ matrix.os }}-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
os: [ubuntu, windows]
python-version: ['3.7']
steps:
- uses: actions/checkout@v4
- uses: moonrepo/setup-rust@v1
with:
cache: false
- uses: Swatinem/rust-cache@v2
with:
key: ${{ runner.os }}-build
cache-all-crates: 'true'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Unix Build
if: ${{ (matrix.os != 'windows') }}
run: |
python -m venv venv
source venv/bin/activate
python -m pip install maturin
maturin build --out dist
- name: Windows Build
if: ${{ (matrix.os == 'windows') }}
run: |
python -m venv venv
.\venv\Scripts\activate
python -m pip install maturin
maturin build --out dist
- name: Test Imports in Clean Env (Unix)
if: ${{ (matrix.os != 'windows') }}
run: |
rm -rf daft
rm -rf venv
python -m venv venv
source venv/bin/activate
ls -R ./dist
pip install dist/*.whl
python -c 'import daft; from daft import *'
- name: Test Imports in Clean Env (Windows)
if: ${{ (matrix.os == 'windows') }}
run: |
rd -r daft
rd -r venv
python -m venv venv
.\venv\Scripts\activate
$FILES = Get-ChildItem -Path .\dist\*.whl -Force -Recurse
python -m pip install $FILES[0].FullName
python -c 'import daft; from daft import *'
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Python Import Checks <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
style:
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
python-version: ['3.8']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install pre-commit
run: |
pip install --upgrade pip
pip install pre-commit
- uses: moonrepo/setup-rust@v1
with:
cache: false
- uses: Swatinem/rust-cache@v2
with:
key: ${{ runner.os }}-build
cache-all-crates: 'true'
- uses: actions/cache@v3
id: pre-commit-cache
with:
path: ~/.cache/pre-commit/
key: ${{ runner.os }}-python-${{ steps.setup-python.outputs.python-version }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
- name: Python And Rust Style Check
run: |
pre-commit run --all-files
- name: Send Slack notification on failure
uses: slackapi/[email protected]
if: ${{ failure() && (github.ref == 'refs/heads/main') }}
with:
payload: |
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":rotating_light: [CI] Style Checks <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK