Eventual-Inc · raunakab · Nov 18, 2024 · Nov 22, 2024 · Nov 22, 2024 · jaychia
diff --git a/.github/actions/install/action.yaml b/.github/actions/install/action.yaml
@@ -0,0 +1,29 @@
+name: Install uv, rust, and python
+description: Install uv, rust, and python
+inputs:
+  python_version:
+    description: The version of python to install
+    required: false
+    default: '3.9'
+runs:
+  using: composite
+  steps:
+  - shell: bash
+    run: |
+      curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+      CARGO_BIN="$HOME/.cargo/bin"
+      echo 'export PATH="$CARGO_BIN:$PATH"' >> $HOME/.bashrc
+      echo "$CARGO_BIN" >> $GITHUB_PATH
+  - shell: bash
+    run: |
+      curl -LsSf https://astral.sh/uv/install.sh | sh
+      UV_BIN="$HOME/.local/bin"
+      echo 'export PATH="$UV_BIN:$PATH"' >> $HOME/.bashrc
+      echo "$UV_BIN" >> $GITHUB_PATH
+  - shell: bash
+    run: |
+      source $HOME/.bashrc
+  - shell: bash
+    run: |
+      uv python install ${{ inputs.python_version }}
+      uv python pin ${{ inputs.python_version }}
diff --git a/.github/assets/benchmarking_ray_config.yaml b/.github/assets/benchmarking_ray_config.yaml
@@ -0,0 +1,65 @@
+cluster_name: performance-comparisons
+
+provider:
+  type: aws
+  region: us-west-2
+  cache_stopped_nodes: true
+  security_group:
+    GroupName: ray-autoscaler-c1
+
+auth:
+  ssh_user: ubuntu
+  ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem
+
+max_workers: 2
+available_node_types:
+  ray.head.default:
+    resources: {"CPU": 0}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+  ray.worker.default:
+    min_workers: 2
+    max_workers: 2
+    resources: {}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+setup_commands:
+# Mount drive
+- |
+  findmnt /tmp 1> /dev/null
+  code=$?
+  if [ $code -ne 0 ]; then
+    sudo mkfs.ext4 /dev/nvme0n1
+    sudo mount -t ext4 /dev/nvme0n1 /tmp
+    sudo chmod 777 /tmp
+  fi
+# Install dependencies
+- sudo snap install aws-cli --classic
+- curl -LsSf https://astral.sh/uv/install.sh | sh
+- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
+- source ~/.bashrc
+- uv python install <<PYTHON_VERSION>>
+- uv python pin <<PYTHON_VERSION>>
+- uv v
+- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc
+- source .venv/bin/activate
+- uv pip install pip ray[default] py-spy
+# GitHub Actions workflow will replace all parameters between `<<...>>` with the
+# actual values as determined dynamically during runtime of the actual workflow.
+- uv pip install https://github-actions-artifacts-bucket.s3.us-west-2.amazonaws.com/builds/<<SHA>>/<<WHEEL>>
+# Download benchmarking fixtures
+- |
+  aws s3 sync \
+  s3://eventual-dev-benchmarking-fixtures/uncompressed/tpch-dbgen/<<SCALE_FACTOR>>/<<PARTITION_SIZE>>/parquet/ \
+  /tmp/data/<<SCALE_FACTOR>>/<<PARTITION_SIZE>>/parquet/ \
+  --quiet
diff --git a/.github/scripts/csv_to_md.py b/.github/scripts/csv_to_md.py
@@ -0,0 +1,29 @@
+import csv
+import sys
+from pathlib import Path
+
+file = Path(sys.argv[1])
+assert file.exists()
+
+output = Path(sys.argv[2])
+assert not output.exists()
+
+
+def make_md_row(row: list[str]) -> str:
+    return f'|{"|".join(row)}|\n'
+
+
+with open(file) as file:
+    with open(output, "w+") as output:
+        csv_reader = csv.reader(file)
+        header = next(csv_reader)
+
+        header_str = make_md_row(header)
+        output.write(header_str)
+
+        separator_str = make_md_row(["-"] * len(header))
+        output.write(separator_str)
+
+        for row in csv_reader:
+            row_str = make_md_row(row)
+            output.write(row_str)
diff --git a/.github/workflows/build-commit-run-tpch.yaml b/.github/workflows/build-commit-run-tpch.yaml
@@ -0,0 +1,54 @@
+name: Build commit and run tpch benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      skip_questions:
+        type: string
+        description: The TPC-H questions to skip
+        required: false
+        default: ""
+      scale_factor:
+        type: choice
+        options:
+        - '2'
+        - '10'
+        - '100'
+        - '1000'
+        description: Which scale factor to use
+        required: false
+        default: '2'
+      partition_size:
+        type: choice
+        options:
+        - '2'
+        - '32'
+        - '32'
+        - '100'
+        - '300'
+        - '320'
+        - '512'
+        description: Which partition size to use
+        required: false
+        default: '2'
+      python_version:
+        type: string
+        description: The version of python to use
+        required: false
+        default: '3.9'
+
+jobs:
+  build:
+    uses: ./.github/workflows/build-commit.yaml
+    secrets:
+      ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
+
+  run:
+    needs: build
+    uses: ./.github/workflows/run-tpch.yaml
+    with:
+      wheel: ${{ needs.build.outputs.wheel }}
+      skip_questions: ${{ inputs.skip_questions }}
+      scale_factor: ${{ inputs.scale_factor }}
+      partition_size: ${{ inputs.partition_size }}
+      python_version: ${{ inputs.python_version }}
diff --git a/.github/workflows/build-commit.yaml b/.github/workflows/build-commit.yaml
@@ -0,0 +1,68 @@
+name: Build a Daft commit and store the outputted wheel in AWS S3
+
+on:
+  workflow_dispatch:
+  workflow_call:
+    secrets:
+      ACTIONS_AWS_ROLE_ARN:
+        description: The ARN of the AWS role to assume
+        required: true
+    outputs:
+      wheel:
+        description: The wheel file that was built
+        value: ${{ jobs.build-commit.outputs.wheel }}
+
+jobs:
+  build-commit:
+    runs-on: buildjet-8vcpu-ubuntu-2004
+    timeout-minutes: 15 # Remove for ssh debugging
+    permissions:
+      id-token: write
+      contents: read
+    outputs:
+      wheel: ${{ steps.build_and_upload.outputs.wheel }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+    - uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: us-west-2
+        role-session-name: build-commit-workflow
+        role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
+    - uses: ./.github/actions/install
+    - uses: buildjet/cache@v4
+      with:
+        path: ~/target
+        key: ${{ runner.os }}-cargo-deps-${{ hashFiles('**/Cargo.lock') }}
+        restore-keys: ${{ runner.os }}-cargo-deps-
+    - id: build_and_upload
+      run: |
+        export CARGO_TARGET_DIR=~/target
+        uv v
+        source .venv/bin/activate
+        uv pip install pip maturin boto3
+
+        if ! ls ~/target/wheels/*.whl 1> /dev/null 2>&1; then
+          # Build wheel
+          maturin build --release
+        fi
+
+        count=$(ls ~/target/wheels/*.whl 2> /dev/null | wc -l)
+        if [ "$count" -gt 1 ]; then
+          echo "Found more than 1 wheel"
+          exit 1
+        elif [ "$count" -eq 0 ]; then
+          echo "Found no wheel files"
+          exit 1
+        fi
+
+        # Upload wheel
+        for file in ~/target/wheels/*.whl; do
+          aws s3 cp $file s3://github-actions-artifacts-bucket/builds/${{ github.sha }}/ --acl public-read --no-progress;
+          file_basename=$(basename $file)
+          echo "wheel=$file_basename" >> $GITHUB_OUTPUT
+          echo "Output wheel has been built and stored in S3 at the following location:" >> $GITHUB_STEP_SUMMARY
+          echo "https://us-west-2.console.aws.amazon.com/s3/buckets/github-actions-artifacts-bucket?prefix=builds/${{ github.sha }}/" >> $GITHUB_STEP_SUMMARY
+        done
+        python tools/generate_whl_html_manifest.py